liqian 1 年之前
父节点
当前提交
408fea41bc
共有 3 个文件被更改,包括 30 次插入15 次删除
  1. 3 2
      ad_feature_data_sample.py
  2. 10 5
      ad_generate_train_test.py
  3. 17 8
      ad_xgboost_train.py

+ 3 - 2
ad_feature_data_sample.py

@@ -8,11 +8,12 @@ import time
 def neg_under_sample(data_dir, sample_data_dir, dt):
     # 欠采样,对负样本按照30%的概率进行舍弃,pos:neg ≈ 1:10
     data_df = pd.read_csv(f"{data_dir}/{dt}.csv")
+    pos_df = data_df[data_df['share_status'] == 1].copy()
     neg_df = data_df[data_df['share_status'] == 0].copy()
     neg_df['rand'] = [random.uniform(0, 1) for i in range(neg_df.shape[0])]
-    sample_neg_df = neg_df[neg_df['rand'] > 0.3]
+    rate = 1 - pos_df.shape[0] / neg_df.shape[0]
+    sample_neg_df = neg_df[neg_df['rand'] > rate]
     sample_neg_df = sample_neg_df.drop(columns=['rand'])
-    pos_df = data_df[data_df['share_status'] == 1].copy()
     sample_df = pd.concat([pos_df, sample_neg_df])
     sample_df = sample_df.sample(frac=1.0).reset_index(drop=True)
     # 写入csv

+ 10 - 5
ad_generate_train_test.py

@@ -6,15 +6,20 @@ import os
 if __name__ == '__main__':
     now_date = datetime.datetime.today()
     dt = datetime.datetime.strftime(now_date, '%Y%m%d')
+    train_test_data_dir = './data/train_test_data'
+    if not os.path.exists(train_test_data_dir):
+        os.makedirs(train_test_data_dir)
+    # 训练集
     data_df_list = []
-    for days in range(2, 9):
+    for days in range(3, 9):
         cur_dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=days), '%Y%m%d')
         print(f"cur_dt = {cur_dt}")
         cur_dt_df = pd.read_csv(f"./data/sample_train_data/{cur_dt}.csv")
         data_df_list.append(cur_dt_df)
     all_df = pd.concat(data_df_list)
     print(f"all data num: {all_df.shape[0]}")
-    train_test_data_dir = './data/train_test_data'
-    if not os.path.exists(train_test_data_dir):
-        os.makedirs(train_test_data_dir)
-    all_df.to_csv(f'{train_test_data_dir}/train_test_{dt}.csv', index=False)
+    all_df.to_csv(f'{train_test_data_dir}/train_{dt}.csv', index=False)
+    # 测试集
+    test_dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=2), '%Y%m%d')
+    test_df = pd.read_csv(f"./data/sample_train_data/{test_dt}.csv")
+    all_df.to_csv(f'{train_test_data_dir}/test_{dt}.csv', index=False)

+ 17 - 8
ad_xgboost_train.py

@@ -8,22 +8,31 @@ from sklearn import metrics
 now_date = datetime.datetime.today()
 dt = datetime.datetime.strftime(now_date, '%Y%m%d')
 # 1. 读取数据
-data = pd.read_csv(f'./data/train_test_data/train_test_{dt}.csv')
-print(data.shape)
+# data = pd.read_csv(f'./data/train_test_data/train_test_{dt}.csv')
+# print(data.shape)
+train_data = pd.read_csv(f'./data/train_test_data/train_{dt}.csv')
+print(train_data.shape)
+test_data = pd.read_csv(f'./data/train_test_data/test_{dt}.csv')
+print(test_data.shape)
 # 2. 划分x和y
-data_columns = data.columns.values.tolist()
-x = data[data_columns[:-1]]
-y = data[data_columns[-1]]
-print(f"x_shape: {x.shape}, y_shape: {y.shape}")
+# data_columns = data.columns.values.tolist()
+# x = data[data_columns[:-1]]
+# y = data[data_columns[-1]]
+# print(f"x_shape: {x.shape}, y_shape: {y.shape}")
+data_columns = train_data.columns.values.tolist()
+x_train = train_data[data_columns[:-1]]
+y_train = train_data[data_columns[-1]]
+x_test = test_data[data_columns[:-1]]
+y_test = test_data[data_columns[-1]]
 # 3. 训练集和测试集分割
-x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
+# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
 print(f"x_train_shape: {x_train.shape}")
 print(f"x_test_shape: {x_test.shape}")
 # 4. 模型训练
 xgb_model = XGBClassifier(
     objective='binary:logistic',
     learning_rate=0.3,
-    max_depth=10,
+    max_depth=5,
     eval_metric=['error', 'logloss', 'auc']
 )
 xgb_model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)])