Browse Source

add ad_model_run.sh & update

liqian 1 year ago
parent
commit
12a6e9827f
4 changed files with 58 additions and 13 deletions
  1. 7 2
      ad_feature_data_sample.py
  2. 6 9
      ad_feature_process.py
  3. 2 2
      ad_generate_train_test.py
  4. 43 0
      ad_model_run.sh

+ 7 - 2
ad_feature_data_sample.py

@@ -32,5 +32,10 @@ if __name__ == '__main__':
     data_dir = './data/train_data'
     sample_data_dir = './data/sample_train_data'
     now_date = datetime.datetime.today()
-    dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=1), '%Y%m%d')
-    neg_under_sample(data_dir=data_dir, sample_data_dir=sample_data_dir, dt=dt)
+    # dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=1), '%Y%m%d')
+    # neg_under_sample(data_dir=data_dir, sample_data_dir=sample_data_dir, dt=dt)
+
+    for days in range(3, 19):
+        cur_dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=days), '%Y%m%d')
+        print(f"cur_dt = {cur_dt}")
+        neg_under_sample(data_dir=data_dir, sample_data_dir=sample_data_dir, dt=cur_dt)

+ 6 - 9
ad_feature_process.py

@@ -3,9 +3,6 @@ import time
 import datetime
 import pandas as pd
 from odps import ODPS
-# from config import set_config
-#
-# config_, env = set_config()
 
 # ODPS服务配置
 odps_config = {
@@ -168,12 +165,12 @@ if __name__ == '__main__':
     table = 'admodel_data_train'
     # dt = '20230725'
     now_date = datetime.datetime.today()
-    dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=1), '%Y%m%d')
-    df = daily_data_process(project=project, table=table, features=features, dt=dt, app_type=0)
-    # print(df.shape)
-    # print(df.columns)
+    # dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=1), '%Y%m%d')
+    # df = daily_data_process(project=project, table=table, features=features, dt=dt, app_type=0)
     # print(time.time() - st_time)
 
-
-
+    for days in range(10, 19):
+        cur_dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=days), '%Y%m%d')
+        print(f"cur_dt = {cur_dt}")
+        df = daily_data_process(project=project, table=table, features=features, dt=cur_dt, app_type=0)
 

+ 2 - 2
ad_generate_train_test.py

@@ -11,7 +11,7 @@ if __name__ == '__main__':
         os.makedirs(train_test_data_dir)
     # 训练集
     data_df_list = []
-    for days in range(3, 9):
+    for days in range(4, 19):
         cur_dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=days), '%Y%m%d')
         print(f"cur_dt = {cur_dt}")
         cur_dt_df = pd.read_csv(f"./data/sample_train_data/{cur_dt}.csv")
@@ -20,7 +20,7 @@ if __name__ == '__main__':
     print(f"all data num: {all_df.shape[0]}")
     all_df.to_csv(f'{train_test_data_dir}/train_{dt}.csv', index=False)
     # 测试集
-    test_dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=2), '%Y%m%d')
+    test_dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=3), '%Y%m%d')
     test_df = pd.read_csv(f"./data/sample_train_data/{test_dt}.csv")
     print(f"test data num: {test_df.shape[0]}")
     test_df.to_csv(f'{train_test_data_dir}/test_{dt}.csv', index=False)

+ 43 - 0
ad_model_run.sh

@@ -0,0 +1,43 @@
+#!/bin/bash
+#conda active rov-offline-py38
+#cd /data/rov-offline
+now_date=`date +"%Y%m%d"`
+echo "now_date: $now_date"
+
+# 1. 下载训练所用特征数据并做处理
+python ad_feature_process.py
+if [ $? -ne 0 ];
+then
+  msg = "[ERROR] ad_feature_process.py"
+  echo $msg
+  exit -1
+fi
+
+# 2. 对样本进行采样
+python ad_feature_data_sample.py
+if [ $? -ne 0 ];
+then
+  msg = "[ERROR] ad_feature_data_sample.py"
+  echo $msg
+  exit -1
+fi
+
+# 3. 训练集和测试集生成
+python ad_generate_train_test.py
+if [ $? -ne 0 ];
+then
+  msg = "[ERROR] ad_generate_train_test.py"
+  echo $msg
+  exit -1
+fi
+
+# 4. 模型训练
+python ad_xgboost_train.py
+if [ $? -ne 0 ];
+then
+  msg = "[ERROR] ad_xgboost_train.py"
+  echo $msg
+  exit -1
+fi
+
+# 5. 离线预测