2 years ago · 3c492741d7
--- a/ad_feature_process.py
+++ b/ad_feature_process.py
@@ -1,5 +1,6 @@
 
				 import os.path
			
 
				 import time
			
 
				+import datetime
			
 
				 
			
 
				 import pandas as pd
			
 
				 from utils import get_data_from_odps
			
@@ -72,22 +73,8 @@ train_feature = [
 
				 ]
			
 
				 
			
 
				 
			
 
				-def get_feature_data(project, table, features, dt):
			
 
				+def get_feature_data(project, table, features, dt, app_type):
			
 
				     """获取特征数据"""
			
 
				-    # records = get_data_from_odps(date=dt, project=project, table=table)
			
 
				-    # feature_data = []
			
 
				-    # i = 0
			
 
				-    # for record in records:
			
 
				-    #     if i > 300000:
			
 
				-    #         break
			
 
				-    #     item = {}
			
 
				-    #     for feature_name in features:
			
 
				-    #         item[feature_name] = record[feature_name]
			
 
				-    #     feature_data.append(item)
			
 
				-    #     i += 1
			
 
				-    # feature_df = pd.DataFrame(feature_data)
			
 
				-    # return feature_df
			
 
				-
			
 
				     odps = ODPS(
			
 
				         access_id=config_.ODPS_CONFIG['ACCESSID'],
			
 
				         secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
			
@@ -95,7 +82,7 @@ def get_feature_data(project, table, features, dt):
 
				         endpoint=config_.ODPS_CONFIG['ENDPOINT'],
			
 
				     )
			
 
				     feature_data = []
			
 
				-    sql = f"select * from {project}.{table} where dt={dt} and apptype=0"
			
 
				+    sql = f"select * from {project}.{table} where dt={dt} and apptype={app_type}"
			
 
				     with odps.execute_sql(sql).open_reader() as reader:
			
 
				         for record in reader:
			
 
				             # print(record)
			
@@ -110,11 +97,12 @@ def get_feature_data(project, table, features, dt):
 
				 def daily_data_process(project, table, features, dt, app_type):
			
 
				     """每日特征处理"""
			
 
				     print('step 1: get feature data')
			
 
				-    feature_initial_df = get_feature_data(project=project, table=table, features=features, dt=dt)
			
 
				+    feature_initial_df = get_feature_data(project=project, table=table, features=features, dt=dt, app_type=app_type)
			
 
				     print(f"feature_initial_df shape: {feature_initial_df.shape}")
			
 
				     print('step 2: process')
			
 
				     feature_initial_df['apptype'] = feature_initial_df['apptype'].astype(int)
			
 
				-    feature_df = feature_initial_df[feature_initial_df['apptype'] == app_type].copy()
			
 
				+    # feature_df = feature_initial_df[feature_initial_df['apptype'] == app_type].copy()
			
 
				+    feature_df = feature_initial_df.copy()
			
 
				     # 增加此次是否有广告字段 'ad_status' 1: 有广告, 0: 无广告
			
 
				     feature_df['ad_status'] = feature_df.apply(func=lambda x: 1 if x['ad_mid'] == x['mid'] else 0, axis=1)
			
 
				     feature_df['share_videoid'].fillna(0, inplace=True)
			
@@ -174,13 +162,13 @@ if __name__ == '__main__':
 
				     st_time = time.time()
			
 
				     project = 'loghubods'
			
 
				     table = 'admodel_data_train'
			
 
				-    dt = '20230725'
			
 
				+    # dt = '20230725'
			
 
				+    now_date = datetime.datetime.today()
			
 
				+    dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=1), '%Y%m%d')
			
 
				     df = daily_data_process(project=project, table=table, features=features, dt=dt, app_type=0)
			
 
				-    print(df.shape)
			
 
				-    print(df.columns)
			
 
				-    # df.to_csv(f'./data/{dt}.csv', index=False)
			
 
				-    # get_feature_data(project=project, table=table, features=features, dt=dt)
			
 
				-    print(time.time() - st_time)
			
 
				+    # print(df.shape)
			
 
				+    # print(df.columns)
			
 
				+    # print(time.time() - st_time)