1 year ago · 035cdd3d00
--- a/ad_feature_process.py
+++ b/ad_feature_process.py
@@ -1,5 +1,13 @@
 
				+import os.path
			
 
				+import time
			
 
				+
			
 
				 import pandas as pd
			
 
				 from utils import get_data_from_odps
			
 
				+from odps.df import DataFrame
			
 
				+from odps import ODPS
			
 
				+from config import set_config
			
 
				+
			
 
				+config_, env = set_config()
			
 
				 
			
 
				 features = [
			
 
				     'apptype',
			
@@ -66,15 +74,37 @@ train_feature = [
 
				 
			
 
				 def get_feature_data(project, table, features, dt):
			
 
				     """获取特征数据"""
			
 
				-    records = get_data_from_odps(date=dt, project=project, table=table)
			
 
				+    # records = get_data_from_odps(date=dt, project=project, table=table)
			
 
				+    # feature_data = []
			
 
				+    # i = 0
			
 
				+    # for record in records:
			
 
				+    #     if i > 300000:
			
 
				+    #         break
			
 
				+    #     item = {}
			
 
				+    #     for feature_name in features:
			
 
				+    #         item[feature_name] = record[feature_name]
			
 
				+    #     feature_data.append(item)
			
 
				+    #     i += 1
			
 
				+    # feature_df = pd.DataFrame(feature_data)
			
 
				+    # return feature_df
			
 
				+
			
 
				+    odps = ODPS(
			
 
				+        access_id=config_.ODPS_CONFIG['ACCESSID'],
			
 
				+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
			
 
				+        project=project,
			
 
				+        endpoint=config_.ODPS_CONFIG['ENDPOINT'],
			
 
				+    )
			
 
				     feature_data = []
			
 
				-    for record in records:
			
 
				-        item = {}
			
 
				-        for feature_name in features:
			
 
				-            item[feature_name] = record[feature_name]
			
 
				-        feature_data.append(item)
			
 
				-    feature_df = pd.DataFrame(feature_data)
			
 
				-    return feature_df
			
 
				+    sql = f"select * from {project}.{table} where dt={dt} and apptype=0"
			
 
				+    with odps.execute_sql(sql).open_reader() as reader:
			
 
				+        for record in reader:
			
 
				+            # print(record)
			
 
				+            item = {}
			
 
				+            for feature_name in features:
			
 
				+                item[feature_name] = record[feature_name]
			
 
				+            feature_data.append(item)
			
 
				+        feature_df = pd.DataFrame(feature_data)
			
 
				+        return feature_df
			
 
				 
			
 
				 
			
 
				 def daily_data_process(project, table, features, dt, app_type):
			
@@ -84,14 +114,14 @@ def daily_data_process(project, table, features, dt, app_type):
 
				     print(f"feature_initial_df shape: {feature_initial_df.shape}")
			
 
				     print('step 2: process')
			
 
				     feature_initial_df['apptype'] = feature_initial_df['apptype'].astype(int)
			
 
				-    feature_df = feature_initial_df[feature_initial_df['apptype'] == app_type]
			
 
				+    feature_df = feature_initial_df[feature_initial_df['apptype'] == app_type].copy()
			
 
				     # 增加此次是否有广告字段 'ad_status' 1: 有广告, 0: 无广告
			
 
				-    feature_df['ad_status'] = feature_df.apply(func=lambda x: 1 if x['ad_mid'] == x['mid'] else 0)
			
 
				+    feature_df['ad_status'] = feature_df.apply(func=lambda x: 1 if x['ad_mid'] == x['mid'] else 0, axis=1)
			
 
				     feature_df['share_videoid'].fillna(0, inplace=True)
			
 
				     feature_df['share_videoid'] = feature_df['share_videoid'].astype(int)
			
 
				     feature_df['videoid'] = feature_df['videoid'].astype(int)
			
 
				     # 增加此次是否分享了该视频 'share_status' 1: 分享, 0: 为分享
			
 
				-    feature_df['share_status'] = feature_df.apply(func=lambda x: 1 if x['share_videoid'] == x['videoid'] else 0)
			
 
				+    feature_df['share_status'] = feature_df.apply(func=lambda x: 1 if x['share_videoid'] == x['videoid'] else 0, axis=1)
			
 
				     # 缺失值填充
			
 
				     feature_df.fillna(0, inplace=True)
			
 
				     # 数据类型校正
			
@@ -132,17 +162,25 @@ def daily_data_process(project, table, features, dt, app_type):
 
				     print('step 3: get train_df')
			
 
				     train_df = feature_df[train_feature]
			
 
				     print(f"train_df shape: {train_df.shape}")
			
 
				+    # 写入csv
			
 
				+    train_data_dir = './data/train_data'
			
 
				+    if not os.path.exists(train_data_dir):
			
 
				+        os.makedirs(train_data_dir)
			
 
				+    train_df.to_csv(f"{train_data_dir}/{dt}.csv")
			
 
				     return train_df
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				+    st_time = time.time()
			
 
				     project = 'loghubods'
			
 
				     table = 'admodel_data_train'
			
 
				     dt = '20230725'
			
 
				     df = daily_data_process(project=project, table=table, features=features, dt=dt, app_type=0)
			
 
				     print(df.shape)
			
 
				     print(df.columns)
			
 
				-    df.to_csv(f'./data/{dt}.csv')
			
 
				+    # df.to_csv(f'./data/{dt}.csv', index=False)
			
 
				+    # get_feature_data(project=project, table=table, features=features, dt=dt)
			
 
				+    print(time.time() - st_time)