liqian 1 年之前
父节点
当前提交
107b2b3096
共有 1 个文件被更改,包括 110 次插入29 次删除
  1. 110 29
      ad_feature_process.py

+ 110 - 29
ad_feature_process.py

@@ -3,34 +3,64 @@ from utils import get_data_from_odps
 
 features = [
     'apptype',
-    'videoid',
+    'subsessionid',
     'mid',
+    'videoid',
     'ad_mid',
-    'mid_preview_count',
-    'mid_view_count',
-    'mid_view_count_pv',
-    'mid_play_count',
-    'mid_play_count_pv',
-    'mid_share_count',
-    'mid_share_count_pv',
-    'mid_return_count',
-    'mid_share_rate',
-    'mid_return_rate',
-    'video_preview_count_uv',
-    'video_preview_count_pv',
-    'video_view_count_uv',
-    'video_view_count_pv',
-    'video_play_count_uv',
-    'video_play_count_pv',
-    'video_share_count_uv',
-    'video_share_count_pv',
-    'video_return_count',
-    'video_ctr_uv',
-    'video_ctr_pv',
-    'video_share_rate_uv',
-    'video_share_rate_pv',
-    'video_return_rate'
     'share_videoid'
+    'mid_preview_count_30day',
+    'mid_view_count_30day',
+    'mid_view_count_pv_30day',
+    'mid_play_count_30day',
+    'mid_play_count_pv_30day',
+    'mid_share_count_30day',
+    'mid_share_count_pv_30day',
+    'mid_return_count_30day',
+    'mid_share_rate_30day',
+    'mid_return_rate_30day',
+    'video_preview_count_uv_30day',
+    'video_preview_count_pv_30day',
+    'video_view_count_uv_30day',
+    'video_view_count_pv_30day',
+    'video_play_count_uv_30day',
+    'video_play_count_pv_30day',
+    'video_share_count_uv_30day',
+    'video_share_count_pv_30day',
+    'video_return_count_30day',
+    'video_ctr_uv_30day',
+    'video_ctr_pv_30day',
+    'video_share_rate_uv_30day',
+    'video_share_rate_pv_30day',
+    'video_return_rate_30day',
+]
+
+train_feature = [
+    'mid_preview_count_30day',
+    'mid_view_count_30day',
+    'mid_view_count_pv_30day',
+    'mid_play_count_30day',
+    'mid_play_count_pv_30day',
+    'mid_share_count_30day',
+    'mid_share_count_pv_30day',
+    'mid_return_count_30day',
+    'mid_share_rate_30day',
+    'mid_return_rate_30day',
+    'video_preview_count_uv_30day',
+    'video_preview_count_pv_30day',
+    'video_view_count_uv_30day',
+    'video_view_count_pv_30day',
+    'video_play_count_uv_30day',
+    'video_play_count_pv_30day',
+    'video_share_count_uv_30day',
+    'video_share_count_pv_30day',
+    'video_return_count_30day',
+    'video_ctr_uv_30day',
+    'video_ctr_pv_30day',
+    'video_share_rate_uv_30day',
+    'video_share_rate_pv_30day',
+    'video_return_rate_30day',
+    'ad_status',
+    'share_status'
 ]
 
 
@@ -47,7 +77,58 @@ def get_feature_data(project, table, features, dt):
     return feature_df
 
 
-def daily_data_process(project, table, features, dt):
-    feature_df = get_feature_data(project=project, table=table, features=features, dt=dt)
-    feature_df['']
-    pass
+def daily_data_process(project, table, features, dt, app_type):
+    """每日特征处理"""
+    feature_initial_df = get_feature_data(project=project, table=table, features=features, dt=dt)
+    feature_initial_df['apptype'] = feature_initial_df['apptype'].astype(int)
+    feature_df = feature_initial_df[feature_initial_df['apptype'] == app_type]
+    # 增加此次是否有广告字段 'ad_status' 1: 有广告, 0: 无广告
+    feature_df['ad_status'] = feature_df.apply(func=lambda x: 1 if x['ad_mid'] == x['mid'] else 0)
+    feature_df['share_videoid'].fillna(0, inplace=True)
+    feature_df['share_videoid'] = feature_df['share_videoid'].astype(int)
+    feature_df['videoid'] = feature_df['videoid'].astype(int)
+    # 增加此次是否分享了该视频 'share_status' 1: 分享, 0: 为分享
+    feature_df['share_status'] = feature_df.apply(func=lambda x: 1 if x['share_videoid'] == x['videoid'] else 0)
+    # 缺失值填充
+    feature_df.fillna(0, inplace=True)
+    # 数据类型校正
+    type_int_columns = [
+        'mid_preview_count_30day',
+        'mid_view_count_30day',
+        'mid_view_count_pv_30day',
+        'mid_play_count_30day',
+        'mid_play_count_pv_30day',
+        'mid_share_count_30day',
+        'mid_share_count_pv_30day',
+        'mid_return_count_30day',
+        'video_preview_count_uv_30day',
+        'video_preview_count_pv_30day',
+        'video_view_count_uv_30day',
+        'video_view_count_pv_30day',
+        'video_play_count_uv_30day',
+        'video_play_count_pv_30day',
+        'video_share_count_uv_30day',
+        'video_share_count_pv_30day',
+        'video_return_count_30day',
+    ]
+    for column_name in type_int_columns:
+        feature_df[column_name].astype(int)
+    type_float_columns = [
+        'mid_share_rate_30day',
+        'mid_return_rate_30day',
+        'video_ctr_uv_30day',
+        'video_ctr_pv_30day',
+        'video_share_rate_uv_30day',
+        'video_share_rate_pv_30day',
+        'video_return_rate_30day',
+    ]
+    for column_name in type_float_columns:
+        feature_df[column_name].astype(float)
+    # 获取所需的字段
+    train_df = feature_df[train_feature]
+    return train_df
+
+
+
+
+