|
@@ -3,34 +3,64 @@ from utils import get_data_from_odps
|
|
|
|
|
|
features = [
|
|
|
'apptype',
|
|
|
- 'videoid',
|
|
|
+ 'subsessionid',
|
|
|
'mid',
|
|
|
+ 'videoid',
|
|
|
'ad_mid',
|
|
|
- 'mid_preview_count',
|
|
|
- 'mid_view_count',
|
|
|
- 'mid_view_count_pv',
|
|
|
- 'mid_play_count',
|
|
|
- 'mid_play_count_pv',
|
|
|
- 'mid_share_count',
|
|
|
- 'mid_share_count_pv',
|
|
|
- 'mid_return_count',
|
|
|
- 'mid_share_rate',
|
|
|
- 'mid_return_rate',
|
|
|
- 'video_preview_count_uv',
|
|
|
- 'video_preview_count_pv',
|
|
|
- 'video_view_count_uv',
|
|
|
- 'video_view_count_pv',
|
|
|
- 'video_play_count_uv',
|
|
|
- 'video_play_count_pv',
|
|
|
- 'video_share_count_uv',
|
|
|
- 'video_share_count_pv',
|
|
|
- 'video_return_count',
|
|
|
- 'video_ctr_uv',
|
|
|
- 'video_ctr_pv',
|
|
|
- 'video_share_rate_uv',
|
|
|
- 'video_share_rate_pv',
|
|
|
- 'video_return_rate'
|
|
|
'share_videoid'
|
|
|
+ 'mid_preview_count_30day',
|
|
|
+ 'mid_view_count_30day',
|
|
|
+ 'mid_view_count_pv_30day',
|
|
|
+ 'mid_play_count_30day',
|
|
|
+ 'mid_play_count_pv_30day',
|
|
|
+ 'mid_share_count_30day',
|
|
|
+ 'mid_share_count_pv_30day',
|
|
|
+ 'mid_return_count_30day',
|
|
|
+ 'mid_share_rate_30day',
|
|
|
+ 'mid_return_rate_30day',
|
|
|
+ 'video_preview_count_uv_30day',
|
|
|
+ 'video_preview_count_pv_30day',
|
|
|
+ 'video_view_count_uv_30day',
|
|
|
+ 'video_view_count_pv_30day',
|
|
|
+ 'video_play_count_uv_30day',
|
|
|
+ 'video_play_count_pv_30day',
|
|
|
+ 'video_share_count_uv_30day',
|
|
|
+ 'video_share_count_pv_30day',
|
|
|
+ 'video_return_count_30day',
|
|
|
+ 'video_ctr_uv_30day',
|
|
|
+ 'video_ctr_pv_30day',
|
|
|
+ 'video_share_rate_uv_30day',
|
|
|
+ 'video_share_rate_pv_30day',
|
|
|
+ 'video_return_rate_30day',
|
|
|
+]
|
|
|
+
|
|
|
+train_feature = [
|
|
|
+ 'mid_preview_count_30day',
|
|
|
+ 'mid_view_count_30day',
|
|
|
+ 'mid_view_count_pv_30day',
|
|
|
+ 'mid_play_count_30day',
|
|
|
+ 'mid_play_count_pv_30day',
|
|
|
+ 'mid_share_count_30day',
|
|
|
+ 'mid_share_count_pv_30day',
|
|
|
+ 'mid_return_count_30day',
|
|
|
+ 'mid_share_rate_30day',
|
|
|
+ 'mid_return_rate_30day',
|
|
|
+ 'video_preview_count_uv_30day',
|
|
|
+ 'video_preview_count_pv_30day',
|
|
|
+ 'video_view_count_uv_30day',
|
|
|
+ 'video_view_count_pv_30day',
|
|
|
+ 'video_play_count_uv_30day',
|
|
|
+ 'video_play_count_pv_30day',
|
|
|
+ 'video_share_count_uv_30day',
|
|
|
+ 'video_share_count_pv_30day',
|
|
|
+ 'video_return_count_30day',
|
|
|
+ 'video_ctr_uv_30day',
|
|
|
+ 'video_ctr_pv_30day',
|
|
|
+ 'video_share_rate_uv_30day',
|
|
|
+ 'video_share_rate_pv_30day',
|
|
|
+ 'video_return_rate_30day',
|
|
|
+ 'ad_status',
|
|
|
+ 'share_status'
|
|
|
]
|
|
|
|
|
|
|
|
@@ -47,7 +77,58 @@ def get_feature_data(project, table, features, dt):
|
|
|
return feature_df
|
|
|
|
|
|
|
|
|
-def daily_data_process(project, table, features, dt):
|
|
|
- feature_df = get_feature_data(project=project, table=table, features=features, dt=dt)
|
|
|
- feature_df['']
|
|
|
- pass
|
|
|
+def daily_data_process(project, table, features, dt, app_type):
|
|
|
+ """每日特征处理"""
|
|
|
+ feature_initial_df = get_feature_data(project=project, table=table, features=features, dt=dt)
|
|
|
+ feature_initial_df['apptype'] = feature_initial_df['apptype'].astype(int)
|
|
|
+ feature_df = feature_initial_df[feature_initial_df['apptype'] == app_type]
|
|
|
+ # 增加此次是否有广告字段 'ad_status' 1: 有广告, 0: 无广告
|
|
|
+ feature_df['ad_status'] = feature_df.apply(func=lambda x: 1 if x['ad_mid'] == x['mid'] else 0)
|
|
|
+ feature_df['share_videoid'].fillna(0, inplace=True)
|
|
|
+ feature_df['share_videoid'] = feature_df['share_videoid'].astype(int)
|
|
|
+ feature_df['videoid'] = feature_df['videoid'].astype(int)
|
|
|
+ # 增加此次是否分享了该视频 'share_status' 1: 分享, 0: 为分享
|
|
|
+ feature_df['share_status'] = feature_df.apply(func=lambda x: 1 if x['share_videoid'] == x['videoid'] else 0)
|
|
|
+ # 缺失值填充
|
|
|
+ feature_df.fillna(0, inplace=True)
|
|
|
+ # 数据类型校正
|
|
|
+ type_int_columns = [
|
|
|
+ 'mid_preview_count_30day',
|
|
|
+ 'mid_view_count_30day',
|
|
|
+ 'mid_view_count_pv_30day',
|
|
|
+ 'mid_play_count_30day',
|
|
|
+ 'mid_play_count_pv_30day',
|
|
|
+ 'mid_share_count_30day',
|
|
|
+ 'mid_share_count_pv_30day',
|
|
|
+ 'mid_return_count_30day',
|
|
|
+ 'video_preview_count_uv_30day',
|
|
|
+ 'video_preview_count_pv_30day',
|
|
|
+ 'video_view_count_uv_30day',
|
|
|
+ 'video_view_count_pv_30day',
|
|
|
+ 'video_play_count_uv_30day',
|
|
|
+ 'video_play_count_pv_30day',
|
|
|
+ 'video_share_count_uv_30day',
|
|
|
+ 'video_share_count_pv_30day',
|
|
|
+ 'video_return_count_30day',
|
|
|
+ ]
|
|
|
+ for column_name in type_int_columns:
|
|
|
+ feature_df[column_name].astype(int)
|
|
|
+ type_float_columns = [
|
|
|
+ 'mid_share_rate_30day',
|
|
|
+ 'mid_return_rate_30day',
|
|
|
+ 'video_ctr_uv_30day',
|
|
|
+ 'video_ctr_pv_30day',
|
|
|
+ 'video_share_rate_uv_30day',
|
|
|
+ 'video_share_rate_pv_30day',
|
|
|
+ 'video_return_rate_30day',
|
|
|
+ ]
|
|
|
+ for column_name in type_float_columns:
|
|
|
+ feature_df[column_name].astype(float)
|
|
|
+ # 获取所需的字段
|
|
|
+ train_df = feature_df[train_feature]
|
|
|
+ return train_df
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|