123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134 |
- import pandas as pd
- from utils import get_data_from_odps
- features = [
- 'apptype',
- 'subsessionid',
- 'mid',
- 'videoid',
- 'ad_mid',
- 'share_videoid'
- 'mid_preview_count_30day',
- 'mid_view_count_30day',
- 'mid_view_count_pv_30day',
- 'mid_play_count_30day',
- 'mid_play_count_pv_30day',
- 'mid_share_count_30day',
- 'mid_share_count_pv_30day',
- 'mid_return_count_30day',
- 'mid_share_rate_30day',
- 'mid_return_rate_30day',
- 'video_preview_count_uv_30day',
- 'video_preview_count_pv_30day',
- 'video_view_count_uv_30day',
- 'video_view_count_pv_30day',
- 'video_play_count_uv_30day',
- 'video_play_count_pv_30day',
- 'video_share_count_uv_30day',
- 'video_share_count_pv_30day',
- 'video_return_count_30day',
- 'video_ctr_uv_30day',
- 'video_ctr_pv_30day',
- 'video_share_rate_uv_30day',
- 'video_share_rate_pv_30day',
- 'video_return_rate_30day',
- ]
- train_feature = [
- 'mid_preview_count_30day',
- 'mid_view_count_30day',
- 'mid_view_count_pv_30day',
- 'mid_play_count_30day',
- 'mid_play_count_pv_30day',
- 'mid_share_count_30day',
- 'mid_share_count_pv_30day',
- 'mid_return_count_30day',
- 'mid_share_rate_30day',
- 'mid_return_rate_30day',
- 'video_preview_count_uv_30day',
- 'video_preview_count_pv_30day',
- 'video_view_count_uv_30day',
- 'video_view_count_pv_30day',
- 'video_play_count_uv_30day',
- 'video_play_count_pv_30day',
- 'video_share_count_uv_30day',
- 'video_share_count_pv_30day',
- 'video_return_count_30day',
- 'video_ctr_uv_30day',
- 'video_ctr_pv_30day',
- 'video_share_rate_uv_30day',
- 'video_share_rate_pv_30day',
- 'video_return_rate_30day',
- 'ad_status',
- 'share_status'
- ]
- def get_feature_data(project, table, features, dt):
- """获取特征数据"""
- records = get_data_from_odps(date=dt, project=project, table=table)
- feature_data = []
- for record in records:
- item = {}
- for feature_name in features:
- item[feature_name] = record[feature_name]
- feature_data.append(item)
- feature_df = pd.DataFrame(feature_data)
- return feature_df
- def daily_data_process(project, table, features, dt, app_type):
- """每日特征处理"""
- feature_initial_df = get_feature_data(project=project, table=table, features=features, dt=dt)
- feature_initial_df['apptype'] = feature_initial_df['apptype'].astype(int)
- feature_df = feature_initial_df[feature_initial_df['apptype'] == app_type]
- # 增加此次是否有广告字段 'ad_status' 1: 有广告, 0: 无广告
- feature_df['ad_status'] = feature_df.apply(func=lambda x: 1 if x['ad_mid'] == x['mid'] else 0)
- feature_df['share_videoid'].fillna(0, inplace=True)
- feature_df['share_videoid'] = feature_df['share_videoid'].astype(int)
- feature_df['videoid'] = feature_df['videoid'].astype(int)
- # 增加此次是否分享了该视频 'share_status' 1: 分享, 0: 为分享
- feature_df['share_status'] = feature_df.apply(func=lambda x: 1 if x['share_videoid'] == x['videoid'] else 0)
- # 缺失值填充
- feature_df.fillna(0, inplace=True)
- # 数据类型校正
- type_int_columns = [
- 'mid_preview_count_30day',
- 'mid_view_count_30day',
- 'mid_view_count_pv_30day',
- 'mid_play_count_30day',
- 'mid_play_count_pv_30day',
- 'mid_share_count_30day',
- 'mid_share_count_pv_30day',
- 'mid_return_count_30day',
- 'video_preview_count_uv_30day',
- 'video_preview_count_pv_30day',
- 'video_view_count_uv_30day',
- 'video_view_count_pv_30day',
- 'video_play_count_uv_30day',
- 'video_play_count_pv_30day',
- 'video_share_count_uv_30day',
- 'video_share_count_pv_30day',
- 'video_return_count_30day',
- ]
- for column_name in type_int_columns:
- feature_df[column_name].astype(int)
- type_float_columns = [
- 'mid_share_rate_30day',
- 'mid_return_rate_30day',
- 'video_ctr_uv_30day',
- 'video_ctr_pv_30day',
- 'video_share_rate_uv_30day',
- 'video_share_rate_pv_30day',
- 'video_return_rate_30day',
- ]
- for column_name in type_float_columns:
- feature_df[column_name].astype(float)
- # 获取所需的字段
- train_df = feature_df[train_feature]
- return train_df
|