ad_feature_process.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. import pandas as pd
  2. from utils import get_data_from_odps
  3. features = [
  4. 'apptype',
  5. 'subsessionid',
  6. 'mid',
  7. 'videoid',
  8. 'ad_mid',
  9. 'share_videoid',
  10. 'mid_preview_count_30day',
  11. 'mid_view_count_30day',
  12. 'mid_view_count_pv_30day',
  13. 'mid_play_count_30day',
  14. 'mid_play_count_pv_30day',
  15. 'mid_share_count_30day',
  16. 'mid_share_count_pv_30day',
  17. 'mid_return_count_30day',
  18. 'mid_share_rate_30day',
  19. 'mid_return_rate_30day',
  20. 'video_preview_count_uv_30day',
  21. 'video_preview_count_pv_30day',
  22. 'video_view_count_uv_30day',
  23. 'video_view_count_pv_30day',
  24. 'video_play_count_uv_30day',
  25. 'video_play_count_pv_30day',
  26. 'video_share_count_uv_30day',
  27. 'video_share_count_pv_30day',
  28. 'video_return_count_30day',
  29. 'video_ctr_uv_30day',
  30. 'video_ctr_pv_30day',
  31. 'video_share_rate_uv_30day',
  32. 'video_share_rate_pv_30day',
  33. 'video_return_rate_30day',
  34. ]
  35. train_feature = [
  36. 'mid_preview_count_30day',
  37. 'mid_view_count_30day',
  38. 'mid_view_count_pv_30day',
  39. 'mid_play_count_30day',
  40. 'mid_play_count_pv_30day',
  41. 'mid_share_count_30day',
  42. 'mid_share_count_pv_30day',
  43. 'mid_return_count_30day',
  44. 'mid_share_rate_30day',
  45. 'mid_return_rate_30day',
  46. 'video_preview_count_uv_30day',
  47. 'video_preview_count_pv_30day',
  48. 'video_view_count_uv_30day',
  49. 'video_view_count_pv_30day',
  50. 'video_play_count_uv_30day',
  51. 'video_play_count_pv_30day',
  52. 'video_share_count_uv_30day',
  53. 'video_share_count_pv_30day',
  54. 'video_return_count_30day',
  55. 'video_ctr_uv_30day',
  56. 'video_ctr_pv_30day',
  57. 'video_share_rate_uv_30day',
  58. 'video_share_rate_pv_30day',
  59. 'video_return_rate_30day',
  60. 'ad_status',
  61. 'share_status',
  62. ]
  63. def get_feature_data(project, table, features, dt):
  64. """获取特征数据"""
  65. records = get_data_from_odps(date=dt, project=project, table=table)
  66. feature_data = []
  67. for record in records:
  68. item = {}
  69. for feature_name in features:
  70. item[feature_name] = record[feature_name]
  71. feature_data.append(item)
  72. feature_df = pd.DataFrame(feature_data)
  73. return feature_df
  74. def daily_data_process(project, table, features, dt, app_type):
  75. """每日特征处理"""
  76. print('step 1: get feature data')
  77. feature_initial_df = get_feature_data(project=project, table=table, features=features, dt=dt)
  78. print(f"feature_initial_df shape: {feature_initial_df.shape}")
  79. print('step 2: process')
  80. feature_initial_df['apptype'] = feature_initial_df['apptype'].astype(int)
  81. feature_df = feature_initial_df[feature_initial_df['apptype'] == app_type]
  82. # 增加此次是否有广告字段 'ad_status' 1: 有广告, 0: 无广告
  83. feature_df['ad_status'] = feature_df.apply(func=lambda x: 1 if x['ad_mid'] == x['mid'] else 0)
  84. feature_df['share_videoid'].fillna(0, inplace=True)
  85. feature_df['share_videoid'] = feature_df['share_videoid'].astype(int)
  86. feature_df['videoid'] = feature_df['videoid'].astype(int)
  87. # 增加此次是否分享了该视频 'share_status' 1: 分享, 0: 为分享
  88. feature_df['share_status'] = feature_df.apply(func=lambda x: 1 if x['share_videoid'] == x['videoid'] else 0)
  89. # 缺失值填充
  90. feature_df.fillna(0, inplace=True)
  91. # 数据类型校正
  92. type_int_columns = [
  93. 'mid_preview_count_30day',
  94. 'mid_view_count_30day',
  95. 'mid_view_count_pv_30day',
  96. 'mid_play_count_30day',
  97. 'mid_play_count_pv_30day',
  98. 'mid_share_count_30day',
  99. 'mid_share_count_pv_30day',
  100. 'mid_return_count_30day',
  101. 'video_preview_count_uv_30day',
  102. 'video_preview_count_pv_30day',
  103. 'video_view_count_uv_30day',
  104. 'video_view_count_pv_30day',
  105. 'video_play_count_uv_30day',
  106. 'video_play_count_pv_30day',
  107. 'video_share_count_uv_30day',
  108. 'video_share_count_pv_30day',
  109. 'video_return_count_30day',
  110. ]
  111. for column_name in type_int_columns:
  112. feature_df[column_name].astype(int)
  113. type_float_columns = [
  114. 'mid_share_rate_30day',
  115. 'mid_return_rate_30day',
  116. 'video_ctr_uv_30day',
  117. 'video_ctr_pv_30day',
  118. 'video_share_rate_uv_30day',
  119. 'video_share_rate_pv_30day',
  120. 'video_return_rate_30day',
  121. ]
  122. for column_name in type_float_columns:
  123. feature_df[column_name].astype(float)
  124. print(f"feature_df shape: {feature_df.shape}")
  125. # 获取所需的字段
  126. print('step 3: get train_df')
  127. train_df = feature_df[train_feature]
  128. print(f"train_df shape: {train_df.shape}")
  129. return train_df
  130. if __name__ == '__main__':
  131. project = 'loghubods'
  132. table = 'admodel_data_train'
  133. dt = '20230725'
  134. df = daily_data_process(project=project, table=table, features=features, dt=dt, app_type=0)
  135. print(df.shape)
  136. print(df.columns)
  137. df.to_csv(f'./data/{dt}.csv')