ad_predict_video_data_process.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. import os.path
  2. import time
  3. import datetime
  4. import pandas as pd
  5. from odps import ODPS
  6. # ODPS服务配置
  7. odps_config = {
  8. 'ENDPOINT': 'http://service.cn.maxcompute.aliyun.com/api',
  9. 'ACCESSID': 'LTAIWYUujJAm7CbH',
  10. 'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
  11. }
  12. features = [
  13. 'apptype',
  14. 'videoid',
  15. 'video_preview_count_uv_30day',
  16. 'video_preview_count_pv_30day',
  17. 'video_view_count_uv_30day',
  18. 'video_view_count_pv_30day',
  19. 'video_play_count_uv_30day',
  20. 'video_play_count_pv_30day',
  21. 'video_share_count_uv_30day',
  22. 'video_share_count_pv_30day',
  23. 'video_return_count_30day',
  24. 'video_ctr_uv_30day',
  25. 'video_ctr_pv_30day',
  26. 'video_share_rate_uv_30day',
  27. 'video_share_rate_pv_30day',
  28. 'video_return_rate_30day',
  29. ]
  30. def get_feature_data(project, table, dt, app_type):
  31. """获取特征数据"""
  32. odps = ODPS(
  33. access_id=odps_config['ACCESSID'],
  34. secret_access_key=odps_config['ACCESSKEY'],
  35. project=project,
  36. endpoint=odps_config['ENDPOINT'],
  37. )
  38. feature_data = []
  39. sql = f"select * from {project}.{table} where dt={dt} and apptype={app_type}"
  40. with odps.execute_sql(sql).open_reader() as reader:
  41. for record in reader:
  42. # print(record)
  43. item = {}
  44. for feature_name in features:
  45. item[feature_name] = record[feature_name]
  46. feature_data.append(item)
  47. feature_df = pd.DataFrame(feature_data)
  48. return feature_df
  49. def user_data_process(project, table, dt, app_type):
  50. """每日特征处理"""
  51. print('step 1: get video feature data')
  52. feature_initial_df = get_feature_data(project=project, table=table, dt=dt, app_type=app_type)
  53. print(f"feature_initial_df shape: {feature_initial_df.shape}")
  54. print('step 2: process')
  55. feature_initial_df['apptype'] = feature_initial_df['apptype'].astype(int)
  56. feature_df = feature_initial_df.copy()
  57. # 缺失值填充
  58. feature_df.fillna(0, inplace=True)
  59. # 数据类型校正
  60. type_int_columns = [
  61. 'video_preview_count_uv_30day',
  62. 'video_preview_count_pv_30day',
  63. 'video_view_count_uv_30day',
  64. 'video_view_count_pv_30day',
  65. 'video_play_count_uv_30day',
  66. 'video_play_count_pv_30day',
  67. 'video_share_count_uv_30day',
  68. 'video_share_count_pv_30day',
  69. 'video_return_count_30day',
  70. ]
  71. for column_name in type_int_columns:
  72. feature_df[column_name] = feature_df[column_name].astype(int)
  73. type_float_columns = [
  74. 'video_ctr_uv_30day',
  75. 'video_ctr_pv_30day',
  76. 'video_share_rate_uv_30day',
  77. 'video_share_rate_pv_30day',
  78. 'video_return_rate_30day',
  79. ]
  80. for column_name in type_float_columns:
  81. feature_df[column_name] = feature_df[column_name].astype(float)
  82. print(f"feature_df shape: {feature_df.shape}")
  83. print('step 3: add new video feature')
  84. # 补充新用户默认数据(使用均值)
  85. new_video_feature = {
  86. 'apptype': app_type,
  87. 'videoid': '-1',
  88. 'video_preview_count_uv_30day': int(feature_df['video_preview_count_uv_30day'].mean()),
  89. 'video_preview_count_pv_30day': int(feature_df['video_preview_count_pv_30day'].mean()),
  90. 'video_view_count_uv_30day': int(feature_df['video_view_count_uv_30day'].mean()),
  91. 'video_view_count_pv_30day': int(feature_df['video_view_count_pv_30day'].mean()),
  92. 'video_play_count_uv_30day': int(feature_df['video_play_count_uv_30day'].mean()),
  93. 'video_play_count_pv_30day': int(feature_df['video_play_count_pv_30day'].mean()),
  94. 'video_share_count_uv_30day': int(feature_df['video_share_count_uv_30day'].mean()),
  95. 'video_share_count_pv_30day': int(feature_df['video_share_count_pv_30day'].mean()),
  96. 'video_return_count_30day': int(feature_df['video_return_count_30day'].mean()),
  97. }
  98. new_video_feature['video_ctr_uv_30day'] = float(
  99. new_video_feature['video_play_count_uv_30day'] / new_video_feature['video_view_count_uv_30day'] + 1)
  100. new_video_feature['video_ctr_pv_30day'] = float(
  101. new_video_feature['video_play_count_pv_30day'] / new_video_feature['video_view_count_pv_30day'] + 1)
  102. new_video_feature['video_share_rate_uv_30day'] = float(
  103. new_video_feature['video_share_count_uv_30day'] / new_video_feature['video_play_count_uv_30day'] + 1)
  104. new_video_feature['video_share_rate_pv_30day'] = float(
  105. new_video_feature['video_share_count_pv_30day'] / new_video_feature['video_play_count_pv_30day'] + 1)
  106. new_video_feature['video_return_rate_30day'] = float(
  107. new_video_feature['video_return_count_30day'] / new_video_feature['video_view_count_pv_30day'] + 1)
  108. new_video_feature_df = pd.DataFrame([new_video_feature])
  109. video_df = pd.concat([feature_df, new_video_feature_df])
  110. print(f"video_df shape: {video_df.shape}")
  111. print(f"step 4: to csv")
  112. # 写入csv
  113. predict_data_dir = './data/predict_data'
  114. if not os.path.exists(predict_data_dir):
  115. os.makedirs(predict_data_dir)
  116. video_df.to_csv(f"{predict_data_dir}/video_feature.csv", index=False)
  117. if __name__ == '__main__':
  118. st_time = time.time()
  119. project = 'loghubods'
  120. table = 'admodel_testset_video'
  121. # dt = '20230725'
  122. now_date = datetime.datetime.today()
  123. dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=1), '%Y%m%d')
  124. user_data_process(project=project, table=table, dt=dt, app_type=0)
  125. print(time.time() - st_time)