ad_xgboost_predict_data_generate.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. import os
  2. import pandas as pd
  3. predict_data_dir = './data/predict_data'
  4. user_filename = 'user_feature.csv'
  5. video_filename = 'video_feature.csv'
  6. def read_csv_data(filepath):
  7. if os.path.exists(filepath):
  8. data = pd.read_csv(filepath, sep=',', engine='python', iterator=True)
  9. chunk_size = 1000000
  10. chunks = []
  11. loop = True
  12. while loop:
  13. try:
  14. chunk_data = data.get_chunk(chunk_size)
  15. chunks.append(chunk_data)
  16. except StopIteration:
  17. loop = False
  18. df = pd.concat(chunks, ignore_index=True)
  19. return df
  20. else:
  21. print("Don't have this file!")
  22. return None
  23. if __name__ == '__main__':
  24. # 1. 获取用户特征数据
  25. user_filepath = f"{predict_data_dir}/{user_filename}"
  26. user_df = read_csv_data(filepath=user_filepath)
  27. # 2. 获取视频特征数据
  28. video_filepath = f"{predict_data_dir}/{video_filename}"
  29. video_df = read_csv_data(filepath=video_filepath)
  30. # 3. 用户特征和视频特征进行拼接
  31. video_features = [
  32. 'videoid',
  33. 'video_preview_count_uv_30day',
  34. 'video_preview_count_pv_30day',
  35. 'video_view_count_uv_30day',
  36. 'video_view_count_pv_30day',
  37. 'video_play_count_uv_30day',
  38. 'video_play_count_pv_30day',
  39. 'video_share_count_uv_30day',
  40. 'video_share_count_pv_30day',
  41. 'video_return_count_30day',
  42. 'video_ctr_uv_30day',
  43. 'video_ctr_pv_30day',
  44. 'video_share_rate_uv_30day',
  45. 'video_share_rate_pv_30day',
  46. 'video_return_rate_30day',
  47. ]
  48. merge_df_list = []
  49. for ind, row in video_df.iterrows():
  50. merge_df_temp = user_df.copy()
  51. for feature in video_features:
  52. merge_df_temp[feature] = row[feature]
  53. merge_df_list.append(merge_df_temp)
  54. merge_df = pd.concat(merge_df_list, ignore_index=True)
  55. # 4. 拼接广告特征ad_status
  56. for ad_status in [0, 1]:
  57. res_df = merge_df.copy()
  58. res_df['ad_status'] = ad_status
  59. # 写入csv
  60. predict_data_dir = './data/predict_data'
  61. if not os.path.exists(predict_data_dir):
  62. os.makedirs(predict_data_dir)
  63. res_df.to_csv(f"{predict_data_dir}/predict_data_{ad_status}.csv", index=False)