ad_xgboost_predict_data_generate.py 3.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. import os
  2. import time
  3. import pandas as pd
  4. predict_data_dir = './data/predict_data'
  5. user_filename = 'user_feature.csv'
  6. video_filename = 'video_feature.csv'
  7. def read_csv_data(filepath):
  8. if os.path.exists(filepath):
  9. data = pd.read_csv(filepath, sep=',', engine='python', iterator=True)
  10. chunk_size = 1000000
  11. chunks = []
  12. loop = True
  13. while loop:
  14. try:
  15. chunk_data = data.get_chunk(chunk_size)
  16. chunks.append(chunk_data)
  17. except StopIteration:
  18. loop = False
  19. df = pd.concat(chunks, ignore_index=True)
  20. return df
  21. else:
  22. print("Don't have this file!")
  23. return None
  24. if __name__ == '__main__':
  25. st_time = time.time()
  26. # 1. 获取用户特征数据
  27. user_filepath = f"{predict_data_dir}/{user_filename}"
  28. user_df = read_csv_data(filepath=user_filepath)
  29. user_df = user_df[user_df['mid'] != '-1']
  30. print(f"user_df shape: {user_df.shape}")
  31. # 2. 获取视频特征数据
  32. video_filepath = f"{predict_data_dir}/{video_filename}"
  33. video_df = read_csv_data(filepath=video_filepath)
  34. video_df = video_df[video_df['videoid' != '-1']]
  35. print(f"video_df shape: {video_df.shape}")
  36. # 3. 用户特征和视频特征进行拼接
  37. video_features = [
  38. 'videoid',
  39. 'video_preview_count_uv_30day',
  40. 'video_preview_count_pv_30day',
  41. 'video_view_count_uv_30day',
  42. 'video_view_count_pv_30day',
  43. 'video_play_count_uv_30day',
  44. 'video_play_count_pv_30day',
  45. 'video_share_count_uv_30day',
  46. 'video_share_count_pv_30day',
  47. 'video_return_count_30day',
  48. 'video_ctr_uv_30day',
  49. 'video_ctr_pv_30day',
  50. 'video_share_rate_uv_30day',
  51. 'video_share_rate_pv_30day',
  52. 'video_return_rate_30day',
  53. ]
  54. predict_data_dir = './data/predict_data'
  55. file_list = [f"{predict_data_dir}/predict_data_0.csv", f"{predict_data_dir}/predict_data_1.csv"]
  56. for file in file_list:
  57. try:
  58. os.remove(file)
  59. except:
  60. continue
  61. merge_df_list = []
  62. for ind, row in video_df.iterrows():
  63. merge_df_temp = user_df.copy()
  64. for feature in video_features:
  65. merge_df_temp[feature] = row[feature]
  66. merge_df_list.append(merge_df_temp)
  67. if ind % 100 == 0:
  68. merge_df = pd.concat(merge_df_list, ignore_index=True)
  69. print(f"ind: {ind}, merge_df shape: {merge_df.shape}")
  70. # 4. 拼接广告特征ad_status
  71. for ad_status in [0, 1]:
  72. res_df = merge_df.copy()
  73. res_df['ad_status'] = ad_status
  74. # 写入csv
  75. if not os.path.exists(predict_data_dir):
  76. os.makedirs(predict_data_dir)
  77. res_df.to_csv(f"{predict_data_dir}/predict_data_{ad_status}.csv", index=False, mode='a')
  78. merge_df_list = []
  79. if len(merge_df_list) > 0:
  80. merge_df = pd.concat(merge_df_list, ignore_index=True)
  81. print(f"merge_df shape: {merge_df.shape}")
  82. # 4. 拼接广告特征ad_status
  83. for ad_status in [0, 1]:
  84. res_df = merge_df.copy()
  85. res_df['ad_status'] = ad_status
  86. # 写入csv
  87. if not os.path.exists(predict_data_dir):
  88. os.makedirs(predict_data_dir)
  89. res_df.to_csv(f"{predict_data_dir}/predict_data_{ad_status}.csv", index=False, mode='a')
  90. print(f"{time.time() - st_time}s")