generate_data.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. """
  2. Created on Mon Mar 18, 2024
  3. @author: luojunhui
  4. """
  5. import os
  6. import sys
  7. import json
  8. from tqdm import tqdm
  9. from datetime import datetime, timedelta
  10. sys.path.append(os.getcwd())
  11. from functions.odps_function import PyODPS
  12. def generate_hourly_strings(start_date, end_date):
  13. """
  14. Generate hourly date_str
  15. :param start_date:
  16. :param end_date:
  17. :return:
  18. """
  19. start = datetime.strptime(start_date, '%Y%m%d%H')
  20. end = datetime.strptime(end_date, '%Y%m%d%H')
  21. current = start
  22. date_strings = []
  23. while current <= end:
  24. date_strings.append(current.strftime('%Y%m%d%H'))
  25. current += timedelta(hours=1)
  26. return date_strings
  27. def generate_daily_strings(start_date, end_date):
  28. """
  29. Generate daily date_str
  30. :param start_date:
  31. :param end_date:
  32. :return:
  33. """
  34. start = datetime.strptime(start_date, '%Y%m%d')
  35. end = datetime.strptime(end_date, '%Y%m%d')
  36. current = start
  37. date_strings = []
  38. while current <= end:
  39. date_strings.append(current.strftime('%Y%m%d'))
  40. current += timedelta(days=1)
  41. return date_strings
  42. def generate_label_date(now_dt):
  43. """
  44. Generate date in 3 days
  45. :param now_dt:
  46. :return:
  47. """
  48. now_date = datetime.strptime(now_dt, "%Y%m%d%H")
  49. three_date = now_date + timedelta(days=4)
  50. return three_date.strftime("%Y%m%d")
  51. class VideoDataGenerator(object):
  52. """
  53. 生成训练数据,测试数据
  54. """
  55. def __init__(self):
  56. self.oo = PyODPS()
  57. def get_hour_data(self, dt):
  58. """
  59. 获取小时级的新视频
  60. :param dt: 小时参数
  61. :return:
  62. """
  63. sql = f"""select * from loghubods.conten_quality_base_hour where dt = '{dt}';"""
  64. hour_data = self.oo.select(sql)
  65. result = []
  66. for line in hour_data:
  67. obj = {
  68. "uid": line['uid'],
  69. "video_id": line['videoid'],
  70. "type": line['type'],
  71. "channel": line['channel'],
  72. "fst": line['flowpool_start_type'],
  73. "fsl": line['flowpool_start_level'],
  74. "fet": line['flowpool_end_type'],
  75. "fel": line['flowpool_end_level'],
  76. "f_view": line['flowpool_distribute_view_times'],
  77. "f_share": line['flowpool_share_times'],
  78. "f_return": line['flowpool_return_users'],
  79. "f3_view": line['flowpool_3days_distribute_view_times'],
  80. "f3_share": line['flowpool_3days_share_times'],
  81. "f3_return": line['flowpool_3days_return_users'],
  82. "ros_dms": line['ros_dms'],
  83. "rov_dms": line['rov_dms'],
  84. "ros_sls": line['ros_sls'],
  85. "rov_sls": line['rov_sls'],
  86. "fans": line['fans'],
  87. "view_count_user_30days": line['view_cnt_user_30days'],
  88. "share_count_user_30days": line['share_cnt_user_30days'],
  89. "return_count_user_30days": line['return_cnt_user_30days'],
  90. "rov_user": line['rov_user'],
  91. "str_user": line['str_user'], # share / view
  92. "out_user_id": line['out_user_id'],
  93. "mode": line['strategy'],
  94. "out_play_cnt": line['out_play_cnt'],
  95. "out_like_cnt": line['out_like_cnt'],
  96. "out_share_cnt": line['out_share_cnt'],
  97. "out_collection_cnt": line['out_collection_cnt'],
  98. "up_level_time_hour": line['up_level_time_hour'],
  99. "dt": line['dt']
  100. }
  101. result.append(obj)
  102. return result
  103. def get_daily_data(self, dt):
  104. """
  105. 天级表里面存储了视频的表现 label, 通过小时级的 video_id 去获取视频的表现
  106. :param dt: 20240101
  107. :return: data_list
  108. """
  109. sql = f"""select * from loghubods.conten_quality_base where dt = '{dt}';"""
  110. data = self.oo.select(sql)
  111. result = [
  112. {
  113. "video_id": item['videoid'],
  114. "total_view": item['flowpool_distribute_view_times'],
  115. "total_share": item['flowpool_share_times'],
  116. "total_return": item['flowpool_return_users'],
  117. "3day_view": item['flowpool_3days_distribute_view_times'],
  118. "3day_share": item['flowpool_3days_share_times'],
  119. "3day_return": item['flowpool_3days_return_users'],
  120. "dt": item['dt']
  121. } for item in data
  122. ]
  123. return result
  124. if __name__ == '__main__':
  125. # date_list = generate_hourly_strings("2024010100", "2024013123")
  126. date_list = generate_daily_strings("20240101", "20240228")
  127. V = VideoDataGenerator()
  128. L = {}
  129. # print(date_list)
  130. for date_str in tqdm(date_list):
  131. L[date_str] = {}
  132. # data_list = V.get_hour_data(date_str)
  133. data_list = V.get_daily_data(date_str)
  134. for obj in tqdm(data_list):
  135. video_id = obj['video_id']
  136. L[date_str][video_id] = obj
  137. with open('data/jan_feb_label.json', 'w') as f:
  138. f.write(json.dumps(L, ensure_ascii=False, indent=4))