cal_hour_score.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. # coding utf-8
  2. import sys
  3. import traceback
  4. import math
  5. import pandas as pd
  6. from my_utils import send_msg_to_feishu
  7. from my_config import set_config
  8. from log import Log
  9. config_, _ = set_config()
  10. log_ = Log()
  11. features = [
  12. 'apptype',
  13. 'code',
  14. 'videoid',
  15. 'lastonehour_preview', # 过去1小时预曝光人数
  16. 'lastonehour_view', # 过去1小时曝光人数
  17. 'lastonehour_play', # 过去1小时播放人数
  18. 'lastonehour_share', # 过去1小时分享人数
  19. 'lastonehour_return', # 过去1小时分享,过去1小时回流人数
  20. 'lastonehour_preview_total', # 过去1小时预曝光次数
  21. 'lastonehour_view_total', # 过去1小时曝光次数
  22. 'lastonehour_play_total', # 过去1小时播放次数
  23. 'lastonehour_share_total', # 过去1小时分享次数
  24. 'platform_return',
  25. 'lastonehour_show', # 不区分地域
  26. 'lastonehour_show_region', # 地域分组
  27. 'lasttwohour_share', # h-2小时分享人数
  28. 'lasttwohour_return_now', # h-2分享,过去1小时回流人数
  29. 'lasttwohour_return', # h-2分享,h-2回流人数
  30. 'lastthreehour_share', # h-3小时分享人数
  31. 'lastthreehour_return_now', # h-3分享,过去1小时回流人数
  32. 'lastthreehour_return', # h-3分享,h-3回流人数
  33. 'lastonehour_return_new', # 过去1小时分享,过去1小时回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域)
  34. 'lasttwohour_return_now_new', # h-2分享,过去1小时回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域)
  35. 'lasttwohour_return_new', # h-2分享,h-2回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域)
  36. 'lastthreehour_return_now_new', # h-3分享,过去1小时回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域)
  37. 'lastthreehour_return_new', # h-3分享,h-3回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域)
  38. 'platform_return_new', # 平台分发回流(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域)
  39. ]
  40. def data_group(data_path):
  41. """将数据按照videoid聚合(求和)"""
  42. f = open(data_path)
  43. index = 0
  44. data_dict = {}
  45. while True:
  46. line = f.readline()
  47. if not line:
  48. break
  49. if index == 0:
  50. index += 1
  51. continue
  52. index += 1
  53. items = line.strip().split(",")
  54. # print(items)
  55. if len(items) < len(features):
  56. continue
  57. video_id = items[2]
  58. if video_id not in data_dict:
  59. data_dict[video_id] = {'videoid': video_id}
  60. for i, feature in enumerate(features):
  61. if feature in ['apptype', 'code', 'videoid']:
  62. continue
  63. data_dict[video_id][feature] = int(float(items[i]))
  64. else:
  65. for i, feature in enumerate(features):
  66. if feature in ['apptype', 'code', 'videoid']:
  67. continue
  68. data_dict[video_id][feature] = data_dict[video_id][feature] + int(float(items[i]))
  69. f.close()
  70. data_list = [item for video_id, item in data_dict.items()]
  71. data_df = pd.DataFrame(data_list)
  72. return data_df
  73. def cal_score(data_df):
  74. """计算score"""
  75. df = data_df.copy()
  76. # share_rate_view = (share+1)/(view+1000)
  77. df['share_rate_view'] = (df['lastonehour_share'] + 1) / (df['lastonehour_view'] + 1000)
  78. # back_rate = (return+1)/(share+10)
  79. df['back_rate'] = (df['lastonehour_return'] + 1) / (df['lastonehour_share'] + 10)
  80. # back_rate_2h = (lasttwohour_return_now+1)/(share+10)
  81. df['back_rate_2h'] = (df['lasttwohour_return_now'] + 1) / (df['lasttwohour_share'] + 10)
  82. # back_rate_3h = (lastthreehour_return_now+1)/(share+10)
  83. df['back_rate_3h'] = (df['lastthreehour_return_now'] + 1) / (df['lastthreehour_share'] + 10)
  84. df['log_back'] = (df['lastonehour_return'] + 1).apply(math.log)
  85. # h-2回流留存
  86. df['return_retention_initial_2h'] = (df['lasttwohour_return_now'] + 1) / (df['lasttwohour_return'] + 5)
  87. df['return_retention_2h'] = df['return_retention_initial_2h'].apply(lambda x: 1 if x > 1 else x)
  88. # h-3回流留存
  89. df['return_retention_initial_3h'] = (df['lastthreehour_return_now'] + 1) / (df['lastthreehour_return'] + 10)
  90. df['return_retention_3h'] = df['return_retention_initial_3h'].apply(lambda x: 0.8 if x > 0.8 else x)
  91. # score1 = 回流/(view+5)
  92. df['hour_score1'] = df['lastonehour_return'] / (df['lastonehour_view'] + 5)
  93. # score2 = (回流 * (1 + h-2回流留存 + h-3回流留存))/(view+1000)
  94. df['hour_score2'] = (df['lastonehour_return'] * (1 + df['return_retention_2h'] + df['return_retention_3h'])) / \
  95. (df['lastonehour_view'] + 1000)
  96. # score3 = (lastthreehour_return_now + lasttwohour_return_now + lastonehour_return)/(lastonehour_view+1000)
  97. df['hour_score3'] = (df['lastthreehour_return_now'] + df['lasttwohour_return_now'] + df['lastonehour_return']) / \
  98. (df['lastonehour_view'] + 1000)
  99. # score4 = share/view * back_rate * logback
  100. df['hour_score4'] = df['share_rate_view'] * df['back_rate'] * df['log_back']
  101. # score5 = share/view * (back_rate + back_rate_2h + back_rate_3h) * logback
  102. df['hour_score5'] = df['share_rate_view'] * (df['back_rate'] + df['back_rate_2h'] + df['back_rate_3h']) * df['log_back']
  103. # score6 = 回流/(view+5)*back_rate
  104. df['hour_score6'] = df['lastonehour_return'] / (df['lastonehour_view'] + 5) * df['back_rate']
  105. score_df = df[['videoid', 'hour_score1', 'hour_score2', 'hour_score3', 'hour_score4', 'hour_score5', 'hour_score6']]
  106. # print(score_df)
  107. return score_df
  108. if __name__ == "__main__":
  109. try:
  110. now_date = sys.argv[1]
  111. log_.info(f"now_date: {now_date}")
  112. data_path = f"./data/hour_video_data_{now_date}.csv"
  113. data_df = data_group(data_path=data_path)
  114. log_.info(f"hour data_df shape: {data_df.shape}")
  115. hour_score_path = f"./data/hour_score_{now_date}.csv"
  116. score_df = cal_score(data_df=data_df)
  117. score_df.to_csv(hour_score_path, index=False)
  118. log_.info(f"hour score_df shape: {score_df.shape}")
  119. except Exception as e:
  120. log_.error(f"rank 小时级分值更新失败, exception: {e}, traceback: {traceback.format_exc()}")
  121. send_msg_to_feishu(
  122. webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
  123. key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
  124. msg_text=f"rov-offline{config_.ENV_TEXT} - rank 小时级分值更新失败\n"
  125. f"exception: {e}\n"
  126. f"traceback: {traceback.format_exc()}"
  127. )