cal_24h_score.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. # coding utf-8
  2. import sys
  3. import json
  4. import math
  5. import pandas as pd
  6. features = [
  7. 'apptype',
  8. 'videoid',
  9. 'preview人数', # 过去24h预曝光人数
  10. 'view人数', # 过去24h曝光人数
  11. 'play人数', # 过去24h播放人数
  12. 'share人数', # 过去24h分享人数
  13. '回流人数', # 过去24h分享,过去24h回流人数
  14. 'preview次数', # 过去24h预曝光次数
  15. 'view次数', # 过去24h曝光次数
  16. 'play次数', # 过去24h播放次数
  17. 'share次数', # 过去24h分享次数
  18. 'platform_return',
  19. 'platform_preview',
  20. 'platform_preview_total',
  21. 'platform_show',
  22. 'platform_show_total',
  23. 'platform_view',
  24. 'platform_view_total',
  25. ]
  26. def data_group(data_path):
  27. """将数据按照videoid聚合(求和)"""
  28. f = open(data_path)
  29. index = 0
  30. data_dict = {}
  31. while True:
  32. line = f.readline()
  33. if not line:
  34. break
  35. if index == 0:
  36. index += 1
  37. continue
  38. index += 1
  39. items = line.strip().split(",")
  40. # print(items)
  41. if len(items) < len(features):
  42. continue
  43. video_id = items[1]
  44. if video_id not in data_dict:
  45. data_dict[video_id] = {'videoid': video_id}
  46. for i, feature in enumerate(features):
  47. if feature in ['apptype', 'videoid']:
  48. continue
  49. data_dict[video_id][feature] = int(float(items[i]))
  50. else:
  51. for i, feature in enumerate(features):
  52. if feature in ['apptype', 'videoid']:
  53. continue
  54. data_dict[video_id][feature] = data_dict[video_id][feature] + int(float(items[i]))
  55. f.close()
  56. data_list = [item for video_id, item in data_dict.items()]
  57. data_df = pd.DataFrame(data_list)
  58. return data_df
  59. def cal_score(data_df):
  60. """计算score"""
  61. df = data_df.copy()
  62. # score1 = 回流/(view+10)
  63. df['24h_score1'] = df['回流人数'] / (df['view人数'] + 10)
  64. score_df = df[['videoid', '24h_score1']]
  65. # print(score_df)
  66. return score_df
  67. if __name__ == "__main__":
  68. # 1.load data
  69. now_date = sys.argv[1]
  70. print(f"now_date: {now_date}")
  71. data_path = f"./data/24h_video_data_{now_date}.csv"
  72. data_df = data_group(data_path=data_path)
  73. print(f"data_df shape: {data_df.shape}")
  74. hour_score_path = f"./data/24h_score_{now_date}.csv"
  75. score_df = cal_score(data_df=data_df)
  76. score_df.to_csv(hour_score_path, index=False)
  77. print(f"score_df shape: {score_df.shape}")