分析-模拟在线打分的AUC.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. import pandas as pd
  2. import numpy as np
  3. import xgboost as xgb
  4. from tqdm import tqdm
  5. import sys
  6. from scipy.stats import pearsonr
  7. from itertools import combinations
  8. from bisect import bisect_left, bisect_right, insort
  9. def apply_title(row):
  10. try:
  11. return row.replace("\n", "")
  12. except Exception as e:
  13. print(str(e))
  14. return row
  15. def func_make_data(file_path: str):
  16. df_list = [pd.read_csv(file) for file in file_path.split(",")]
  17. df = pd.concat(df_list, ignore_index=True)
  18. print(df.columns.tolist())
  19. # df["title"] = df["title"].apply(apply_title)
  20. for col in [
  21. '日期', '小时', 'rank',
  22. '曝光量', '分享次数', '多层回流人数', 'return_rate', 'return_cnt',
  23. 'str', 'rosn', 'rovn', 'vovh24',
  24. 'score_563', 'score_562', 'score_567',
  25. 'fmrov', 'hasreturnrovscore', 'vov_score_562', 'vov_score_567'
  26. ]:
  27. df[col] = pd.to_numeric(df[col], errors='coerce')
  28. for col in ['实验组']:
  29. if col not in df.columns:
  30. df[col] = "无"
  31. else:
  32. df[col] = df[col].astype(str)
  33. df["score_552_offline"] = df["fmrov"] * (1 + df["hasreturnrovscore"])
  34. df["score_563_offline"] = df["fmrov"] * (1 + df["hasreturnrovscore"]) + 0.1 * df["vov_score_563"]
  35. df["score_562_offline"] = df["fmrov"] * (1 + df["hasreturnrovscore"]) * (1 + 1 * df["vov_score_562"])
  36. df["score_567_offline"] = df["fmrov"] * (1 + df["hasreturnrovscore"]) + 0.05 * df["vov_score_567"]
  37. df.fillna(0, inplace=True)
  38. df = df[df["实验组"] != "未知"].reset_index(drop=True)
  39. return df
  40. # 计算每个 hour 的皮尔逊相关系数
  41. def calculate_correlation(group, col_a, col_b):
  42. a = group[col_a]
  43. b = group[col_b]
  44. return pearsonr(a, b)[0] # 取皮尔逊相关系数
  45. def calculate_auc_v2(group, col_a, col_b):
  46. sorted_group = group.sort_values(by=col_a).reset_index(drop=True)
  47. success_count = 0
  48. b_list = [] # 维护一个有序的 col_b 列值列表
  49. for index, row in sorted_group.iterrows():
  50. current_a = row[col_a]
  51. current_b = row[col_b]
  52. # 使用二分法查找 b_list 中当前值的位置
  53. pos_left = bisect_left(b_list, current_b)
  54. pos_right = bisect_right(b_list, current_b)
  55. pos = pos_left + (pos_right - pos_left)/2
  56. # 成功的计数:所有当前值之前的值都小于等于 current_a
  57. success_count += pos # 在 b_list 中的值个数即为成功计数
  58. # 插入当前 col_b 值到 b_list 中
  59. insort(b_list, current_b)
  60. # 计算成功概率
  61. total_combinations = len(group) * (len(group) - 1) / 2
  62. success_probability = success_count / total_combinations if total_combinations > 0 else 0
  63. return success_probability
  64. def func(df, rank_limit, col_a, col_b):
  65. c = df[df["rank"] <= rank_limit].groupby(['小时', '实验组']).apply(calculate_auc_v2, col_a=col_a, col_b=col_b).reset_index()
  66. c.columns = ['小时', '实验组', col_a + "-" + col_b]
  67. print("完成:{}和{}的计算。".format(col_a, col_b))
  68. return c
  69. try:
  70. date_train = sys.argv[1]
  71. except Exception as e:
  72. date_train = "~/Downloads/20241115_top1000.csv"
  73. df = func_make_data(date_train)
  74. for rank_limit in [100, 500, 1000]:
  75. print("date_train:rank_limit:{}-{}".format(date_train, rank_limit))
  76. df_01 = func(df, rank_limit, "vovh24", "score_563")
  77. df_02 = func(df, rank_limit, "vovh24", "score_562")
  78. df_03 = func(df, rank_limit, "vovh24", "score_567")
  79. df_04 = func(df, rank_limit, "rovn", "score_563")
  80. df_05 = func(df, rank_limit, "rovn", "score_562")
  81. df_06 = func(df, rank_limit, "rovn", "score_567")
  82. df_07 = func(df, rank_limit, "vovh24", "score_563_offline")
  83. df_08 = func(df, rank_limit, "vovh24", "score_562_offline")
  84. df_09 = func(df, rank_limit, "vovh24", "score_567_offline")
  85. df_10 = func(df, rank_limit, "rovn", "score_563_offline")
  86. df_11 = func(df, rank_limit, "rovn", "score_562_offline")
  87. df_12 = func(df, rank_limit, "rovn", "score_567_offline")
  88. df_list = [df_01, df_02, df_03, df_04, df_05, df_06, df_07, df_08, df_09, df_10, df_11, df_12]
  89. df_merged = pd.concat(df_list, axis=1)
  90. df_select = df_merged.iloc[:, [0] + [3*i+2 for i in range(len(df_list))]]
  91. df_select.to_csv("产品0_20241115_top1000-相关性-top{}.csv".format(rank_limit), index=False)