分析-模拟在线打分的AUC.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. import pandas as pd
  2. import numpy as np
  3. import xgboost as xgb
  4. from tqdm import tqdm
  5. import sys
  6. from scipy.stats import pearsonr
  7. from itertools import combinations
  8. from bisect import bisect_left, bisect_right, insort
  9. def apply_title(row):
  10. try:
  11. return row.replace("\n", "")
  12. except Exception as e:
  13. print(str(e))
  14. return row
  15. def func_make_data(file_path: str):
  16. df_list = [pd.read_csv(file) for file in file_path.split(",")]
  17. df = pd.concat(df_list, ignore_index=True)
  18. print(df.columns.tolist())
  19. # df["title"] = df["title"].apply(apply_title)
  20. for col in [
  21. '日期', '小时', 'rank',
  22. '曝光量', '分享次数', '多层回流人数', 'return_rate', 'return_cnt',
  23. 'str', 'rosn', 'rovn', 'vovh24',
  24. 'score_552', 'score_562', 'score_567',
  25. 'fmrov', 'hasreturnrovscore', 'vov_score_562', 'vov_score_567'
  26. ]:
  27. df[col] = pd.to_numeric(df[col], errors='coerce')
  28. for col in ['实验组']:
  29. if col not in df.columns:
  30. df[col] = "无"
  31. else:
  32. df[col] = df[col].astype(str)
  33. df["score_552_offline"] = df["fmrov"] * (1 + df["hasreturnrovscore"])
  34. df["score_562_offline"] = df["fmrov"] * (1 + df["hasreturnrovscore"]) * (1 + 1 * df["vov_score_562"])
  35. df["score_567_offline"] = df["fmrov"] * (1 + df["hasreturnrovscore"]) + 0.05 * df["vov_score_567"]
  36. df.fillna(0, inplace=True)
  37. df = df[df["实验组"] != "未知"].reset_index(drop=True)
  38. return df
  39. # 计算每个 hour 的皮尔逊相关系数
  40. def calculate_correlation(group, col_a, col_b):
  41. a = group[col_a]
  42. b = group[col_b]
  43. return pearsonr(a, b)[0] # 取皮尔逊相关系数
  44. def calculate_auc_v2(group, col_a, col_b):
  45. sorted_group = group.sort_values(by=col_a).reset_index(drop=True)
  46. success_count = 0
  47. b_list = [] # 维护一个有序的 col_b 列值列表
  48. for index, row in sorted_group.iterrows():
  49. current_a = row[col_a]
  50. current_b = row[col_b]
  51. # 使用二分法查找 b_list 中当前值的位置
  52. pos_left = bisect_left(b_list, current_b)
  53. pos_right = bisect_right(b_list, current_b)
  54. pos = pos_left + (pos_right - pos_left)/2
  55. # 成功的计数:所有当前值之前的值都小于等于 current_a
  56. success_count += pos # 在 b_list 中的值个数即为成功计数
  57. # 插入当前 col_b 值到 b_list 中
  58. insort(b_list, current_b)
  59. # 计算成功概率
  60. total_combinations = len(group) * (len(group) - 1) / 2
  61. success_probability = success_count / total_combinations if total_combinations > 0 else 0
  62. return success_probability
  63. def func(df, rank_limit, col_a, col_b):
  64. c = df[df["rank"] <= rank_limit].groupby(['小时', '实验组']).apply(calculate_auc_v2, col_a=col_a, col_b=col_b).reset_index()
  65. c.columns = ['小时', '实验组', col_a + "-" + col_b]
  66. print("完成:{}和{}的计算。".format(col_a, col_b))
  67. return c
  68. try:
  69. date_train = sys.argv[1]
  70. except Exception as e:
  71. date_train = "~/Downloads/20241109_top1000(1).csv"
  72. df = func_make_data(date_train)
  73. for rank_limit in [100, 500, 1000]:
  74. print("date_train:rank_limit:{}-{}".format(date_train, rank_limit))
  75. df_01 = func(df, rank_limit, "vovh24", "score_552")
  76. df_02 = func(df, rank_limit, "vovh24", "score_562")
  77. df_03 = func(df, rank_limit, "vovh24", "score_567")
  78. df_04 = func(df, rank_limit, "rovn", "score_552")
  79. df_05 = func(df, rank_limit, "rovn", "score_562")
  80. df_06 = func(df, rank_limit, "rovn", "score_567")
  81. df_07 = func(df, rank_limit, "vovh24", "score_552_offline")
  82. df_08 = func(df, rank_limit, "vovh24", "score_562_offline")
  83. df_09 = func(df, rank_limit, "vovh24", "score_567_offline")
  84. df_10 = func(df, rank_limit, "rovn", "score_552_offline")
  85. df_11 = func(df, rank_limit, "rovn", "score_562_offline")
  86. df_12 = func(df, rank_limit, "rovn", "score_567_offline")
  87. df_list = [df_01, df_02, df_03, df_04, df_05, df_06, df_07, df_08, df_09, df_10, df_11, df_12]
  88. df_merged = pd.concat(df_list, axis=1)
  89. df_select = df_merged.iloc[:, [0] + [3*i+2 for i in range(len(df_list))]]
  90. df_select.to_csv("产品4_20241109_top1000-相关性-top{}.csv".format(rank_limit), index=False)