import pandas as pd import numpy as np import xgboost as xgb from tqdm import tqdm import sys from scipy.stats import pearsonr from itertools import combinations from bisect import bisect_left, bisect_right, insort def apply_title(row): try: return row.replace("\n", "") except Exception as e: print(str(e)) return row def func_make_data(file_path: str): df_list = [pd.read_csv(file) for file in file_path.split(",")] df = pd.concat(df_list, ignore_index=True) print(df.columns.tolist()) # df["title"] = df["title"].apply(apply_title) for col in [ '日期', '小时', 'rank', '曝光量', '分享次数', '多层回流人数', 'return_rate', 'return_cnt', 'str', 'rosn', 'rovn', 'vovh24', 'score', 'fmrov', 'hasreturnrovscore', 'alpha_vov', 'vovscore' ]: df[col] = pd.to_numeric(df[col], errors='coerce') for col in ['实验组']: if col not in df.columns: df[col] = "无" else: df[col] = df[col].astype(str) df["p_rov"] = df["fmrov"] * (1 + df["hasreturnrovscore"]) df.fillna(0, inplace=True) df = df[df["实验组"] != "未知"].reset_index(drop=True) return df # 计算每个 hour 的皮尔逊相关系数 def calculate_correlation(group, col_a, col_b): a = group[col_a] b = group[col_b] return pearsonr(a, b)[0] # 取皮尔逊相关系数 def calculate_auc_v2(group, col_a, col_b): sorted_group = group.sort_values(by=col_a).reset_index(drop=True) success_count = 0 b_list = [] # 维护一个有序的 col_b 列值列表 for index, row in sorted_group.iterrows(): current_a = row[col_a] current_b = row[col_b] # 使用二分法查找 b_list 中当前值的位置 pos_left = bisect_left(b_list, current_b) pos_right = bisect_right(b_list, current_b) pos = pos_left + (pos_right - pos_left)/2 # 成功的计数:所有当前值之前的值都小于等于 current_a success_count += pos # 在 b_list 中的值个数即为成功计数 # 插入当前 col_b 值到 b_list 中 insort(b_list, current_b) # 计算成功概率 total_combinations = len(group) * (len(group) - 1) / 2 success_probability = success_count / total_combinations if total_combinations > 0 else 0 return success_probability def func(df, rank_limit, col_a, col_b): c = df[df["rank"] <= rank_limit].groupby(['小时', '实验组']).apply(calculate_auc_v2, col_a=col_a, col_b=col_b).reset_index() c.columns = ['小时', '实验组', col_a + "-" + col_b] print("完成:{}和{}的计算。".format(col_a, col_b)) return c try: date_train = sys.argv[1] except Exception as e: date_train = "~/Downloads/20241116.csv" df = func_make_data(date_train) for rank_limit in [100, 500, 1000]: print("date_train:rank_limit:{}-{}".format(date_train, rank_limit)) df_01 = func(df, rank_limit, "vovh24", "score") df_02 = func(df, rank_limit, "rovn", "score") df_list = [] for df_tmp in [df_01, df_02]: for experiment in ["563", "562", "567"]: df_list.append( df_tmp[df_tmp["实验组"] == experiment].reset_index(drop=True) ) df_merged = pd.concat(df_list, axis=1) df_select = df_merged.iloc[:, [0] + [3*i+2 for i in range(len(df_list))]] df_select.to_csv("20241116-相关性-top{}.csv".format(rank_limit), index=False)