1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- import pandas as pd
- import numpy as np
- import xgboost as xgb
- from tqdm import tqdm
- import sys
- from scipy.stats import pearsonr
- from itertools import combinations
- from bisect import bisect_left, bisect_right, insort
- def apply_title(row):
- try:
- return row.replace("\n", "")
- except Exception as e:
- print(str(e))
- return row
- def func_make_data(file_path: str):
- df_list = [pd.read_csv(file) for file in file_path.split(",")]
- df = pd.concat(df_list, ignore_index=True)
- print(df.columns.tolist())
- # df["title"] = df["title"].apply(apply_title)
- for col in [
- '日期', '小时', 'rank',
- '曝光量', '分享次数', '多层回流人数', 'return_rate', 'return_cnt',
- 'str', 'rosn', 'rovn', 'vovh24',
- 'score', 'fmrov', 'hasreturnrovscore', 'alpha_vov', 'vovscore'
- ]:
- df[col] = pd.to_numeric(df[col], errors='coerce')
- for col in ['实验组']:
- if col not in df.columns:
- df[col] = "无"
- else:
- df[col] = df[col].astype(str)
- df["p_rov"] = df["fmrov"] * (1 + df["hasreturnrovscore"])
- df.fillna(0, inplace=True)
- df = df[df["实验组"] != "未知"].reset_index(drop=True)
- return df
- # 计算每个 hour 的皮尔逊相关系数
- def calculate_correlation(group, col_a, col_b):
- a = group[col_a]
- b = group[col_b]
- return pearsonr(a, b)[0] # 取皮尔逊相关系数
- def calculate_auc_v2(group, col_a, col_b):
- sorted_group = group.sort_values(by=col_a).reset_index(drop=True)
- success_count = 0
- b_list = [] # 维护一个有序的 col_b 列值列表
- for index, row in sorted_group.iterrows():
- current_a = row[col_a]
- current_b = row[col_b]
- # 使用二分法查找 b_list 中当前值的位置
- pos_left = bisect_left(b_list, current_b)
- pos_right = bisect_right(b_list, current_b)
- pos = pos_left + (pos_right - pos_left)/2
- # 成功的计数:所有当前值之前的值都小于等于 current_a
- success_count += pos # 在 b_list 中的值个数即为成功计数
- # 插入当前 col_b 值到 b_list 中
- insort(b_list, current_b)
- # 计算成功概率
- total_combinations = len(group) * (len(group) - 1) / 2
- success_probability = success_count / total_combinations if total_combinations > 0 else 0
- return success_probability
- def func(df, rank_limit, col_a, col_b):
- c = df[df["rank"] <= rank_limit].groupby(['小时', '实验组']).apply(calculate_auc_v2, col_a=col_a, col_b=col_b).reset_index()
- c.columns = ['小时', '实验组', col_a + "-" + col_b]
- print("完成:{}和{}的计算。".format(col_a, col_b))
- return c
- try:
- date_train = sys.argv[1]
- except Exception as e:
- date_train = "~/Downloads/20241116.csv"
- df = func_make_data(date_train)
- for rank_limit in [100, 500, 1000]:
- print("date_train:rank_limit:{}-{}".format(date_train, rank_limit))
- df_01 = func(df, rank_limit, "vovh24", "score")
- df_02 = func(df, rank_limit, "rovn", "score")
- df_list = []
- for df_tmp in [df_01, df_02]:
- for experiment in ["563", "562", "567"]:
- df_list.append(
- df_tmp[df_tmp["实验组"] == experiment].reset_index(drop=True)
- )
- df_merged = pd.concat(df_list, axis=1)
- df_select = df_merged.iloc[:, [0] + [3*i+2 for i in range(len(df_list))]]
- df_select.to_csv("20241116-相关性-top{}.csv".format(rank_limit), index=False)
|