|
@@ -8,7 +8,8 @@ from hdfs import InsecureClient
|
|
|
|
|
|
client = InsecureClient("http://master-1-1.c-7f31a3eea195cb73.cn-hangzhou.emr.aliyuncs.com:9870", user="spark")
|
|
client = InsecureClient("http://master-1-1.c-7f31a3eea195cb73.cn-hangzhou.emr.aliyuncs.com:9870", user="spark")
|
|
|
|
|
|
-SEGMENT_BASE_PATH = "/dw/recommend/model/36_score_calibration_file"
|
|
|
|
|
|
+SEGMENT_BASE_PATH = os.environ.get("SEGMENT_BASE_PATH", "/dw/recommend/model/36_score_calibration_file")
|
|
|
|
+PREDICT_CACHE_PATH = os.environ.get("PREDICT_CACHE_PATH", "/root/zhaohp/XGB/predict_cache")
|
|
|
|
|
|
|
|
|
|
def read_predict_from_local_txt(txt_file) -> list:
|
|
def read_predict_from_local_txt(txt_file) -> list:
|
|
@@ -81,10 +82,7 @@ def segment_calc_diff_rate_by_score(df: pd.DataFrame, segment_file_path: str, st
|
|
segment_score_avg=('score', 'mean'),
|
|
segment_score_avg=('score', 'mean'),
|
|
).reset_index()
|
|
).reset_index()
|
|
group_df['segment_true_score'] = group_df['segment_label_sum'] / group_df['segment_label_cnt']
|
|
group_df['segment_true_score'] = group_df['segment_label_sum'] / group_df['segment_label_cnt']
|
|
- group_df['segment_diff_rate_origin'] = (group_df['segment_score_avg'] / group_df['segment_true_score'] - 1).mask(group_df['segment_true_score'] == 0, 0)
|
|
|
|
-
|
|
|
|
- # 使用滑动窗口计算当前值以及上下两行的平均值,作为新的diff_rate
|
|
|
|
- group_df['segment_diff_rate'] = group_df['segment_diff_rate_origin'].rolling(window=5, center=True, min_periods=1).mean()
|
|
|
|
|
|
+ group_df['segment_diff_rate'] = (group_df['segment_score_avg'] / group_df['segment_true_score'] - 1).mask(group_df['segment_true_score'] == 0, 0)
|
|
|
|
|
|
# 完整的分段文件保存
|
|
# 完整的分段文件保存
|
|
csv_data = group_df.to_csv(sep="\t", index=False)
|
|
csv_data = group_df.to_csv(sep="\t", index=False)
|
|
@@ -100,7 +98,7 @@ def segment_calc_diff_rate_by_score(df: pd.DataFrame, segment_file_path: str, st
|
|
return merged_df, filtered_df
|
|
return merged_df, filtered_df
|
|
|
|
|
|
|
|
|
|
-def read_and_calibration_predict(predict_path: str, step=100) -> [pd.DataFrame, pd.DataFrame]:
|
|
|
|
|
|
+def read_and_calibration_predict(predict_path: str, step=100) -> [pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
|
"""
|
|
"""
|
|
读取评估结果,并进行校准
|
|
读取评估结果,并进行校准
|
|
"""
|
|
"""
|
|
@@ -127,12 +125,26 @@ def read_and_calibration_predict(predict_path: str, step=100) -> [pd.DataFrame,
|
|
).reset_index()
|
|
).reset_index()
|
|
grouped_df['true_ctcvr'] = grouped_df['conv'] / grouped_df['view']
|
|
grouped_df['true_ctcvr'] = grouped_df['conv'] / grouped_df['view']
|
|
|
|
|
|
- return grouped_df, segment_df
|
|
|
|
|
|
+ return df, grouped_df, segment_df
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def predict_local_save_for_auc(old_df: pd.DataFrame, new_df: pd.DataFrame):
|
|
|
|
+ """
|
|
|
|
+ 本地保存一份评估结果, 计算AUC使用
|
|
|
|
+ """
|
|
|
|
+ d = {"old": old_df, "new": new_df}
|
|
|
|
+ for key in d:
|
|
|
|
+ df = d[key][['label', "score"]]
|
|
|
|
+ df.to_csv(f"{PREDICT_CACHE_PATH}/{key}_1.csv", index=False, header=False)
|
|
|
|
+ df = d[key][['label', "score_2"]]
|
|
|
|
+ df.to_csv(f"{PREDICT_CACHE_PATH}/{key}_2.csv", index=False, header=False)
|
|
|
|
|
|
|
|
|
|
def _main(old_predict_path: str, new_predict_path: str, calibration_file: str, analyse_file: str):
|
|
def _main(old_predict_path: str, new_predict_path: str, calibration_file: str, analyse_file: str):
|
|
- old_group_df, old_segment_df = read_and_calibration_predict(old_predict_path)
|
|
|
|
- new_group_df, new_segment_df = read_and_calibration_predict(new_predict_path)
|
|
|
|
|
|
+ old_df, old_group_df, old_segment_df = read_and_calibration_predict(old_predict_path)
|
|
|
|
+ new_df, new_group_df, new_segment_df = read_and_calibration_predict(new_predict_path)
|
|
|
|
+
|
|
|
|
+ predict_local_save_for_auc(old_df, new_df)
|
|
|
|
|
|
# 分段文件保存, 此处保留的最后使用的分段文件,不是所有的分段
|
|
# 分段文件保存, 此处保留的最后使用的分段文件,不是所有的分段
|
|
new_segment_df.to_csv(calibration_file, sep='\t', index=False, header=False)
|
|
new_segment_df.to_csv(calibration_file, sep='\t', index=False, header=False)
|