Explorar el Código

feat:添加模型分本地保存逻辑

zhaohaipeng hace 8 meses
padre
commit
383d06c462
Se han modificado 4 ficheros con 89 adiciones y 11 borrados
  1. 6 0
      model/auc.py
  2. 20 5
      model/model_predict_analyse.py
  3. 54 0
      script/t.py
  4. 9 6
      vov/data_download.py

+ 6 - 0
model/auc.py

@@ -0,0 +1,6 @@
+import os
+import sys
+
+if __name__ == '__main__':
+    print("a b c")
+    print(os.environ.get("PREDICT_CACHE_PATH"))

+ 20 - 5
model/model_predict_analyse.py

@@ -8,7 +8,8 @@ from hdfs import InsecureClient
 
 client = InsecureClient("http://master-1-1.c-7f31a3eea195cb73.cn-hangzhou.emr.aliyuncs.com:9870", user="spark")
 
-SEGMENT_BASE_PATH = "/dw/recommend/model/36_score_calibration_file"
+SEGMENT_BASE_PATH = os.environ.get("SEGMENT_BASE_PATH", "/dw/recommend/model/36_score_calibration_file")
+PREDICT_CACHE_PATH = os.environ.get("PREDICT_CACHE_PATH", "/root/zhaohp/XGB/predict_cache")
 
 
 def read_predict_from_local_txt(txt_file) -> list:
@@ -97,7 +98,7 @@ def segment_calc_diff_rate_by_score(df: pd.DataFrame, segment_file_path: str, st
     return merged_df, filtered_df
 
 
-def read_and_calibration_predict(predict_path: str, step=100) -> [pd.DataFrame, pd.DataFrame]:
+def read_and_calibration_predict(predict_path: str, step=100) -> [pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """
     读取评估结果,并进行校准
     """
@@ -124,12 +125,26 @@ def read_and_calibration_predict(predict_path: str, step=100) -> [pd.DataFrame,
     ).reset_index()
     grouped_df['true_ctcvr'] = grouped_df['conv'] / grouped_df['view']
 
-    return grouped_df, segment_df
+    return df, grouped_df, segment_df
+
+
+def predict_local_save_for_auc(old_df: pd.DataFrame, new_df: pd.DataFrame):
+    """
+    本地保存一份评估结果, 计算AUC使用
+    """
+    d = {"old": old_df, "new": new_df}
+    for key in d:
+        df = d[key][['label', "score"]]
+        df.to_csv(f"{PREDICT_CACHE_PATH}/{key}_1.csv", index=False, header=False)
+        df = d[key][['label', "score_2"]]
+        df.to_csv(f"{PREDICT_CACHE_PATH}/{key}_2.csv", index=False, header=False)
 
 
 def _main(old_predict_path: str, new_predict_path: str, calibration_file: str, analyse_file: str):
-    old_group_df, old_segment_df = read_and_calibration_predict(old_predict_path)
-    new_group_df, new_segment_df = read_and_calibration_predict(new_predict_path)
+    old_df, old_group_df, old_segment_df = read_and_calibration_predict(old_predict_path)
+    new_df, new_group_df, new_segment_df = read_and_calibration_predict(new_predict_path)
+
+    predict_local_save_for_auc(old_df, new_df)
 
     # 分段文件保存, 此处保留的最后使用的分段文件,不是所有的分段
     new_segment_df.to_csv(calibration_file, sep='\t', index=False, header=False)

+ 54 - 0
script/t.py

@@ -0,0 +1,54 @@
+import json
+import pandas as pd
+
+feature = ["1_vovh0分子",
+           "2_vovh0分子",
+           "2_vovh1分子",
+           "3_vovh0分子",
+           "3_vovh1分子",
+           "3_vovh2分子",
+           "4_vovh0分子",
+           "4_vovh1分子",
+           "4_vovh3分子",
+           "7_vovh0分子",
+           "7_vovh1分子",
+           "7_vovh6分子",
+           "13_vovh0分子",
+           "13_vovh1分子",
+           "13_vovh12分子",
+           "25_vovh0分子",
+           "25_vovh1分子",
+           "25_vovh24分子",
+           "1_vovd0分子",
+           "2_vovd0分子",
+           "2_vovd1分子",
+           "3_vovd0分子",
+           "3_vovd1分子",
+           "3_vovd2分子",
+           "1_vovh分母",
+           "2_vovh分母",
+           "3_vovh分母",
+           "4_vovh分母",
+           "7_vovh分母",
+           "13_vovh分母",
+           "25_vovh分母",
+           "1_vovd分母",
+           "2_vovd分母",
+           "3_vovd分母", ]
+
+with open("/Users/zhao/Desktop/1.json", "r") as file:
+    result = json.loads(file.read())
+
+data = []
+for item in result:
+    d = {
+        'score': item['score'],
+        "vovScore": item["scoresMap"]['vovScore'],
+        '小时': item['allFeatureMap']['weightKey']
+    }
+    for f in feature:
+        d[f] = item['allFeatureMap'][f]
+
+    data.append(d)
+df = pd.DataFrame.from_records(data)
+df.to_csv("/Users/zhao/Desktop/1.csv", index=False)

+ 9 - 6
vov/data_download.py

@@ -8,7 +8,7 @@ from client import ODPSClient
 
 odps_client = ODPSClient.ODPSClient()
 
-dt_list = ["20241030", "20241031", "20241101", "20241102"]
+dt_list = ["20241108", "20241109", "20241110"]
 hh_list = ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17",
            "18", "19", "20", "21", "22", "23"]
 
@@ -22,10 +22,13 @@ def read_odps(dt: str, hh: str):
     with open(f"{VOV_BASE_PATH}/sql/vovh24_feature.sql", "r") as f:
         sql = f.read()
 
-    real_sql = (sql.replace("${bizdate}", dt)
-                .replace("${hh}", hh))
+    real_sql = (
+        sql.replace("${bizdate}", dt)
+        .replace("${hh}", hh)
+        .replace("${apptype}", "4")
+    )
     print(f"Executing for dt: {dt}, hh: {hh}")
-    odps_client.execute_sql_result_save_file(real_sql, f"{VOV_BASE_PATH}/csv/{dt}{hh}.csv")
+    odps_client.execute_sql_result_save_file(real_sql, f"{VOV_BASE_PATH}/csv/{dt}{hh}_feature.csv")
 
 
 # 并行执行函数
@@ -54,11 +57,11 @@ def download():
     for dt in dt_list:
         csv_list = []
         for hh in hh_list:
-            csv_list.append(f"{VOV_BASE_PATH}/csv/{dt}{hh}.csv")
+            csv_list.append(f"{VOV_BASE_PATH}/csv/{dt}{hh}_feature.csv")
 
         df_list = [pd.read_csv(file) for file in csv_list]
         df = pd.concat(df_list, ignore_index=True)
-        df.to_csv(f"{VOV_BASE_PATH}/csv/{dt}.csv", index=False)
+        df.to_csv(f"{VOV_BASE_PATH}/csv/{dt}_feature.csv", index=False)
 
 
 def _main():