Pārlūkot izejas kodu

用于过滤的vov xgb模型22

zhangbo 6 mēneši atpakaļ
vecāks
revīzija
eccf27d40a
1 mainītis faili ar 34 papildinājumiem un 11 dzēšanām
  1. 34 11
      write_redis/tree_model_xgb_vov.py

+ 34 - 11
write_redis/tree_model_xgb_vov.py

@@ -4,7 +4,7 @@ import numpy as np
 import xgboost as xgb
 from tqdm import tqdm
 import sys
-
+import glob
 # def func_make_data(file_path: str):
 #     df = pd.read_csv(file_path)
 #     for col in ['曝光占比', 'vov0', '分子', '分母', '1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01',
@@ -67,9 +67,18 @@ import sys
 # profit_value = df_test.loc[condition_choose_real, 'profit_loss_value'].sum()
 # print("总盈亏:{} 纯盈利:{}".format(profit_loss_value, profit_value))
 # 2 分类模型
+
+def apply_title(row):
+    try:
+        return row.replace("\n", "")
+    except Exception as e:
+        return row
 def func_make_data(file_path: str):
-    df = pd.read_csv(file_path)
-    df["title"] = df["title"].apply(lambda x: x.replace("\n", ""))
+    df_list = [pd.read_csv(file) for file in file_path.split(",")]
+    df = pd.concat(df_list, ignore_index=True)
+
+    # df = pd.read_csv(file_path)
+    df["title"] = df["title"].apply(apply_title)
     for col in ['曝光占比', 'vov0', '分子', '分母', '1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01',
                 '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012', '1_vov0_分子', '1_vov0_分母',
                 '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母', '5_vov0_分子',
@@ -102,15 +111,22 @@ try:
     date_train = sys.argv[1]
     date_test = sys.argv[2]
 except Exception as e:
-    date_train = "20240919.csv"
-    date_test = "20240920.csv"
+    # date_train = "20241010.csv"
+    # date_test = "20241011.csv"
+    date_train = "20241009_train.csv,20241010_train.csv"
+    date_test = "20241010_predict.csv"
+    # date_train = "20240924.csv,20240923.csv,20240922.csv,20240921.csv,20240920.csv,20240919.csv"
+    # date_train = "20240915.csv"
+    # date_test = "20240916.csv"
+    # date_train = "20240924_new.csv"
+    # date_test = "20240925_new.csv"
 
 
 df, trains_array,trains_label_array = func_make_data(date_train)
 header = df.columns.tolist()
 # print(header) # print(df.dtypes)
 model = xgb.XGBClassifier(
-    n_estimators=100,
+    n_estimators=1000,
     learning_rate=0.01,
     max_depth=5,
     min_child_weight=1,
@@ -129,19 +145,26 @@ df_test, tests_array, _ =  func_make_data(date_test)
 y_pred = model.predict_proba(tests_array)[:, 1]
 df_test["y_pred"] = y_pred
 
-condition_choose = ((df_test['y_pred'] <= 0.2)
-                     # & ((df_test['1_vov0_分母'] > 50) | (df_test['2_vov0_分母'] > 50) | (df_test['3_vov0_分母'] > 50))
-                    & (df_test.index <= 10000)
+condition_choose = ((df_test['y_pred'] <= 0.1)
+                    # & ((df_test['1_vov0_分母'] > 100))
+                    & ((df_test['4_vov0_分母'] > 50) | (df_test['2_vov0_分母'] > 50) | (df_test['3_vov0_分母'] > 50))
+                    # & (df_test.index <= 10000)
+                    & ((df_test["1_vov0"] - df_test["2_vov0"] < 0.1))
+                    # & ((df_test["1_vov0"] - df_test["2_vov0"] <= 0.1) | (df_test["2_vov0"] <= 0) | (df_test["1_vov0"] <= 0.2))
                     )
 profit_theshold = 0.3
 condition_choose_real = condition_choose & (df_test['vov0'] <= profit_theshold)
 df_test["condition_choose"] = condition_choose
-df_test[["vid","title","曝光占比","vov0", "condition_choose"]].to_csv("new_" + date_test, sep="\t", index=False)
+
+condition_fuck =condition_choose & (df_test['vov0'] > profit_theshold)
+df_test["condition_fuck"] = condition_fuck
+
+df_test[["vid","title","曝光占比","vov0", "condition_choose", "condition_fuck"]].to_csv("new_" + date_test, sep="\t", index=False)
 
 choose_bad = condition_choose.sum()
 choose_bad_realbad = condition_choose_real.sum()
 acc = choose_bad_realbad / choose_bad
-print("acc:{} 分子={} 分母={} 总视频数={} 盈利计算标注vov0大于:{}".format(acc, choose_bad_realbad, choose_bad, df_test.size, profit_theshold))
+print("acc:{} 分子={} 分母={} 总视频数={} 盈利计算标注vov0大于:{}".format(acc, choose_bad_realbad, choose_bad, df_test.shape[0], profit_theshold))
 
 surface = df_test.loc[condition_choose, '曝光占比'].sum()
 surface_income = df_test.loc[condition_choose_real, '曝光占比'].sum()