|
@@ -4,7 +4,7 @@ import numpy as np
|
|
|
import xgboost as xgb
|
|
|
from tqdm import tqdm
|
|
|
import sys
|
|
|
-
|
|
|
+import glob
|
|
|
# def func_make_data(file_path: str):
|
|
|
# df = pd.read_csv(file_path)
|
|
|
# for col in ['曝光占比', 'vov0', '分子', '分母', '1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01',
|
|
@@ -67,9 +67,18 @@ import sys
|
|
|
# profit_value = df_test.loc[condition_choose_real, 'profit_loss_value'].sum()
|
|
|
# print("总盈亏:{} 纯盈利:{}".format(profit_loss_value, profit_value))
|
|
|
# 2 分类模型
|
|
|
+
|
|
|
+def apply_title(row):
|
|
|
+ try:
|
|
|
+ return row.replace("\n", "")
|
|
|
+ except Exception as e:
|
|
|
+ return row
|
|
|
def func_make_data(file_path: str):
|
|
|
- df = pd.read_csv(file_path)
|
|
|
- df["title"] = df["title"].apply(lambda x: x.replace("\n", ""))
|
|
|
+ df_list = [pd.read_csv(file) for file in file_path.split(",")]
|
|
|
+ df = pd.concat(df_list, ignore_index=True)
|
|
|
+
|
|
|
+ # df = pd.read_csv(file_path)
|
|
|
+ df["title"] = df["title"].apply(apply_title)
|
|
|
for col in ['曝光占比', 'vov0', '分子', '分母', '1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01',
|
|
|
'3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012', '1_vov0_分子', '1_vov0_分母',
|
|
|
'2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母', '5_vov0_分子',
|
|
@@ -102,15 +111,22 @@ try:
|
|
|
date_train = sys.argv[1]
|
|
|
date_test = sys.argv[2]
|
|
|
except Exception as e:
|
|
|
- date_train = "20240919.csv"
|
|
|
- date_test = "20240920.csv"
|
|
|
+ # date_train = "20241010.csv"
|
|
|
+ # date_test = "20241011.csv"
|
|
|
+ date_train = "20241009_train.csv,20241010_train.csv"
|
|
|
+ date_test = "20241010_predict.csv"
|
|
|
+ # date_train = "20240924.csv,20240923.csv,20240922.csv,20240921.csv,20240920.csv,20240919.csv"
|
|
|
+ # date_train = "20240915.csv"
|
|
|
+ # date_test = "20240916.csv"
|
|
|
+ # date_train = "20240924_new.csv"
|
|
|
+ # date_test = "20240925_new.csv"
|
|
|
|
|
|
|
|
|
df, trains_array,trains_label_array = func_make_data(date_train)
|
|
|
header = df.columns.tolist()
|
|
|
# print(header) # print(df.dtypes)
|
|
|
model = xgb.XGBClassifier(
|
|
|
- n_estimators=100,
|
|
|
+ n_estimators=1000,
|
|
|
learning_rate=0.01,
|
|
|
max_depth=5,
|
|
|
min_child_weight=1,
|
|
@@ -129,19 +145,26 @@ df_test, tests_array, _ = func_make_data(date_test)
|
|
|
y_pred = model.predict_proba(tests_array)[:, 1]
|
|
|
df_test["y_pred"] = y_pred
|
|
|
|
|
|
-condition_choose = ((df_test['y_pred'] <= 0.2)
|
|
|
- # & ((df_test['1_vov0_分母'] > 50) | (df_test['2_vov0_分母'] > 50) | (df_test['3_vov0_分母'] > 50))
|
|
|
- & (df_test.index <= 10000)
|
|
|
+condition_choose = ((df_test['y_pred'] <= 0.1)
|
|
|
+ # & ((df_test['1_vov0_分母'] > 100))
|
|
|
+ & ((df_test['4_vov0_分母'] > 50) | (df_test['2_vov0_分母'] > 50) | (df_test['3_vov0_分母'] > 50))
|
|
|
+ # & (df_test.index <= 10000)
|
|
|
+ & ((df_test["1_vov0"] - df_test["2_vov0"] < 0.1))
|
|
|
+ # & ((df_test["1_vov0"] - df_test["2_vov0"] <= 0.1) | (df_test["2_vov0"] <= 0) | (df_test["1_vov0"] <= 0.2))
|
|
|
)
|
|
|
profit_theshold = 0.3
|
|
|
condition_choose_real = condition_choose & (df_test['vov0'] <= profit_theshold)
|
|
|
df_test["condition_choose"] = condition_choose
|
|
|
-df_test[["vid","title","曝光占比","vov0", "condition_choose"]].to_csv("new_" + date_test, sep="\t", index=False)
|
|
|
+
|
|
|
+condition_fuck =condition_choose & (df_test['vov0'] > profit_theshold)
|
|
|
+df_test["condition_fuck"] = condition_fuck
|
|
|
+
|
|
|
+df_test[["vid","title","曝光占比","vov0", "condition_choose", "condition_fuck"]].to_csv("new_" + date_test, sep="\t", index=False)
|
|
|
|
|
|
choose_bad = condition_choose.sum()
|
|
|
choose_bad_realbad = condition_choose_real.sum()
|
|
|
acc = choose_bad_realbad / choose_bad
|
|
|
-print("acc:{} 分子={} 分母={} 总视频数={} 盈利计算标注vov0大于:{}".format(acc, choose_bad_realbad, choose_bad, df_test.size, profit_theshold))
|
|
|
+print("acc:{} 分子={} 分母={} 总视频数={} 盈利计算标注vov0大于:{}".format(acc, choose_bad_realbad, choose_bad, df_test.shape[0], profit_theshold))
|
|
|
|
|
|
surface = df_test.loc[condition_choose, '曝光占比'].sum()
|
|
|
surface_income = df_test.loc[condition_choose_real, '曝光占比'].sum()
|