|
@@ -0,0 +1,243 @@
|
|
|
|
+import pandas as pd
|
|
|
|
+from sklearn.metrics import roc_auc_score
|
|
|
|
+import numpy as np
|
|
|
|
+import xgboost as xgb
|
|
|
|
+from tqdm import tqdm
|
|
|
|
+import sys
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def func_make_data(file_path: str):
|
|
|
|
+ df = pd.read_csv(file_path)
|
|
|
|
+ df["title"] = df["title"].apply(lambda x: x.replace("\n", ""))
|
|
|
|
+ for col in ['曝光占比', 'vov0', '分子', '分母', '1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01',
|
|
|
|
+ '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012', '1_vov0_分子', '1_vov0_分母',
|
|
|
|
+ '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母', '5_vov0_分子',
|
|
|
|
+ '5_vov0_分母', '2_vov01_分子', '2_vov01_分母', '3_vov01_分子', '3_vov01_分母', '4_vov01_分子',
|
|
|
|
+ '4_vov01_分母', '5_vov01_分子', '5_vov01_分母', '3_vov012_分子', '3_vov012_分母', '4_vov012_分子',
|
|
|
|
+ '4_vov012_分母', '5_vov012_分子', '5_vov012_分母']:
|
|
|
|
+ df[col] = pd.to_numeric(df[col], errors='coerce')
|
|
|
|
+
|
|
|
|
+ df.fillna(0, inplace=True)
|
|
|
|
+ df["12_change"] = df["1_vov0"] - df["2_vov0"]
|
|
|
|
+ df["23_change"] = df["2_vov0"] - df["3_vov0"]
|
|
|
|
+ df["34_change"] = df["3_vov0"] - df["4_vov0"]
|
|
|
|
+ features_name = ['1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01', '3_vov01', '4_vov01', '5_vov01',
|
|
|
|
+ '3_vov012', '4_vov012', '5_vov012'
|
|
|
|
+ , "12_change", "23_change", "34_change"
|
|
|
|
+ , '2_vov01', '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012'
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ ]
|
|
|
|
+ feature_array = df[features_name].values
|
|
|
|
+ df["label"] = df["vov0"].apply(lambda x: 1 if x > 0.25 else 0)
|
|
|
|
+ label_array = df["label"].values
|
|
|
|
+ return df, feature_array, label_array
|
|
|
|
+
|
|
|
|
+try:
|
|
|
|
+ date_train = sys.argv[1]
|
|
|
|
+ date_test = sys.argv[2]
|
|
|
|
+except Exception as e:
|
|
|
|
+ date_train = "20240919.csv"
|
|
|
|
+ date_test = "20240920.csv"
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+df, trains_array,trains_label_array = func_make_data(date_train)
|
|
|
|
+header = df.columns.tolist()
|
|
|
|
+
|
|
|
|
+model = xgb.XGBClassifier(
|
|
|
|
+ n_estimators=100,
|
|
|
|
+ learning_rate=0.01,
|
|
|
|
+ max_depth=5,
|
|
|
|
+ min_child_weight=1,
|
|
|
|
+ gamma=0,
|
|
|
|
+ subsample=0.8,
|
|
|
|
+ colsample_bytree=0.8,
|
|
|
|
+ objective= 'binary:logistic',
|
|
|
|
+ nthread=8,
|
|
|
|
+ scale_pos_weight=1,
|
|
|
|
+ random_state=2024,
|
|
|
|
+ seed=2024,
|
|
|
|
+
|
|
|
|
+)
|
|
|
|
+model.fit(trains_array, trains_label_array)
|
|
|
|
+df_test, tests_array, _ = func_make_data(date_test)
|
|
|
|
+y_pred = model.predict_proba(tests_array)[:, 1]
|
|
|
|
+df_test["y_pred"] = y_pred
|
|
|
|
+
|
|
|
|
+condition_choose = ((df_test['y_pred'] <= 0.2)
|
|
|
|
+
|
|
|
|
+ & (df_test.index <= 10000)
|
|
|
|
+ )
|
|
|
|
+profit_theshold = 0.3
|
|
|
|
+condition_choose_real = condition_choose & (df_test['vov0'] <= profit_theshold)
|
|
|
|
+df_test["condition_choose"] = condition_choose
|
|
|
|
+df_test[["vid","title","曝光占比","vov0", "condition_choose"]].to_csv("new_" + date_test, sep="\t", index=False)
|
|
|
|
+
|
|
|
|
+choose_bad = condition_choose.sum()
|
|
|
|
+choose_bad_realbad = condition_choose_real.sum()
|
|
|
|
+acc = choose_bad_realbad / choose_bad
|
|
|
|
+print("acc:{} 分子={} 分母={} 总视频数={} 盈利计算标注vov0大于:{}".format(acc, choose_bad_realbad, choose_bad, df_test.size, profit_theshold))
|
|
|
|
+
|
|
|
|
+surface = df_test.loc[condition_choose, '曝光占比'].sum()
|
|
|
|
+surface_income = df_test.loc[condition_choose_real, '曝光占比'].sum()
|
|
|
|
+print("总影响面:{} 盈利影响面:{} 亏损影响面:{}".format(round(surface, 6), round(surface_income, 6), round(surface-surface_income, 6)))
|
|
|
|
+
|
|
|
|
+df_test["profit_loss_value"] = df_test['分母'] * (df_test['vov0'] - profit_theshold)
|
|
|
|
+profit_loss_value = df_test.loc[condition_choose, 'profit_loss_value'].sum()
|
|
|
|
+profit_value = df_test.loc[condition_choose_real, 'profit_loss_value'].sum()
|
|
|
|
+print("总盈亏:{} 纯盈利:{} 纯亏损:{} 盈利效率:{}".format(round(profit_loss_value, 1), round(profit_value, 1), round(profit_loss_value-profit_value, 1), round(profit_loss_value/profit_value, 6)))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+"""
|
|
|
|
+https://zhuanlan.zhihu.com/p/688993572
|
|
|
|
+"""
|