123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266 |
- import pandas as pd
- from sklearn.metrics import roc_auc_score
- import numpy as np
- import xgboost as xgb
- from tqdm import tqdm
- import sys
- import glob
- def apply_title(row):
- try:
- return row.replace("\n", "")
- except Exception as e:
- return row
- def func_make_data(file_path: str):
- df_list = [pd.read_csv(file) for file in file_path.split(",")]
- df = pd.concat(df_list, ignore_index=True)
-
- df["title"] = df["title"].apply(apply_title)
- for col in ['曝光占比', 'vov0', '分子', '分母', '1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01',
- '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012', '1_vov0_分子', '1_vov0_分母',
- '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母', '5_vov0_分子',
- '5_vov0_分母', '2_vov01_分子', '2_vov01_分母', '3_vov01_分子', '3_vov01_分母', '4_vov01_分子',
- '4_vov01_分母', '5_vov01_分子', '5_vov01_分母', '3_vov012_分子', '3_vov012_分母', '4_vov012_分子',
- '4_vov012_分母', '5_vov012_分子', '5_vov012_分母']:
- df[col] = pd.to_numeric(df[col], errors='coerce')
- df.fillna(0, inplace=True)
- df["12_change"] = df["1_vov0"] - df["2_vov0"]
- df["23_change"] = df["2_vov0"] - df["3_vov0"]
- df["34_change"] = df["3_vov0"] - df["4_vov0"]
- features_name = ['1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01', '3_vov01', '4_vov01', '5_vov01',
- '3_vov012', '4_vov012', '5_vov012'
- , "12_change", "23_change", "34_change"
- , '2_vov01', '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012'
-
-
-
-
-
-
- ]
- feature_array = df[features_name].values
- df["label"] = df["vov0"].apply(lambda x: 1 if x > 0.25 else 0)
- label_array = df["label"].values
- return df, feature_array, label_array
- try:
- date_train = sys.argv[1]
- date_test = sys.argv[2]
- except Exception as e:
-
-
- date_train = "20241009_train.csv,20241010_train.csv"
- date_test = "20241010_predict.csv"
-
-
-
-
-
- df, trains_array,trains_label_array = func_make_data(date_train)
- header = df.columns.tolist()
- model = xgb.XGBClassifier(
- n_estimators=1000,
- learning_rate=0.01,
- max_depth=5,
- min_child_weight=1,
- gamma=0,
- subsample=0.8,
- colsample_bytree=0.8,
- objective= 'binary:logistic',
- nthread=8,
- scale_pos_weight=1,
- random_state=2024,
- seed=2024,
-
- )
- model.fit(trains_array, trains_label_array)
- df_test, tests_array, _ = func_make_data(date_test)
- y_pred = model.predict_proba(tests_array)[:, 1]
- df_test["y_pred"] = y_pred
- condition_choose = ((df_test['y_pred'] <= 0.1)
-
- & ((df_test['4_vov0_分母'] > 50) | (df_test['2_vov0_分母'] > 50) | (df_test['3_vov0_分母'] > 50))
-
- & ((df_test["1_vov0"] - df_test["2_vov0"] < 0.1))
-
- )
- profit_theshold = 0.3
- condition_choose_real = condition_choose & (df_test['vov0'] <= profit_theshold)
- df_test["condition_choose"] = condition_choose
- condition_fuck =condition_choose & (df_test['vov0'] > profit_theshold)
- df_test["condition_fuck"] = condition_fuck
- df_test[["vid","title","曝光占比","vov0", "condition_choose", "condition_fuck"]].to_csv("new_" + date_test, sep="\t", index=False)
- choose_bad = condition_choose.sum()
- choose_bad_realbad = condition_choose_real.sum()
- acc = choose_bad_realbad / choose_bad
- print("acc:{} 分子={} 分母={} 总视频数={} 盈利计算标注vov0大于:{}".format(acc, choose_bad_realbad, choose_bad, df_test.shape[0], profit_theshold))
- surface = df_test.loc[condition_choose, '曝光占比'].sum()
- surface_income = df_test.loc[condition_choose_real, '曝光占比'].sum()
- print("总影响面:{} 盈利影响面:{} 亏损影响面:{}".format(round(surface, 6), round(surface_income, 6), round(surface-surface_income, 6)))
- df_test["profit_loss_value"] = df_test['分母'] * (df_test['vov0'] - profit_theshold)
- profit_loss_value = df_test.loc[condition_choose, 'profit_loss_value'].sum()
- profit_value = df_test.loc[condition_choose_real, 'profit_loss_value'].sum()
- print("总盈亏:{} 纯盈利:{} 纯亏损:{} 盈利效率:{}".format(round(profit_loss_value, 1), round(profit_value, 1), round(profit_loss_value-profit_value, 1), round(profit_loss_value/profit_value, 6)))
- """
- https://zhuanlan.zhihu.com/p/688993572
- """
|