import pandas as pd from sklearn.metrics import roc_auc_score import numpy as np import xgboost as xgb from tqdm import tqdm import sys import glob # def func_make_data(file_path: str): # df = pd.read_csv(file_path) # for col in ['曝光占比', 'vov0', '分子', '分母', '1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01', # '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012', '1_vov0_分子', '1_vov0_分母', # '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母', '5_vov0_分子', # '5_vov0_分母', '2_vov01_分子', '2_vov01_分母', '3_vov01_分子', '3_vov01_分母', '4_vov01_分子', # '4_vov01_分母', '5_vov01_分子', '5_vov01_分母', '3_vov012_分子', '3_vov012_分母', '4_vov012_分子', # '4_vov012_分母', '5_vov012_分子', '5_vov012_分母']: # df[col] = pd.to_numeric(df[col], errors='coerce') # # df.fillna(0, inplace=True) # df["12_change"] = df["1_vov0"] - df["2_vov0"] # df["23_change"] = df["2_vov0"] - df["3_vov0"] # df["34_change"] = df["3_vov0"] - df["4_vov0"] # features_name = ['1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01', '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012' # ,"12_change", "23_change", "34_change" # ,'2_vov01', '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012' # ,'1_vov0_分子', '1_vov0_分母', # '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母', '5_vov0_分子', # '5_vov0_分母', '2_vov01_分子', '2_vov01_分母', '3_vov01_分子', '3_vov01_分母', '4_vov01_分子', # '4_vov01_分母', '5_vov01_分子', '5_vov01_分母', '3_vov012_分子', '3_vov012_分母', '4_vov012_分子', # '4_vov012_分母', '5_vov012_分子', '5_vov012_分母' # ] # feature_array = df[features_name].values # df["reg_label"] = df["vov0"].apply(lambda x: float(x)) # label_array = df["reg_label"].values # return df, feature_array, label_array # # df, trains_array,trains_label_array = func_make_data("20240919.csv") # header = df.columns.tolist() # print(header) # print(df.dtypes) # # 1 回归模型 # model = xgb.XGBRegressor(objective='reg:squarederror', # learning_rate=0.01, # n_estimators=100, # max_depth=3) # model.fit(trains_array, trains_label_array) # df_test, tests_array, _ = func_make_data("20240920.csv") # y_pred = model.predict(tests_array) # df_test["y_pred"] = y_pred # # condition_choose = ((df_test['y_pred'] <= 0.15) # & ((df_test['1_vov0_分母'] > 50) | (df_test['2_vov0_分母'] > 50) | (df_test['3_vov0_分母'] > 50)) # # ) # condition_choose_real = condition_choose & (df_test['vov0'] <= 0.30) # profit_theshold = 0.34 # # choose_bad = condition_choose.sum() # choose_bad_realbad = condition_choose_real.sum() # acc = choose_bad_realbad / choose_bad # print("acc:{} 分子={} 分母={} 总视频数={}".format(acc, choose_bad_realbad, choose_bad, df_test.size)) # # surface = df_test.loc[condition_choose, '曝光占比'].sum() # surface_income = df_test.loc[condition_choose_real, '曝光占比'].sum() # print("总影响面:{} 收益影响面:{}".format(surface, surface_income)) # # df_test["profit_loss_value"] = df_test['分母'] * (df_test['vov0'] - profit_theshold) # profit_loss_value = df_test.loc[condition_choose, 'profit_loss_value'].sum() # profit_value = df_test.loc[condition_choose_real, 'profit_loss_value'].sum() # print("总盈亏:{} 纯盈利:{}".format(profit_loss_value, profit_value)) # 2 分类模型 def apply_title(row): try: return row.replace("\n", "") except Exception as e: return row def func_make_data(file_path: str): df_list = [pd.read_csv(file) for file in file_path.split(",")] df = pd.concat(df_list, ignore_index=True) # df = pd.read_csv(file_path) df["title"] = df["title"].apply(apply_title) for col in ['曝光占比', 'vov0', '分子', '分母', '1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01', '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012', '1_vov0_分子', '1_vov0_分母', '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母', '5_vov0_分子', '5_vov0_分母', '2_vov01_分子', '2_vov01_分母', '3_vov01_分子', '3_vov01_分母', '4_vov01_分子', '4_vov01_分母', '5_vov01_分子', '5_vov01_分母', '3_vov012_分子', '3_vov012_分母', '4_vov012_分子', '4_vov012_分母', '5_vov012_分子', '5_vov012_分母']: df[col] = pd.to_numeric(df[col], errors='coerce') df.fillna(0, inplace=True) df["12_change"] = df["1_vov0"] - df["2_vov0"] df["23_change"] = df["2_vov0"] - df["3_vov0"] df["34_change"] = df["3_vov0"] - df["4_vov0"] features_name = ['1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01', '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012' , "12_change", "23_change", "34_change" , '2_vov01', '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012' # , '1_vov0_分子', '1_vov0_分母', # '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母', # '5_vov0_分子', # '5_vov0_分母', '2_vov01_分子', '2_vov01_分母', '3_vov01_分子', '3_vov01_分母', '4_vov01_分子', # '4_vov01_分母', '5_vov01_分子', '5_vov01_分母', '3_vov012_分子', '3_vov012_分母', '4_vov012_分子', # '4_vov012_分母', '5_vov012_分子', '5_vov012_分母' ] feature_array = df[features_name].values df["label"] = df["vov0"].apply(lambda x: 1 if x > 0.25 else 0) label_array = df["label"].values return df, feature_array, label_array try: date_train = sys.argv[1] date_test = sys.argv[2] except Exception as e: # date_train = "20241010.csv" # date_test = "20241011.csv" date_train = "20241009_train.csv,20241010_train.csv" date_test = "20241010_predict.csv" # date_train = "20240924.csv,20240923.csv,20240922.csv,20240921.csv,20240920.csv,20240919.csv" # date_train = "20240915.csv" # date_test = "20240916.csv" # date_train = "20240924_new.csv" # date_test = "20240925_new.csv" df, trains_array,trains_label_array = func_make_data(date_train) header = df.columns.tolist() # print(header) # print(df.dtypes) model = xgb.XGBClassifier( n_estimators=1000, learning_rate=0.01, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=8, scale_pos_weight=1, random_state=2024, seed=2024, # verbose=True, ) model.fit(trains_array, trains_label_array) df_test, tests_array, _ = func_make_data(date_test) y_pred = model.predict_proba(tests_array)[:, 1] df_test["y_pred"] = y_pred condition_choose = ((df_test['y_pred'] <= 0.1) # & ((df_test['1_vov0_分母'] > 100)) & ((df_test['4_vov0_分母'] > 50) | (df_test['2_vov0_分母'] > 50) | (df_test['3_vov0_分母'] > 50)) # & (df_test.index <= 10000) & ((df_test["1_vov0"] - df_test["2_vov0"] < 0.1)) # & ((df_test["1_vov0"] - df_test["2_vov0"] <= 0.1) | (df_test["2_vov0"] <= 0) | (df_test["1_vov0"] <= 0.2)) ) profit_theshold = 0.3 condition_choose_real = condition_choose & (df_test['vov0'] <= profit_theshold) df_test["condition_choose"] = condition_choose condition_fuck =condition_choose & (df_test['vov0'] > profit_theshold) df_test["condition_fuck"] = condition_fuck df_test[["vid","title","曝光占比","vov0", "condition_choose", "condition_fuck"]].to_csv("new_" + date_test, sep="\t", index=False) choose_bad = condition_choose.sum() choose_bad_realbad = condition_choose_real.sum() acc = choose_bad_realbad / choose_bad print("acc:{} 分子={} 分母={} 总视频数={} 盈利计算标注vov0大于:{}".format(acc, choose_bad_realbad, choose_bad, df_test.shape[0], profit_theshold)) surface = df_test.loc[condition_choose, '曝光占比'].sum() surface_income = df_test.loc[condition_choose_real, '曝光占比'].sum() print("总影响面:{} 盈利影响面:{} 亏损影响面:{}".format(round(surface, 6), round(surface_income, 6), round(surface-surface_income, 6))) df_test["profit_loss_value"] = df_test['分母'] * (df_test['vov0'] - profit_theshold) profit_loss_value = df_test.loc[condition_choose, 'profit_loss_value'].sum() profit_value = df_test.loc[condition_choose_real, 'profit_loss_value'].sum() print("总盈亏:{} 纯盈利:{} 纯亏损:{} 盈利效率:{}".format(round(profit_loss_value, 1), round(profit_value, 1), round(profit_loss_value-profit_value, 1), round(profit_loss_value/profit_value, 6))) # 3 多分类模型 概率 # names = [] # with open("a_features_recsys.txt", "r") as file: # for line in file.readlines(): # line = line.strip() # names.append(line) # # trains = [] # trains_label = [] # # trains_id = [] # with open("20240728.txt", "r") as file: # for line in tqdm(file.readlines()): # lines = line.strip().split("\t") # label = lines[0] # trains_label.append(int(label)) # m = dict([(l.split(":")[0], l.split(":")[1]) for l in lines[1:]]) # row = [float(m.get(name, "0.0")) for name in names] # trains.append(row) # # for key in m.keys(): # # if key.startswith("cid_"): # # trains_id.append(key.replace("cid_", "")) # # break # # trains_array = np.array(trains) # trains_label_array = np.array(trains_label) # print("train samples={} positive={} rate={}".format( # len(trains_label), # sum(trains_label), # format(1.0 * sum(trains_label)/len(trains_label), ".6f")) # ) # # # model = xgb.XGBClassifier( # n_estimators=2000, # learning_rate=0.01, # max_depth=5, # min_child_weight=1, # gamma=0, # subsample=0.8, # colsample_bytree=0.8, # objective= 'binary:logistic', # nthread=8, # scale_pos_weight=1, # random_state=2024, # seed=2024, # verbose=True, # ) # model.fit(trains_array, trains_label_array) # # tests = [] # tests_label = [] # # tests_id = [] # with open("20240729.txt", "r") as file: # for line in tqdm(file.readlines()): # lines = line.strip().split("\t") # label = lines[0] # tests_label.append(int(label)) # m = dict([(l.split(":")[0], l.split(":")[1]) for l in lines[1:]]) # row = [float(m.get(name, "0.0")) for name in names] # tests.append(row) # # for key in m.keys(): # # if key.startswith("cid_"): # # tests_id.append(key.replace("cid_", "")) # # break # tests_array = np.array(tests) # tests_label_array = np.array(tests_label) # print("test samples={} positive={} rate={}".format( # len(tests_label), # sum(tests_label), # format(1.0 * sum(tests_label)/len(tests_label), ".6f")) # ) # # # 进行预测 # y_pred = model.predict(tests_array) # probabilities = model.predict_proba(tests_array) # # # # auc = roc_auc_score(tests_label_array, probabilities[:, 1]) # print("auc:{}".format(auc)) """ https://zhuanlan.zhihu.com/p/688993572 """