tree_model_xgb_vov.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. import pandas as pd
  2. from sklearn.metrics import roc_auc_score
  3. import numpy as np
  4. import xgboost as xgb
  5. from tqdm import tqdm
  6. import sys
  7. import glob
  8. # def func_make_data(file_path: str):
  9. # df = pd.read_csv(file_path)
  10. # for col in ['曝光占比', 'vov0', '分子', '分母', '1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01',
  11. # '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012', '1_vov0_分子', '1_vov0_分母',
  12. # '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母', '5_vov0_分子',
  13. # '5_vov0_分母', '2_vov01_分子', '2_vov01_分母', '3_vov01_分子', '3_vov01_分母', '4_vov01_分子',
  14. # '4_vov01_分母', '5_vov01_分子', '5_vov01_分母', '3_vov012_分子', '3_vov012_分母', '4_vov012_分子',
  15. # '4_vov012_分母', '5_vov012_分子', '5_vov012_分母']:
  16. # df[col] = pd.to_numeric(df[col], errors='coerce')
  17. #
  18. # df.fillna(0, inplace=True)
  19. # df["12_change"] = df["1_vov0"] - df["2_vov0"]
  20. # df["23_change"] = df["2_vov0"] - df["3_vov0"]
  21. # df["34_change"] = df["3_vov0"] - df["4_vov0"]
  22. # features_name = ['1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01', '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012'
  23. # ,"12_change", "23_change", "34_change"
  24. # ,'2_vov01', '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012'
  25. # ,'1_vov0_分子', '1_vov0_分母',
  26. # '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母', '5_vov0_分子',
  27. # '5_vov0_分母', '2_vov01_分子', '2_vov01_分母', '3_vov01_分子', '3_vov01_分母', '4_vov01_分子',
  28. # '4_vov01_分母', '5_vov01_分子', '5_vov01_分母', '3_vov012_分子', '3_vov012_分母', '4_vov012_分子',
  29. # '4_vov012_分母', '5_vov012_分子', '5_vov012_分母'
  30. # ]
  31. # feature_array = df[features_name].values
  32. # df["reg_label"] = df["vov0"].apply(lambda x: float(x))
  33. # label_array = df["reg_label"].values
  34. # return df, feature_array, label_array
  35. #
  36. # df, trains_array,trains_label_array = func_make_data("20240919.csv")
  37. # header = df.columns.tolist()
  38. # print(header) # print(df.dtypes)
  39. # # 1 回归模型
  40. # model = xgb.XGBRegressor(objective='reg:squarederror',
  41. # learning_rate=0.01,
  42. # n_estimators=100,
  43. # max_depth=3)
  44. # model.fit(trains_array, trains_label_array)
  45. # df_test, tests_array, _ = func_make_data("20240920.csv")
  46. # y_pred = model.predict(tests_array)
  47. # df_test["y_pred"] = y_pred
  48. #
  49. # condition_choose = ((df_test['y_pred'] <= 0.15)
  50. # & ((df_test['1_vov0_分母'] > 50) | (df_test['2_vov0_分母'] > 50) | (df_test['3_vov0_分母'] > 50))
  51. #
  52. # )
  53. # condition_choose_real = condition_choose & (df_test['vov0'] <= 0.30)
  54. # profit_theshold = 0.34
  55. #
  56. # choose_bad = condition_choose.sum()
  57. # choose_bad_realbad = condition_choose_real.sum()
  58. # acc = choose_bad_realbad / choose_bad
  59. # print("acc:{} 分子={} 分母={} 总视频数={}".format(acc, choose_bad_realbad, choose_bad, df_test.size))
  60. #
  61. # surface = df_test.loc[condition_choose, '曝光占比'].sum()
  62. # surface_income = df_test.loc[condition_choose_real, '曝光占比'].sum()
  63. # print("总影响面:{} 收益影响面:{}".format(surface, surface_income))
  64. #
  65. # df_test["profit_loss_value"] = df_test['分母'] * (df_test['vov0'] - profit_theshold)
  66. # profit_loss_value = df_test.loc[condition_choose, 'profit_loss_value'].sum()
  67. # profit_value = df_test.loc[condition_choose_real, 'profit_loss_value'].sum()
  68. # print("总盈亏:{} 纯盈利:{}".format(profit_loss_value, profit_value))
  69. # 2 分类模型
  70. def apply_title(row):
  71. try:
  72. return row.replace("\n", "")
  73. except Exception as e:
  74. return row
  75. def func_make_data(file_path: str):
  76. df_list = [pd.read_csv(file) for file in file_path.split(",")]
  77. df = pd.concat(df_list, ignore_index=True)
  78. # df = pd.read_csv(file_path)
  79. df["title"] = df["title"].apply(apply_title)
  80. for col in ['曝光占比', 'vov0', '分子', '分母', '1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01',
  81. '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012', '1_vov0_分子', '1_vov0_分母',
  82. '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母', '5_vov0_分子',
  83. '5_vov0_分母', '2_vov01_分子', '2_vov01_分母', '3_vov01_分子', '3_vov01_分母', '4_vov01_分子',
  84. '4_vov01_分母', '5_vov01_分子', '5_vov01_分母', '3_vov012_分子', '3_vov012_分母', '4_vov012_分子',
  85. '4_vov012_分母', '5_vov012_分子', '5_vov012_分母']:
  86. df[col] = pd.to_numeric(df[col], errors='coerce')
  87. df.fillna(0, inplace=True)
  88. df["12_change"] = df["1_vov0"] - df["2_vov0"]
  89. df["23_change"] = df["2_vov0"] - df["3_vov0"]
  90. df["34_change"] = df["3_vov0"] - df["4_vov0"]
  91. features_name = ['1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01', '3_vov01', '4_vov01', '5_vov01',
  92. '3_vov012', '4_vov012', '5_vov012'
  93. , "12_change", "23_change", "34_change"
  94. , '2_vov01', '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012'
  95. # , '1_vov0_分子', '1_vov0_分母',
  96. # '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母',
  97. # '5_vov0_分子',
  98. # '5_vov0_分母', '2_vov01_分子', '2_vov01_分母', '3_vov01_分子', '3_vov01_分母', '4_vov01_分子',
  99. # '4_vov01_分母', '5_vov01_分子', '5_vov01_分母', '3_vov012_分子', '3_vov012_分母', '4_vov012_分子',
  100. # '4_vov012_分母', '5_vov012_分子', '5_vov012_分母'
  101. ]
  102. feature_array = df[features_name].values
  103. df["label"] = df["vov0"].apply(lambda x: 1 if x > 0.25 else 0)
  104. label_array = df["label"].values
  105. return df, feature_array, label_array
  106. try:
  107. date_train = sys.argv[1]
  108. date_test = sys.argv[2]
  109. except Exception as e:
  110. # date_train = "20241010.csv"
  111. # date_test = "20241011.csv"
  112. date_train = "20241009_train.csv,20241010_train.csv"
  113. date_test = "20241010_predict.csv"
  114. # date_train = "20240924.csv,20240923.csv,20240922.csv,20240921.csv,20240920.csv,20240919.csv"
  115. # date_train = "20240915.csv"
  116. # date_test = "20240916.csv"
  117. # date_train = "20240924_new.csv"
  118. # date_test = "20240925_new.csv"
  119. df, trains_array,trains_label_array = func_make_data(date_train)
  120. header = df.columns.tolist()
  121. # print(header) # print(df.dtypes)
  122. model = xgb.XGBClassifier(
  123. n_estimators=1000,
  124. learning_rate=0.01,
  125. max_depth=5,
  126. min_child_weight=1,
  127. gamma=0,
  128. subsample=0.8,
  129. colsample_bytree=0.8,
  130. objective= 'binary:logistic',
  131. nthread=8,
  132. scale_pos_weight=1,
  133. random_state=2024,
  134. seed=2024,
  135. # verbose=True,
  136. )
  137. model.fit(trains_array, trains_label_array)
  138. df_test, tests_array, _ = func_make_data(date_test)
  139. y_pred = model.predict_proba(tests_array)[:, 1]
  140. df_test["y_pred"] = y_pred
  141. condition_choose = ((df_test['y_pred'] <= 0.1)
  142. # & ((df_test['1_vov0_分母'] > 100))
  143. & ((df_test['4_vov0_分母'] > 50) | (df_test['2_vov0_分母'] > 50) | (df_test['3_vov0_分母'] > 50))
  144. # & (df_test.index <= 10000)
  145. & ((df_test["1_vov0"] - df_test["2_vov0"] < 0.1))
  146. # & ((df_test["1_vov0"] - df_test["2_vov0"] <= 0.1) | (df_test["2_vov0"] <= 0) | (df_test["1_vov0"] <= 0.2))
  147. )
  148. profit_theshold = 0.3
  149. condition_choose_real = condition_choose & (df_test['vov0'] <= profit_theshold)
  150. df_test["condition_choose"] = condition_choose
  151. condition_fuck =condition_choose & (df_test['vov0'] > profit_theshold)
  152. df_test["condition_fuck"] = condition_fuck
  153. df_test[["vid","title","曝光占比","vov0", "condition_choose", "condition_fuck"]].to_csv("new_" + date_test, sep="\t", index=False)
  154. choose_bad = condition_choose.sum()
  155. choose_bad_realbad = condition_choose_real.sum()
  156. acc = choose_bad_realbad / choose_bad
  157. print("acc:{} 分子={} 分母={} 总视频数={} 盈利计算标注vov0大于:{}".format(acc, choose_bad_realbad, choose_bad, df_test.shape[0], profit_theshold))
  158. surface = df_test.loc[condition_choose, '曝光占比'].sum()
  159. surface_income = df_test.loc[condition_choose_real, '曝光占比'].sum()
  160. print("总影响面:{} 盈利影响面:{} 亏损影响面:{}".format(round(surface, 6), round(surface_income, 6), round(surface-surface_income, 6)))
  161. df_test["profit_loss_value"] = df_test['分母'] * (df_test['vov0'] - profit_theshold)
  162. profit_loss_value = df_test.loc[condition_choose, 'profit_loss_value'].sum()
  163. profit_value = df_test.loc[condition_choose_real, 'profit_loss_value'].sum()
  164. print("总盈亏:{} 纯盈利:{} 纯亏损:{} 盈利效率:{}".format(round(profit_loss_value, 1), round(profit_value, 1), round(profit_loss_value-profit_value, 1), round(profit_loss_value/profit_value, 6)))
  165. # 3 多分类模型 概率
  166. # names = []
  167. # with open("a_features_recsys.txt", "r") as file:
  168. # for line in file.readlines():
  169. # line = line.strip()
  170. # names.append(line)
  171. #
  172. # trains = []
  173. # trains_label = []
  174. # # trains_id = []
  175. # with open("20240728.txt", "r") as file:
  176. # for line in tqdm(file.readlines()):
  177. # lines = line.strip().split("\t")
  178. # label = lines[0]
  179. # trains_label.append(int(label))
  180. # m = dict([(l.split(":")[0], l.split(":")[1]) for l in lines[1:]])
  181. # row = [float(m.get(name, "0.0")) for name in names]
  182. # trains.append(row)
  183. # # for key in m.keys():
  184. # # if key.startswith("cid_"):
  185. # # trains_id.append(key.replace("cid_", ""))
  186. # # break
  187. #
  188. # trains_array = np.array(trains)
  189. # trains_label_array = np.array(trains_label)
  190. # print("train samples={} positive={} rate={}".format(
  191. # len(trains_label),
  192. # sum(trains_label),
  193. # format(1.0 * sum(trains_label)/len(trains_label), ".6f"))
  194. # )
  195. #
  196. #
  197. # model = xgb.XGBClassifier(
  198. # n_estimators=2000,
  199. # learning_rate=0.01,
  200. # max_depth=5,
  201. # min_child_weight=1,
  202. # gamma=0,
  203. # subsample=0.8,
  204. # colsample_bytree=0.8,
  205. # objective= 'binary:logistic',
  206. # nthread=8,
  207. # scale_pos_weight=1,
  208. # random_state=2024,
  209. # seed=2024,
  210. # verbose=True,
  211. # )
  212. # model.fit(trains_array, trains_label_array)
  213. #
  214. # tests = []
  215. # tests_label = []
  216. # # tests_id = []
  217. # with open("20240729.txt", "r") as file:
  218. # for line in tqdm(file.readlines()):
  219. # lines = line.strip().split("\t")
  220. # label = lines[0]
  221. # tests_label.append(int(label))
  222. # m = dict([(l.split(":")[0], l.split(":")[1]) for l in lines[1:]])
  223. # row = [float(m.get(name, "0.0")) for name in names]
  224. # tests.append(row)
  225. # # for key in m.keys():
  226. # # if key.startswith("cid_"):
  227. # # tests_id.append(key.replace("cid_", ""))
  228. # # break
  229. # tests_array = np.array(tests)
  230. # tests_label_array = np.array(tests_label)
  231. # print("test samples={} positive={} rate={}".format(
  232. # len(tests_label),
  233. # sum(tests_label),
  234. # format(1.0 * sum(tests_label)/len(tests_label), ".6f"))
  235. # )
  236. #
  237. # # 进行预测
  238. # y_pred = model.predict(tests_array)
  239. # probabilities = model.predict_proba(tests_array)
  240. #
  241. #
  242. #
  243. # auc = roc_auc_score(tests_label_array, probabilities[:, 1])
  244. # print("auc:{}".format(auc))
  245. """
  246. https://zhuanlan.zhihu.com/p/688993572
  247. """