tree_model_xgb_vov.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. import pandas as pd
  2. from sklearn.metrics import roc_auc_score
  3. import numpy as np
  4. import xgboost as xgb
  5. from tqdm import tqdm
  6. import sys
  7. # def func_make_data(file_path: str):
  8. # df = pd.read_csv(file_path)
  9. # for col in ['曝光占比', 'vov0', '分子', '分母', '1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01',
  10. # '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012', '1_vov0_分子', '1_vov0_分母',
  11. # '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母', '5_vov0_分子',
  12. # '5_vov0_分母', '2_vov01_分子', '2_vov01_分母', '3_vov01_分子', '3_vov01_分母', '4_vov01_分子',
  13. # '4_vov01_分母', '5_vov01_分子', '5_vov01_分母', '3_vov012_分子', '3_vov012_分母', '4_vov012_分子',
  14. # '4_vov012_分母', '5_vov012_分子', '5_vov012_分母']:
  15. # df[col] = pd.to_numeric(df[col], errors='coerce')
  16. #
  17. # df.fillna(0, inplace=True)
  18. # df["12_change"] = df["1_vov0"] - df["2_vov0"]
  19. # df["23_change"] = df["2_vov0"] - df["3_vov0"]
  20. # df["34_change"] = df["3_vov0"] - df["4_vov0"]
  21. # features_name = ['1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01', '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012'
  22. # ,"12_change", "23_change", "34_change"
  23. # ,'2_vov01', '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012'
  24. # ,'1_vov0_分子', '1_vov0_分母',
  25. # '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母', '5_vov0_分子',
  26. # '5_vov0_分母', '2_vov01_分子', '2_vov01_分母', '3_vov01_分子', '3_vov01_分母', '4_vov01_分子',
  27. # '4_vov01_分母', '5_vov01_分子', '5_vov01_分母', '3_vov012_分子', '3_vov012_分母', '4_vov012_分子',
  28. # '4_vov012_分母', '5_vov012_分子', '5_vov012_分母'
  29. # ]
  30. # feature_array = df[features_name].values
  31. # df["reg_label"] = df["vov0"].apply(lambda x: float(x))
  32. # label_array = df["reg_label"].values
  33. # return df, feature_array, label_array
  34. #
  35. # df, trains_array,trains_label_array = func_make_data("20240919.csv")
  36. # header = df.columns.tolist()
  37. # print(header) # print(df.dtypes)
  38. # # 1 回归模型
  39. # model = xgb.XGBRegressor(objective='reg:squarederror',
  40. # learning_rate=0.01,
  41. # n_estimators=100,
  42. # max_depth=3)
  43. # model.fit(trains_array, trains_label_array)
  44. # df_test, tests_array, _ = func_make_data("20240920.csv")
  45. # y_pred = model.predict(tests_array)
  46. # df_test["y_pred"] = y_pred
  47. #
  48. # condition_choose = ((df_test['y_pred'] <= 0.15)
  49. # & ((df_test['1_vov0_分母'] > 50) | (df_test['2_vov0_分母'] > 50) | (df_test['3_vov0_分母'] > 50))
  50. #
  51. # )
  52. # condition_choose_real = condition_choose & (df_test['vov0'] <= 0.30)
  53. # profit_theshold = 0.34
  54. #
  55. # choose_bad = condition_choose.sum()
  56. # choose_bad_realbad = condition_choose_real.sum()
  57. # acc = choose_bad_realbad / choose_bad
  58. # print("acc:{} 分子={} 分母={} 总视频数={}".format(acc, choose_bad_realbad, choose_bad, df_test.size))
  59. #
  60. # surface = df_test.loc[condition_choose, '曝光占比'].sum()
  61. # surface_income = df_test.loc[condition_choose_real, '曝光占比'].sum()
  62. # print("总影响面:{} 收益影响面:{}".format(surface, surface_income))
  63. #
  64. # df_test["profit_loss_value"] = df_test['分母'] * (df_test['vov0'] - profit_theshold)
  65. # profit_loss_value = df_test.loc[condition_choose, 'profit_loss_value'].sum()
  66. # profit_value = df_test.loc[condition_choose_real, 'profit_loss_value'].sum()
  67. # print("总盈亏:{} 纯盈利:{}".format(profit_loss_value, profit_value))
  68. # 2 分类模型
  69. def func_make_data(file_path: str):
  70. df = pd.read_csv(file_path)
  71. df["title"] = df["title"].apply(lambda x: x.replace("\n", ""))
  72. for col in ['曝光占比', 'vov0', '分子', '分母', '1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01',
  73. '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012', '1_vov0_分子', '1_vov0_分母',
  74. '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母', '5_vov0_分子',
  75. '5_vov0_分母', '2_vov01_分子', '2_vov01_分母', '3_vov01_分子', '3_vov01_分母', '4_vov01_分子',
  76. '4_vov01_分母', '5_vov01_分子', '5_vov01_分母', '3_vov012_分子', '3_vov012_分母', '4_vov012_分子',
  77. '4_vov012_分母', '5_vov012_分子', '5_vov012_分母']:
  78. df[col] = pd.to_numeric(df[col], errors='coerce')
  79. df.fillna(0, inplace=True)
  80. df["12_change"] = df["1_vov0"] - df["2_vov0"]
  81. df["23_change"] = df["2_vov0"] - df["3_vov0"]
  82. df["34_change"] = df["3_vov0"] - df["4_vov0"]
  83. features_name = ['1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01', '3_vov01', '4_vov01', '5_vov01',
  84. '3_vov012', '4_vov012', '5_vov012'
  85. , "12_change", "23_change", "34_change"
  86. , '2_vov01', '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012'
  87. # , '1_vov0_分子', '1_vov0_分母',
  88. # '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母',
  89. # '5_vov0_分子',
  90. # '5_vov0_分母', '2_vov01_分子', '2_vov01_分母', '3_vov01_分子', '3_vov01_分母', '4_vov01_分子',
  91. # '4_vov01_分母', '5_vov01_分子', '5_vov01_分母', '3_vov012_分子', '3_vov012_分母', '4_vov012_分子',
  92. # '4_vov012_分母', '5_vov012_分子', '5_vov012_分母'
  93. ]
  94. feature_array = df[features_name].values
  95. df["label"] = df["vov0"].apply(lambda x: 1 if x > 0.25 else 0)
  96. label_array = df["label"].values
  97. return df, feature_array, label_array
  98. try:
  99. date_train = sys.argv[1]
  100. date_test = sys.argv[2]
  101. except Exception as e:
  102. date_train = "20240919.csv"
  103. date_test = "20240920.csv"
  104. df, trains_array,trains_label_array = func_make_data(date_train)
  105. header = df.columns.tolist()
  106. # print(header) # print(df.dtypes)
  107. model = xgb.XGBClassifier(
  108. n_estimators=100,
  109. learning_rate=0.01,
  110. max_depth=5,
  111. min_child_weight=1,
  112. gamma=0,
  113. subsample=0.8,
  114. colsample_bytree=0.8,
  115. objective= 'binary:logistic',
  116. nthread=8,
  117. scale_pos_weight=1,
  118. random_state=2024,
  119. seed=2024,
  120. # verbose=True,
  121. )
  122. model.fit(trains_array, trains_label_array)
  123. df_test, tests_array, _ = func_make_data(date_test)
  124. y_pred = model.predict_proba(tests_array)[:, 1]
  125. df_test["y_pred"] = y_pred
  126. condition_choose = ((df_test['y_pred'] <= 0.2)
  127. # & ((df_test['1_vov0_分母'] > 50) | (df_test['2_vov0_分母'] > 50) | (df_test['3_vov0_分母'] > 50))
  128. & (df_test.index <= 10000)
  129. )
  130. profit_theshold = 0.3
  131. condition_choose_real = condition_choose & (df_test['vov0'] <= profit_theshold)
  132. df_test["condition_choose"] = condition_choose
  133. df_test[["vid","title","曝光占比","vov0", "condition_choose"]].to_csv("new_" + date_test, sep="\t", index=False)
  134. choose_bad = condition_choose.sum()
  135. choose_bad_realbad = condition_choose_real.sum()
  136. acc = choose_bad_realbad / choose_bad
  137. print("acc:{} 分子={} 分母={} 总视频数={} 盈利计算标注vov0大于:{}".format(acc, choose_bad_realbad, choose_bad, df_test.size, profit_theshold))
  138. surface = df_test.loc[condition_choose, '曝光占比'].sum()
  139. surface_income = df_test.loc[condition_choose_real, '曝光占比'].sum()
  140. print("总影响面:{} 盈利影响面:{} 亏损影响面:{}".format(round(surface, 6), round(surface_income, 6), round(surface-surface_income, 6)))
  141. df_test["profit_loss_value"] = df_test['分母'] * (df_test['vov0'] - profit_theshold)
  142. profit_loss_value = df_test.loc[condition_choose, 'profit_loss_value'].sum()
  143. profit_value = df_test.loc[condition_choose_real, 'profit_loss_value'].sum()
  144. print("总盈亏:{} 纯盈利:{} 纯亏损:{} 盈利效率:{}".format(round(profit_loss_value, 1), round(profit_value, 1), round(profit_loss_value-profit_value, 1), round(profit_loss_value/profit_value, 6)))
  145. # 3 多分类模型 概率
  146. # names = []
  147. # with open("a_features_recsys.txt", "r") as file:
  148. # for line in file.readlines():
  149. # line = line.strip()
  150. # names.append(line)
  151. #
  152. # trains = []
  153. # trains_label = []
  154. # # trains_id = []
  155. # with open("20240728.txt", "r") as file:
  156. # for line in tqdm(file.readlines()):
  157. # lines = line.strip().split("\t")
  158. # label = lines[0]
  159. # trains_label.append(int(label))
  160. # m = dict([(l.split(":")[0], l.split(":")[1]) for l in lines[1:]])
  161. # row = [float(m.get(name, "0.0")) for name in names]
  162. # trains.append(row)
  163. # # for key in m.keys():
  164. # # if key.startswith("cid_"):
  165. # # trains_id.append(key.replace("cid_", ""))
  166. # # break
  167. #
  168. # trains_array = np.array(trains)
  169. # trains_label_array = np.array(trains_label)
  170. # print("train samples={} positive={} rate={}".format(
  171. # len(trains_label),
  172. # sum(trains_label),
  173. # format(1.0 * sum(trains_label)/len(trains_label), ".6f"))
  174. # )
  175. #
  176. #
  177. # model = xgb.XGBClassifier(
  178. # n_estimators=2000,
  179. # learning_rate=0.01,
  180. # max_depth=5,
  181. # min_child_weight=1,
  182. # gamma=0,
  183. # subsample=0.8,
  184. # colsample_bytree=0.8,
  185. # objective= 'binary:logistic',
  186. # nthread=8,
  187. # scale_pos_weight=1,
  188. # random_state=2024,
  189. # seed=2024,
  190. # verbose=True,
  191. # )
  192. # model.fit(trains_array, trains_label_array)
  193. #
  194. # tests = []
  195. # tests_label = []
  196. # # tests_id = []
  197. # with open("20240729.txt", "r") as file:
  198. # for line in tqdm(file.readlines()):
  199. # lines = line.strip().split("\t")
  200. # label = lines[0]
  201. # tests_label.append(int(label))
  202. # m = dict([(l.split(":")[0], l.split(":")[1]) for l in lines[1:]])
  203. # row = [float(m.get(name, "0.0")) for name in names]
  204. # tests.append(row)
  205. # # for key in m.keys():
  206. # # if key.startswith("cid_"):
  207. # # tests_id.append(key.replace("cid_", ""))
  208. # # break
  209. # tests_array = np.array(tests)
  210. # tests_label_array = np.array(tests_label)
  211. # print("test samples={} positive={} rate={}".format(
  212. # len(tests_label),
  213. # sum(tests_label),
  214. # format(1.0 * sum(tests_label)/len(tests_label), ".6f"))
  215. # )
  216. #
  217. # # 进行预测
  218. # y_pred = model.predict(tests_array)
  219. # probabilities = model.predict_proba(tests_array)
  220. #
  221. #
  222. #
  223. # auc = roc_auc_score(tests_label_array, probabilities[:, 1])
  224. # print("auc:{}".format(auc))
  225. """
  226. https://zhuanlan.zhihu.com/p/688993572
  227. """