feature.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. import glob
  2. import os.path
  3. import numpy as np
  4. import pandas as pd
  5. import xgboost as xgb
  6. from model.XGBModel import XGBModel
  7. features = [
  8. "cpa",
  9. "b2_3h_ctr",
  10. "b2_3h_ctcvr",
  11. "b2_3h_cvr",
  12. "b2_3h_conver",
  13. "b2_3h_ecpm",
  14. "b2_3h_click",
  15. "b2_3h_conver*log(view)",
  16. "b2_3h_conver*ctcvr",
  17. "b2_6h_ctr",
  18. "b2_6h_ctcvr",
  19. "b2_6h_cvr",
  20. "b2_6h_conver",
  21. "b2_6h_ecpm",
  22. "b2_6h_click",
  23. "b2_6h_conver*log(view)",
  24. "b2_6h_conver*ctcvr",
  25. "b2_12h_ctr",
  26. "b2_12h_ctcvr",
  27. "b2_12h_cvr",
  28. "b2_12h_conver",
  29. "b2_12h_ecpm",
  30. "b2_12h_click",
  31. "b2_12h_conver*log(view)",
  32. "b2_12h_conver*ctcvr",
  33. "b2_1d_ctr",
  34. "b2_1d_ctcvr",
  35. "b2_1d_cvr",
  36. "b2_1d_conver",
  37. "b2_1d_ecpm",
  38. "b2_1d_click",
  39. "b2_1d_conver*log(view)",
  40. "b2_1d_conver*ctcvr",
  41. "b2_3d_ctr",
  42. "b2_3d_ctcvr",
  43. "b2_3d_cvr",
  44. "b2_3d_conver",
  45. "b2_3d_ecpm",
  46. "b2_3d_click",
  47. "b2_3d_conver*log(view)",
  48. "b2_3d_conver*ctcvr",
  49. "b2_7d_ctr",
  50. "b2_7d_ctcvr",
  51. "b2_7d_cvr",
  52. "b2_7d_conver",
  53. "b2_7d_ecpm",
  54. "b2_7d_click",
  55. "b2_7d_conver*log(view)",
  56. "b2_7d_conver*ctcvr",
  57. "b3_3h_ctr",
  58. "b3_3h_ctcvr",
  59. "b3_3h_cvr",
  60. "b3_3h_conver",
  61. "b3_3h_ecpm",
  62. "b3_3h_click",
  63. "b3_3h_conver*log(view)",
  64. "b3_3h_conver*ctcvr",
  65. "b3_6h_ctr",
  66. "b3_6h_ctcvr",
  67. "b3_6h_cvr",
  68. "b3_6h_conver",
  69. "b3_6h_ecpm",
  70. "b3_6h_click",
  71. "b3_6h_conver*log(view)",
  72. "b3_6h_conver*ctcvr",
  73. "b3_12h_ctr",
  74. "b3_12h_ctcvr",
  75. "b3_12h_cvr",
  76. "b3_12h_conver",
  77. "b3_12h_ecpm",
  78. "b3_12h_click",
  79. "b3_12h_conver*log(view)",
  80. "b3_12h_conver*ctcvr",
  81. "b3_1d_ctr",
  82. "b3_1d_ctcvr",
  83. "b3_1d_cvr",
  84. "b3_1d_conver",
  85. "b3_1d_ecpm",
  86. "b3_1d_click",
  87. "b3_1d_conver*log(view)",
  88. "b3_1d_conver*ctcvr",
  89. "b3_3d_ctr",
  90. "b3_3d_ctcvr",
  91. "b3_3d_cvr",
  92. "b3_3d_conver",
  93. "b3_3d_ecpm",
  94. "b3_3d_click",
  95. "b3_3d_conver*log(view)",
  96. "b3_3d_conver*ctcvr",
  97. "b3_7d_ctr",
  98. "b3_7d_ctcvr",
  99. "b3_7d_cvr",
  100. "b3_7d_conver",
  101. "b3_7d_ecpm",
  102. "b3_7d_click",
  103. "b3_7d_conver*log(view)",
  104. "b3_7d_conver*ctcvr",
  105. "b4_3h_ctr",
  106. "b4_3h_ctcvr",
  107. "b4_3h_cvr",
  108. "b4_3h_conver",
  109. "b4_3h_ecpm",
  110. "b4_3h_click",
  111. "b4_3h_conver*log(view)",
  112. "b4_3h_conver*ctcvr",
  113. "b4_6h_ctr",
  114. "b4_6h_ctcvr",
  115. "b4_6h_cvr",
  116. "b4_6h_conver",
  117. "b4_6h_ecpm",
  118. "b4_6h_click",
  119. "b4_6h_conver*log(view)",
  120. "b4_6h_conver*ctcvr",
  121. "b4_12h_ctr",
  122. "b4_12h_ctcvr",
  123. "b4_12h_cvr",
  124. "b4_12h_conver",
  125. "b4_12h_ecpm",
  126. "b4_12h_click",
  127. "b4_12h_conver*log(view)",
  128. "b4_12h_conver*ctcvr",
  129. "b4_1d_ctr",
  130. "b4_1d_ctcvr",
  131. "b4_1d_cvr",
  132. "b4_1d_conver",
  133. "b4_1d_ecpm",
  134. "b4_1d_click",
  135. "b4_1d_conver*log(view)",
  136. "b4_1d_conver*ctcvr",
  137. "b4_3d_ctr",
  138. "b4_3d_ctcvr",
  139. "b4_3d_cvr",
  140. "b4_3d_conver",
  141. "b4_3d_ecpm",
  142. "b4_3d_click",
  143. "b4_3d_conver*log(view)",
  144. "b4_3d_conver*ctcvr",
  145. "b4_7d_ctr",
  146. "b4_7d_ctcvr",
  147. "b4_7d_cvr",
  148. "b4_7d_conver",
  149. "b4_7d_ecpm",
  150. "b4_7d_click",
  151. "b4_7d_conver*log(view)",
  152. "b4_7d_conver*ctcvr",
  153. "b5_3h_ctr",
  154. "b5_3h_ctcvr",
  155. "b5_3h_cvr",
  156. "b5_3h_conver",
  157. "b5_3h_ecpm",
  158. "b5_3h_click",
  159. "b5_3h_conver*log(view)",
  160. "b5_3h_conver*ctcvr",
  161. "b5_6h_ctr",
  162. "b5_6h_ctcvr",
  163. "b5_6h_cvr",
  164. "b5_6h_conver",
  165. "b5_6h_ecpm",
  166. "b5_6h_click",
  167. "b5_6h_conver*log(view)",
  168. "b5_6h_conver*ctcvr",
  169. "b5_12h_ctr",
  170. "b5_12h_ctcvr",
  171. "b5_12h_cvr",
  172. "b5_12h_conver",
  173. "b5_12h_ecpm",
  174. "b5_12h_click",
  175. "b5_12h_conver*log(view)",
  176. "b5_12h_conver*ctcvr",
  177. "b5_1d_ctr",
  178. "b5_1d_ctcvr",
  179. "b5_1d_cvr",
  180. "b5_1d_conver",
  181. "b5_1d_ecpm",
  182. "b5_1d_click",
  183. "b5_1d_conver*log(view)",
  184. "b5_1d_conver*ctcvr",
  185. "b5_3d_ctr",
  186. "b5_3d_ctcvr",
  187. "b5_3d_cvr",
  188. "b5_3d_conver",
  189. "b5_3d_ecpm",
  190. "b5_3d_click",
  191. "b5_3d_conver*log(view)",
  192. "b5_3d_conver*ctcvr",
  193. "b5_7d_ctr",
  194. "b5_7d_ctcvr",
  195. "b5_7d_cvr",
  196. "b5_7d_conver",
  197. "b5_7d_ecpm",
  198. "b5_7d_click",
  199. "b5_7d_conver*log(view)",
  200. "b5_7d_conver*ctcvr",
  201. "b8_3h_ctr",
  202. "b8_3h_ctcvr",
  203. "b8_3h_cvr",
  204. "b8_3h_conver",
  205. "b8_3h_ecpm",
  206. "b8_3h_click",
  207. "b8_3h_conver*log(view)",
  208. "b8_3h_conver*ctcvr",
  209. "b8_6h_ctr",
  210. "b8_6h_ctcvr",
  211. "b8_6h_cvr",
  212. "b8_6h_conver",
  213. "b8_6h_ecpm",
  214. "b8_6h_click",
  215. "b8_6h_conver*log(view)",
  216. "b8_6h_conver*ctcvr",
  217. "b8_12h_ctr",
  218. "b8_12h_ctcvr",
  219. "b8_12h_cvr",
  220. "b8_12h_conver",
  221. "b8_12h_ecpm",
  222. "b8_12h_click",
  223. "b8_12h_conver*log(view)",
  224. "b8_12h_conver*ctcvr",
  225. "b8_1d_ctr",
  226. "b8_1d_ctcvr",
  227. "b8_1d_cvr",
  228. "b8_1d_conver",
  229. "b8_1d_ecpm",
  230. "b8_1d_click",
  231. "b8_1d_conver*log(view)",
  232. "b8_1d_conver*ctcvr",
  233. "b8_3d_ctr",
  234. "b8_3d_ctcvr",
  235. "b8_3d_cvr",
  236. "b8_3d_conver",
  237. "b8_3d_ecpm",
  238. "b8_3d_click",
  239. "b8_3d_conver*log(view)",
  240. "b8_3d_conver*ctcvr",
  241. "b8_7d_ctr",
  242. "b8_7d_ctcvr",
  243. "b8_7d_cvr",
  244. "b8_7d_conver",
  245. "b8_7d_ecpm",
  246. "b8_7d_click",
  247. "b8_7d_conver*log(view)",
  248. "b8_7d_conver*ctcvr",
  249. "b6_7d_ctr",
  250. "b6_7d_ctcvr",
  251. "b6_7d_cvr",
  252. "b6_7d_conver",
  253. "b6_7d_ecpm",
  254. "b6_7d_click",
  255. "b6_7d_conver*log(view)",
  256. "b6_7d_conver*ctcvr",
  257. "b6_14d_ctr",
  258. "b6_14d_ctcvr",
  259. "b6_14d_cvr",
  260. "b6_14d_conver",
  261. "b6_14d_ecpm",
  262. "b6_14d_click",
  263. "b6_14d_conver*log(view)",
  264. "b6_14d_conver*ctcvr",
  265. "b7_7d_ctr",
  266. "b7_7d_ctcvr",
  267. "b7_7d_cvr",
  268. "b7_7d_conver",
  269. "b7_7d_ecpm",
  270. "b7_7d_click",
  271. "b7_7d_conver*log(view)",
  272. "b7_7d_conver*ctcvr",
  273. "b7_14d_ctr",
  274. "b7_14d_ctcvr",
  275. "b7_14d_cvr",
  276. "b7_14d_conver",
  277. "b7_14d_ecpm",
  278. "b7_14d_click",
  279. "b7_14d_conver*log(view)",
  280. "b7_14d_conver*ctcvr",
  281. "viewAll",
  282. "clickAll",
  283. "converAll",
  284. "incomeAll",
  285. "ctr_all",
  286. "ctcvr_all",
  287. "cvr_all",
  288. "ecpm_all",
  289. "timediff_view",
  290. "timediff_click",
  291. "timediff_conver",
  292. "actionstatic_view",
  293. "actionstatic_click",
  294. "actionstatic_conver",
  295. "actionstatic_income",
  296. "actionstatic_ctr",
  297. "actionstatic_ctcvr",
  298. "actionstatic_cvr",
  299. "e1_tags_3d_matchnum",
  300. "e1_tags_3d_maxscore",
  301. "e1_tags_3d_avgscore",
  302. "e1_tags_7d_matchnum",
  303. "e1_tags_7d_maxscore",
  304. "e1_tags_7d_avgscore",
  305. "e1_tags_14d_matchnum",
  306. "e1_tags_14d_maxscore",
  307. "e1_tags_14d_avgscore",
  308. "e2_tags_3d_matchnum",
  309. "e2_tags_3d_maxscore",
  310. "e2_tags_3d_avgscore",
  311. "e2_tags_7d_matchnum",
  312. "e2_tags_7d_maxscore",
  313. "e2_tags_7d_avgscore",
  314. "e2_tags_14d_matchnum",
  315. "e2_tags_14d_maxscore",
  316. "e2_tags_14d_avgscore",
  317. "d1_feature_3h_ctr",
  318. "d1_feature_3h_ctcvr",
  319. "d1_feature_3h_cvr",
  320. "d1_feature_3h_conver",
  321. "d1_feature_3h_ecpm",
  322. "d1_feature_6h_ctr",
  323. "d1_feature_6h_ctcvr",
  324. "d1_feature_6h_cvr",
  325. "d1_feature_6h_conver",
  326. "d1_feature_6h_ecpm",
  327. "d1_feature_12h_ctr",
  328. "d1_feature_12h_ctcvr",
  329. "d1_feature_12h_cvr",
  330. "d1_feature_12h_conver",
  331. "d1_feature_12h_ecpm",
  332. "d1_feature_1d_ctr",
  333. "d1_feature_1d_ctcvr",
  334. "d1_feature_1d_cvr",
  335. "d1_feature_1d_conver",
  336. "d1_feature_1d_ecpm",
  337. "d1_feature_3d_ctr",
  338. "d1_feature_3d_ctcvr",
  339. "d1_feature_3d_cvr",
  340. "d1_feature_3d_conver",
  341. "d1_feature_3d_ecpm",
  342. "d1_feature_7d_ctr",
  343. "d1_feature_7d_ctcvr",
  344. "d1_feature_7d_cvr",
  345. "d1_feature_7d_conver",
  346. "d1_feature_7d_ecpm",
  347. "vid_rank_ctr_1d",
  348. "vid_rank_ctr_3d",
  349. "vid_rank_ctr_7d",
  350. "vid_rank_ctr_14d",
  351. "vid_rank_ctcvr_1d",
  352. "vid_rank_ctcvr_3d",
  353. "vid_rank_ctcvr_7d",
  354. "vid_rank_ctcvr_14d",
  355. "vid_rank_ecpm_1d",
  356. "vid_rank_ecpm_3d",
  357. "vid_rank_ecpm_7d",
  358. "vid_rank_ecpm_14d"
  359. ]
  360. def load_model_and_score(model_path, feature_map):
  361. model = xgb.Booster()
  362. model.load_model(f"{model_path}/data/XGBoostClassificationModel")
  363. model.set_param({"missing": 0.0})
  364. values = np.array([
  365. float(feature_map.get(feature, 0.0))
  366. for feature in features
  367. ], dtype=np.float32)
  368. dm = xgb.DMatrix(values.reshape(1, -1), missing=0.0)
  369. return float(model.predict(dm, output_margin=False)[0])
  370. def _multi_importance_flat_map(importance_map: dict):
  371. result = []
  372. all_features = set(key for inner_dict in importance_map.values() for key in inner_dict.keys())
  373. for feature in all_features:
  374. item = {
  375. "feature": feature,
  376. }
  377. for key in importance_map:
  378. if feature in importance_map[key]:
  379. item[key] = importance_map[key][feature]
  380. result.append(item)
  381. return result
  382. def _main():
  383. model_path = "/Users/zhao/Desktop/tzld/XGB/35_ad_model"
  384. all_model = glob.glob(f"{model_path}/*")
  385. model_dict = {}
  386. for e in all_model:
  387. if "model_xgb_351_1000_v2" in e:
  388. model_dict[e] = XGBModel(model_file=f"{e}/data/XGBoostClassificationModel", features=features)
  389. weight_dict = {}
  390. cover_dict = {}
  391. gain_dict = {}
  392. for key in model_dict:
  393. dt = os.path.basename(key)[-9:]
  394. weight_dict[dt] = model_dict[key].feature_weight_importance()
  395. cover_dict[dt] = model_dict[key].feature_cover_importance()
  396. gain_dict[dt] = model_dict[key].feature_gain_importance()
  397. weight = _multi_importance_flat_map(dict(sorted(weight_dict.items())))
  398. cover = _multi_importance_flat_map(dict(sorted(cover_dict.items())))
  399. gain = _multi_importance_flat_map(dict(sorted(gain_dict.items())))
  400. pd.DataFrame(weight).to_csv("/Users/zhao/Desktop/weight.csv", index=False)
  401. pd.DataFrame(cover).to_csv("/Users/zhao/Desktop/cover.csv", index=False)
  402. pd.DataFrame(gain).to_csv("/Users/zhao/Desktop/gain.csv", index=False)
  403. if __name__ == '__main__':
  404. _main()