process_feature.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. import time
  2. import numpy as np
  3. add_feature = [
  4. 'all_return_day1_return_count', # -- 1/3/7/14日内总回流 #12
  5. 'all_return_day3_return_count',
  6. 'all_return_day7_return_count',
  7. 'all_return_day14_return_count',
  8. 'three_return_day1_return_count', # -- 1/3/7/14日内前三层回流 #14
  9. 'three_return_day3_return_count',
  10. 'three_return_day7_return_count',
  11. 'three_return_day14_return_count',
  12. 'four_up_return_day1_return_count', # -- 1/3/7/14日内四+层回流 #15
  13. 'four_up_return_day3_return_count',
  14. 'four_up_return_day7_return_count',
  15. 'four_up_return_day14_return_count',
  16. 'one_return_day1_return_count', # -- 1/3/7/14日内一层回流 #13
  17. 'one_return_day3_return_count',
  18. 'one_return_day7_return_count',
  19. 'one_return_day14_return_count',
  20. 'four_up_return_div_three_return_day1', # -- 1/3/7/14日内四+层回流/前三层回流 #23
  21. 'four_up_return_div_three_return_day3',
  22. 'four_up_return_div_three_return_day7',
  23. 'four_up_return_div_three_return_day14',
  24. 'all_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内回流 #8
  25. 'all_return_day3_view_day3_return_count',
  26. 'all_return_day7_view_day7_return_count',
  27. 'all_return_day14_view_day14_return_count',
  28. 'three_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内前三层回流 #10
  29. 'three_return_day3_view_day3_return_count',
  30. 'three_return_day7_view_day7_return_count',
  31. 'three_return_day14_view_day14_return_count',
  32. 'four_up_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流 # 11
  33. 'four_up_return_day3_view_day3_return_count',
  34. 'four_up_return_day7_view_day7_return_count',
  35. 'four_up_return_day14_view_day14_return_count',
  36. 'one_return_day1_view_day1_return_count', ##-- 1/3/7/14日内曝光在1/3/7/14日内一层回流 #9
  37. 'one_return_day3_view_day3_return_count',
  38. 'one_return_day7_view_day7_return_count',
  39. 'one_return_day14_view_day14_return_count',
  40. 'all_return_day1_on_day1_return_count', # 前day1+1 / day1+3/day1+7/day1+14 到前 day1+1日内曝光在 day1的总回流 #16
  41. 'all_return_day3_on_day1_return_count',
  42. 'all_return_day7_on_day1_return_count',
  43. 'all_return_day14_on_day1_return_count',
  44. 'four_up_return_day1_view_day1_return_div_three_d1', # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流/前三层回流 #22
  45. 'four_up_return_day3_view_day3_return_div_three_d3',
  46. 'four_up_return_day7_view_day7_return_div_three_d7',
  47. 'four_up_return_day14_view_day14_return_div_three_d14',
  48. 'day1ctr', # -- 1/3/7/14/30/60日内播放/曝光 #17
  49. 'day3ctr',
  50. 'day7ctr',
  51. 'day14ctr',
  52. 'day30ctr',
  53. 'day60ctr',
  54. 'day1sov', # -- 1/3/7/14/30/60日内分享/曝光 #18
  55. 'day3sov',
  56. 'day7sov',
  57. 'day14sov',
  58. 'day30sov',
  59. 'day60sov',
  60. 'day1rov', # -- 1/3/7/14日内曝光的回流/曝光 #19
  61. 'day3rov',
  62. 'day7rov',
  63. 'day14rov',
  64. 'day1soc', # -- 1/3/7/14/30/60日内分享/播放 #20
  65. 'day3soc',
  66. 'day7soc',
  67. 'day14soc',
  68. 'day30soc',
  69. 'day60soc',
  70. 'day1roc', # -- 1/3/7/14日内曝光的回流/播放 #21
  71. 'day3roc',
  72. 'day7roc',
  73. 'day14roc',
  74. 'oneday_day1rov', # -- 1/3/7/14日内曝光在今日的回流/ 1/3/7/14日内曝光 #24
  75. 'oneday_day3rov',
  76. 'oneday_day7rov',
  77. 'oneday_day14rov',
  78. 'futre7dayreturn'
  79. ,'todyviewcount_rank'
  80. ,'day1viewcount_rank'
  81. ]
  82. featurename = [
  83. 'dt',
  84. 'videoid',
  85. 'day1playcount',
  86. 'day1returncount',
  87. 'day1sharecount',
  88. 'day1viewcount',
  89. 'day14playcount',
  90. 'day14returncount',
  91. 'day14sharecount',
  92. 'day14viewcount',
  93. 'day30playcount',
  94. 'day30returncount',
  95. 'day30sharecount',
  96. 'day30viewcount',
  97. 'day3playcount',
  98. 'day3returncount',
  99. 'day3sharecount',
  100. 'day3viewcount',
  101. 'day60playcount',
  102. 'day60returncount',
  103. 'day60sharecount',
  104. 'day60viewcount',
  105. 'day7playcount',
  106. 'day7returncount',
  107. 'day7sharecount',
  108. 'day7viewcount',
  109. 'videocategory11',
  110. 'videocategory12',
  111. 'videocategory45',
  112. 'videocategory49',
  113. 'videocategory1',
  114. 'videocategory2',
  115. 'videocategory3',
  116. 'videocategory4',
  117. 'videocategory5',
  118. 'videocategory6',
  119. 'videocategory7',
  120. 'videocategory8',
  121. 'videocategory9',
  122. 'videocategory85',
  123. 'videocategory10',
  124. 'videocategory555',
  125. 'usercategory1',
  126. 'usercategory2',
  127. 'usercategory3',
  128. 'usercategory4',
  129. 'usercategory5',
  130. 'usercategory6',
  131. 'usercategory7',
  132. 'usercategory8',
  133. 'usercategory9',
  134. 'usercategory10',
  135. 'usercategory11',
  136. 'usercategory12',
  137. 'usercategory45',
  138. 'usercategory49',
  139. 'usercategory85',
  140. 'usercategory555',
  141. 'todyviewcount',
  142. 'day5returncount_1_stage',
  143. 'day5returncount_2_stage',
  144. 'day5returncount_3_stage',
  145. 'day5returncount_4_stage',
  146. 'stage_one_retrn',
  147. 'stage_two_retrn',
  148. 'stage_three_retrn',
  149. 'stage_four_retrn']
  150. words = ['videotags','words_without_tags']
  151. featurename = featurename + add_feature + words
  152. # 首页特征
  153. root_page_1day = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount']
  154. root_page_3day = ['day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount']
  155. root_page_7day = ['day7playcount', 'day7returncount', 'day7sharecount', 'day7viewcount']
  156. root_page_14day = ['day14playcount', 'day14returncount', 'day14sharecount', 'day14viewcount']
  157. root_page_30day = ['day30playcount', 'day30returncount', 'day30sharecount', 'day30viewcount']
  158. root_page_60day = ['day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount']
  159. return_5day = ['day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage',
  160. 'day5returncount_4_stage']
  161. cate_feat = ['videocategory1', 'videocategory10', 'videocategory11', 'videocategory12',
  162. 'videocategory2', 'videocategory3', 'videocategory4', 'videocategory45',
  163. 'videocategory49', 'videocategory5', 'videocategory6',
  164. 'videocategory7', 'videocategory8', 'videocategory85', 'videocategory9', 'videocategory555']
  165. one_hot_feature = ['videotags','words_without_tags','videoid']
  166. #
  167. features = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount', 'day30playcount', 'day30returncount',
  168. 'day30sharecount', 'day30viewcount', 'day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount',
  169. 'day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount', 'day7playcount', 'day7returncount',
  170. 'day7sharecount', 'day7viewcount', 'usercategory1', 'usercategory2', 'usercategory3', 'usercategory4',
  171. 'usercategory5', 'usercategory6', 'usercategory7', 'usercategory8', 'usercategory9', 'usercategory10',
  172. 'usercategory11', 'usercategory12', 'usercategory45', 'usercategory49', 'usercategory85','usercategory555',
  173. 'todyviewcount',
  174. 'day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage', 'day5returncount_4_stage',
  175. 'stage_one_retrn', 'stage_two_retrn', 'stage_three_retrn', 'stage_four_retrn', 'all_return_day1_return_count',
  176. 'all_return_day3_return_count', 'all_return_day7_return_count', 'all_return_day14_return_count',
  177. 'three_return_day1_return_count', 'three_return_day3_return_count', 'three_return_day7_return_count',
  178. 'three_return_day14_return_count', 'four_up_return_day1_return_count', 'four_up_return_day3_return_count',
  179. 'four_up_return_day7_return_count', 'four_up_return_day14_return_count', 'one_return_day1_return_count',
  180. 'one_return_day3_return_count', 'one_return_day7_return_count', 'one_return_day14_return_count',
  181. 'four_up_return_div_three_return_day1', 'four_up_return_div_three_return_day3',
  182. 'four_up_return_div_three_return_day7', 'four_up_return_div_three_return_day14',
  183. 'all_return_day1_view_day1_return_count', 'all_return_day3_view_day3_return_count',
  184. 'all_return_day7_view_day7_return_count', 'all_return_day14_view_day14_return_count',
  185. 'three_return_day1_view_day1_return_count', 'three_return_day3_view_day3_return_count',
  186. 'three_return_day7_view_day7_return_count', 'three_return_day14_view_day14_return_count',
  187. 'four_up_return_day1_view_day1_return_count', 'four_up_return_day3_view_day3_return_count',
  188. 'four_up_return_day7_view_day7_return_count', 'four_up_return_day14_view_day14_return_count',
  189. 'one_return_day1_view_day1_return_count', 'one_return_day3_view_day3_return_count',
  190. 'one_return_day7_view_day7_return_count', 'one_return_day14_view_day14_return_count',
  191. 'all_return_day1_on_day1_return_count', 'all_return_day3_on_day1_return_count',
  192. 'all_return_day7_on_day1_return_count', 'all_return_day14_on_day1_return_count',
  193. 'four_up_return_day1_view_day1_return_div_three_d1', 'four_up_return_day3_view_day3_return_div_three_d3',
  194. 'four_up_return_day7_view_day7_return_div_three_d7', 'four_up_return_day14_view_day14_return_div_three_d14',
  195. 'day1ctr', 'day3ctr', 'day7ctr', 'day14ctr', 'day30ctr', 'day60ctr', 'day1sov', 'day3sov', 'day7sov',
  196. 'day14sov', 'day30sov', 'day60sov', 'day1rov', 'day3rov', 'day7rov', 'day14rov', 'day1soc', 'day3soc',
  197. 'day7soc', 'day14soc', 'day30soc', 'day60soc', 'day1roc', 'day3roc', 'day7roc', 'day14roc', 'oneday_day1rov',
  198. 'oneday_day3rov', 'oneday_day7rov', 'oneday_day14rov',
  199. 'day60playcount_divide_day30playcount', 'day60playcount_dif_day30playcount',
  200. 'day60returncount_divide_day30returncount', 'day60returncount_dif_day30returncount',
  201. 'day60sharecount_divide_day30sharecount', 'day60sharecount_dif_day30sharecount',
  202. 'day60viewcount_divide_day30viewcount', 'day60viewcount_dif_day30viewcount',
  203. 'day30playcount_divide_day7playcount', 'day30playcount_dif_day7playcount',
  204. 'day30returncount_divide_day7returncount', 'day30returncount_dif_day7returncount',
  205. 'day30sharecount_divide_day7sharecount', 'day30sharecount_dif_day7sharecount',
  206. 'day30viewcount_divide_day7viewcount', 'day30viewcount_dif_day7viewcount',
  207. 'day7playcount_divide_day3playcount', 'day7playcount_dif_day3playcount',
  208. 'day7returncount_divide_day3returncount', 'day7returncount_dif_day3returncount',
  209. 'day7sharecount_divide_day3sharecount', 'day7sharecount_dif_day3sharecount',
  210. 'day7viewcount_divide_day3viewcount', 'day7viewcount_dif_day3viewcount', 'day3playcount_divide_day1playcount',
  211. 'day3playcount_dif_day1playcount', 'day3returncount_divide_day1returncount',
  212. 'day3returncount_dif_day1returncount', 'day3sharecount_divide_day1sharecount',
  213. 'day3sharecount_dif_day1sharecount', 'day3viewcount_divide_day1viewcount',
  214. 'day3viewcount_dif_day1viewcount']
  215. def filter_recent_features():
  216. print(len(features))
  217. res = [f for f in features if (f.find('30') == -1 and f.find('60') == -1)]
  218. print(len(res))
  219. return res
  220. def cal_feature(df):
  221. start = time.time()
  222. for i in range(len(root_page_1day)):
  223. newfeat_div = root_page_60day[i] + '_divide_' + root_page_30day[i]
  224. # df[newfeat_div] = df.apply(lambda s: s[root_page_30day[i]] / s[root_page_60day[i]]\
  225. # if s[root_page_60day[i]] != 0 else 0, axis=1)
  226. df[newfeat_div] = df[root_page_30day[i]]/ df[root_page_60day[i]]
  227. newfeat_diff = root_page_60day[i] + '_dif_' + root_page_30day[i]
  228. # df[newfeat_diff] = df.apply(lambda s: s[root_page_60day[i]]-s[root_page_30day[i]],\
  229. # axis=1)
  230. df[newfeat_diff] = df[root_page_60day[i]] - df[root_page_30day[i]]
  231. end = time.time()
  232. running_time = end-start
  233. print('stage 1: time cost : %.5f sec' %running_time)
  234. start = time.time()
  235. for i in range(len(root_page_1day)):
  236. newfeat_div = root_page_30day[i] + '_divide_' + root_page_7day[i]
  237. # df[newfeat_div] = df.apply(lambda s: s[root_page_7day[i]] / s[root_page_30day[i]]\
  238. # if s[root_page_30day[i]] != 0 else 0, axis=1)
  239. df[newfeat_div] = df[root_page_7day[i]]/df[root_page_30day[i]]
  240. newfeat_diff = root_page_30day[i] + '_dif_' + root_page_7day[i]
  241. # df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\
  242. # axis=1)
  243. df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]]
  244. end = time.time()
  245. running_time = end-start
  246. print('stage 2: time cost : %.5f sec' %running_time)
  247. start = time.time()
  248. for i in range(len(root_page_1day)):
  249. newfeat_div = root_page_7day[i] + '_divide_' + root_page_3day[i]
  250. # df[newfeat_div] = df.apply(lambda s: s[root_page_3day[i]] / s[root_page_7day[i]]\
  251. # if s[root_page_7day[i]] != 0 else 0, axis=1)
  252. df[newfeat_div] = df[root_page_3day[i]]/df[root_page_7day[i]]
  253. newfeat_diff = root_page_7day[i] + '_dif_' + root_page_3day[i]
  254. # df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\
  255. # axis=1)
  256. df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]]
  257. end = time.time()
  258. running_time = end-start
  259. print('stage 3: time cost : %.5f sec' %running_time)
  260. start = time.time()
  261. for i in range(len(root_page_1day)):
  262. newfeat_div = root_page_3day[i] + '_divide_' + root_page_1day[i]
  263. # df[newfeat_div] = df.apply(lambda s: s[root_page_1day[i]] / s[root_page_3day[i]]\
  264. # if s[root_page_3day[i]] != 0 else 0, axis=1)
  265. df[newfeat_div] = df[root_page_1day[i]] / df[root_page_3day[i]]
  266. newfeat_diff = root_page_3day[i] + '_dif_' + root_page_1day[i]
  267. # df[newfeat_diff] = df.apply(lambda s: s[root_page_3day[i]] - s[root_page_1day[i]],\
  268. # axis=1)
  269. df[newfeat_diff] = df[root_page_3day[i]] - df[root_page_1day[i]]
  270. end = time.time()
  271. running_time = end-start
  272. print('stage 4: time cost : %.5f sec' %running_time)
  273. df = df.replace([np.inf, -np.inf], np.nan)
  274. df = df.fillna(0)
  275. return df