process_feature.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. import time
  2. import numpy as np
  3. add_feature = [
  4. 'all_return_day1_return_count', # -- 1/3/7/14日内总回流 #12
  5. 'all_return_day3_return_count',
  6. 'all_return_day7_return_count',
  7. 'all_return_day14_return_count',
  8. 'three_return_day1_return_count', # -- 1/3/7/14日内前三层回流 #14
  9. 'three_return_day3_return_count',
  10. 'three_return_day7_return_count',
  11. 'three_return_day14_return_count',
  12. 'four_up_return_day1_return_count', # -- 1/3/7/14日内四+层回流 #15
  13. 'four_up_return_day3_return_count',
  14. 'four_up_return_day7_return_count',
  15. 'four_up_return_day14_return_count',
  16. 'one_return_day1_return_count', # -- 1/3/7/14日内一层回流 #13
  17. 'one_return_day3_return_count',
  18. 'one_return_day7_return_count',
  19. 'one_return_day14_return_count',
  20. 'four_up_return_div_three_return_day1', # -- 1/3/7/14日内四+层回流/前三层回流 #23
  21. 'four_up_return_div_three_return_day3',
  22. 'four_up_return_div_three_return_day7',
  23. 'four_up_return_div_three_return_day14',
  24. 'all_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内回流 #8
  25. 'all_return_day3_view_day3_return_count',
  26. 'all_return_day7_view_day7_return_count',
  27. 'all_return_day14_view_day14_return_count',
  28. 'three_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内前三层回流 #10
  29. 'three_return_day3_view_day3_return_count',
  30. 'three_return_day7_view_day7_return_count',
  31. 'three_return_day14_view_day14_return_count',
  32. 'four_up_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流 # 11
  33. 'four_up_return_day3_view_day3_return_count',
  34. 'four_up_return_day7_view_day7_return_count',
  35. 'four_up_return_day14_view_day14_return_count',
  36. 'one_return_day1_view_day1_return_count', ##-- 1/3/7/14日内曝光在1/3/7/14日内一层回流 #9
  37. 'one_return_day3_view_day3_return_count',
  38. 'one_return_day7_view_day7_return_count',
  39. 'one_return_day14_view_day14_return_count',
  40. 'all_return_day1_on_day1_return_count', # 前day1+1 / day1+3/day1+7/day1+14 到前 day1+1日内曝光在 day1的总回流 #16
  41. 'all_return_day3_on_day1_return_count',
  42. 'all_return_day7_on_day1_return_count',
  43. 'all_return_day14_on_day1_return_count',
  44. 'four_up_return_day1_view_day1_return_div_three_d1', # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流/前三层回流 #22
  45. 'four_up_return_day3_view_day3_return_div_three_d3',
  46. 'four_up_return_day7_view_day7_return_div_three_d7',
  47. 'four_up_return_day14_view_day14_return_div_three_d14',
  48. 'day1ctr', # -- 1/3/7/14/30/60日内播放/曝光 #17
  49. 'day3ctr',
  50. 'day7ctr',
  51. 'day14ctr',
  52. 'day30ctr',
  53. 'day60ctr',
  54. 'day1sov', # -- 1/3/7/14/30/60日内分享/曝光 #18
  55. 'day3sov',
  56. 'day7sov',
  57. 'day14sov',
  58. 'day30sov',
  59. 'day60sov',
  60. 'day1rov', # -- 1/3/7/14日内曝光的回流/曝光 #19
  61. 'day3rov',
  62. 'day7rov',
  63. 'day14rov',
  64. 'day1soc', # -- 1/3/7/14/30/60日内分享/播放 #20
  65. 'day3soc',
  66. 'day7soc',
  67. 'day14soc',
  68. 'day30soc',
  69. 'day60soc',
  70. 'day1roc', # -- 1/3/7/14日内曝光的回流/播放 #21
  71. 'day3roc',
  72. 'day7roc',
  73. 'day14roc',
  74. 'oneday_day1rov', # -- 1/3/7/14日内曝光在今日的回流/ 1/3/7/14日内曝光 #24
  75. 'oneday_day3rov',
  76. 'oneday_day7rov',
  77. 'oneday_day14rov',
  78. 'futre7dayreturn'
  79. ,'todyviewcount_rank'
  80. ,'day1viewcount_rank'
  81. ]
  82. featurename = [
  83. 'dt',
  84. 'videoid',
  85. 'day1playcount',
  86. 'day1returncount',
  87. 'day1sharecount',
  88. 'day1viewcount',
  89. 'day14playcount',
  90. 'day14returncount',
  91. 'day14sharecount',
  92. 'day14viewcount',
  93. 'day30playcount',
  94. 'day30returncount',
  95. 'day30sharecount',
  96. 'day30viewcount',
  97. 'day3playcount',
  98. 'day3returncount',
  99. 'day3sharecount',
  100. 'day3viewcount',
  101. 'day60playcount',
  102. 'day60returncount',
  103. 'day60sharecount',
  104. 'day60viewcount',
  105. 'day7playcount',
  106. 'day7returncount',
  107. 'day7sharecount',
  108. 'day7viewcount',
  109. 'videocategory11',
  110. 'videocategory12',
  111. 'videocategory45',
  112. 'videocategory49',
  113. 'videocategory1',
  114. 'videocategory2',
  115. 'videocategory3',
  116. 'videocategory4',
  117. 'videocategory5',
  118. 'videocategory6',
  119. 'videocategory7',
  120. 'videocategory8',
  121. 'videocategory9',
  122. 'videocategory85',
  123. 'videocategory10',
  124. 'videocategory555',
  125. 'usercategory1',
  126. 'usercategory2',
  127. 'usercategory3',
  128. 'usercategory4',
  129. 'usercategory5',
  130. 'usercategory6',
  131. 'usercategory7',
  132. 'usercategory8',
  133. 'usercategory9',
  134. 'usercategory10',
  135. 'usercategory11',
  136. 'usercategory12',
  137. 'usercategory45',
  138. 'usercategory49',
  139. 'usercategory85',
  140. 'usercategory555',
  141. 'todyviewcount',
  142. 'day5returncount_1_stage',
  143. 'day5returncount_2_stage',
  144. 'day5returncount_3_stage',
  145. 'day5returncount_4_stage',
  146. 'stage_one_retrn',
  147. 'stage_two_retrn',
  148. 'stage_three_retrn',
  149. 'stage_four_retrn']
  150. words = ['videotags','words_without_tags']
  151. featurename = featurename + add_feature + words
  152. # 首页特征
  153. root_page_1day = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount']
  154. root_page_3day = ['day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount']
  155. root_page_7day = ['day7playcount', 'day7returncount', 'day7sharecount', 'day7viewcount']
  156. root_page_14day = ['day14playcount', 'day14returncount', 'day14sharecount', 'day14viewcount']
  157. root_page_30day = ['day30playcount', 'day30returncount', 'day30sharecount', 'day30viewcount']
  158. root_page_60day = ['day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount']
  159. return_5day = ['day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage',
  160. 'day5returncount_4_stage']
  161. cate_feat = ['videocategory1', 'videocategory10', 'videocategory11', 'videocategory12',
  162. 'videocategory2', 'videocategory3', 'videocategory4', 'videocategory45',
  163. 'videocategory49', 'videocategory5', 'videocategory6',
  164. 'videocategory7', 'videocategory8', 'videocategory85', 'videocategory9', 'videocategory555']
  165. one_hot_feature = ['videotags','words_without_tags','videoid']
  166. #
  167. def cal_feature(df):
  168. start = time.time()
  169. for i in range(len(root_page_1day)):
  170. newfeat_div = root_page_60day[i] + '_divide_' + root_page_30day[i]
  171. # df[newfeat_div] = df.apply(lambda s: s[root_page_30day[i]] / s[root_page_60day[i]]\
  172. # if s[root_page_60day[i]] != 0 else 0, axis=1)
  173. df[newfeat_div] = df[root_page_30day[i]]/ df[root_page_60day[i]]
  174. newfeat_diff = root_page_60day[i] + '_dif_' + root_page_30day[i]
  175. # df[newfeat_diff] = df.apply(lambda s: s[root_page_60day[i]]-s[root_page_30day[i]],\
  176. # axis=1)
  177. df[newfeat_diff] = df[root_page_60day[i]] - df[root_page_30day[i]]
  178. end = time.time()
  179. running_time = end-start
  180. print('stage 1: time cost : %.5f sec' %running_time)
  181. start = time.time()
  182. for i in range(len(root_page_1day)):
  183. newfeat_div = root_page_30day[i] + '_divide_' + root_page_7day[i]
  184. # df[newfeat_div] = df.apply(lambda s: s[root_page_7day[i]] / s[root_page_30day[i]]\
  185. # if s[root_page_30day[i]] != 0 else 0, axis=1)
  186. df[newfeat_div] = df[root_page_7day[i]]/df[root_page_30day[i]]
  187. newfeat_diff = root_page_30day[i] + '_dif_' + root_page_7day[i]
  188. # df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\
  189. # axis=1)
  190. df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]]
  191. end = time.time()
  192. running_time = end-start
  193. print('stage 2: time cost : %.5f sec' %running_time)
  194. start = time.time()
  195. for i in range(len(root_page_1day)):
  196. newfeat_div = root_page_7day[i] + '_divide_' + root_page_3day[i]
  197. # df[newfeat_div] = df.apply(lambda s: s[root_page_3day[i]] / s[root_page_7day[i]]\
  198. # if s[root_page_7day[i]] != 0 else 0, axis=1)
  199. df[newfeat_div] = df[root_page_3day[i]]/df[root_page_7day[i]]
  200. newfeat_diff = root_page_7day[i] + '_dif_' + root_page_3day[i]
  201. # df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\
  202. # axis=1)
  203. df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]]
  204. end = time.time()
  205. running_time = end-start
  206. print('stage 3: time cost : %.5f sec' %running_time)
  207. start = time.time()
  208. for i in range(len(root_page_1day)):
  209. newfeat_div = root_page_3day[i] + '_divide_' + root_page_1day[i]
  210. # df[newfeat_div] = df.apply(lambda s: s[root_page_1day[i]] / s[root_page_3day[i]]\
  211. # if s[root_page_3day[i]] != 0 else 0, axis=1)
  212. df[newfeat_div] = df[root_page_1day[i]] / df[root_page_3day[i]]
  213. newfeat_diff = root_page_3day[i] + '_dif_' + root_page_1day[i]
  214. # df[newfeat_diff] = df.apply(lambda s: s[root_page_3day[i]] - s[root_page_1day[i]],\
  215. # axis=1)
  216. df[newfeat_diff] = df[root_page_3day[i]] - df[root_page_1day[i]]
  217. end = time.time()
  218. running_time = end-start
  219. print('stage 4: time cost : %.5f sec' %running_time)
  220. df = df.replace([np.inf, -np.inf], np.nan)
  221. df = df.fillna(0)
  222. return df