process_feature.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. add_feature = [
  2. 'all_return_day1_return_count', # -- 1/3/7/14日内总回流 #12
  3. 'all_return_day3_return_count',
  4. 'all_return_day7_return_count',
  5. 'all_return_day14_return_count',
  6. 'three_return_day1_return_count', # -- 1/3/7/14日内前三层回流 #14
  7. 'three_return_day3_return_count',
  8. 'three_return_day7_return_count',
  9. 'three_return_day14_return_count',
  10. 'four_up_return_day1_return_count', # -- 1/3/7/14日内四+层回流 #15
  11. 'four_up_return_day3_return_count',
  12. 'four_up_return_day7_return_count',
  13. 'four_up_return_day14_return_count',
  14. 'one_return_day1_return_count', # -- 1/3/7/14日内一层回流 #13
  15. 'one_return_day3_return_count',
  16. 'one_return_day7_return_count',
  17. 'one_return_day14_return_count',
  18. 'four_up_return_div_three_return_day1', # -- 1/3/7/14日内四+层回流/前三层回流 #23
  19. 'four_up_return_div_three_return_day3',
  20. 'four_up_return_div_three_return_day7',
  21. 'four_up_return_div_three_return_day14',
  22. 'all_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内回流 #8
  23. 'all_return_day3_view_day3_return_count',
  24. 'all_return_day7_view_day7_return_count',
  25. 'all_return_day14_view_day14_return_count',
  26. 'three_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内前三层回流 #10
  27. 'three_return_day3_view_day3_return_count',
  28. 'three_return_day7_view_day7_return_count',
  29. 'three_return_day14_view_day14_return_count',
  30. 'four_up_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流 # 11
  31. 'four_up_return_day3_view_day3_return_count',
  32. 'four_up_return_day7_view_day7_return_count',
  33. 'four_up_return_day14_view_day14_return_count',
  34. 'one_return_day1_view_day1_return_count', ##-- 1/3/7/14日内曝光在1/3/7/14日内一层回流 #9
  35. 'one_return_day3_view_day3_return_count',
  36. 'one_return_day7_view_day7_return_count',
  37. 'one_return_day14_view_day14_return_count',
  38. 'all_return_day1_on_day1_return_count', # 前day1+1 / day1+3/day1+7/day1+14 到前 day1+1日内曝光在 day1的总回流 #16
  39. 'all_return_day3_on_day1_return_count',
  40. 'all_return_day7_on_day1_return_count',
  41. 'all_return_day14_on_day1_return_count',
  42. 'four_up_return_day1_view_day1_return_div_three_d1', # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流/前三层回流 #22
  43. 'four_up_return_day3_view_day3_return_div_three_d3',
  44. 'four_up_return_day7_view_day7_return_div_three_d7',
  45. 'four_up_return_day14_view_day14_return_div_three_d14',
  46. 'day1ctr', # -- 1/3/7/14/30/60日内播放/曝光 #17
  47. 'day3ctr',
  48. 'day7ctr',
  49. 'day14ctr',
  50. 'day30ctr',
  51. 'day60ctr',
  52. 'day1sov', # -- 1/3/7/14/30/60日内分享/曝光 #18
  53. 'day3sov',
  54. 'day7sov',
  55. 'day14sov',
  56. 'day30sov',
  57. 'day60sov',
  58. 'day1rov', # -- 1/3/7/14日内曝光的回流/曝光 #19
  59. 'day3rov',
  60. 'day7rov',
  61. 'day14rov',
  62. 'day1soc', # -- 1/3/7/14/30/60日内分享/播放 #20
  63. 'day3soc',
  64. 'day7soc',
  65. 'day14soc',
  66. 'day30soc',
  67. 'day60soc',
  68. 'day1roc', # -- 1/3/7/14日内曝光的回流/播放 #21
  69. 'day3roc',
  70. 'day7roc',
  71. 'day14roc',
  72. 'oneday_day1rov', # -- 1/3/7/14日内曝光在今日的回流/ 1/3/7/14日内曝光 #24
  73. 'oneday_day3rov',
  74. 'oneday_day7rov',
  75. 'oneday_day14rov',
  76. 'futre7dayreturn'
  77. ,'todyviewcount_rank'
  78. ,'day1viewcount_rank'
  79. ]
  80. featurename = [
  81. 'dt',
  82. 'videoid',
  83. 'day1playcount',
  84. 'day1returncount',
  85. 'day1sharecount',
  86. 'day1viewcount',
  87. 'day14playcount',
  88. 'day14returncount',
  89. 'day14sharecount',
  90. 'day14viewcount',
  91. 'day30playcount',
  92. 'day30returncount',
  93. 'day30sharecount',
  94. 'day30viewcount',
  95. 'day3playcount',
  96. 'day3returncount',
  97. 'day3sharecount',
  98. 'day3viewcount',
  99. 'day60playcount',
  100. 'day60returncount',
  101. 'day60sharecount',
  102. 'day60viewcount',
  103. 'day7playcount',
  104. 'day7returncount',
  105. 'day7sharecount',
  106. 'day7viewcount',
  107. 'videocategory11',
  108. 'videocategory12',
  109. 'videocategory45',
  110. 'videocategory49',
  111. 'videocategory1',
  112. 'videocategory2',
  113. 'videocategory3',
  114. 'videocategory4',
  115. 'videocategory5',
  116. 'videocategory6',
  117. 'videocategory7',
  118. 'videocategory8',
  119. 'videocategory9',
  120. 'videocategory85',
  121. 'videocategory10',
  122. 'videocategory555',
  123. 'usercategory1',
  124. 'usercategory2',
  125. 'usercategory3',
  126. 'usercategory4',
  127. 'usercategory5',
  128. 'usercategory6',
  129. 'usercategory7',
  130. 'usercategory8',
  131. 'usercategory9',
  132. 'usercategory10',
  133. 'usercategory11',
  134. 'usercategory12',
  135. 'usercategory45',
  136. 'usercategory49',
  137. 'usercategory85',
  138. 'usercategory555',
  139. 'todyviewcount',
  140. 'day5returncount_1_stage',
  141. 'day5returncount_2_stage',
  142. 'day5returncount_3_stage',
  143. 'day5returncount_4_stage',
  144. 'stage_one_retrn',
  145. 'stage_two_retrn',
  146. 'stage_three_retrn',
  147. 'stage_four_retrn']
  148. words = ['videotags','words_without_tags']
  149. featurename = featurename + add_feature + words
  150. # 首页特征
  151. root_page_1day = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount']
  152. root_page_3day = ['day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount']
  153. root_page_7day = ['day7playcount', 'day7returncount', 'day7sharecount', 'day7viewcount']
  154. root_page_14day = ['day14playcount', 'day14returncount', 'day14sharecount', 'day14viewcount']
  155. root_page_30day = ['day30playcount', 'day30returncount', 'day30sharecount', 'day30viewcount']
  156. root_page_60day = ['day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount']
  157. return_5day = ['day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage',
  158. 'day5returncount_4_stage']
  159. cate_feat = ['videocategory1', 'videocategory10', 'videocategory11', 'videocategory12',
  160. 'videocategory2', 'videocategory3', 'videocategory4', 'videocategory45',
  161. 'videocategory49', 'videocategory5', 'videocategory6',
  162. 'videocategory7', 'videocategory8', 'videocategory85', 'videocategory9', 'videocategory555']
  163. one_hot_feature = ['videotags','words_without_tags','videoid']
  164. #
  165. def cal_feature(df):
  166. start = time.time()
  167. for i in range(len(root_page_1day)):
  168. newfeat_div = root_page_60day[i] + '_divide_' + root_page_30day[i]
  169. # df[newfeat_div] = df.apply(lambda s: s[root_page_30day[i]] / s[root_page_60day[i]]\
  170. # if s[root_page_60day[i]] != 0 else 0, axis=1)
  171. df[newfeat_div] = df[root_page_30day[i]]/ df[root_page_60day[i]]
  172. newfeat_diff = root_page_60day[i] + '_dif_' + root_page_30day[i]
  173. # df[newfeat_diff] = df.apply(lambda s: s[root_page_60day[i]]-s[root_page_30day[i]],\
  174. # axis=1)
  175. df[newfeat_diff] = df[root_page_60day[i]] - df[root_page_30day[i]]
  176. end = time.time()
  177. running_time = end-start
  178. print('stage 1: time cost : %.5f sec' %running_time)
  179. start = time.time()
  180. for i in range(len(root_page_1day)):
  181. newfeat_div = root_page_30day[i] + '_divide_' + root_page_7day[i]
  182. # df[newfeat_div] = df.apply(lambda s: s[root_page_7day[i]] / s[root_page_30day[i]]\
  183. # if s[root_page_30day[i]] != 0 else 0, axis=1)
  184. df[newfeat_div] = df[root_page_7day[i]]/df[root_page_30day[i]]
  185. newfeat_diff = root_page_30day[i] + '_dif_' + root_page_7day[i]
  186. # df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\
  187. # axis=1)
  188. df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]]
  189. end = time.time()
  190. running_time = end-start
  191. print('stage 2: time cost : %.5f sec' %running_time)
  192. start = time.time()
  193. for i in range(len(root_page_1day)):
  194. newfeat_div = root_page_7day[i] + '_divide_' + root_page_3day[i]
  195. # df[newfeat_div] = df.apply(lambda s: s[root_page_3day[i]] / s[root_page_7day[i]]\
  196. # if s[root_page_7day[i]] != 0 else 0, axis=1)
  197. df[newfeat_div] = df[root_page_3day[i]]/df[root_page_7day[i]]
  198. newfeat_diff = root_page_7day[i] + '_dif_' + root_page_3day[i]
  199. # df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\
  200. # axis=1)
  201. df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]]
  202. end = time.time()
  203. running_time = end-start
  204. print('stage 3: time cost : %.5f sec' %running_time)
  205. start = time.time()
  206. for i in range(len(root_page_1day)):
  207. newfeat_div = root_page_3day[i] + '_divide_' + root_page_1day[i]
  208. # df[newfeat_div] = df.apply(lambda s: s[root_page_1day[i]] / s[root_page_3day[i]]\
  209. # if s[root_page_3day[i]] != 0 else 0, axis=1)
  210. df[newfeat_div] = df[root_page_1day[i]] / df[root_page_3day[i]]
  211. newfeat_diff = root_page_3day[i] + '_dif_' + root_page_1day[i]
  212. # df[newfeat_diff] = df.apply(lambda s: s[root_page_3day[i]] - s[root_page_1day[i]],\
  213. # axis=1)
  214. df[newfeat_diff] = df[root_page_3day[i]] - df[root_page_1day[i]]
  215. end = time.time()
  216. running_time = end-start
  217. print('stage 4: time cost : %.5f sec' %running_time)
  218. df = df.replace([np.inf, -np.inf], np.nan)
  219. df = df.fillna(0)
  220. return df