123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318 |
- import time
- import numpy as np
- add_feature = [
- 'all_return_day1_return_count', # -- 1/3/7/14日内总回流 #12
- 'all_return_day3_return_count',
- 'all_return_day7_return_count',
- 'all_return_day14_return_count',
- 'three_return_day1_return_count', # -- 1/3/7/14日内前三层回流 #14
- 'three_return_day3_return_count',
- 'three_return_day7_return_count',
- 'three_return_day14_return_count',
- 'four_up_return_day1_return_count', # -- 1/3/7/14日内四+层回流 #15
- 'four_up_return_day3_return_count',
- 'four_up_return_day7_return_count',
- 'four_up_return_day14_return_count',
- 'one_return_day1_return_count', # -- 1/3/7/14日内一层回流 #13
- 'one_return_day3_return_count',
- 'one_return_day7_return_count',
- 'one_return_day14_return_count',
- 'four_up_return_div_three_return_day1', # -- 1/3/7/14日内四+层回流/前三层回流 #23
- 'four_up_return_div_three_return_day3',
- 'four_up_return_div_three_return_day7',
- 'four_up_return_div_three_return_day14',
- 'all_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内回流 #8
- 'all_return_day3_view_day3_return_count',
- 'all_return_day7_view_day7_return_count',
- 'all_return_day14_view_day14_return_count',
- 'three_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内前三层回流 #10
- 'three_return_day3_view_day3_return_count',
- 'three_return_day7_view_day7_return_count',
- 'three_return_day14_view_day14_return_count',
- 'four_up_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流 # 11
- 'four_up_return_day3_view_day3_return_count',
- 'four_up_return_day7_view_day7_return_count',
- 'four_up_return_day14_view_day14_return_count',
- 'one_return_day1_view_day1_return_count', ##-- 1/3/7/14日内曝光在1/3/7/14日内一层回流 #9
- 'one_return_day3_view_day3_return_count',
- 'one_return_day7_view_day7_return_count',
- 'one_return_day14_view_day14_return_count',
- 'all_return_day1_on_day1_return_count', # 前day1+1 / day1+3/day1+7/day1+14 到前 day1+1日内曝光在 day1的总回流 #16
- 'all_return_day3_on_day1_return_count',
- 'all_return_day7_on_day1_return_count',
- 'all_return_day14_on_day1_return_count',
- 'four_up_return_day1_view_day1_return_div_three_d1', # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流/前三层回流 #22
- 'four_up_return_day3_view_day3_return_div_three_d3',
- 'four_up_return_day7_view_day7_return_div_three_d7',
- 'four_up_return_day14_view_day14_return_div_three_d14',
- 'day1ctr', # -- 1/3/7/14/30/60日内播放/曝光 #17
- 'day3ctr',
- 'day7ctr',
- 'day14ctr',
- 'day30ctr',
- 'day60ctr',
- 'day1sov', # -- 1/3/7/14/30/60日内分享/曝光 #18
- 'day3sov',
- 'day7sov',
- 'day14sov',
- 'day30sov',
- 'day60sov',
- 'day1rov', # -- 1/3/7/14日内曝光的回流/曝光 #19
- 'day3rov',
- 'day7rov',
- 'day14rov',
- 'day1soc', # -- 1/3/7/14/30/60日内分享/播放 #20
- 'day3soc',
- 'day7soc',
- 'day14soc',
- 'day30soc',
- 'day60soc',
- 'day1roc', # -- 1/3/7/14日内曝光的回流/播放 #21
- 'day3roc',
- 'day7roc',
- 'day14roc',
- 'oneday_day1rov', # -- 1/3/7/14日内曝光在今日的回流/ 1/3/7/14日内曝光 #24
- 'oneday_day3rov',
- 'oneday_day7rov',
- 'oneday_day14rov',
- 'futre7dayreturn'
-
- ,'todyviewcount_rank'
- ,'day1viewcount_rank'
- ]
- featurename = [
- 'dt',
- 'videoid',
- 'day1playcount',
- 'day1returncount',
- 'day1sharecount',
- 'day1viewcount',
- 'day14playcount',
- 'day14returncount',
- 'day14sharecount',
- 'day14viewcount',
- 'day30playcount',
- 'day30returncount',
- 'day30sharecount',
- 'day30viewcount',
- 'day3playcount',
- 'day3returncount',
- 'day3sharecount',
- 'day3viewcount',
- 'day60playcount',
- 'day60returncount',
- 'day60sharecount',
- 'day60viewcount',
- 'day7playcount',
- 'day7returncount',
- 'day7sharecount',
- 'day7viewcount',
- 'videocategory11',
- 'videocategory12',
- 'videocategory45',
- 'videocategory49',
- 'videocategory1',
- 'videocategory2',
- 'videocategory3',
- 'videocategory4',
- 'videocategory5',
- 'videocategory6',
- 'videocategory7',
- 'videocategory8',
- 'videocategory9',
- 'videocategory85',
- 'videocategory10',
- 'videocategory555',
- 'usercategory1',
- 'usercategory2',
- 'usercategory3',
- 'usercategory4',
- 'usercategory5',
- 'usercategory6',
- 'usercategory7',
- 'usercategory8',
- 'usercategory9',
- 'usercategory10',
- 'usercategory11',
- 'usercategory12',
- 'usercategory45',
- 'usercategory49',
- 'usercategory85',
- 'usercategory555',
- 'todyviewcount',
- 'day5returncount_1_stage',
- 'day5returncount_2_stage',
- 'day5returncount_3_stage',
- 'day5returncount_4_stage',
- 'stage_one_retrn',
- 'stage_two_retrn',
- 'stage_three_retrn',
- 'stage_four_retrn']
- words = ['videotags','words_without_tags']
- featurename = featurename + add_feature + words
- # 首页特征
- root_page_1day = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount']
- root_page_3day = ['day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount']
- root_page_7day = ['day7playcount', 'day7returncount', 'day7sharecount', 'day7viewcount']
- root_page_14day = ['day14playcount', 'day14returncount', 'day14sharecount', 'day14viewcount']
- root_page_30day = ['day30playcount', 'day30returncount', 'day30sharecount', 'day30viewcount']
- root_page_60day = ['day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount']
- return_5day = ['day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage',
- 'day5returncount_4_stage']
- cate_feat = ['videocategory1', 'videocategory10', 'videocategory11', 'videocategory12',
- 'videocategory2', 'videocategory3', 'videocategory4', 'videocategory45',
- 'videocategory49', 'videocategory5', 'videocategory6',
- 'videocategory7', 'videocategory8', 'videocategory85', 'videocategory9', 'videocategory555']
- one_hot_feature = ['videotags','words_without_tags','videoid']
- #
- features = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount', 'day30playcount', 'day30returncount',
- 'day30sharecount', 'day30viewcount', 'day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount',
- 'day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount', 'day7playcount', 'day7returncount',
- 'day7sharecount', 'day7viewcount', 'usercategory1', 'usercategory2', 'usercategory3', 'usercategory4',
- 'usercategory5', 'usercategory6', 'usercategory7', 'usercategory8', 'usercategory9', 'usercategory10',
- 'usercategory11', 'usercategory12', 'usercategory45', 'usercategory49', 'usercategory85','usercategory555',
- 'todyviewcount',
- 'day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage', 'day5returncount_4_stage',
- 'stage_one_retrn', 'stage_two_retrn', 'stage_three_retrn', 'stage_four_retrn', 'all_return_day1_return_count',
- 'all_return_day3_return_count', 'all_return_day7_return_count', 'all_return_day14_return_count',
- 'three_return_day1_return_count', 'three_return_day3_return_count', 'three_return_day7_return_count',
- 'three_return_day14_return_count', 'four_up_return_day1_return_count', 'four_up_return_day3_return_count',
- 'four_up_return_day7_return_count', 'four_up_return_day14_return_count', 'one_return_day1_return_count',
- 'one_return_day3_return_count', 'one_return_day7_return_count', 'one_return_day14_return_count',
- 'four_up_return_div_three_return_day1', 'four_up_return_div_three_return_day3',
- 'four_up_return_div_three_return_day7', 'four_up_return_div_three_return_day14',
- 'all_return_day1_view_day1_return_count', 'all_return_day3_view_day3_return_count',
- 'all_return_day7_view_day7_return_count', 'all_return_day14_view_day14_return_count',
- 'three_return_day1_view_day1_return_count', 'three_return_day3_view_day3_return_count',
- 'three_return_day7_view_day7_return_count', 'three_return_day14_view_day14_return_count',
- 'four_up_return_day1_view_day1_return_count', 'four_up_return_day3_view_day3_return_count',
- 'four_up_return_day7_view_day7_return_count', 'four_up_return_day14_view_day14_return_count',
- 'one_return_day1_view_day1_return_count', 'one_return_day3_view_day3_return_count',
- 'one_return_day7_view_day7_return_count', 'one_return_day14_view_day14_return_count',
- 'all_return_day1_on_day1_return_count', 'all_return_day3_on_day1_return_count',
- 'all_return_day7_on_day1_return_count', 'all_return_day14_on_day1_return_count',
- 'four_up_return_day1_view_day1_return_div_three_d1', 'four_up_return_day3_view_day3_return_div_three_d3',
- 'four_up_return_day7_view_day7_return_div_three_d7', 'four_up_return_day14_view_day14_return_div_three_d14',
- 'day1ctr', 'day3ctr', 'day7ctr', 'day14ctr', 'day30ctr', 'day60ctr', 'day1sov', 'day3sov', 'day7sov',
- 'day14sov', 'day30sov', 'day60sov', 'day1rov', 'day3rov', 'day7rov', 'day14rov', 'day1soc', 'day3soc',
- 'day7soc', 'day14soc', 'day30soc', 'day60soc', 'day1roc', 'day3roc', 'day7roc', 'day14roc', 'oneday_day1rov',
- 'oneday_day3rov', 'oneday_day7rov', 'oneday_day14rov',
- 'day60playcount_divide_day30playcount', 'day60playcount_dif_day30playcount',
- 'day60returncount_divide_day30returncount', 'day60returncount_dif_day30returncount',
- 'day60sharecount_divide_day30sharecount', 'day60sharecount_dif_day30sharecount',
- 'day60viewcount_divide_day30viewcount', 'day60viewcount_dif_day30viewcount',
- 'day30playcount_divide_day7playcount', 'day30playcount_dif_day7playcount',
- 'day30returncount_divide_day7returncount', 'day30returncount_dif_day7returncount',
- 'day30sharecount_divide_day7sharecount', 'day30sharecount_dif_day7sharecount',
- 'day30viewcount_divide_day7viewcount', 'day30viewcount_dif_day7viewcount',
- 'day7playcount_divide_day3playcount', 'day7playcount_dif_day3playcount',
- 'day7returncount_divide_day3returncount', 'day7returncount_dif_day3returncount',
- 'day7sharecount_divide_day3sharecount', 'day7sharecount_dif_day3sharecount',
- 'day7viewcount_divide_day3viewcount', 'day7viewcount_dif_day3viewcount', 'day3playcount_divide_day1playcount',
- 'day3playcount_dif_day1playcount', 'day3returncount_divide_day1returncount',
- 'day3returncount_dif_day1returncount', 'day3sharecount_divide_day1sharecount',
- 'day3sharecount_dif_day1sharecount', 'day3viewcount_divide_day1viewcount',
- 'day3viewcount_dif_day1viewcount']
- def filter_recent_features():
- print(len(features))
- res = [f for f in features if (f.find('30') == -1 and f.find('60') == -1)]
- print(len(res))
- return res
- def cal_feature(df):
- start = time.time()
- for i in range(len(root_page_1day)):
-
-
- newfeat_div = root_page_60day[i] + '_divide_' + root_page_30day[i]
- # df[newfeat_div] = df.apply(lambda s: s[root_page_30day[i]] / s[root_page_60day[i]]\
- # if s[root_page_60day[i]] != 0 else 0, axis=1)
- df[newfeat_div] = df[root_page_30day[i]]/ df[root_page_60day[i]]
-
- newfeat_diff = root_page_60day[i] + '_dif_' + root_page_30day[i]
- # df[newfeat_diff] = df.apply(lambda s: s[root_page_60day[i]]-s[root_page_30day[i]],\
- # axis=1)
- df[newfeat_diff] = df[root_page_60day[i]] - df[root_page_30day[i]]
- end = time.time()
- running_time = end-start
- print('stage 1: time cost : %.5f sec' %running_time)
-
-
- start = time.time()
- for i in range(len(root_page_1day)):
- newfeat_div = root_page_30day[i] + '_divide_' + root_page_7day[i]
- # df[newfeat_div] = df.apply(lambda s: s[root_page_7day[i]] / s[root_page_30day[i]]\
- # if s[root_page_30day[i]] != 0 else 0, axis=1)
- df[newfeat_div] = df[root_page_7day[i]]/df[root_page_30day[i]]
- newfeat_diff = root_page_30day[i] + '_dif_' + root_page_7day[i]
- # df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\
- # axis=1)
- df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]]
- end = time.time()
- running_time = end-start
- print('stage 2: time cost : %.5f sec' %running_time)
-
-
- start = time.time()
- for i in range(len(root_page_1day)):
- newfeat_div = root_page_7day[i] + '_divide_' + root_page_3day[i]
- # df[newfeat_div] = df.apply(lambda s: s[root_page_3day[i]] / s[root_page_7day[i]]\
- # if s[root_page_7day[i]] != 0 else 0, axis=1)
- df[newfeat_div] = df[root_page_3day[i]]/df[root_page_7day[i]]
- newfeat_diff = root_page_7day[i] + '_dif_' + root_page_3day[i]
- # df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\
- # axis=1)
- df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]]
- end = time.time()
- running_time = end-start
- print('stage 3: time cost : %.5f sec' %running_time)
-
- start = time.time()
- for i in range(len(root_page_1day)):
- newfeat_div = root_page_3day[i] + '_divide_' + root_page_1day[i]
- # df[newfeat_div] = df.apply(lambda s: s[root_page_1day[i]] / s[root_page_3day[i]]\
- # if s[root_page_3day[i]] != 0 else 0, axis=1)
- df[newfeat_div] = df[root_page_1day[i]] / df[root_page_3day[i]]
- newfeat_diff = root_page_3day[i] + '_dif_' + root_page_1day[i]
- # df[newfeat_diff] = df.apply(lambda s: s[root_page_3day[i]] - s[root_page_1day[i]],\
- # axis=1)
- df[newfeat_diff] = df[root_page_3day[i]] - df[root_page_1day[i]]
- end = time.time()
- running_time = end-start
- print('stage 4: time cost : %.5f sec' %running_time)
- df = df.replace([np.inf, -np.inf], np.nan)
- df = df.fillna(0)
- return df
|