add_feature = [ 'all_return_day1_return_count', # -- 1/3/7/14日内总回流 #12 'all_return_day3_return_count', 'all_return_day7_return_count', 'all_return_day14_return_count', 'three_return_day1_return_count', # -- 1/3/7/14日内前三层回流 #14 'three_return_day3_return_count', 'three_return_day7_return_count', 'three_return_day14_return_count', 'four_up_return_day1_return_count', # -- 1/3/7/14日内四+层回流 #15 'four_up_return_day3_return_count', 'four_up_return_day7_return_count', 'four_up_return_day14_return_count', 'one_return_day1_return_count', # -- 1/3/7/14日内一层回流 #13 'one_return_day3_return_count', 'one_return_day7_return_count', 'one_return_day14_return_count', 'four_up_return_div_three_return_day1', # -- 1/3/7/14日内四+层回流/前三层回流 #23 'four_up_return_div_three_return_day3', 'four_up_return_div_three_return_day7', 'four_up_return_div_three_return_day14', 'all_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内回流 #8 'all_return_day3_view_day3_return_count', 'all_return_day7_view_day7_return_count', 'all_return_day14_view_day14_return_count', 'three_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内前三层回流 #10 'three_return_day3_view_day3_return_count', 'three_return_day7_view_day7_return_count', 'three_return_day14_view_day14_return_count', 'four_up_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流 # 11 'four_up_return_day3_view_day3_return_count', 'four_up_return_day7_view_day7_return_count', 'four_up_return_day14_view_day14_return_count', 'one_return_day1_view_day1_return_count', ##-- 1/3/7/14日内曝光在1/3/7/14日内一层回流 #9 'one_return_day3_view_day3_return_count', 'one_return_day7_view_day7_return_count', 'one_return_day14_view_day14_return_count', 'all_return_day1_on_day1_return_count', # 前day1+1 / day1+3/day1+7/day1+14 到前 day1+1日内曝光在 day1的总回流 #16 'all_return_day3_on_day1_return_count', 'all_return_day7_on_day1_return_count', 'all_return_day14_on_day1_return_count', 'four_up_return_day1_view_day1_return_div_three_d1', # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流/前三层回流 #22 'four_up_return_day3_view_day3_return_div_three_d3', 'four_up_return_day7_view_day7_return_div_three_d7', 'four_up_return_day14_view_day14_return_div_three_d14', 'day1ctr', # -- 1/3/7/14/30/60日内播放/曝光 #17 'day3ctr', 'day7ctr', 'day14ctr', 'day30ctr', 'day60ctr', 'day1sov', # -- 1/3/7/14/30/60日内分享/曝光 #18 'day3sov', 'day7sov', 'day14sov', 'day30sov', 'day60sov', 'day1rov', # -- 1/3/7/14日内曝光的回流/曝光 #19 'day3rov', 'day7rov', 'day14rov', 'day1soc', # -- 1/3/7/14/30/60日内分享/播放 #20 'day3soc', 'day7soc', 'day14soc', 'day30soc', 'day60soc', 'day1roc', # -- 1/3/7/14日内曝光的回流/播放 #21 'day3roc', 'day7roc', 'day14roc', 'oneday_day1rov', # -- 1/3/7/14日内曝光在今日的回流/ 1/3/7/14日内曝光 #24 'oneday_day3rov', 'oneday_day7rov', 'oneday_day14rov', 'futre7dayreturn' ,'todyviewcount_rank' ,'day1viewcount_rank' ] featurename = [ 'dt', 'videoid', 'day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount', 'day14playcount', 'day14returncount', 'day14sharecount', 'day14viewcount', 'day30playcount', 'day30returncount', 'day30sharecount', 'day30viewcount', 'day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount', 'day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount', 'day7playcount', 'day7returncount', 'day7sharecount', 'day7viewcount', 'videocategory11', 'videocategory12', 'videocategory45', 'videocategory49', 'videocategory1', 'videocategory2', 'videocategory3', 'videocategory4', 'videocategory5', 'videocategory6', 'videocategory7', 'videocategory8', 'videocategory9', 'videocategory85', 'videocategory10', 'videocategory555', 'usercategory1', 'usercategory2', 'usercategory3', 'usercategory4', 'usercategory5', 'usercategory6', 'usercategory7', 'usercategory8', 'usercategory9', 'usercategory10', 'usercategory11', 'usercategory12', 'usercategory45', 'usercategory49', 'usercategory85', 'usercategory555', 'todyviewcount', 'day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage', 'day5returncount_4_stage', 'stage_one_retrn', 'stage_two_retrn', 'stage_three_retrn', 'stage_four_retrn'] words = ['videotags','words_without_tags'] featurename = featurename + add_feature + words # 首页特征 root_page_1day = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount'] root_page_3day = ['day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount'] root_page_7day = ['day7playcount', 'day7returncount', 'day7sharecount', 'day7viewcount'] root_page_14day = ['day14playcount', 'day14returncount', 'day14sharecount', 'day14viewcount'] root_page_30day = ['day30playcount', 'day30returncount', 'day30sharecount', 'day30viewcount'] root_page_60day = ['day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount'] return_5day = ['day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage', 'day5returncount_4_stage'] cate_feat = ['videocategory1', 'videocategory10', 'videocategory11', 'videocategory12', 'videocategory2', 'videocategory3', 'videocategory4', 'videocategory45', 'videocategory49', 'videocategory5', 'videocategory6', 'videocategory7', 'videocategory8', 'videocategory85', 'videocategory9', 'videocategory555'] one_hot_feature = ['videotags','words_without_tags','videoid'] # def cal_feature(df): start = time.time() for i in range(len(root_page_1day)): newfeat_div = root_page_60day[i] + '_divide_' + root_page_30day[i] # df[newfeat_div] = df.apply(lambda s: s[root_page_30day[i]] / s[root_page_60day[i]]\ # if s[root_page_60day[i]] != 0 else 0, axis=1) df[newfeat_div] = df[root_page_30day[i]]/ df[root_page_60day[i]] newfeat_diff = root_page_60day[i] + '_dif_' + root_page_30day[i] # df[newfeat_diff] = df.apply(lambda s: s[root_page_60day[i]]-s[root_page_30day[i]],\ # axis=1) df[newfeat_diff] = df[root_page_60day[i]] - df[root_page_30day[i]] end = time.time() running_time = end-start print('stage 1: time cost : %.5f sec' %running_time) start = time.time() for i in range(len(root_page_1day)): newfeat_div = root_page_30day[i] + '_divide_' + root_page_7day[i] # df[newfeat_div] = df.apply(lambda s: s[root_page_7day[i]] / s[root_page_30day[i]]\ # if s[root_page_30day[i]] != 0 else 0, axis=1) df[newfeat_div] = df[root_page_7day[i]]/df[root_page_30day[i]] newfeat_diff = root_page_30day[i] + '_dif_' + root_page_7day[i] # df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\ # axis=1) df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]] end = time.time() running_time = end-start print('stage 2: time cost : %.5f sec' %running_time) start = time.time() for i in range(len(root_page_1day)): newfeat_div = root_page_7day[i] + '_divide_' + root_page_3day[i] # df[newfeat_div] = df.apply(lambda s: s[root_page_3day[i]] / s[root_page_7day[i]]\ # if s[root_page_7day[i]] != 0 else 0, axis=1) df[newfeat_div] = df[root_page_3day[i]]/df[root_page_7day[i]] newfeat_diff = root_page_7day[i] + '_dif_' + root_page_3day[i] # df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\ # axis=1) df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]] end = time.time() running_time = end-start print('stage 3: time cost : %.5f sec' %running_time) start = time.time() for i in range(len(root_page_1day)): newfeat_div = root_page_3day[i] + '_divide_' + root_page_1day[i] # df[newfeat_div] = df.apply(lambda s: s[root_page_1day[i]] / s[root_page_3day[i]]\ # if s[root_page_3day[i]] != 0 else 0, axis=1) df[newfeat_div] = df[root_page_1day[i]] / df[root_page_3day[i]] newfeat_diff = root_page_3day[i] + '_dif_' + root_page_1day[i] # df[newfeat_diff] = df.apply(lambda s: s[root_page_3day[i]] - s[root_page_1day[i]],\ # axis=1) df[newfeat_diff] = df[root_page_3day[i]] - df[root_page_1day[i]] end = time.time() running_time = end-start print('stage 4: time cost : %.5f sec' %running_time) df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0) return df