|  | @@ -0,0 +1,993 @@
 | 
											
												
													
														|  | 
 |  | +#!/usr/bin/env python
 | 
											
												
													
														|  | 
 |  | +# coding: utf-8
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[2]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +import warnings
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +warnings.filterwarnings("ignore")
 | 
											
												
													
														|  | 
 |  | +from sklearn.metrics import r2_score
 | 
											
												
													
														|  | 
 |  | +import os
 | 
											
												
													
														|  | 
 |  | +import pandas as pd
 | 
											
												
													
														|  | 
 |  | +import gc
 | 
											
												
													
														|  | 
 |  | +import math
 | 
											
												
													
														|  | 
 |  | +import numpy as np
 | 
											
												
													
														|  | 
 |  | +import time
 | 
											
												
													
														|  | 
 |  | +from sklearn.linear_model import SGDRegressor
 | 
											
												
													
														|  | 
 |  | +from sklearn.linear_model import SGDClassifier
 | 
											
												
													
														|  | 
 |  | +import lightgbm as lgb
 | 
											
												
													
														|  | 
 |  | +from sklearn.model_selection import train_test_split
 | 
											
												
													
														|  | 
 |  | +from sklearn.model_selection import StratifiedKFold
 | 
											
												
													
														|  | 
 |  | +from sklearn import metrics
 | 
											
												
													
														|  | 
 |  | +import pickle
 | 
											
												
													
														|  | 
 |  | +from sklearn.metrics import mean_squared_error
 | 
											
												
													
														|  | 
 |  | +import seaborn as sns
 | 
											
												
													
														|  | 
 |  | +import matplotlib.pylab as plt
 | 
											
												
													
														|  | 
 |  | +from odps import ODPS
 | 
											
												
													
														|  | 
 |  | +from odps.df import DataFrame as odpsdf
 | 
											
												
													
														|  | 
 |  | +from datetime import datetime as dt
 | 
											
												
													
														|  | 
 |  | +import datetime
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[3]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +now_date = datetime.date.today() 
 | 
											
												
													
														|  | 
 |  | +# day = datetime.datetime.strftime(now_date, '%Y%m%d')
 | 
											
												
													
														|  | 
 |  | +diff_1 = datetime.timedelta(days=1)
 | 
											
												
													
														|  | 
 |  | +diff_5 = datetime.timedelta(days=7)
 | 
											
												
													
														|  | 
 |  | +input_dt = now_date - diff_1
 | 
											
												
													
														|  | 
 |  | +input_day = datetime.datetime.strftime(input_dt, '%Y%m%d')
 | 
											
												
													
														|  | 
 |  | +now_day = datetime.datetime.strftime(now_date, '%Y%m%d')
 | 
											
												
													
														|  | 
 |  | +train_dt = now_date - diff_5
 | 
											
												
													
														|  | 
 |  | +train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[4]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +add_feature = [
 | 
											
												
													
														|  | 
 |  | +    'all_return_day1_return_count',  # -- 1/3/7/14日内总回流  #12
 | 
											
												
													
														|  | 
 |  | +    'all_return_day3_return_count',
 | 
											
												
													
														|  | 
 |  | +    'all_return_day7_return_count',
 | 
											
												
													
														|  | 
 |  | +    'all_return_day14_return_count',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'three_return_day1_return_count',  # -- 1/3/7/14日内前三层回流 #14
 | 
											
												
													
														|  | 
 |  | +    'three_return_day3_return_count',
 | 
											
												
													
														|  | 
 |  | +    'three_return_day7_return_count',
 | 
											
												
													
														|  | 
 |  | +    'three_return_day14_return_count',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_day1_return_count',  # -- 1/3/7/14日内四+层回流 #15
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_day3_return_count',
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_day7_return_count',
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_day14_return_count',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'one_return_day1_return_count',  # -- 1/3/7/14日内一层回流  #13
 | 
											
												
													
														|  | 
 |  | +    'one_return_day3_return_count',
 | 
											
												
													
														|  | 
 |  | +    'one_return_day7_return_count',
 | 
											
												
													
														|  | 
 |  | +    'one_return_day14_return_count',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_div_three_return_day1',  # -- 1/3/7/14日内四+层回流/前三层回流   #23
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_div_three_return_day3',
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_div_three_return_day7',
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_div_three_return_day14',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'all_return_day1_view_day1_return_count',  # -- 1/3/7/14日内曝光在1/3/7/14日内回流  #8
 | 
											
												
													
														|  | 
 |  | +    'all_return_day3_view_day3_return_count',
 | 
											
												
													
														|  | 
 |  | +    'all_return_day7_view_day7_return_count',
 | 
											
												
													
														|  | 
 |  | +    'all_return_day14_view_day14_return_count',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'three_return_day1_view_day1_return_count',  # -- 1/3/7/14日内曝光在1/3/7/14日内前三层回流 #10
 | 
											
												
													
														|  | 
 |  | +    'three_return_day3_view_day3_return_count',
 | 
											
												
													
														|  | 
 |  | +    'three_return_day7_view_day7_return_count',
 | 
											
												
													
														|  | 
 |  | +    'three_return_day14_view_day14_return_count',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_day1_view_day1_return_count',  # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流  # 11
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_day3_view_day3_return_count',
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_day7_view_day7_return_count',
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_day14_view_day14_return_count',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'one_return_day1_view_day1_return_count',  ##-- 1/3/7/14日内曝光在1/3/7/14日内一层回流 #9
 | 
											
												
													
														|  | 
 |  | +    'one_return_day3_view_day3_return_count',
 | 
											
												
													
														|  | 
 |  | +    'one_return_day7_view_day7_return_count',
 | 
											
												
													
														|  | 
 |  | +    'one_return_day14_view_day14_return_count',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'all_return_day1_on_day1_return_count',  # 前day1+1 / day1+3/day1+7/day1+14 到前 day1+1日内曝光在 day1的总回流   #16
 | 
											
												
													
														|  | 
 |  | +    'all_return_day3_on_day1_return_count',
 | 
											
												
													
														|  | 
 |  | +    'all_return_day7_on_day1_return_count',
 | 
											
												
													
														|  | 
 |  | +    'all_return_day14_on_day1_return_count',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_day1_view_day1_return_div_three_d1',  # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流/前三层回流  #22
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_day3_view_day3_return_div_three_d3',
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_day7_view_day7_return_div_three_d7',
 | 
											
												
													
														|  | 
 |  | +    'four_up_return_day14_view_day14_return_div_three_d14',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'day1ctr',  # -- 1/3/7/14/30/60日内播放/曝光   #17
 | 
											
												
													
														|  | 
 |  | +    'day3ctr',
 | 
											
												
													
														|  | 
 |  | +    'day7ctr',
 | 
											
												
													
														|  | 
 |  | +    'day14ctr',
 | 
											
												
													
														|  | 
 |  | +    'day30ctr',
 | 
											
												
													
														|  | 
 |  | +    'day60ctr',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'day1sov',  # --  1/3/7/14/30/60日内分享/曝光  #18
 | 
											
												
													
														|  | 
 |  | +    'day3sov',
 | 
											
												
													
														|  | 
 |  | +    'day7sov',
 | 
											
												
													
														|  | 
 |  | +    'day14sov',
 | 
											
												
													
														|  | 
 |  | +    'day30sov',
 | 
											
												
													
														|  | 
 |  | +    'day60sov',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'day1rov',  # -- 1/3/7/14日内曝光的回流/曝光   #19
 | 
											
												
													
														|  | 
 |  | +    'day3rov',
 | 
											
												
													
														|  | 
 |  | +    'day7rov',
 | 
											
												
													
														|  | 
 |  | +    'day14rov',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'day1soc',  # -- 1/3/7/14/30/60日内分享/播放  #20
 | 
											
												
													
														|  | 
 |  | +    'day3soc',
 | 
											
												
													
														|  | 
 |  | +    'day7soc',
 | 
											
												
													
														|  | 
 |  | +    'day14soc',
 | 
											
												
													
														|  | 
 |  | +    'day30soc',
 | 
											
												
													
														|  | 
 |  | +    'day60soc',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'day1roc',  # -- 1/3/7/14日内曝光的回流/播放  #21
 | 
											
												
													
														|  | 
 |  | +    'day3roc',
 | 
											
												
													
														|  | 
 |  | +    'day7roc',
 | 
											
												
													
														|  | 
 |  | +    'day14roc',
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'oneday_day1rov',  # -- 1/3/7/14日内曝光在今日的回流/ 1/3/7/14日内曝光  #24
 | 
											
												
													
														|  | 
 |  | +    'oneday_day3rov',
 | 
											
												
													
														|  | 
 |  | +    'oneday_day7rov',
 | 
											
												
													
														|  | 
 |  | +    'oneday_day14rov',
 | 
											
												
													
														|  | 
 |  | +    'futre7dayreturn'
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    ,'todyviewcount_rank'
 | 
											
												
													
														|  | 
 |  | +    ,'day1viewcount_rank'
 | 
											
												
													
														|  | 
 |  | +]
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +featurename = [
 | 
											
												
													
														|  | 
 |  | +    'dt',
 | 
											
												
													
														|  | 
 |  | +    'videoid',
 | 
											
												
													
														|  | 
 |  | +    'day1playcount',
 | 
											
												
													
														|  | 
 |  | +    'day1returncount',
 | 
											
												
													
														|  | 
 |  | +    'day1sharecount',
 | 
											
												
													
														|  | 
 |  | +    'day1viewcount',
 | 
											
												
													
														|  | 
 |  | +    'day14playcount',
 | 
											
												
													
														|  | 
 |  | +    'day14returncount',
 | 
											
												
													
														|  | 
 |  | +    'day14sharecount',
 | 
											
												
													
														|  | 
 |  | +    'day14viewcount',
 | 
											
												
													
														|  | 
 |  | +    'day30playcount',
 | 
											
												
													
														|  | 
 |  | +    'day30returncount',
 | 
											
												
													
														|  | 
 |  | +    'day30sharecount',
 | 
											
												
													
														|  | 
 |  | +    'day30viewcount',
 | 
											
												
													
														|  | 
 |  | +    'day3playcount',
 | 
											
												
													
														|  | 
 |  | +    'day3returncount',
 | 
											
												
													
														|  | 
 |  | +    'day3sharecount',
 | 
											
												
													
														|  | 
 |  | +    'day3viewcount',
 | 
											
												
													
														|  | 
 |  | +    'day60playcount',
 | 
											
												
													
														|  | 
 |  | +    'day60returncount',
 | 
											
												
													
														|  | 
 |  | +    'day60sharecount',
 | 
											
												
													
														|  | 
 |  | +    'day60viewcount',
 | 
											
												
													
														|  | 
 |  | +    'day7playcount',
 | 
											
												
													
														|  | 
 |  | +    'day7returncount',
 | 
											
												
													
														|  | 
 |  | +    'day7sharecount',
 | 
											
												
													
														|  | 
 |  | +    'day7viewcount',
 | 
											
												
													
														|  | 
 |  | +    'videocategory11',
 | 
											
												
													
														|  | 
 |  | +    'videocategory12',
 | 
											
												
													
														|  | 
 |  | +    'videocategory45',
 | 
											
												
													
														|  | 
 |  | +    'videocategory49',
 | 
											
												
													
														|  | 
 |  | +    'videocategory1',
 | 
											
												
													
														|  | 
 |  | +    'videocategory2',
 | 
											
												
													
														|  | 
 |  | +    'videocategory3',
 | 
											
												
													
														|  | 
 |  | +    'videocategory4',
 | 
											
												
													
														|  | 
 |  | +    'videocategory5',
 | 
											
												
													
														|  | 
 |  | +    'videocategory6',
 | 
											
												
													
														|  | 
 |  | +    'videocategory7',
 | 
											
												
													
														|  | 
 |  | +    'videocategory8',
 | 
											
												
													
														|  | 
 |  | +    'videocategory9',
 | 
											
												
													
														|  | 
 |  | +    'videocategory85',
 | 
											
												
													
														|  | 
 |  | +    'videocategory10',
 | 
											
												
													
														|  | 
 |  | +    'videocategory555',
 | 
											
												
													
														|  | 
 |  | +    'usercategory1',
 | 
											
												
													
														|  | 
 |  | +    'usercategory2',
 | 
											
												
													
														|  | 
 |  | +    'usercategory3',
 | 
											
												
													
														|  | 
 |  | +    'usercategory4',
 | 
											
												
													
														|  | 
 |  | +    'usercategory5',
 | 
											
												
													
														|  | 
 |  | +    'usercategory6',
 | 
											
												
													
														|  | 
 |  | +    'usercategory7',
 | 
											
												
													
														|  | 
 |  | +    'usercategory8',
 | 
											
												
													
														|  | 
 |  | +    'usercategory9',
 | 
											
												
													
														|  | 
 |  | +    'usercategory10',
 | 
											
												
													
														|  | 
 |  | +    'usercategory11',
 | 
											
												
													
														|  | 
 |  | +    'usercategory12',
 | 
											
												
													
														|  | 
 |  | +    'usercategory45',
 | 
											
												
													
														|  | 
 |  | +    'usercategory49',
 | 
											
												
													
														|  | 
 |  | +    'usercategory85',
 | 
											
												
													
														|  | 
 |  | +    'usercategory555',
 | 
											
												
													
														|  | 
 |  | +    'todyviewcount',
 | 
											
												
													
														|  | 
 |  | +    'day5returncount_1_stage',
 | 
											
												
													
														|  | 
 |  | +    'day5returncount_2_stage',
 | 
											
												
													
														|  | 
 |  | +    'day5returncount_3_stage',
 | 
											
												
													
														|  | 
 |  | +    'day5returncount_4_stage',
 | 
											
												
													
														|  | 
 |  | +    'stage_one_retrn',
 | 
											
												
													
														|  | 
 |  | +    'stage_two_retrn',
 | 
											
												
													
														|  | 
 |  | +    'stage_three_retrn',
 | 
											
												
													
														|  | 
 |  | +    'stage_four_retrn']
 | 
											
												
													
														|  | 
 |  | +words = ['videotags','words_without_tags']
 | 
											
												
													
														|  | 
 |  | +featurename = featurename + add_feature + words
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +print(len(featurename))
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[5]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def getRovfeaturetable(dt):
 | 
											
												
													
														|  | 
 |  | +    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
 | 
											
												
													
														|  | 
 |  | +                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
 | 
											
												
													
														|  | 
 |  | +                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    featureArray = []
 | 
											
												
													
														|  | 
 |  | +    for record in odps.read_table('rov_feature_add_v1', partition='dt=%s' % dt):
 | 
											
												
													
														|  | 
 |  | +        valueFeature = {}
 | 
											
												
													
														|  | 
 |  | +        for i in featurename:
 | 
											
												
													
														|  | 
 |  | +            if i == 'dt':
 | 
											
												
													
														|  | 
 |  | +                valueFeature[i] = dt
 | 
											
												
													
														|  | 
 |  | +            else:
 | 
											
												
													
														|  | 
 |  | +                valueFeature[i] = record[i]
 | 
											
												
													
														|  | 
 |  | +        featureArray.append(valueFeature)
 | 
											
												
													
														|  | 
 |  | +    featureArray = pd.DataFrame(featureArray)
 | 
											
												
													
														|  | 
 |  | +    print(dt, 'feature table finish')
 | 
											
												
													
														|  | 
 |  | +    return featureArray
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def getRovtestable(dt):
 | 
											
												
													
														|  | 
 |  | +    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
 | 
											
												
													
														|  | 
 |  | +                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
 | 
											
												
													
														|  | 
 |  | +                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    featureArray = []
 | 
											
												
													
														|  | 
 |  | +    for record in odps.read_table('rov_predict_table_add_v1', partition='dt=%s' % dt):
 | 
											
												
													
														|  | 
 |  | +        valueFeature = {}
 | 
											
												
													
														|  | 
 |  | +        for i in featurename:
 | 
											
												
													
														|  | 
 |  | +            if i == 'dt':
 | 
											
												
													
														|  | 
 |  | +                valueFeature[i] = dt
 | 
											
												
													
														|  | 
 |  | +            else:
 | 
											
												
													
														|  | 
 |  | +                valueFeature[i] = record[i]
 | 
											
												
													
														|  | 
 |  | +        featureArray.append(valueFeature)
 | 
											
												
													
														|  | 
 |  | +    featureArray = pd.DataFrame(featureArray)
 | 
											
												
													
														|  | 
 |  | +    print(dt, 'test table finish')
 | 
											
												
													
														|  | 
 |  | +    return featureArray
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def getestingdata(date):
 | 
											
												
													
														|  | 
 |  | +    new_date = dt.strptime(date, '%Y%m%d')
 | 
											
												
													
														|  | 
 |  | +    datelist = []
 | 
											
												
													
														|  | 
 |  | +    testlist = []
 | 
											
												
													
														|  | 
 |  | +    for i in range(0, 1):
 | 
											
												
													
														|  | 
 |  | +        delta = datetime.timedelta(days=i)
 | 
											
												
													
														|  | 
 |  | +        tar_dt = new_date - delta
 | 
											
												
													
														|  | 
 |  | +        datelist.append(tar_dt.strftime("%Y%m%d"))
 | 
											
												
													
														|  | 
 |  | +    print(datelist)
 | 
											
												
													
														|  | 
 |  | +    for tm in datelist:
 | 
											
												
													
														|  | 
 |  | +        testlist.append(getRovtestable(tm))
 | 
											
												
													
														|  | 
 |  | +    testdata = pd.concat(testlist)
 | 
											
												
													
														|  | 
 |  | +    testdata.reset_index(inplace=True)
 | 
											
												
													
														|  | 
 |  | +    testdata = testdata.drop(axis=1, columns='index')
 | 
											
												
													
														|  | 
 |  | +    return testdata
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def getrainingdata(date):
 | 
											
												
													
														|  | 
 |  | +    new_date = dt.strptime(date, '%Y%m%d')
 | 
											
												
													
														|  | 
 |  | +    datelist = []
 | 
											
												
													
														|  | 
 |  | +    trainlist = []
 | 
											
												
													
														|  | 
 |  | +    for i in range(0, 30):
 | 
											
												
													
														|  | 
 |  | +        delta = datetime.timedelta(days=i)
 | 
											
												
													
														|  | 
 |  | +        tar_dt = new_date - delta
 | 
											
												
													
														|  | 
 |  | +        datelist.append(tar_dt.strftime("%Y%m%d"))
 | 
											
												
													
														|  | 
 |  | +    print(datelist)
 | 
											
												
													
														|  | 
 |  | +    for tm in datelist:
 | 
											
												
													
														|  | 
 |  | +        trainlist.append(getRovfeaturetable(tm))
 | 
											
												
													
														|  | 
 |  | +    traindata = pd.concat(trainlist)
 | 
											
												
													
														|  | 
 |  | +    traindata.reset_index(inplace=True)
 | 
											
												
													
														|  | 
 |  | +    traindata = traindata.drop(axis=1, columns='index')
 | 
											
												
													
														|  | 
 |  | +    return traindata
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +traindata = getrainingdata(train_day)
 | 
											
												
													
														|  | 
 |  | +data_test_ori_rk = getestingdata(input_day)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[6]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def select_recent_video(df):
 | 
											
												
													
														|  | 
 |  | +    """对每一个视频添加row number,按照日期排序,最后选取最近的那一天"""
 | 
											
												
													
														|  | 
 |  | +    df['dt'] = df['dt'].astype(int)
 | 
											
												
													
														|  | 
 |  | +    df['rk'] = df['dt'].groupby(df['videoid']).rank(ascending=0, method='first')
 | 
											
												
													
														|  | 
 |  | +    df = df[df['rk'] == 1]
 | 
											
												
													
														|  | 
 |  | +    return df
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +data_test_ori = select_recent_video(data_test_ori_rk)
 | 
											
												
													
														|  | 
 |  | +data_test_ori.loc[data_test_ori['dt'] != int(input_day), 'futre7dayreturn'] = 0
 | 
											
												
													
														|  | 
 |  | +data_test_ori = data_test_ori.drop(axis=1, columns='rk')
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[7]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +## 去重复,保证每个视频 每一天 有切仅有一条数据。
 | 
											
												
													
														|  | 
 |  | +traindata.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
 | 
											
												
													
														|  | 
 |  | +data_test_ori.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[8]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def basic_cal(df):
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +#     df['weighted_retrn'] = df['stage_one_retrn'].astype('int')*0.4 + \
 | 
											
												
													
														|  | 
 |  | +#                     df['stage_two_retrn'].astype('int')*0.3 + \
 | 
											
												
													
														|  | 
 |  | +#                     df['stage_three_retrn'].astype('int')*0.3
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    df['weighted_retrn'] = df['futre7dayreturn'].astype('int') 
 | 
											
												
													
														|  | 
 |  | +#     df['weighted_retrn'] = df['futr5returncount'].astype('int')
 | 
											
												
													
														|  | 
 |  | +    #  day1viewcount 如果是零,就返回 rov,rov_log 变为零
 | 
											
												
													
														|  | 
 |  | +    df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1)
 | 
											
												
													
														|  | 
 |  | +#     df['rov'] = df.apply(lambda x: x['weighted_retrn'] / x['todyviewcount'] \
 | 
											
												
													
														|  | 
 |  | +#                                      if x['todyviewcount']!=0 else 0,axis=1)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +#     df['rov_log'] = df.apply(lambda x: np.log(x['rov'] + 1),axis=1)
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +#     thresh = np.percentile(df[df['weighted_retrn']>0]['weighted_retrn'],50)
 | 
											
												
													
														|  | 
 |  | +    ## 设置回流大于thresh, label就是1, 没有分享或有分享但是回流数是零的标为0
 | 
											
												
													
														|  | 
 |  | +    df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1)
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    return df 
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +data_train = basic_cal(traindata)
 | 
											
												
													
														|  | 
 |  | +data_test = basic_cal(data_test_ori)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[9]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def today_view_category(df):
 | 
											
												
													
														|  | 
 |  | +### 对当天的曝光量分三个级别,未来三天的曝光量分3个级别,添加Category feaure
 | 
											
												
													
														|  | 
 |  | +    data_test1_view1 =   df.loc[data_test['day1viewcount_rank'] > 10000]['day1viewcount'].mean()
 | 
											
												
													
														|  | 
 |  | +    data_test1_view2 =   df.loc[(data_test['day1viewcount_rank'] > 3000)&(data_test['day1viewcount_rank'] <= 10000)]['day1viewcount'].mean()
 | 
											
												
													
														|  | 
 |  | +    data_test1_view3 =   df.loc[(data_test['day1viewcount_rank'] > 1000)&(data_test['day1viewcount_rank'] <= 3000)]['day1viewcount'].mean()
 | 
											
												
													
														|  | 
 |  | +    data_test1_view4 =   df.loc[(data_test['day1viewcount_rank'] > 300)&(data_test['day1viewcount_rank'] <= 1000)]['day1viewcount'].mean()
 | 
											
												
													
														|  | 
 |  | +    data_test1_view5 =   df.loc[(data_test['day1viewcount_rank'] > 100)&(data_test['day1viewcount_rank'] <= 300)]['day1viewcount'].mean()
 | 
											
												
													
														|  | 
 |  | +    data_test1_view6 =   df.loc[(data_test['day1viewcount_rank'] > 30)&(data_test['day1viewcount_rank'] <= 100)]['day1viewcount'].mean()
 | 
											
												
													
														|  | 
 |  | +    data_test1_view7 =   df.loc[(data_test['day1viewcount_rank'] > 0)&(data_test['day1viewcount_rank'] <= 30)]['day1viewcount'].mean()
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    df.loc[df['day1viewcount_rank'] > 10000, 'todyviewcount'] = data_test1_view3
 | 
											
												
													
														|  | 
 |  | +    df.loc[(data_test['day1viewcount_rank'] > 3000)&(data_test['day1viewcount_rank'] <= 10000), 'todyviewcount'] = data_test1_view3
 | 
											
												
													
														|  | 
 |  | +    df.loc[(data_test['day1viewcount_rank'] > 1000)&(data_test['day1viewcount_rank'] <= 3000), 'todyviewcount'] = data_test1_view4
 | 
											
												
													
														|  | 
 |  | +    df.loc[(data_test['day1viewcount_rank'] > 300)&(data_test['day1viewcount_rank'] <= 1000), 'todyviewcount'] = data_test1_view5
 | 
											
												
													
														|  | 
 |  | +    df.loc[(data_test['day1viewcount_rank'] > 100)&(data_test['day1viewcount_rank'] <= 300), 'todyviewcount'] = data_test1_view6
 | 
											
												
													
														|  | 
 |  | +    df.loc[(data_test['day1viewcount_rank'] > 30)&(data_test['day1viewcount_rank'] <= 100), 'todyviewcount'] = data_test1_view7
 | 
											
												
													
														|  | 
 |  | +    df.loc[(data_test['day1viewcount_rank'] > 0)&(data_test['day1viewcount_rank'] <= 30), 'todyviewcount'] = data_test1_view7
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    return df
 | 
											
												
													
														|  | 
 |  | +data_test =  today_view_category(data_test) 
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[10]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# 首页特征
 | 
											
												
													
														|  | 
 |  | +root_page_1day = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +root_page_3day = ['day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +root_page_7day = ['day7playcount', 'day7returncount', 'day7sharecount', 'day7viewcount']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +root_page_14day = ['day14playcount', 'day14returncount', 'day14sharecount', 'day14viewcount']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +root_page_30day = ['day30playcount', 'day30returncount', 'day30sharecount', 'day30viewcount']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +root_page_60day = ['day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +return_5day = ['day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage',
 | 
											
												
													
														|  | 
 |  | +               'day5returncount_4_stage']
 | 
											
												
													
														|  | 
 |  | +cate_feat = ['videocategory1', 'videocategory10', 'videocategory11', 'videocategory12',
 | 
											
												
													
														|  | 
 |  | +             'videocategory2', 'videocategory3', 'videocategory4', 'videocategory45',
 | 
											
												
													
														|  | 
 |  | +             'videocategory49', 'videocategory5', 'videocategory6',
 | 
											
												
													
														|  | 
 |  | +             'videocategory7', 'videocategory8', 'videocategory85', 'videocategory9', 'videocategory555']
 | 
											
												
													
														|  | 
 |  | +one_hot_feature = ['videotags','words_without_tags','videoid']
 | 
											
												
													
														|  | 
 |  | +# cate_view_feat = [ 'todyview_low','todyview_median','todyview_high']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# cate_view_feat = ['todyview_1', 'todyview_2', 'todyview_3', 'todyview_4', 'todyview_5', 'todyview_6', 'todyview_7',
 | 
											
												
													
														|  | 
 |  | +#                   'todyview_8']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[11]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def cal_feature(df):
 | 
											
												
													
														|  | 
 |  | +    start = time.time()
 | 
											
												
													
														|  | 
 |  | +    for i in range(len(root_page_1day)):
 | 
											
												
													
														|  | 
 |  | +        
 | 
											
												
													
														|  | 
 |  | +        
 | 
											
												
													
														|  | 
 |  | +        newfeat_div = root_page_60day[i] + '_divide_' + root_page_30day[i]
 | 
											
												
													
														|  | 
 |  | +#         df[newfeat_div] = df.apply(lambda s: s[root_page_30day[i]] / s[root_page_60day[i]]\
 | 
											
												
													
														|  | 
 |  | +#                                           if s[root_page_60day[i]] != 0 else 0, axis=1) 
 | 
											
												
													
														|  | 
 |  | +        df[newfeat_div] = df[root_page_30day[i]]/ df[root_page_60day[i]]
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +        newfeat_diff = root_page_60day[i] + '_dif_' + root_page_30day[i]
 | 
											
												
													
														|  | 
 |  | +#         df[newfeat_diff] = df.apply(lambda s: s[root_page_60day[i]]-s[root_page_30day[i]],\
 | 
											
												
													
														|  | 
 |  | +#                                                 axis=1) 
 | 
											
												
													
														|  | 
 |  | +        df[newfeat_diff] = df[root_page_60day[i]] - df[root_page_30day[i]]
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    end = time.time()
 | 
											
												
													
														|  | 
 |  | +    running_time = end-start
 | 
											
												
													
														|  | 
 |  | +    print('stage 1: time cost : %.5f sec' %running_time)
 | 
											
												
													
														|  | 
 |  | +     
 | 
											
												
													
														|  | 
 |  | +  
 | 
											
												
													
														|  | 
 |  | +    start = time.time()
 | 
											
												
													
														|  | 
 |  | +    for i in range(len(root_page_1day)):
 | 
											
												
													
														|  | 
 |  | +        newfeat_div = root_page_30day[i] + '_divide_' + root_page_7day[i]
 | 
											
												
													
														|  | 
 |  | +#         df[newfeat_div] = df.apply(lambda s: s[root_page_7day[i]] / s[root_page_30day[i]]\
 | 
											
												
													
														|  | 
 |  | +#                                           if s[root_page_30day[i]] != 0 else 0, axis=1) 
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +        df[newfeat_div] = df[root_page_7day[i]]/df[root_page_30day[i]]
 | 
											
												
													
														|  | 
 |  | +        newfeat_diff = root_page_30day[i] + '_dif_' + root_page_7day[i]
 | 
											
												
													
														|  | 
 |  | +#         df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\
 | 
											
												
													
														|  | 
 |  | +#                                                 axis=1) 
 | 
											
												
													
														|  | 
 |  | +        df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]]
 | 
											
												
													
														|  | 
 |  | +    end = time.time()
 | 
											
												
													
														|  | 
 |  | +    running_time = end-start
 | 
											
												
													
														|  | 
 |  | +    print('stage 2: time cost : %.5f sec' %running_time) 
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    start = time.time()
 | 
											
												
													
														|  | 
 |  | +    for i in range(len(root_page_1day)):
 | 
											
												
													
														|  | 
 |  | +        newfeat_div = root_page_7day[i] + '_divide_' + root_page_3day[i]
 | 
											
												
													
														|  | 
 |  | +#         df[newfeat_div] = df.apply(lambda s: s[root_page_3day[i]] / s[root_page_7day[i]]\
 | 
											
												
													
														|  | 
 |  | +#                                           if s[root_page_7day[i]] != 0 else 0, axis=1)
 | 
											
												
													
														|  | 
 |  | +        df[newfeat_div] = df[root_page_3day[i]]/df[root_page_7day[i]]
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +        newfeat_diff = root_page_7day[i] + '_dif_' + root_page_3day[i]
 | 
											
												
													
														|  | 
 |  | +#         df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\
 | 
											
												
													
														|  | 
 |  | +#                                                 axis=1) 
 | 
											
												
													
														|  | 
 |  | +        df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]]
 | 
											
												
													
														|  | 
 |  | +    end = time.time()
 | 
											
												
													
														|  | 
 |  | +    running_time = end-start
 | 
											
												
													
														|  | 
 |  | +    print('stage 3: time cost : %.5f sec' %running_time)
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    start = time.time()
 | 
											
												
													
														|  | 
 |  | +    for i in range(len(root_page_1day)):
 | 
											
												
													
														|  | 
 |  | +        newfeat_div = root_page_3day[i] + '_divide_' + root_page_1day[i]
 | 
											
												
													
														|  | 
 |  | +#         df[newfeat_div] = df.apply(lambda s: s[root_page_1day[i]] / s[root_page_3day[i]]\
 | 
											
												
													
														|  | 
 |  | +#                                           if s[root_page_3day[i]] != 0 else 0, axis=1) 
 | 
											
												
													
														|  | 
 |  | +        df[newfeat_div] = df[root_page_1day[i]] / df[root_page_3day[i]]
 | 
											
												
													
														|  | 
 |  | +        newfeat_diff = root_page_3day[i] + '_dif_' + root_page_1day[i]
 | 
											
												
													
														|  | 
 |  | +#         df[newfeat_diff] = df.apply(lambda s: s[root_page_3day[i]] - s[root_page_1day[i]],\
 | 
											
												
													
														|  | 
 |  | +#                                                 axis=1) 
 | 
											
												
													
														|  | 
 |  | +        df[newfeat_diff] = df[root_page_3day[i]] - df[root_page_1day[i]]
 | 
											
												
													
														|  | 
 |  | +    end = time.time()
 | 
											
												
													
														|  | 
 |  | +    running_time = end-start
 | 
											
												
													
														|  | 
 |  | +    print('stage 4: time cost : %.5f sec' %running_time)
 | 
											
												
													
														|  | 
 |  | +    df = df.replace([np.inf, -np.inf], np.nan)
 | 
											
												
													
														|  | 
 |  | +    df = df.fillna(0)
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    return df
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[12]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +data_train = data_train.fillna(0)
 | 
											
												
													
														|  | 
 |  | +data_test = data_test.fillna(0)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +data_train = cal_feature(data_train)
 | 
											
												
													
														|  | 
 |  | +data_test = cal_feature(data_test)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[13]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +print('data_train shape:', data_train.shape)
 | 
											
												
													
														|  | 
 |  | +print('data_test shape:', data_test.shape)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[14]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +features = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount', 'day30playcount', 'day30returncount', 
 | 
											
												
													
														|  | 
 |  | +            'day30sharecount', 'day30viewcount', 'day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount',
 | 
											
												
													
														|  | 
 |  | +            'day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount', 'day7playcount', 'day7returncount', 
 | 
											
												
													
														|  | 
 |  | +            'day7sharecount', 'day7viewcount', 'usercategory1', 'usercategory2', 'usercategory3', 'usercategory4',
 | 
											
												
													
														|  | 
 |  | +            'usercategory5', 'usercategory6', 'usercategory7', 'usercategory8', 'usercategory9', 'usercategory10',
 | 
											
												
													
														|  | 
 |  | +            'usercategory11', 'usercategory12', 'usercategory45', 'usercategory49', 'usercategory85','usercategory555', 
 | 
											
												
													
														|  | 
 |  | +            'todyviewcount', 
 | 
											
												
													
														|  | 
 |  | +            'day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage', 'day5returncount_4_stage', 
 | 
											
												
													
														|  | 
 |  | +            'stage_one_retrn', 'stage_two_retrn', 'stage_three_retrn', 'stage_four_retrn', 'all_return_day1_return_count', 
 | 
											
												
													
														|  | 
 |  | +            'all_return_day3_return_count', 'all_return_day7_return_count', 'all_return_day14_return_count',
 | 
											
												
													
														|  | 
 |  | +            'three_return_day1_return_count', 'three_return_day3_return_count', 'three_return_day7_return_count', 
 | 
											
												
													
														|  | 
 |  | +            'three_return_day14_return_count', 'four_up_return_day1_return_count', 'four_up_return_day3_return_count', 
 | 
											
												
													
														|  | 
 |  | +            'four_up_return_day7_return_count', 'four_up_return_day14_return_count', 'one_return_day1_return_count', 
 | 
											
												
													
														|  | 
 |  | +            'one_return_day3_return_count', 'one_return_day7_return_count', 'one_return_day14_return_count', 
 | 
											
												
													
														|  | 
 |  | +            'four_up_return_div_three_return_day1', 'four_up_return_div_three_return_day3', 
 | 
											
												
													
														|  | 
 |  | +            'four_up_return_div_three_return_day7', 'four_up_return_div_three_return_day14', 
 | 
											
												
													
														|  | 
 |  | +            'all_return_day1_view_day1_return_count', 'all_return_day3_view_day3_return_count', 
 | 
											
												
													
														|  | 
 |  | +            'all_return_day7_view_day7_return_count', 'all_return_day14_view_day14_return_count',
 | 
											
												
													
														|  | 
 |  | +            'three_return_day1_view_day1_return_count', 'three_return_day3_view_day3_return_count', 
 | 
											
												
													
														|  | 
 |  | +            'three_return_day7_view_day7_return_count', 'three_return_day14_view_day14_return_count', 
 | 
											
												
													
														|  | 
 |  | +            'four_up_return_day1_view_day1_return_count', 'four_up_return_day3_view_day3_return_count', 
 | 
											
												
													
														|  | 
 |  | +            'four_up_return_day7_view_day7_return_count', 'four_up_return_day14_view_day14_return_count', 
 | 
											
												
													
														|  | 
 |  | +            'one_return_day1_view_day1_return_count', 'one_return_day3_view_day3_return_count', 
 | 
											
												
													
														|  | 
 |  | +            'one_return_day7_view_day7_return_count', 'one_return_day14_view_day14_return_count', 
 | 
											
												
													
														|  | 
 |  | +            'all_return_day1_on_day1_return_count', 'all_return_day3_on_day1_return_count', 
 | 
											
												
													
														|  | 
 |  | +            'all_return_day7_on_day1_return_count', 'all_return_day14_on_day1_return_count', 
 | 
											
												
													
														|  | 
 |  | +            'four_up_return_day1_view_day1_return_div_three_d1', 'four_up_return_day3_view_day3_return_div_three_d3', 
 | 
											
												
													
														|  | 
 |  | +            'four_up_return_day7_view_day7_return_div_three_d7', 'four_up_return_day14_view_day14_return_div_three_d14', 
 | 
											
												
													
														|  | 
 |  | +            'day1ctr', 'day3ctr', 'day7ctr', 'day14ctr', 'day30ctr', 'day60ctr', 'day1sov', 'day3sov', 'day7sov', 
 | 
											
												
													
														|  | 
 |  | +            'day14sov', 'day30sov', 'day60sov', 'day1rov', 'day3rov', 'day7rov', 'day14rov', 'day1soc', 'day3soc', 
 | 
											
												
													
														|  | 
 |  | +            'day7soc', 'day14soc', 'day30soc', 'day60soc', 'day1roc', 'day3roc', 'day7roc', 'day14roc', 'oneday_day1rov', 
 | 
											
												
													
														|  | 
 |  | +            'oneday_day3rov', 'oneday_day7rov', 'oneday_day14rov',
 | 
											
												
													
														|  | 
 |  | +            'day60playcount_divide_day30playcount', 'day60playcount_dif_day30playcount', 
 | 
											
												
													
														|  | 
 |  | +            'day60returncount_divide_day30returncount', 'day60returncount_dif_day30returncount', 
 | 
											
												
													
														|  | 
 |  | +            'day60sharecount_divide_day30sharecount', 'day60sharecount_dif_day30sharecount', 
 | 
											
												
													
														|  | 
 |  | +            'day60viewcount_divide_day30viewcount', 'day60viewcount_dif_day30viewcount',
 | 
											
												
													
														|  | 
 |  | +            'day30playcount_divide_day7playcount', 'day30playcount_dif_day7playcount', 
 | 
											
												
													
														|  | 
 |  | +            'day30returncount_divide_day7returncount', 'day30returncount_dif_day7returncount', 
 | 
											
												
													
														|  | 
 |  | +            'day30sharecount_divide_day7sharecount', 'day30sharecount_dif_day7sharecount', 
 | 
											
												
													
														|  | 
 |  | +            'day30viewcount_divide_day7viewcount', 'day30viewcount_dif_day7viewcount', 
 | 
											
												
													
														|  | 
 |  | +            'day7playcount_divide_day3playcount', 'day7playcount_dif_day3playcount', 
 | 
											
												
													
														|  | 
 |  | +            'day7returncount_divide_day3returncount', 'day7returncount_dif_day3returncount', 
 | 
											
												
													
														|  | 
 |  | +            'day7sharecount_divide_day3sharecount', 'day7sharecount_dif_day3sharecount', 
 | 
											
												
													
														|  | 
 |  | +            'day7viewcount_divide_day3viewcount', 'day7viewcount_dif_day3viewcount', 'day3playcount_divide_day1playcount', 
 | 
											
												
													
														|  | 
 |  | +            'day3playcount_dif_day1playcount', 'day3returncount_divide_day1returncount', 
 | 
											
												
													
														|  | 
 |  | +            'day3returncount_dif_day1returncount', 'day3sharecount_divide_day1sharecount',
 | 
											
												
													
														|  | 
 |  | +            'day3sharecount_dif_day1sharecount', 'day3viewcount_divide_day1viewcount', 
 | 
											
												
													
														|  | 
 |  | +            'day3viewcount_dif_day1viewcount']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[15]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def dataprepare(df_pre):
 | 
											
												
													
														|  | 
 |  | +    #  直接将特征送进去,不加交叉特征。
 | 
											
												
													
														|  | 
 |  | +    # 是否对数据补零
 | 
											
												
													
														|  | 
 |  | +    df_pre = df_pre.fillna(0)
 | 
											
												
													
														|  | 
 |  | +    df_new_feature = df_pre[features]
 | 
											
												
													
														|  | 
 |  | +#     df_onehot_feature = df_pre[one_hot_feature]
 | 
											
												
													
														|  | 
 |  | +#     df_new_feature = pd.concat([df_pre.loc[:, 'all_return_day14_on_day1_return_count':'day7viewcount'], \
 | 
											
												
													
														|  | 
 |  | +#                                 df_pre.loc[:, 'four_up_return_day14_return_count': \
 | 
											
												
													
														|  | 
 |  | +#                                               'four_up_return_div_three_return_day7'], \
 | 
											
												
													
														|  | 
 |  | +#                                 df_pre.loc[:, 'one_return_day14_return_count':'oneday_day7rov'],
 | 
											
												
													
														|  | 
 |  | +#                                 df_pre.loc[:,
 | 
											
												
													
														|  | 
 |  | +#                                 'three_return_day14_return_count':'three_return_day7_view_day7_return_count'],
 | 
											
												
													
														|  | 
 |  | +#                                 df_pre.loc[:, 'usercategory1':'usercategory9'], df_pre.loc[:,
 | 
											
												
													
														|  | 
 |  | +#                                                                                 'day60playcount_divide_day30playcount':'day3viewcount_dif_day1viewcount']],
 | 
											
												
													
														|  | 
 |  | +#                                axis=1)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    #     df_new_feature = pd.concat([df_pre.loc[:,'day1playcount':'day7viewcount'],\
 | 
											
												
													
														|  | 
 |  | +    #                                 df_pre.loc[:,'day60playcount_divide_day30playcount':\
 | 
											
												
													
														|  | 
 |  | +    #                                              'day5returncount_4_stage_dif_day5returncount_3_stage'], \
 | 
											
												
													
														|  | 
 |  | +    #                                df_pre.loc[:,'usercategory1':'usercategory9']], axis=1)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    df_target = df_pre['weighted_retrn_log']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    df_new_feature = pd.concat([df_new_feature, df_pre[cate_feat],df_pre[one_hot_feature]], axis=1)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    return df_new_feature, df_target
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[16]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +recall_video = pd.read_csv('/root/ROVtrain/readonlinetable/result/recall_' + input_day[-4:] + '.csv')
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[17]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +ten_percent_thresh = recall_video['score'].min()
 | 
											
												
													
														|  | 
 |  | +if ten_percent_thresh < 0.4:
 | 
											
												
													
														|  | 
 |  | +    recall_video_stage_one = recall_video[recall_video['score'] > 0.4]
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +else:
 | 
											
												
													
														|  | 
 |  | +    recall_video_stage_one = recall_video[recall_video['score'] > ten_percent_thresh]
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[18]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +data_test['videoid'] = data_test['videoid'].astype('int')
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +data_train = data_train[data_train['weighted_retrn'] > 0]
 | 
											
												
													
														|  | 
 |  | +print(data_train.shape, 'train shape')
 | 
											
												
													
														|  | 
 |  | +data_test = pd.merge(data_test, recall_video_stage_one, on=['videoid'], how='inner')
 | 
											
												
													
														|  | 
 |  | +print('score>0.5 video_count:', data_test.shape)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[19]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +df_new_feature,df_target= dataprepare(data_train)
 | 
											
												
													
														|  | 
 |  | +df_new_feature_test, df_target_test = dataprepare(data_test)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[20]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +#数值
 | 
											
												
													
														|  | 
 |  | +from scipy import sparse
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
 | 
											
												
													
														|  | 
 |  | +df_new_feature_test_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_test).loc[:,'day1playcount':'videocategory555']))
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +print('value feature generate successfully')
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[21]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +#videoid
 | 
											
												
													
														|  | 
 |  | +train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid']
 | 
											
												
													
														|  | 
 |  | +test_videoid = pd.DataFrame(df_new_feature_test).loc[:,'videoid']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist()
 | 
											
												
													
														|  | 
 |  | +test_videoid_list = pd.DataFrame(df_new_feature_test).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_test).loc[:,'videoid']),1).tolist()
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +allvideo_raw = list(set(np.array(pd.concat([train_videoid,test_videoid])).tolist()))
 | 
											
												
													
														|  | 
 |  | +allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist()
 | 
											
												
													
														|  | 
 |  | +from sklearn.preprocessing import MultiLabelBinarizer
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo)
 | 
											
												
													
														|  | 
 |  | +train_videoid = mlb_model_videoid.transform(train_videoid_list)
 | 
											
												
													
														|  | 
 |  | +test_videoid = mlb_model_videoid.transform(test_videoid_list)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +print('videoid feature generate successfully')
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[23]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +len(mlb_model_videoid.classes_)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[24]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def tag_preprocessing(filename):
 | 
											
												
													
														|  | 
 |  | +    #读取tag分词结果
 | 
											
												
													
														|  | 
 |  | +    tag_txt = open("/root/ROVtrain/tfidfCompution/"+ filename +".txt","r")   #设置文件对象
 | 
											
												
													
														|  | 
 |  | +    ftextlist = tag_txt.readlines() # 同上
 | 
											
												
													
														|  | 
 |  | +    tag_txt.close() #关闭文件
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    #转为corpus
 | 
											
												
													
														|  | 
 |  | +    tagList = str(ftextlist).replace('[','').replace(']','').replace("'","").replace("'","").split(',')
 | 
											
												
													
														|  | 
 |  | +    tag = np.array(tagList).reshape(len(tagList),1).tolist()
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    #将词特征转为list形式
 | 
											
												
													
														|  | 
 |  | +    train_tag_feature =  pd.DataFrame(df_new_feature).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videotags']),1).tolist()
 | 
											
												
													
														|  | 
 |  | +    test_tag_feature = pd.DataFrame(df_new_feature_test).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_test).loc[:,'videotags']),1).tolist()
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    #稀疏特征
 | 
											
												
													
														|  | 
 |  | +    mlb_model_tag = MultiLabelBinarizer(sparse_output=True).fit(tag)
 | 
											
												
													
														|  | 
 |  | +    train_tag = mlb_model_tag.transform(train_tag_feature)
 | 
											
												
													
														|  | 
 |  | +    test_tag = mlb_model_tag.transform(test_tag_feature)
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    return mlb_model_tag.classes_,train_tag,test_tag
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[25]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +#读取tf,idf
 | 
											
												
													
														|  | 
 |  | +def get_tag_tfidf(dt, tfidf_table_name):
 | 
											
												
													
														|  | 
 |  | +    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods',
 | 
											
												
													
														|  | 
 |  | +                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
 | 
											
												
													
														|  | 
 |  | +                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
 | 
											
												
													
														|  | 
 |  | +    tag_dict = {}
 | 
											
												
													
														|  | 
 |  | +    for record in odps.read_table(tfidf_table_name, partition='dt=%s' % dt):
 | 
											
												
													
														|  | 
 |  | +        tag_dict[record[0]] = record[1]
 | 
											
												
													
														|  | 
 |  | +    return tag_dict
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[26]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def ttfidf_list_generation(tag_corpus,tag_dict):
 | 
											
												
													
														|  | 
 |  | +    tag_tfidf_list = []
 | 
											
												
													
														|  | 
 |  | +    for i in tag_corpus:
 | 
											
												
													
														|  | 
 |  | +        try :
 | 
											
												
													
														|  | 
 |  | +            tag_tfidf_list.append(tag_dict[i])
 | 
											
												
													
														|  | 
 |  | +        except:
 | 
											
												
													
														|  | 
 |  | +            tag_tfidf_list.append(0)
 | 
											
												
													
														|  | 
 |  | +    return tag_tfidf_list
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[27]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +#获取tag-one-hot
 | 
											
												
													
														|  | 
 |  | +tags ,train_tag,test_tag = tag_preprocessing('tag')
 | 
											
												
													
														|  | 
 |  | +#获取tag tfidf
 | 
											
												
													
														|  | 
 |  | +tag_dict = get_tag_tfidf('20200305','video_tag_tf_idf')
 | 
											
												
													
														|  | 
 |  | +print('lenth tag_dict:',len(tag_dict))
 | 
											
												
													
														|  | 
 |  | +#获取tfidf_tag 稀疏矩阵
 | 
											
												
													
														|  | 
 |  | +tag_corpus = tags.tolist()  #corpus
 | 
											
												
													
														|  | 
 |  | +tag_tfidf_list = ttfidf_list_generation(tag_corpus,tag_dict )
 | 
											
												
													
														|  | 
 |  | +tag_tf_idf_matrix  = sparse.csr_matrix(np.array(tag_tfidf_list))
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +tag_feature_train = train_tag.multiply(tag_tf_idf_matrix)  
 | 
											
												
													
														|  | 
 |  | +tag_feature_test = test_tag.multiply(tag_tf_idf_matrix)  
 | 
											
												
													
														|  | 
 |  | +print('tag tfidf feature generate successfully')
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +print('tag dimension:', len(tag_tfidf_list))
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[28]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +#获取values without tag
 | 
											
												
													
														|  | 
 |  | +words ,train_words,test_words = tag_preprocessing('words_no_tag')
 | 
											
												
													
														|  | 
 |  | +#获取words tfidf
 | 
											
												
													
														|  | 
 |  | +words_dict = get_tag_tfidf('20200305','video_words_without_tags_tfidf')
 | 
											
												
													
														|  | 
 |  | +print('lenth words_dict:',len(words_dict))
 | 
											
												
													
														|  | 
 |  | +#获取tfidf_tag 稀疏矩阵
 | 
											
												
													
														|  | 
 |  | +words_corpus = words.tolist()  #corpus
 | 
											
												
													
														|  | 
 |  | +words_tfidf_list = ttfidf_list_generation(words_corpus,words_dict )
 | 
											
												
													
														|  | 
 |  | +words_tf_idf_matrix  = sparse.csr_matrix(np.array(words_tfidf_list))
 | 
											
												
													
														|  | 
 |  | +words_feature_train = train_words.multiply(words_tf_idf_matrix)  
 | 
											
												
													
														|  | 
 |  | +words_feature_test = test_words.multiply(words_tf_idf_matrix)  
 | 
											
												
													
														|  | 
 |  | +print('tag tfidf feature generate successfully')
 | 
											
												
													
														|  | 
 |  | +print('words dimension:', len(words_tfidf_list))
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[32]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +#cancat 特征
 | 
											
												
													
														|  | 
 |  | +from scipy.sparse import hstack
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +#训练特征
 | 
											
												
													
														|  | 
 |  | +df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train])
 | 
											
												
													
														|  | 
 |  | +df_new_feature_test = hstack([df_new_feature_test_part_one,test_videoid,tag_feature_test,words_feature_test])
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +#target
 | 
											
												
													
														|  | 
 |  | +df_target_test = sparse.csr_matrix(pd.DataFrame(df_target_test).values).toarray()
 | 
											
												
													
														|  | 
 |  | +df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray()
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[33]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +df_target.size
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[34]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +param = {'num_leaves': 18,
 | 
											
												
													
														|  | 
 |  | +         'min_data_in_leaf': 60,
 | 
											
												
													
														|  | 
 |  | +         'objective': 'regression',
 | 
											
												
													
														|  | 
 |  | +         'max_depth': -1,
 | 
											
												
													
														|  | 
 |  | +         'learning_rate': 0.01,
 | 
											
												
													
														|  | 
 |  | +         "min_child_samples": 30,
 | 
											
												
													
														|  | 
 |  | +         "boosting": "gbdt",
 | 
											
												
													
														|  | 
 |  | +         "feature_fraction": 0.8,
 | 
											
												
													
														|  | 
 |  | +         "bagging_freq": 1,
 | 
											
												
													
														|  | 
 |  | +         "bagging_fraction": 0.8,
 | 
											
												
													
														|  | 
 |  | +         "bagging_seed": 11,
 | 
											
												
													
														|  | 
 |  | +         "metric": 'rmse',
 | 
											
												
													
														|  | 
 |  | +         "lambda_l1": 0.1,
 | 
											
												
													
														|  | 
 |  | +         "verbosity": -1,
 | 
											
												
													
														|  | 
 |  | +         "nthread": 4,
 | 
											
												
													
														|  | 
 |  | +         #          'max_bin': 512,
 | 
											
												
													
														|  | 
 |  | +         "random_state": 4590}
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590)
 | 
											
												
													
														|  | 
 |  | +oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray())))
 | 
											
												
													
														|  | 
 |  | +predictions = np.zeros(len(df_target_test))
 | 
											
												
													
														|  | 
 |  | +feature_importance_df = pd.DataFrame()
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[ ]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[46]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +values_lenth = len(features + cate_feat)
 | 
											
												
													
														|  | 
 |  | +video_id_lenth = len(mlb_model_videoid.classes_)
 | 
											
												
													
														|  | 
 |  | +tag_length = len(tag_tfidf_list)
 | 
											
												
													
														|  | 
 |  | +word_length = len(words_tfidf_list)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +print(values_lenth)
 | 
											
												
													
														|  | 
 |  | +print(video_id_lenth)
 | 
											
												
													
														|  | 
 |  | +print(tag_length)
 | 
											
												
													
														|  | 
 |  | +print(word_length)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[36]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +change_view = pd.DataFrame(pd.DataFrame(df_new_feature_test.toarray()))
 | 
											
												
													
														|  | 
 |  | +change_view = change_view.sort_index()
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[64]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[67]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, data_train['return_back'].values)):
 | 
											
												
													
														|  | 
 |  | +    print("folds {}".format(fold_))
 | 
											
												
													
														|  | 
 |  | +    trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx])
 | 
											
												
													
														|  | 
 |  | +    val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx])
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    num_round = 10000
 | 
											
												
													
														|  | 
 |  | +    clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100,
 | 
											
												
													
														|  | 
 |  | +                    early_stopping_rounds=200)
 | 
											
												
													
														|  | 
 |  | +    oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration)
 | 
											
												
													
														|  | 
 |  | +    predictions += clf.predict(df_new_feature_test, num_iteration=clf.best_iteration) / folds.n_splits
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    fold_importance_df = pd.DataFrame()
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    column = features+cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus
 | 
											
												
													
														|  | 
 |  | +    fold_importance_df["Feature"] = np.array(column)
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +    fold_importance_df["importance"] = clf.feature_importance()
 | 
											
												
													
														|  | 
 |  | +    fold_importance_df["fold"] = fold_ + 1
 | 
											
												
													
														|  | 
 |  | +    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[72]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +print(values_lenth)
 | 
											
												
													
														|  | 
 |  | +print(video_id_lenth)
 | 
											
												
													
														|  | 
 |  | +print(tag_length)
 | 
											
												
													
														|  | 
 |  | +print(word_length)
 | 
											
												
													
														|  | 
 |  | +fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1]
 | 
											
												
													
														|  | 
 |  | +fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2]
 | 
											
												
													
														|  | 
 |  | +fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3]
 | 
											
												
													
														|  | 
 |  | +fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4]
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[95]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length):
 | 
											
												
													
														|  | 
 |  | +    Feature_Data= pd.DataFrame()
 | 
											
												
													
														|  | 
 |  | +    for df in (fold1_df,fold2_df,fold3_df,fold4_df):
 | 
											
												
													
														|  | 
 |  | +        fold1_df1 = df.iloc[0:values_lenth,:]
 | 
											
												
													
														|  | 
 |  | +        videoid_fold1_importance = df.iloc[values_lenth:values_lenth+video_id_lenth,:]['importance'].sum()
 | 
											
												
													
														|  | 
 |  | +        fold1_df2 = pd.DataFrame([{'Feature':'videoid','importance':videoid_fold1_importance,'fold':1}])
 | 
											
												
													
														|  | 
 |  | +        tag_fold1_importance = df.iloc[values_lenth+video_id_lenth:values_lenth+video_id_lenth+tag_length,:]['importance'].sum()
 | 
											
												
													
														|  | 
 |  | +        fold1_df3 = pd.DataFrame([{'Feature':'tags','importance':tag_fold1_importance,'fold':1}])
 | 
											
												
													
														|  | 
 |  | +        words_fold1_importance = df.iloc[values_lenth+video_id_lenth+tag_length:values_lenth+video_id_lenth+tag_length+word_length,:]['importance'].sum()
 | 
											
												
													
														|  | 
 |  | +        fold1_df4 = pd.DataFrame([{'Feature':'words','importance':words_fold1_importance,'fold':1}])
 | 
											
												
													
														|  | 
 |  | +        
 | 
											
												
													
														|  | 
 |  | +        
 | 
											
												
													
														|  | 
 |  | +        Feature_Data = pd.concat([Feature_Data,fold1_df1,fold1_df2,fold1_df3,fold1_df4])
 | 
											
												
													
														|  | 
 |  | +        
 | 
											
												
													
														|  | 
 |  | +    return Feature_Data
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[96]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[98]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof)))
 | 
											
												
													
														|  | 
 |  | +print('oof_mse:', mean_squared_error(df_target, oof))
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +print('test_rmse:', np.sqrt(mean_squared_error(df_target_test, predictions)))
 | 
											
												
													
														|  | 
 |  | +print('test_mse:', mean_squared_error(df_target_test, predictions))
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[99]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def MAPE(true, pred):
 | 
											
												
													
														|  | 
 |  | +    true = np.array(true)
 | 
											
												
													
														|  | 
 |  | +    sum_ = 0
 | 
											
												
													
														|  | 
 |  | +    count = 0
 | 
											
												
													
														|  | 
 |  | +    for i in range(len(true)):
 | 
											
												
													
														|  | 
 |  | +        if true[i] != 0:
 | 
											
												
													
														|  | 
 |  | +            sum_ = sum_ + np.abs(true[i] - pred[i]) / true[i]
 | 
											
												
													
														|  | 
 |  | +            count = count + 1
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +        else:
 | 
											
												
													
														|  | 
 |  | +            continue
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    return sum_ / count
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +print('oof_mape:', MAPE(df_target, oof))
 | 
											
												
													
														|  | 
 |  | +print('test_mape:', MAPE(df_target_test, predictions))
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[100]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# from sklearn.metrics import r2_score
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +print('verification r2:', r2_score(df_target, oof))
 | 
											
												
													
														|  | 
 |  | +print('test r2:', r2_score(df_target_test, predictions))
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +sub_df_ = pd.DataFrame({"videoid": data_test["videoid"].values})
 | 
											
												
													
														|  | 
 |  | +sub_df_['score'] = predictions
 | 
											
												
													
														|  | 
 |  | +print('regre ranking shape', sub_df_.shape)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[101]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +if ten_percent_thresh < 0.4:
 | 
											
												
													
														|  | 
 |  | +    rest_video = recall_video[recall_video['score'] <= 0.35]
 | 
											
												
													
														|  | 
 |  | +else:
 | 
											
												
													
														|  | 
 |  | +    rest_video = recall_video[recall_video['score'] <= ten_percent_thresh]
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# recall_all = pd.concat([rest_video,sub_df_],axis=0).sort_values(by=['score'],ascending=False)
 | 
											
												
													
														|  | 
 |  | +# recall_all.columns = ['videoId', 'score']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +recall_all = sub_df_.sort_values(by=['score'], ascending=False)
 | 
											
												
													
														|  | 
 |  | +recall_all.columns = ['videoId', 'score']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +print('result score shape', recall_all.shape)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[102]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# recall_all.to_json('/root/ROVtrain/readonlinetable/video_score_add_newfeature'+ datetime.datetime.strftime(now_date, '%Y%m%d')[-4:] + '.json',orient='records')
 | 
											
												
													
														|  | 
 |  | +# print('save json success')
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +recall_all.to_json(
 | 
											
												
													
														|  | 
 |  | +    '/root/ROVtrain/readonlinetable/result/video_score_' + datetime.datetime.strftime(now_date, '%Y%m%d')[
 | 
											
												
													
														|  | 
 |  | +                                                           -4:] + '.json', orient='records')
 | 
											
												
													
														|  | 
 |  | +print('save json success')
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[103]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +sub_df_ = pd.DataFrame({"videoid": data_test["videoid"].values})
 | 
											
												
													
														|  | 
 |  | +sub_df_['score'] = predictions
 | 
											
												
													
														|  | 
 |  | +compare_col_ = data_test[
 | 
											
												
													
														|  | 
 |  | +    ['videoid', 'weighted_retrn_log', 'weighted_retrn', 'todyviewcount', 'day3viewcount', 'day1viewcount',
 | 
											
												
													
														|  | 
 |  | +     'day3returncount', 'day1returncount']]
 | 
											
												
													
														|  | 
 |  | +merge_ = pd.merge(compare_col_, sub_df_, on=['videoid'])
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# In[104]:
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +# merge_.shape
 | 
											
												
													
														|  | 
 |  | +merge_.to_csv('/root/ROVtrain/readonlinetable/video_metric_score/' + now_day[-4:] + '/' + 'video_metric' + '.csv',
 | 
											
												
													
														|  | 
 |  | +              index=False)
 | 
											
												
													
														|  | 
 |  | +feature_importance_df.to_csv(
 | 
											
												
													
														|  | 
 |  | +    '/root/ROVtrain/readonlinetable/video_metric_score/' + now_day[-4:] + '/' + 'feature_importance' + '.csv',
 | 
											
												
													
														|  | 
 |  | +    index=False)
 | 
											
												
													
														|  | 
 |  | +print('end')
 | 
											
												
													
														|  | 
 |  | +
 |