#!/usr/bin/env python # coding: utf-8 # In[2]: import warnings warnings.filterwarnings("ignore") from sklearn.metrics import r2_score import os import pandas as pd import gc import math import numpy as np import time from sklearn.linear_model import SGDRegressor from sklearn.linear_model import SGDClassifier import lightgbm as lgb from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold from sklearn import metrics import pickle from sklearn.metrics import mean_squared_error import seaborn as sns import matplotlib.pylab as plt from odps import ODPS from odps.df import DataFrame as odpsdf from datetime import datetime as dt import datetime # In[3]: now_date = datetime.date.today() # day = datetime.datetime.strftime(now_date, '%Y%m%d') diff_1 = datetime.timedelta(days=1) diff_5 = datetime.timedelta(days=7) input_dt = now_date - diff_1 input_day = datetime.datetime.strftime(input_dt, '%Y%m%d') now_day = datetime.datetime.strftime(now_date, '%Y%m%d') train_dt = now_date - diff_5 train_day = datetime.datetime.strftime(train_dt, '%Y%m%d') # In[4]: add_feature = [ 'all_return_day1_return_count', # -- 1/3/7/14日内总回流 #12 'all_return_day3_return_count', 'all_return_day7_return_count', 'all_return_day14_return_count', 'three_return_day1_return_count', # -- 1/3/7/14日内前三层回流 #14 'three_return_day3_return_count', 'three_return_day7_return_count', 'three_return_day14_return_count', 'four_up_return_day1_return_count', # -- 1/3/7/14日内四+层回流 #15 'four_up_return_day3_return_count', 'four_up_return_day7_return_count', 'four_up_return_day14_return_count', 'one_return_day1_return_count', # -- 1/3/7/14日内一层回流 #13 'one_return_day3_return_count', 'one_return_day7_return_count', 'one_return_day14_return_count', 'four_up_return_div_three_return_day1', # -- 1/3/7/14日内四+层回流/前三层回流 #23 'four_up_return_div_three_return_day3', 'four_up_return_div_three_return_day7', 'four_up_return_div_three_return_day14', 'all_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内回流 #8 'all_return_day3_view_day3_return_count', 'all_return_day7_view_day7_return_count', 'all_return_day14_view_day14_return_count', 'three_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内前三层回流 #10 'three_return_day3_view_day3_return_count', 'three_return_day7_view_day7_return_count', 'three_return_day14_view_day14_return_count', 'four_up_return_day1_view_day1_return_count', # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流 # 11 'four_up_return_day3_view_day3_return_count', 'four_up_return_day7_view_day7_return_count', 'four_up_return_day14_view_day14_return_count', 'one_return_day1_view_day1_return_count', ##-- 1/3/7/14日内曝光在1/3/7/14日内一层回流 #9 'one_return_day3_view_day3_return_count', 'one_return_day7_view_day7_return_count', 'one_return_day14_view_day14_return_count', 'all_return_day1_on_day1_return_count', # 前day1+1 / day1+3/day1+7/day1+14 到前 day1+1日内曝光在 day1的总回流 #16 'all_return_day3_on_day1_return_count', 'all_return_day7_on_day1_return_count', 'all_return_day14_on_day1_return_count', 'four_up_return_day1_view_day1_return_div_three_d1', # -- 1/3/7/14日内曝光在1/3/7/14日内四+层回流/前三层回流 #22 'four_up_return_day3_view_day3_return_div_three_d3', 'four_up_return_day7_view_day7_return_div_three_d7', 'four_up_return_day14_view_day14_return_div_three_d14', 'day1ctr', # -- 1/3/7/14/30/60日内播放/曝光 #17 'day3ctr', 'day7ctr', 'day14ctr', 'day30ctr', 'day60ctr', 'day1sov', # -- 1/3/7/14/30/60日内分享/曝光 #18 'day3sov', 'day7sov', 'day14sov', 'day30sov', 'day60sov', 'day1rov', # -- 1/3/7/14日内曝光的回流/曝光 #19 'day3rov', 'day7rov', 'day14rov', 'day1soc', # -- 1/3/7/14/30/60日内分享/播放 #20 'day3soc', 'day7soc', 'day14soc', 'day30soc', 'day60soc', 'day1roc', # -- 1/3/7/14日内曝光的回流/播放 #21 'day3roc', 'day7roc', 'day14roc', 'oneday_day1rov', # -- 1/3/7/14日内曝光在今日的回流/ 1/3/7/14日内曝光 #24 'oneday_day3rov', 'oneday_day7rov', 'oneday_day14rov', 'futre7dayreturn' ,'todyviewcount_rank' ,'day1viewcount_rank' ] featurename = [ 'dt', 'videoid', 'day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount', 'day14playcount', 'day14returncount', 'day14sharecount', 'day14viewcount', 'day30playcount', 'day30returncount', 'day30sharecount', 'day30viewcount', 'day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount', 'day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount', 'day7playcount', 'day7returncount', 'day7sharecount', 'day7viewcount', 'videocategory11', 'videocategory12', 'videocategory45', 'videocategory49', 'videocategory1', 'videocategory2', 'videocategory3', 'videocategory4', 'videocategory5', 'videocategory6', 'videocategory7', 'videocategory8', 'videocategory9', 'videocategory85', 'videocategory10', 'videocategory555', 'usercategory1', 'usercategory2', 'usercategory3', 'usercategory4', 'usercategory5', 'usercategory6', 'usercategory7', 'usercategory8', 'usercategory9', 'usercategory10', 'usercategory11', 'usercategory12', 'usercategory45', 'usercategory49', 'usercategory85', 'usercategory555', 'todyviewcount', 'day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage', 'day5returncount_4_stage', 'stage_one_retrn', 'stage_two_retrn', 'stage_three_retrn', 'stage_four_retrn'] words = ['videotags','words_without_tags'] featurename = featurename + add_feature + words print(len(featurename)) # In[5]: def getRovfeaturetable(dt): odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm', endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \ read_timeout=500000, pool_maxsize=1000, pool_connections=1000) featureArray = [] for record in odps.read_table('rov_feature_add_v1', partition='dt=%s' % dt): valueFeature = {} for i in featurename: if i == 'dt': valueFeature[i] = dt else: valueFeature[i] = record[i] featureArray.append(valueFeature) featureArray = pd.DataFrame(featureArray) print(dt, 'feature table finish') return featureArray def getRovtestable(dt): odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm', endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \ read_timeout=500000, pool_maxsize=1000, pool_connections=1000) featureArray = [] for record in odps.read_table('rov_predict_table_add_v1', partition='dt=%s' % dt): valueFeature = {} for i in featurename: if i == 'dt': valueFeature[i] = dt else: valueFeature[i] = record[i] featureArray.append(valueFeature) featureArray = pd.DataFrame(featureArray) print(dt, 'test table finish') return featureArray def getestingdata(date): new_date = dt.strptime(date, '%Y%m%d') datelist = [] testlist = [] for i in range(0, 1): delta = datetime.timedelta(days=i) tar_dt = new_date - delta datelist.append(tar_dt.strftime("%Y%m%d")) print(datelist) for tm in datelist: testlist.append(getRovtestable(tm)) testdata = pd.concat(testlist) testdata.reset_index(inplace=True) testdata = testdata.drop(axis=1, columns='index') return testdata def getrainingdata(date): new_date = dt.strptime(date, '%Y%m%d') datelist = [] trainlist = [] for i in range(0, 30): delta = datetime.timedelta(days=i) tar_dt = new_date - delta datelist.append(tar_dt.strftime("%Y%m%d")) print(datelist) for tm in datelist: trainlist.append(getRovfeaturetable(tm)) traindata = pd.concat(trainlist) traindata.reset_index(inplace=True) traindata = traindata.drop(axis=1, columns='index') return traindata traindata = getrainingdata(train_day) data_test_ori_rk = getestingdata(input_day) # In[6]: def select_recent_video(df): """对每一个视频添加row number,按照日期排序,最后选取最近的那一天""" df['dt'] = df['dt'].astype(int) df['rk'] = df['dt'].groupby(df['videoid']).rank(ascending=0, method='first') df = df[df['rk'] == 1] return df data_test_ori = select_recent_video(data_test_ori_rk) data_test_ori.loc[data_test_ori['dt'] != int(input_day), 'futre7dayreturn'] = 0 data_test_ori = data_test_ori.drop(axis=1, columns='rk') # In[7]: ## 去重复,保证每个视频 每一天 有切仅有一条数据。 traindata.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True) data_test_ori.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True) # In[8]: def basic_cal(df): # df['weighted_retrn'] = df['stage_one_retrn'].astype('int')*0.4 + \ # df['stage_two_retrn'].astype('int')*0.3 + \ # df['stage_three_retrn'].astype('int')*0.3 df['weighted_retrn'] = df['futre7dayreturn'].astype('int') # df['weighted_retrn'] = df['futr5returncount'].astype('int') # day1viewcount 如果是零,就返回 rov,rov_log 变为零 df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1) # df['rov'] = df.apply(lambda x: x['weighted_retrn'] / x['todyviewcount'] \ # if x['todyviewcount']!=0 else 0,axis=1) # df['rov_log'] = df.apply(lambda x: np.log(x['rov'] + 1),axis=1) # thresh = np.percentile(df[df['weighted_retrn']>0]['weighted_retrn'],50) ## 设置回流大于thresh, label就是1, 没有分享或有分享但是回流数是零的标为0 df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1) return df data_train = basic_cal(traindata) data_test = basic_cal(data_test_ori) # In[9]: def today_view_category(df): ### 对当天的曝光量分三个级别,未来三天的曝光量分3个级别,添加Category feaure data_test1_view1 = df.loc[data_test['day1viewcount_rank'] > 10000]['day1viewcount'].mean() data_test1_view2 = df.loc[(data_test['day1viewcount_rank'] > 3000)&(data_test['day1viewcount_rank'] <= 10000)]['day1viewcount'].mean() data_test1_view3 = df.loc[(data_test['day1viewcount_rank'] > 1000)&(data_test['day1viewcount_rank'] <= 3000)]['day1viewcount'].mean() data_test1_view4 = df.loc[(data_test['day1viewcount_rank'] > 300)&(data_test['day1viewcount_rank'] <= 1000)]['day1viewcount'].mean() data_test1_view5 = df.loc[(data_test['day1viewcount_rank'] > 100)&(data_test['day1viewcount_rank'] <= 300)]['day1viewcount'].mean() data_test1_view6 = df.loc[(data_test['day1viewcount_rank'] > 30)&(data_test['day1viewcount_rank'] <= 100)]['day1viewcount'].mean() data_test1_view7 = df.loc[(data_test['day1viewcount_rank'] > 0)&(data_test['day1viewcount_rank'] <= 30)]['day1viewcount'].mean() df.loc[df['day1viewcount_rank'] > 10000, 'todyviewcount'] = data_test1_view3 df.loc[(data_test['day1viewcount_rank'] > 3000)&(data_test['day1viewcount_rank'] <= 10000), 'todyviewcount'] = data_test1_view3 df.loc[(data_test['day1viewcount_rank'] > 1000)&(data_test['day1viewcount_rank'] <= 3000), 'todyviewcount'] = data_test1_view4 df.loc[(data_test['day1viewcount_rank'] > 300)&(data_test['day1viewcount_rank'] <= 1000), 'todyviewcount'] = data_test1_view5 df.loc[(data_test['day1viewcount_rank'] > 100)&(data_test['day1viewcount_rank'] <= 300), 'todyviewcount'] = data_test1_view6 df.loc[(data_test['day1viewcount_rank'] > 30)&(data_test['day1viewcount_rank'] <= 100), 'todyviewcount'] = data_test1_view7 df.loc[(data_test['day1viewcount_rank'] > 0)&(data_test['day1viewcount_rank'] <= 30), 'todyviewcount'] = data_test1_view7 return df data_test = today_view_category(data_test) # In[10]: # 首页特征 root_page_1day = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount'] root_page_3day = ['day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount'] root_page_7day = ['day7playcount', 'day7returncount', 'day7sharecount', 'day7viewcount'] root_page_14day = ['day14playcount', 'day14returncount', 'day14sharecount', 'day14viewcount'] root_page_30day = ['day30playcount', 'day30returncount', 'day30sharecount', 'day30viewcount'] root_page_60day = ['day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount'] return_5day = ['day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage', 'day5returncount_4_stage'] cate_feat = ['videocategory1', 'videocategory10', 'videocategory11', 'videocategory12', 'videocategory2', 'videocategory3', 'videocategory4', 'videocategory45', 'videocategory49', 'videocategory5', 'videocategory6', 'videocategory7', 'videocategory8', 'videocategory85', 'videocategory9', 'videocategory555'] one_hot_feature = ['videotags','words_without_tags','videoid'] # cate_view_feat = [ 'todyview_low','todyview_median','todyview_high'] # cate_view_feat = ['todyview_1', 'todyview_2', 'todyview_3', 'todyview_4', 'todyview_5', 'todyview_6', 'todyview_7', # 'todyview_8'] # In[11]: def cal_feature(df): start = time.time() for i in range(len(root_page_1day)): newfeat_div = root_page_60day[i] + '_divide_' + root_page_30day[i] # df[newfeat_div] = df.apply(lambda s: s[root_page_30day[i]] / s[root_page_60day[i]]\ # if s[root_page_60day[i]] != 0 else 0, axis=1) df[newfeat_div] = df[root_page_30day[i]]/ df[root_page_60day[i]] newfeat_diff = root_page_60day[i] + '_dif_' + root_page_30day[i] # df[newfeat_diff] = df.apply(lambda s: s[root_page_60day[i]]-s[root_page_30day[i]],\ # axis=1) df[newfeat_diff] = df[root_page_60day[i]] - df[root_page_30day[i]] end = time.time() running_time = end-start print('stage 1: time cost : %.5f sec' %running_time) start = time.time() for i in range(len(root_page_1day)): newfeat_div = root_page_30day[i] + '_divide_' + root_page_7day[i] # df[newfeat_div] = df.apply(lambda s: s[root_page_7day[i]] / s[root_page_30day[i]]\ # if s[root_page_30day[i]] != 0 else 0, axis=1) df[newfeat_div] = df[root_page_7day[i]]/df[root_page_30day[i]] newfeat_diff = root_page_30day[i] + '_dif_' + root_page_7day[i] # df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\ # axis=1) df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]] end = time.time() running_time = end-start print('stage 2: time cost : %.5f sec' %running_time) start = time.time() for i in range(len(root_page_1day)): newfeat_div = root_page_7day[i] + '_divide_' + root_page_3day[i] # df[newfeat_div] = df.apply(lambda s: s[root_page_3day[i]] / s[root_page_7day[i]]\ # if s[root_page_7day[i]] != 0 else 0, axis=1) df[newfeat_div] = df[root_page_3day[i]]/df[root_page_7day[i]] newfeat_diff = root_page_7day[i] + '_dif_' + root_page_3day[i] # df[newfeat_diff] = df.apply(lambda s: s[root_page_7day[i]] - s[root_page_3day[i]],\ # axis=1) df[newfeat_diff] = df[root_page_7day[i]] - df[root_page_3day[i]] end = time.time() running_time = end-start print('stage 3: time cost : %.5f sec' %running_time) start = time.time() for i in range(len(root_page_1day)): newfeat_div = root_page_3day[i] + '_divide_' + root_page_1day[i] # df[newfeat_div] = df.apply(lambda s: s[root_page_1day[i]] / s[root_page_3day[i]]\ # if s[root_page_3day[i]] != 0 else 0, axis=1) df[newfeat_div] = df[root_page_1day[i]] / df[root_page_3day[i]] newfeat_diff = root_page_3day[i] + '_dif_' + root_page_1day[i] # df[newfeat_diff] = df.apply(lambda s: s[root_page_3day[i]] - s[root_page_1day[i]],\ # axis=1) df[newfeat_diff] = df[root_page_3day[i]] - df[root_page_1day[i]] end = time.time() running_time = end-start print('stage 4: time cost : %.5f sec' %running_time) df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0) return df # In[12]: data_train = data_train.fillna(0) data_test = data_test.fillna(0) data_train = cal_feature(data_train) data_test = cal_feature(data_test) # In[13]: print('data_train shape:', data_train.shape) print('data_test shape:', data_test.shape) # In[14]: features = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount', 'day30playcount', 'day30returncount', 'day30sharecount', 'day30viewcount', 'day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount', 'day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount', 'day7playcount', 'day7returncount', 'day7sharecount', 'day7viewcount', 'usercategory1', 'usercategory2', 'usercategory3', 'usercategory4', 'usercategory5', 'usercategory6', 'usercategory7', 'usercategory8', 'usercategory9', 'usercategory10', 'usercategory11', 'usercategory12', 'usercategory45', 'usercategory49', 'usercategory85','usercategory555', 'todyviewcount', 'day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage', 'day5returncount_4_stage', 'stage_one_retrn', 'stage_two_retrn', 'stage_three_retrn', 'stage_four_retrn', 'all_return_day1_return_count', 'all_return_day3_return_count', 'all_return_day7_return_count', 'all_return_day14_return_count', 'three_return_day1_return_count', 'three_return_day3_return_count', 'three_return_day7_return_count', 'three_return_day14_return_count', 'four_up_return_day1_return_count', 'four_up_return_day3_return_count', 'four_up_return_day7_return_count', 'four_up_return_day14_return_count', 'one_return_day1_return_count', 'one_return_day3_return_count', 'one_return_day7_return_count', 'one_return_day14_return_count', 'four_up_return_div_three_return_day1', 'four_up_return_div_three_return_day3', 'four_up_return_div_three_return_day7', 'four_up_return_div_three_return_day14', 'all_return_day1_view_day1_return_count', 'all_return_day3_view_day3_return_count', 'all_return_day7_view_day7_return_count', 'all_return_day14_view_day14_return_count', 'three_return_day1_view_day1_return_count', 'three_return_day3_view_day3_return_count', 'three_return_day7_view_day7_return_count', 'three_return_day14_view_day14_return_count', 'four_up_return_day1_view_day1_return_count', 'four_up_return_day3_view_day3_return_count', 'four_up_return_day7_view_day7_return_count', 'four_up_return_day14_view_day14_return_count', 'one_return_day1_view_day1_return_count', 'one_return_day3_view_day3_return_count', 'one_return_day7_view_day7_return_count', 'one_return_day14_view_day14_return_count', 'all_return_day1_on_day1_return_count', 'all_return_day3_on_day1_return_count', 'all_return_day7_on_day1_return_count', 'all_return_day14_on_day1_return_count', 'four_up_return_day1_view_day1_return_div_three_d1', 'four_up_return_day3_view_day3_return_div_three_d3', 'four_up_return_day7_view_day7_return_div_three_d7', 'four_up_return_day14_view_day14_return_div_three_d14', 'day1ctr', 'day3ctr', 'day7ctr', 'day14ctr', 'day30ctr', 'day60ctr', 'day1sov', 'day3sov', 'day7sov', 'day14sov', 'day30sov', 'day60sov', 'day1rov', 'day3rov', 'day7rov', 'day14rov', 'day1soc', 'day3soc', 'day7soc', 'day14soc', 'day30soc', 'day60soc', 'day1roc', 'day3roc', 'day7roc', 'day14roc', 'oneday_day1rov', 'oneday_day3rov', 'oneday_day7rov', 'oneday_day14rov', 'day60playcount_divide_day30playcount', 'day60playcount_dif_day30playcount', 'day60returncount_divide_day30returncount', 'day60returncount_dif_day30returncount', 'day60sharecount_divide_day30sharecount', 'day60sharecount_dif_day30sharecount', 'day60viewcount_divide_day30viewcount', 'day60viewcount_dif_day30viewcount', 'day30playcount_divide_day7playcount', 'day30playcount_dif_day7playcount', 'day30returncount_divide_day7returncount', 'day30returncount_dif_day7returncount', 'day30sharecount_divide_day7sharecount', 'day30sharecount_dif_day7sharecount', 'day30viewcount_divide_day7viewcount', 'day30viewcount_dif_day7viewcount', 'day7playcount_divide_day3playcount', 'day7playcount_dif_day3playcount', 'day7returncount_divide_day3returncount', 'day7returncount_dif_day3returncount', 'day7sharecount_divide_day3sharecount', 'day7sharecount_dif_day3sharecount', 'day7viewcount_divide_day3viewcount', 'day7viewcount_dif_day3viewcount', 'day3playcount_divide_day1playcount', 'day3playcount_dif_day1playcount', 'day3returncount_divide_day1returncount', 'day3returncount_dif_day1returncount', 'day3sharecount_divide_day1sharecount', 'day3sharecount_dif_day1sharecount', 'day3viewcount_divide_day1viewcount', 'day3viewcount_dif_day1viewcount'] # In[15]: def dataprepare(df_pre): # 直接将特征送进去,不加交叉特征。 # 是否对数据补零 df_pre = df_pre.fillna(0) df_new_feature = df_pre[features] # df_onehot_feature = df_pre[one_hot_feature] # df_new_feature = pd.concat([df_pre.loc[:, 'all_return_day14_on_day1_return_count':'day7viewcount'], \ # df_pre.loc[:, 'four_up_return_day14_return_count': \ # 'four_up_return_div_three_return_day7'], \ # df_pre.loc[:, 'one_return_day14_return_count':'oneday_day7rov'], # df_pre.loc[:, # 'three_return_day14_return_count':'three_return_day7_view_day7_return_count'], # df_pre.loc[:, 'usercategory1':'usercategory9'], df_pre.loc[:, # 'day60playcount_divide_day30playcount':'day3viewcount_dif_day1viewcount']], # axis=1) # df_new_feature = pd.concat([df_pre.loc[:,'day1playcount':'day7viewcount'],\ # df_pre.loc[:,'day60playcount_divide_day30playcount':\ # 'day5returncount_4_stage_dif_day5returncount_3_stage'], \ # df_pre.loc[:,'usercategory1':'usercategory9']], axis=1) df_target = df_pre['weighted_retrn_log'] df_new_feature = pd.concat([df_new_feature, df_pre[cate_feat],df_pre[one_hot_feature]], axis=1) return df_new_feature, df_target # In[16]: recall_video = pd.read_csv('/root/ROVtrain/readonlinetable/result/recall_' + input_day[-4:] + '.csv') # In[17]: ten_percent_thresh = recall_video['score'].min() if ten_percent_thresh < 0.4: recall_video_stage_one = recall_video[recall_video['score'] > 0.4] else: recall_video_stage_one = recall_video[recall_video['score'] > ten_percent_thresh] # In[18]: data_test['videoid'] = data_test['videoid'].astype('int') data_train = data_train[data_train['weighted_retrn'] > 0] print(data_train.shape, 'train shape') data_test = pd.merge(data_test, recall_video_stage_one, on=['videoid'], how='inner') print('score>0.5 video_count:', data_test.shape) # In[19]: df_new_feature,df_target= dataprepare(data_train) df_new_feature_test, df_target_test = dataprepare(data_test) # In[20]: #数值 from scipy import sparse df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555'])) df_new_feature_test_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_test).loc[:,'day1playcount':'videocategory555'])) print('value feature generate successfully') # In[21]: #videoid train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid'] test_videoid = pd.DataFrame(df_new_feature_test).loc[:,'videoid'] train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist() test_videoid_list = pd.DataFrame(df_new_feature_test).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_test).loc[:,'videoid']),1).tolist() allvideo_raw = list(set(np.array(pd.concat([train_videoid,test_videoid])).tolist())) allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist() from sklearn.preprocessing import MultiLabelBinarizer mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo) train_videoid = mlb_model_videoid.transform(train_videoid_list) test_videoid = mlb_model_videoid.transform(test_videoid_list) print('videoid feature generate successfully') # In[23]: len(mlb_model_videoid.classes_) # In[24]: def tag_preprocessing(filename): #读取tag分词结果 tag_txt = open("/root/ROVtrain/tfidfCompution/"+ filename +".txt","r") #设置文件对象 ftextlist = tag_txt.readlines() # 同上 tag_txt.close() #关闭文件 #转为corpus tagList = str(ftextlist).replace('[','').replace(']','').replace("'","").replace("'","").split(',') tag = np.array(tagList).reshape(len(tagList),1).tolist() #将词特征转为list形式 train_tag_feature = pd.DataFrame(df_new_feature).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videotags']),1).tolist() test_tag_feature = pd.DataFrame(df_new_feature_test).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_test).loc[:,'videotags']),1).tolist() #稀疏特征 mlb_model_tag = MultiLabelBinarizer(sparse_output=True).fit(tag) train_tag = mlb_model_tag.transform(train_tag_feature) test_tag = mlb_model_tag.transform(test_tag_feature) return mlb_model_tag.classes_,train_tag,test_tag # In[25]: #读取tf,idf def get_tag_tfidf(dt, tfidf_table_name): odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods', endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \ read_timeout=500000, pool_maxsize=1000, pool_connections=1000) tag_dict = {} for record in odps.read_table(tfidf_table_name, partition='dt=%s' % dt): tag_dict[record[0]] = record[1] return tag_dict # In[26]: def ttfidf_list_generation(tag_corpus,tag_dict): tag_tfidf_list = [] for i in tag_corpus: try : tag_tfidf_list.append(tag_dict[i]) except: tag_tfidf_list.append(0) return tag_tfidf_list # In[27]: #获取tag-one-hot tags ,train_tag,test_tag = tag_preprocessing('tag') #获取tag tfidf tag_dict = get_tag_tfidf('20200305','video_tag_tf_idf') print('lenth tag_dict:',len(tag_dict)) #获取tfidf_tag 稀疏矩阵 tag_corpus = tags.tolist() #corpus tag_tfidf_list = ttfidf_list_generation(tag_corpus,tag_dict ) tag_tf_idf_matrix = sparse.csr_matrix(np.array(tag_tfidf_list)) tag_feature_train = train_tag.multiply(tag_tf_idf_matrix) tag_feature_test = test_tag.multiply(tag_tf_idf_matrix) print('tag tfidf feature generate successfully') print('tag dimension:', len(tag_tfidf_list)) # In[28]: #获取values without tag words ,train_words,test_words = tag_preprocessing('words_no_tag') #获取words tfidf words_dict = get_tag_tfidf('20200305','video_words_without_tags_tfidf') print('lenth words_dict:',len(words_dict)) #获取tfidf_tag 稀疏矩阵 words_corpus = words.tolist() #corpus words_tfidf_list = ttfidf_list_generation(words_corpus,words_dict ) words_tf_idf_matrix = sparse.csr_matrix(np.array(words_tfidf_list)) words_feature_train = train_words.multiply(words_tf_idf_matrix) words_feature_test = test_words.multiply(words_tf_idf_matrix) print('tag tfidf feature generate successfully') print('words dimension:', len(words_tfidf_list)) # In[32]: #cancat 特征 from scipy.sparse import hstack #训练特征 df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train]) df_new_feature_test = hstack([df_new_feature_test_part_one,test_videoid,tag_feature_test,words_feature_test]) #target df_target_test = sparse.csr_matrix(pd.DataFrame(df_target_test).values).toarray() df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray() # In[33]: df_target.size # In[34]: param = {'num_leaves': 18, 'min_data_in_leaf': 60, 'objective': 'regression', 'max_depth': -1, 'learning_rate': 0.01, "min_child_samples": 30, "boosting": "gbdt", "feature_fraction": 0.8, "bagging_freq": 1, "bagging_fraction": 0.8, "bagging_seed": 11, "metric": 'rmse', "lambda_l1": 0.1, "verbosity": -1, "nthread": 4, # 'max_bin': 512, "random_state": 4590} folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590) oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray()))) predictions = np.zeros(len(df_target_test)) feature_importance_df = pd.DataFrame() # In[ ]: # In[46]: values_lenth = len(features + cate_feat) video_id_lenth = len(mlb_model_videoid.classes_) tag_length = len(tag_tfidf_list) word_length = len(words_tfidf_list) print(values_lenth) print(video_id_lenth) print(tag_length) print(word_length) # In[36]: change_view = pd.DataFrame(pd.DataFrame(df_new_feature_test.toarray())) change_view = change_view.sort_index() # In[64]: # In[67]: for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, data_train['return_back'].values)): print("folds {}".format(fold_)) trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx]) val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx]) num_round = 10000 clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100, early_stopping_rounds=200) oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration) predictions += clf.predict(df_new_feature_test, num_iteration=clf.best_iteration) / folds.n_splits fold_importance_df = pd.DataFrame() column = features+cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus fold_importance_df["Feature"] = np.array(column) fold_importance_df["importance"] = clf.feature_importance() fold_importance_df["fold"] = fold_ + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) # In[72]: print(values_lenth) print(video_id_lenth) print(tag_length) print(word_length) fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1] fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2] fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3] fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4] # In[95]: def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length): Feature_Data= pd.DataFrame() for df in (fold1_df,fold2_df,fold3_df,fold4_df): fold1_df1 = df.iloc[0:values_lenth,:] videoid_fold1_importance = df.iloc[values_lenth:values_lenth+video_id_lenth,:]['importance'].sum() fold1_df2 = pd.DataFrame([{'Feature':'videoid','importance':videoid_fold1_importance,'fold':1}]) tag_fold1_importance = df.iloc[values_lenth+video_id_lenth:values_lenth+video_id_lenth+tag_length,:]['importance'].sum() fold1_df3 = pd.DataFrame([{'Feature':'tags','importance':tag_fold1_importance,'fold':1}]) words_fold1_importance = df.iloc[values_lenth+video_id_lenth+tag_length:values_lenth+video_id_lenth+tag_length+word_length,:]['importance'].sum() fold1_df4 = pd.DataFrame([{'Feature':'words','importance':words_fold1_importance,'fold':1}]) Feature_Data = pd.concat([Feature_Data,fold1_df1,fold1_df2,fold1_df3,fold1_df4]) return Feature_Data # In[96]: feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length) # In[98]: print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof))) print('oof_mse:', mean_squared_error(df_target, oof)) print('test_rmse:', np.sqrt(mean_squared_error(df_target_test, predictions))) print('test_mse:', mean_squared_error(df_target_test, predictions)) # In[99]: def MAPE(true, pred): true = np.array(true) sum_ = 0 count = 0 for i in range(len(true)): if true[i] != 0: sum_ = sum_ + np.abs(true[i] - pred[i]) / true[i] count = count + 1 else: continue return sum_ / count print('oof_mape:', MAPE(df_target, oof)) print('test_mape:', MAPE(df_target_test, predictions)) # In[100]: # from sklearn.metrics import r2_score print('verification r2:', r2_score(df_target, oof)) print('test r2:', r2_score(df_target_test, predictions)) sub_df_ = pd.DataFrame({"videoid": data_test["videoid"].values}) sub_df_['score'] = predictions print('regre ranking shape', sub_df_.shape) # In[101]: if ten_percent_thresh < 0.4: rest_video = recall_video[recall_video['score'] <= 0.35] else: rest_video = recall_video[recall_video['score'] <= ten_percent_thresh] # recall_all = pd.concat([rest_video,sub_df_],axis=0).sort_values(by=['score'],ascending=False) # recall_all.columns = ['videoId', 'score'] recall_all = sub_df_.sort_values(by=['score'], ascending=False) recall_all.columns = ['videoId', 'score'] print('result score shape', recall_all.shape) # In[102]: # recall_all.to_json('/root/ROVtrain/readonlinetable/video_score_add_newfeature'+ datetime.datetime.strftime(now_date, '%Y%m%d')[-4:] + '.json',orient='records') # print('save json success') recall_all.to_json( '/root/ROVtrain/readonlinetable/result/video_score_' + datetime.datetime.strftime(now_date, '%Y%m%d')[ -4:] + '.json', orient='records') print('save json success') # In[103]: sub_df_ = pd.DataFrame({"videoid": data_test["videoid"].values}) sub_df_['score'] = predictions compare_col_ = data_test[ ['videoid', 'weighted_retrn_log', 'weighted_retrn', 'todyviewcount', 'day3viewcount', 'day1viewcount', 'day3returncount', 'day1returncount']] merge_ = pd.merge(compare_col_, sub_df_, on=['videoid']) # In[104]: # merge_.shape merge_.to_csv('/root/ROVtrain/readonlinetable/video_metric_score/' + now_day[-4:] + '/' + 'video_metric' + '.csv', index=False) feature_importance_df.to_csv( '/root/ROVtrain/readonlinetable/video_metric_score/' + now_day[-4:] + '/' + 'feature_importance' + '.csv', index=False) print('end')