baichongyang 3 years ago
parent
commit
0d1f3c76a6
3 changed files with 148 additions and 160 deletions
  1. 3 0
      process_feature.py
  2. 9 15
      process_tag.py
  3. 136 145
      rov_train.py

+ 3 - 0
process_feature.py

@@ -1,3 +1,6 @@
+import time
+import numpy as np
+
 add_feature = [
 add_feature = [
     'all_return_day1_return_count',  # -- 1/3/7/14日内总回流  #12
     'all_return_day1_return_count',  # -- 1/3/7/14日内总回流  #12
     'all_return_day3_return_count',
     'all_return_day3_return_count',

+ 9 - 15
process_tag.py

@@ -1,33 +1,30 @@
-def tag_preprocessing(filename):
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import MultiLabelBinarizer
+from odps import ODPS
+
+def tag_preprocessing(filename,df_new_feature, df_new_feature_predict):
     #读取tag分词结果
     #读取tag分词结果
     tag_txt = open("/root/ROVtrain/tfidfCompution/"+ filename +".txt","r")   #设置文件对象
     tag_txt = open("/root/ROVtrain/tfidfCompution/"+ filename +".txt","r")   #设置文件对象
     ftextlist = tag_txt.readlines() # 同上
     ftextlist = tag_txt.readlines() # 同上
     tag_txt.close() #关闭文件
     tag_txt.close() #关闭文件
-    
 
 
     #转为corpus
     #转为corpus
     tagList = str(ftextlist).replace('[','').replace(']','').replace("'","").replace("'","").split(',')
     tagList = str(ftextlist).replace('[','').replace(']','').replace("'","").replace("'","").split(',')
     tag = np.array(tagList).reshape(len(tagList),1).tolist()
     tag = np.array(tagList).reshape(len(tagList),1).tolist()
-    
-    
-
 
 
     #将词特征转为list形式
     #将词特征转为list形式
     train_tag_feature =  pd.DataFrame(df_new_feature).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videotags']),1).tolist()
     train_tag_feature =  pd.DataFrame(df_new_feature).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videotags']),1).tolist()
-    test_tag_feature = pd.DataFrame(df_new_feature_test).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_test).loc[:,'videotags']),1).tolist()
+    predict_tag_feature = pd.DataFrame(df_new_feature_predict).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videotags']),1).tolist()
     
     
     #稀疏特征
     #稀疏特征
     mlb_model_tag = MultiLabelBinarizer(sparse_output=True).fit(tag)
     mlb_model_tag = MultiLabelBinarizer(sparse_output=True).fit(tag)
     train_tag = mlb_model_tag.transform(train_tag_feature)
     train_tag = mlb_model_tag.transform(train_tag_feature)
-    test_tag = mlb_model_tag.transform(test_tag_feature)
+    predict_tag = mlb_model_tag.transform(predict_tag_feature)
     
     
-    return mlb_model_tag.classes_,train_tag,test_tag
+    return mlb_model_tag.classes_,train_tag,predict_tag
 
 
 
 
-# In[25]:
-
-
-#读取tf,idf
 def get_tag_tfidf(dt, tfidf_table_name):
 def get_tag_tfidf(dt, tfidf_table_name):
     odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods',
     odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods',
                 endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
                 endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
@@ -38,9 +35,6 @@ def get_tag_tfidf(dt, tfidf_table_name):
     return tag_dict
     return tag_dict
 
 
 
 
-# In[26]:
-
-
 def ttfidf_list_generation(tag_corpus,tag_dict):
 def ttfidf_list_generation(tag_corpus,tag_dict):
     tag_tfidf_list = []
     tag_tfidf_list = []
     for i in tag_corpus:
     for i in tag_corpus:

+ 136 - 145
rov_train.py

@@ -13,6 +13,7 @@ from sklearn.linear_model import SGDClassifier
 import lightgbm as lgb
 import lightgbm as lgb
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import StratifiedKFold
 from sklearn.model_selection import StratifiedKFold
+from sklearn.preprocessing import MultiLabelBinarizer
 from sklearn import metrics
 from sklearn import metrics
 import pickle
 import pickle
 from sklearn.metrics import mean_squared_error
 from sklearn.metrics import mean_squared_error
@@ -22,16 +23,12 @@ from odps import ODPS
 from odps.df import DataFrame as odpsdf
 from odps.df import DataFrame as odpsdf
 from datetime import datetime as dt
 from datetime import datetime as dt
 import datetime
 import datetime
+from scipy import sparse
+from scipy.sparse import hstack
+
+import process_feature
+import process_tag
 
 
-now_date = datetime.date.today() 
-# day = datetime.datetime.strftime(now_date, '%Y%m%d')
-diff_1 = datetime.timedelta(days=1)
-diff_5 = datetime.timedelta(days=7)
-input_dt = now_date - diff_1
-input_day = datetime.datetime.strftime(input_dt, '%Y%m%d')
-now_day = datetime.datetime.strftime(now_date, '%Y%m%d')
-train_dt = now_date - diff_5
-train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
 
 
 def getRovfeaturetable(dt, table):
 def getRovfeaturetable(dt, table):
     odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
     odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
@@ -41,7 +38,7 @@ def getRovfeaturetable(dt, table):
     featureArray = []
     featureArray = []
     for record in odps.read_table(table, partition='dt=%s' % dt):
     for record in odps.read_table(table, partition='dt=%s' % dt):
         valueFeature = {}
         valueFeature = {}
-        for i in featurename:
+        for i in process_feature.featurename:
             if i == 'dt':
             if i == 'dt':
                 valueFeature[i] = dt
                 valueFeature[i] = dt
             else:
             else:
@@ -61,17 +58,12 @@ def getdatasample(date, max_range, table):
         datelist.append(tar_dt.strftime("%Y%m%d"))
         datelist.append(tar_dt.strftime("%Y%m%d"))
     print(datelist)
     print(datelist)
     for tm in datelist:
     for tm in datelist:
-        testlist.append(getRovtestable(tm, table))
+        testlist.append(getRovfeaturetable(tm, table))
     testdata = pd.concat(testlist)
     testdata = pd.concat(testlist)
     testdata.reset_index(inplace=True)
     testdata.reset_index(inplace=True)
     testdata = testdata.drop(axis=1, columns='index')
     testdata = testdata.drop(axis=1, columns='index')
     return testdata
     return testdata
 
 
-
-traindata = getrainingdata(train_day, 30, 'rov_feature_add_v1')
-data_test_ori_rk = getestingdata(input_day, 1, 'rov_predict_table_add_v1')
-
-
 def select_recent_video(df):
 def select_recent_video(df):
     """对每一个视频添加row number,按照日期排序,最后选取最近的那一天"""
     """对每一个视频添加row number,按照日期排序,最后选取最近的那一天"""
     df['dt'] = df['dt'].astype(int)
     df['dt'] = df['dt'].astype(int)
@@ -79,15 +71,6 @@ def select_recent_video(df):
     df = df[df['rk'] == 1]
     df = df[df['rk'] == 1]
     return df
     return df
 
 
-
-data_test_ori = select_recent_video(data_test_ori_rk)
-data_test_ori.loc[data_test_ori['dt'] != int(input_day), 'futre7dayreturn'] = 0
-data_test_ori = data_test_ori.drop(axis=1, columns='rk')
-
-traindata.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
-data_test_ori.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
-
-
 def basic_cal(df):
 def basic_cal(df):
     df['weighted_retrn'] = df['futre7dayreturn'].astype('int') 
     df['weighted_retrn'] = df['futre7dayreturn'].astype('int') 
     df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1)
     df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1)
@@ -95,111 +78,34 @@ def basic_cal(df):
     df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1)
     df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1)
     return df 
     return df 
 
 
-
-data_train = basic_cal(traindata)
-data_test = basic_cal(data_test_ori)
-
 def today_view_category(df):
 def today_view_category(df):
 ### 对当天的曝光量分三个级别,未来三天的曝光量分3个级别,添加Category feaure
 ### 对当天的曝光量分三个级别,未来三天的曝光量分3个级别,添加Category feaure
-    data_test1_view1 =   df.loc[data_test['day1viewcount_rank'] > 10000]['day1viewcount'].mean()
-    data_test1_view2 =   df.loc[(data_test['day1viewcount_rank'] > 3000)&(data_test['day1viewcount_rank'] <= 10000)]['day1viewcount'].mean()
-    data_test1_view3 =   df.loc[(data_test['day1viewcount_rank'] > 1000)&(data_test['day1viewcount_rank'] <= 3000)]['day1viewcount'].mean()
-    data_test1_view4 =   df.loc[(data_test['day1viewcount_rank'] > 300)&(data_test['day1viewcount_rank'] <= 1000)]['day1viewcount'].mean()
-    data_test1_view5 =   df.loc[(data_test['day1viewcount_rank'] > 100)&(data_test['day1viewcount_rank'] <= 300)]['day1viewcount'].mean()
-    data_test1_view6 =   df.loc[(data_test['day1viewcount_rank'] > 30)&(data_test['day1viewcount_rank'] <= 100)]['day1viewcount'].mean()
-    data_test1_view7 =   df.loc[(data_test['day1viewcount_rank'] > 0)&(data_test['day1viewcount_rank'] <= 30)]['day1viewcount'].mean()
+    data_test1_view1 =   df.loc[predict_data['day1viewcount_rank'] > 10000]['day1viewcount'].mean()
+    data_test1_view2 =   df.loc[(predict_data['day1viewcount_rank'] > 3000)&(predict_data['day1viewcount_rank'] <= 10000)]['day1viewcount'].mean()
+    data_test1_view3 =   df.loc[(predict_data['day1viewcount_rank'] > 1000)&(predict_data['day1viewcount_rank'] <= 3000)]['day1viewcount'].mean()
+    data_test1_view4 =   df.loc[(predict_data['day1viewcount_rank'] > 300)&(predict_data['day1viewcount_rank'] <= 1000)]['day1viewcount'].mean()
+    data_test1_view5 =   df.loc[(predict_data['day1viewcount_rank'] > 100)&(predict_data['day1viewcount_rank'] <= 300)]['day1viewcount'].mean()
+    data_test1_view6 =   df.loc[(predict_data['day1viewcount_rank'] > 30)&(predict_data['day1viewcount_rank'] <= 100)]['day1viewcount'].mean()
+    data_test1_view7 =   df.loc[(predict_data['day1viewcount_rank'] > 0)&(predict_data['day1viewcount_rank'] <= 30)]['day1viewcount'].mean()
     
     
     df.loc[df['day1viewcount_rank'] > 10000, 'todyviewcount'] = data_test1_view1
     df.loc[df['day1viewcount_rank'] > 10000, 'todyviewcount'] = data_test1_view1
-    df.loc[(data_test['day1viewcount_rank'] > 3000)&(data_test['day1viewcount_rank'] <= 10000), 'todyviewcount'] = data_test1_view2
-    df.loc[(data_test['day1viewcount_rank'] > 1000)&(data_test['day1viewcount_rank'] <= 3000), 'todyviewcount'] = data_test1_view3
-    df.loc[(data_test['day1viewcount_rank'] > 300)&(data_test['day1viewcount_rank'] <= 1000), 'todyviewcount'] = data_test1_view4
-    df.loc[(data_test['day1viewcount_rank'] > 100)&(data_test['day1viewcount_rank'] <= 300), 'todyviewcount'] = data_test1_view5
-    df.loc[(data_test['day1viewcount_rank'] > 30)&(data_test['day1viewcount_rank'] <= 100), 'todyviewcount'] = data_test1_view6
-    df.loc[(data_test['day1viewcount_rank'] > 0)&(data_test['day1viewcount_rank'] <= 30), 'todyviewcount'] = data_test1_view7
+    df.loc[(predict_data['day1viewcount_rank'] > 3000)&(predict_data['day1viewcount_rank'] <= 10000), 'todyviewcount'] = data_test1_view2
+    df.loc[(predict_data['day1viewcount_rank'] > 1000)&(predict_data['day1viewcount_rank'] <= 3000), 'todyviewcount'] = data_test1_view3
+    df.loc[(predict_data['day1viewcount_rank'] > 300)&(predict_data['day1viewcount_rank'] <= 1000), 'todyviewcount'] = data_test1_view4
+    df.loc[(predict_data['day1viewcount_rank'] > 100)&(predict_data['day1viewcount_rank'] <= 300), 'todyviewcount'] = data_test1_view5
+    df.loc[(predict_data['day1viewcount_rank'] > 30)&(predict_data['day1viewcount_rank'] <= 100), 'todyviewcount'] = data_test1_view6
+    df.loc[(predict_data['day1viewcount_rank'] > 0)&(predict_data['day1viewcount_rank'] <= 30), 'todyviewcount'] = data_test1_view7
     return df
     return df
 
 
-data_test =  today_view_category(data_test) 
-
-
 def dataprepare(df_pre):
 def dataprepare(df_pre):
     #  直接将特征送进去,不加交叉特征。
     #  直接将特征送进去,不加交叉特征。
     # 是否对数据补零
     # 是否对数据补零
     df_pre = df_pre.fillna(0)
     df_pre = df_pre.fillna(0)
-    df_new_feature = df_pre[features]
+    df_new_feature = df_pre[process_feature.features]
     df_target = df_pre['weighted_retrn_log']
     df_target = df_pre['weighted_retrn_log']
-    df_new_feature = pd.concat([df_new_feature, df_pre[cate_feat],df_pre[one_hot_feature]], axis=1)
+    df_new_feature = pd.concat([df_new_feature, df_pre[process_feature.cate_feat],df_pre[process_feature.one_hot_feature]], axis=1)
     return df_new_feature, df_target
     return df_new_feature, df_target
 
 
-data_test['videoid'] = data_test['videoid'].astype('int')
-
-data_train = data_train[data_train['weighted_retrn'] > 0]
-print(data_train.shape, 'train shape')
-data_test = pd.merge(data_test, recall_video_stage_one, on=['videoid'], how='inner')
-print('score>0.5 video_count:', data_test.shape)
-
-df_new_feature,df_target= dataprepare(data_train)
-df_new_feature_test, df_target_test = dataprepare(data_test)
-
-
-#数值
-from scipy import sparse
-
-df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
-df_new_feature_test_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_test).loc[:,'day1playcount':'videocategory555']))
-
-print('value feature generate successfully')
-
-train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid']
-test_videoid = pd.DataFrame(df_new_feature_test).loc[:,'videoid']
-
-train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist()
-test_videoid_list = pd.DataFrame(df_new_feature_test).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_test).loc[:,'videoid']),1).tolist()
-
-
-allvideo_raw = list(set(np.array(pd.concat([train_videoid,test_videoid])).tolist()))
-allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist()
-from sklearn.preprocessing import MultiLabelBinarizer
-
-mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo)
-train_videoid = mlb_model_videoid.transform(train_videoid_list)
-test_videoid = mlb_model_videoid.transform(test_videoid_list)
-
-print('videoid feature generate successfully')
-
-#获取tag-one-hot
-tags ,train_tag,test_tag = tag_preprocessing('tag')
-#获取tag tfidf
-tag_dict = get_tag_tfidf('20200305','video_tag_tf_idf')
-print('lenth tag_dict:',len(tag_dict))
-#获取tfidf_tag 稀疏矩阵
-tag_corpus = tags.tolist()  #corpus
-tag_tfidf_list = ttfidf_list_generation(tag_corpus,tag_dict )
-tag_tf_idf_matrix  = sparse.csr_matrix(np.array(tag_tfidf_list))
-
-tag_feature_train = train_tag.multiply(tag_tf_idf_matrix)  
-tag_feature_test = test_tag.multiply(tag_tf_idf_matrix)  
-print('tag tfidf feature generate successfully')
-
-print('tag dimension:', len(tag_tfidf_list))
-
-
-# In[28]:
-
-
-#获取values without tag
-words ,train_words,test_words = tag_preprocessing('words_no_tag')
-#获取words tfidf
-words_dict = get_tag_tfidf('20200305','video_words_without_tags_tfidf')
-print('lenth words_dict:',len(words_dict))
-#获取tfidf_tag 稀疏矩阵
-words_corpus = words.tolist()  #corpus
-words_tfidf_list = ttfidf_list_generation(words_corpus,words_dict )
-words_tf_idf_matrix  = sparse.csr_matrix(np.array(words_tfidf_list))
-words_feature_train = train_words.multiply(words_tf_idf_matrix)  
-words_feature_test = test_words.multiply(words_tf_idf_matrix)  
-print('tag tfidf feature generate successfully')
-print('words dimension:', len(words_tfidf_list))
-
 
 
 def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length):
 def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length):
     Feature_Data= pd.DataFrame()
     Feature_Data= pd.DataFrame()
@@ -233,15 +139,101 @@ def MAPE(true, pred):
     return sum_ / count
     return sum_ / count
 
 
 
 
+def process_train_predict_data():
+    now_date = datetime.date.today() 
+    # day = datetime.datetime.strftime(now_date, '%Y%m%d')
+    diff_1 = datetime.timedelta(days=1)
+    diff_5 = datetime.timedelta(days=7)
+    predict_dt = now_date - diff_1
+    predict_day = datetime.datetime.strftime(predict_dt, '%Y%m%d')
+    train_dt = now_date - diff_5
+    train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
+
+    train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
+    predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
+    #TODO save tempt
+    
+    train_data = basic_cal(train_data)
+    predict_data = basic_cal(predict_data)
+
+    predict_data = select_recent_video(predict_data)
+    predict_data.loc[predict_data['dt'] != int(predict_day), 'futre7dayreturn'] = 0
+    predict_data = predict_data.drop(axis=1, columns='rk')
+
+    train_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
+    predict_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
 
 
-def do_train():
-    from scipy.sparse import hstack
+    train_data = train_data.fillna(0)
+    predict_data = predict_data.fillna(0)
+    train_data = process_feature.cal_feature(train_data)
+    predict_data = process_feature.cal_feature(predict_data)
+    predict_data =  today_view_category(predict_data) 
+
+    predict_data['videoid'] = predict_data['videoid'].astype('int')
+
+    df_new_feature,df_target= dataprepare(train_data)
+    df_new_feature_predict, df_target_predict = dataprepare(predict_data)
+
+    df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
+    df_new_feature_predict_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_predict).loc[:,'day1playcount':'videocategory555']))
+
+    print('value feature generate successfully')
+
+    train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid']
+    predict_videoid = pd.DataFrame(df_new_feature_predict).loc[:,'videoid']
+
+    train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist()
+    predict_videoid_list = pd.DataFrame(df_new_feature_predict).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videoid']),1).tolist()
+
+
+    allvideo_raw = list(set(np.array(pd.concat([train_videoid,predict_videoid])).tolist()))
+    allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist()
+    
+
+    mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo)
+    train_videoid = mlb_model_videoid.transform(train_videoid_list)
+    predict_videoid = mlb_model_videoid.transform(predict_videoid_list)
+
+    print('videoid feature generate successfully')
+
+    #获取tag-one-hot
+    tags ,train_tag,predict_tag = process_tag.tag_preprocessing('tag', df_new_feature, df_new_feature_predict)
+    #获取tag tfidf
+    tag_dict = process_tag.get_tag_tfidf('20200305','video_tag_tf_idf')
+    print('lenth tag_dict:',len(tag_dict))
+    #获取tfidf_tag 稀疏矩阵
+    tag_corpus = tags.tolist()  #corpus
+    tag_tfidf_list = process_tag.ttfidf_list_generation(tag_corpus,tag_dict )
+    tag_tf_idf_matrix  = sparse.csr_matrix(np.array(tag_tfidf_list))
+
+    tag_feature_train = train_tag.multiply(tag_tf_idf_matrix)  
+    tag_feature_test = predict_tag.multiply(tag_tf_idf_matrix)  
+    print('tag tfidf feature generate successfully')
+    print('tag dimension:', len(tag_tfidf_list))
+
+    #获取values without tag
+    words ,train_words,test_words = process_tag.tag_preprocessing('words_no_tag')
+    #获取words tfidf
+    words_dict = process_tag.get_tag_tfidf('20200305','video_words_without_tags_tfidf')
+    print('lenth words_dict:',len(words_dict))
+    #获取tfidf_tag 稀疏矩阵
+    words_corpus = words.tolist()  #corpus
+    words_tfidf_list = process_tag.ttfidf_list_generation(words_corpus,words_dict )
+    words_tf_idf_matrix  = sparse.csr_matrix(np.array(words_tfidf_list))
+    words_feature_train = train_words.multiply(words_tf_idf_matrix)  
+    words_feature_test = test_words.multiply(words_tf_idf_matrix)  
+    print('tag tfidf feature generate successfully')
+    print('words dimension:', len(words_tfidf_list))
 
 
     df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train])
     df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train])
-    df_new_feature_test = hstack([df_new_feature_test_part_one,test_videoid,tag_feature_test,words_feature_test])
+    df_new_feature_predict = hstack([df_new_feature_predict_part_one,predict_videoid,tag_feature_test,words_feature_test])
+ 
+
+
+def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict):
 
 
     #target
     #target
-    df_target_test = sparse.csr_matrix(pd.DataFrame(df_target_test).values).toarray()
+    df_target_predict = sparse.csr_matrix(pd.DataFrame(df_target_predict).values).toarray()
     df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray()
     df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray()
 
 
 
 
@@ -265,21 +257,21 @@ def do_train():
 
 
     folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590)
     folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590)
     oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray())))
     oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray())))
-    predictions = np.zeros(len(df_target_test))
+    predictions = np.zeros(len(df_target_predict))
     feature_importance_df = pd.DataFrame()
     feature_importance_df = pd.DataFrame()
 
 
 
 
-    values_lenth = len(features + cate_feat)
-    video_id_lenth = len(mlb_model_videoid.classes_)
-    tag_length = len(tag_tfidf_list)
-    word_length = len(words_tfidf_list)
+    # values_lenth = len(process_feature.features + process_feature.cate_feat)
+    # video_id_lenth = len(mlb_model_videoid.classes_)
+    # tag_length = len(tag_tfidf_list)
+    # word_length = len(words_tfidf_list)
 
 
 
 
-    change_view = pd.DataFrame(pd.DataFrame(df_new_feature_test.toarray()))
+    change_view = pd.DataFrame(pd.DataFrame(df_new_feature_predict.toarray()))
     change_view = change_view.sort_index()
     change_view = change_view.sort_index()
 
 
 
 
-    for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, data_train['return_back'].values)):
+    for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, train_data['return_back'].values)):
         print("folds {}".format(fold_))
         print("folds {}".format(fold_))
         trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx])
         trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx])
         val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx])
         val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx])
@@ -288,41 +280,40 @@ def do_train():
         clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100,
         clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100,
                 early_stopping_rounds=200)
                 early_stopping_rounds=200)
         oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration)
         oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration)
-        predictions += clf.predict(df_new_feature_test, num_iteration=clf.best_iteration) / folds.n_splits
+        predictions += clf.predict(df_new_feature_predict, num_iteration=clf.best_iteration) / folds.n_splits
 
 
         fold_importance_df = pd.DataFrame()
         fold_importance_df = pd.DataFrame()
         
         
-        column = features+cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus
-        fold_importance_df["Feature"] = np.array(column)
+        # column = process_feature.features+process_feature.cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus
+        # fold_importance_df["Feature"] = np.array(column)
         
         
-        fold_importance_df["importance"] = clf.feature_importance()
-        fold_importance_df["fold"] = fold_ + 1
-        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
-
+        # fold_importance_df["importance"] = clf.feature_importance()
+        # fold_importance_df["fold"] = fold_ + 1
+        # feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
 
 
-    fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1]
-    fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2]
-    fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3]
-    fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4]
 
 
+    # fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1]
+    # fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2]
+    # fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3]
+    # fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4]
 
 
 
 
-    feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length)
+    # feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length)
 
 
     print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof)))
     print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof)))
     print('oof_mse:', mean_squared_error(df_target, oof))
     print('oof_mse:', mean_squared_error(df_target, oof))
 
 
-    print('test_rmse:', np.sqrt(mean_squared_error(df_target_test, predictions)))
-    print('test_mse:', mean_squared_error(df_target_test, predictions))
+    print('test_rmse:', np.sqrt(mean_squared_error(df_target_predict, predictions)))
+    print('test_mse:', mean_squared_error(df_target_predict, predictions))
 
 
 
 
     print('oof_mape:', MAPE(df_target, oof))
     print('oof_mape:', MAPE(df_target, oof))
-    print('test_mape:', MAPE(df_target_test, predictions))
+    print('test_mape:', MAPE(df_target_predict, predictions))
 
 
     print('verification r2:', r2_score(df_target, oof))
     print('verification r2:', r2_score(df_target, oof))
-    print('test r2:', r2_score(df_target_test, predictions))
+    print('test r2:', r2_score(df_target_predict, predictions))
 
 
-    sub_df_ = pd.DataFrame({"videoid": data_test["videoid"].values})
+    sub_df_ = pd.DataFrame({"videoid": predict_data["videoid"].values})
     sub_df_['score'] = predictions
     sub_df_['score'] = predictions
     print('regre ranking shape', sub_df_.shape)
     print('regre ranking shape', sub_df_.shape)