baichongyang 3 лет назад
Родитель
Сommit
8a5d6390c6
2 измененных файлов с 63 добавлено и 9 удалено
  1. 49 0
      process_feature.py
  2. 14 9
      rov_train.py

+ 49 - 0
process_feature.py

@@ -191,6 +191,55 @@ cate_feat = ['videocategory1', 'videocategory10', 'videocategory11', 'videocateg
 one_hot_feature = ['videotags','words_without_tags','videoid']
 #
 
+features = ['day1playcount', 'day1returncount', 'day1sharecount', 'day1viewcount', 'day30playcount', 'day30returncount', 
+            'day30sharecount', 'day30viewcount', 'day3playcount', 'day3returncount', 'day3sharecount', 'day3viewcount',
+            'day60playcount', 'day60returncount', 'day60sharecount', 'day60viewcount', 'day7playcount', 'day7returncount', 
+            'day7sharecount', 'day7viewcount', 'usercategory1', 'usercategory2', 'usercategory3', 'usercategory4',
+            'usercategory5', 'usercategory6', 'usercategory7', 'usercategory8', 'usercategory9', 'usercategory10',
+            'usercategory11', 'usercategory12', 'usercategory45', 'usercategory49', 'usercategory85','usercategory555', 
+            'todyviewcount', 
+            'day5returncount_1_stage', 'day5returncount_2_stage', 'day5returncount_3_stage', 'day5returncount_4_stage', 
+            'stage_one_retrn', 'stage_two_retrn', 'stage_three_retrn', 'stage_four_retrn', 'all_return_day1_return_count', 
+            'all_return_day3_return_count', 'all_return_day7_return_count', 'all_return_day14_return_count',
+            'three_return_day1_return_count', 'three_return_day3_return_count', 'three_return_day7_return_count', 
+            'three_return_day14_return_count', 'four_up_return_day1_return_count', 'four_up_return_day3_return_count', 
+            'four_up_return_day7_return_count', 'four_up_return_day14_return_count', 'one_return_day1_return_count', 
+            'one_return_day3_return_count', 'one_return_day7_return_count', 'one_return_day14_return_count', 
+            'four_up_return_div_three_return_day1', 'four_up_return_div_three_return_day3', 
+            'four_up_return_div_three_return_day7', 'four_up_return_div_three_return_day14', 
+            'all_return_day1_view_day1_return_count', 'all_return_day3_view_day3_return_count', 
+            'all_return_day7_view_day7_return_count', 'all_return_day14_view_day14_return_count',
+            'three_return_day1_view_day1_return_count', 'three_return_day3_view_day3_return_count', 
+            'three_return_day7_view_day7_return_count', 'three_return_day14_view_day14_return_count', 
+            'four_up_return_day1_view_day1_return_count', 'four_up_return_day3_view_day3_return_count', 
+            'four_up_return_day7_view_day7_return_count', 'four_up_return_day14_view_day14_return_count', 
+            'one_return_day1_view_day1_return_count', 'one_return_day3_view_day3_return_count', 
+            'one_return_day7_view_day7_return_count', 'one_return_day14_view_day14_return_count', 
+            'all_return_day1_on_day1_return_count', 'all_return_day3_on_day1_return_count', 
+            'all_return_day7_on_day1_return_count', 'all_return_day14_on_day1_return_count', 
+            'four_up_return_day1_view_day1_return_div_three_d1', 'four_up_return_day3_view_day3_return_div_three_d3', 
+            'four_up_return_day7_view_day7_return_div_three_d7', 'four_up_return_day14_view_day14_return_div_three_d14', 
+            'day1ctr', 'day3ctr', 'day7ctr', 'day14ctr', 'day30ctr', 'day60ctr', 'day1sov', 'day3sov', 'day7sov', 
+            'day14sov', 'day30sov', 'day60sov', 'day1rov', 'day3rov', 'day7rov', 'day14rov', 'day1soc', 'day3soc', 
+            'day7soc', 'day14soc', 'day30soc', 'day60soc', 'day1roc', 'day3roc', 'day7roc', 'day14roc', 'oneday_day1rov', 
+            'oneday_day3rov', 'oneday_day7rov', 'oneday_day14rov',
+            'day60playcount_divide_day30playcount', 'day60playcount_dif_day30playcount', 
+            'day60returncount_divide_day30returncount', 'day60returncount_dif_day30returncount', 
+            'day60sharecount_divide_day30sharecount', 'day60sharecount_dif_day30sharecount', 
+            'day60viewcount_divide_day30viewcount', 'day60viewcount_dif_day30viewcount',
+            'day30playcount_divide_day7playcount', 'day30playcount_dif_day7playcount', 
+            'day30returncount_divide_day7returncount', 'day30returncount_dif_day7returncount', 
+            'day30sharecount_divide_day7sharecount', 'day30sharecount_dif_day7sharecount', 
+            'day30viewcount_divide_day7viewcount', 'day30viewcount_dif_day7viewcount', 
+            'day7playcount_divide_day3playcount', 'day7playcount_dif_day3playcount', 
+            'day7returncount_divide_day3returncount', 'day7returncount_dif_day3returncount', 
+            'day7sharecount_divide_day3sharecount', 'day7sharecount_dif_day3sharecount', 
+            'day7viewcount_divide_day3viewcount', 'day7viewcount_dif_day3viewcount', 'day3playcount_divide_day1playcount', 
+            'day3playcount_dif_day1playcount', 'day3returncount_divide_day1returncount', 
+            'day3returncount_dif_day1returncount', 'day3sharecount_divide_day1sharecount',
+            'day3sharecount_dif_day1sharecount', 'day3viewcount_divide_day1viewcount', 
+            'day3viewcount_dif_day1viewcount']
+
 def cal_feature(df):
     start = time.time()
     for i in range(len(root_page_1day)):

+ 14 - 9
rov_train.py

@@ -148,21 +148,24 @@ def process_train_predict_data():
     predict_day = datetime.datetime.strftime(predict_dt, '%Y%m%d')
     train_dt = now_date - diff_5
     train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
-
-    train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
-    predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
-    #TODO save tempt
+    #read data from ali
+    #train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
+    #predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
+    #pickle for test
     import _pickle as cPickle
+    ''''
     with open('train_data.pickle','wb') as output_file:
         cPickle.dump(train_data, output_file)
     with open('predict_data.pickle','wb') as output_file:
         cPickle.dump(predict_data, output_file) 
     exit()
+    '''
+    #with open(r"train_data.pickle", "rb") as input_file:
     with open(r"train_data.pickle", "rb") as input_file:
         train_data = cPickle.load(input_file)    
     with open(r"predict_data.pickle", "rb") as input_file:
         predict_data = cPickle.load(input_file)       
-
+    #end pickle
     train_data = basic_cal(train_data)
     predict_data = basic_cal(predict_data)
 
@@ -222,7 +225,7 @@ def process_train_predict_data():
     print('tag dimension:', len(tag_tfidf_list))
 
     #获取values without tag
-    words ,train_words,test_words = process_tag.tag_preprocessing('words_no_tag')
+    words ,train_words,test_words = process_tag.tag_preprocessing('words_no_tag', df_new_feature, df_new_feature_predict)
     #获取words tfidf
     words_dict = process_tag.get_tag_tfidf('20200305','video_words_without_tags_tfidf')
     print('lenth words_dict:',len(words_dict))
@@ -237,7 +240,7 @@ def process_train_predict_data():
 
     df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train])
     df_new_feature_predict = hstack([df_new_feature_predict_part_one,predict_videoid,tag_feature_test,words_feature_test])
- 
+    return train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict 
 
 
 def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict):
@@ -266,7 +269,8 @@ def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feat
          "random_state": 4590}
 
     folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590)
-    oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray())))
+    #oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray())))
+    oof = np.zeros(len(df_target))
     predictions = np.zeros(len(df_target_predict))
     feature_importance_df = pd.DataFrame()
 
@@ -329,4 +333,5 @@ def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feat
 
 
 if __name__ == '__main__':
-    process_train_predict_data()
+    train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict = process_train_predict_data()
+    do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict)