baichongyang 3 lat temu
rodzic
commit
dabbcf72fe
1 zmienionych plików z 7 dodań i 64 usunięć
  1. 7 64
      rov_train2.py

+ 7 - 64
rov_train2.py

@@ -73,7 +73,6 @@ def select_recent_video(df):
 def basic_cal(df):
     df['weighted_retrn'] = df['futre7dayreturn'].astype('int') 
     df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1)
-    df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1)
     return df 
 
 def dataprepare(df_pre):
@@ -131,17 +130,15 @@ def process_train_predict_data():
     predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
     #pickle for test
     import _pickle as cPickle
-    with open('train_data.pickle','wb') as output_file:
-        cPickle.dump(train_data, output_file)
-    with open('predict_data.pickle','wb') as output_file:
-        cPickle.dump(predict_data, output_file) 
+    # with open('train_data.pickle','wb') as output_file:
+    #     cPickle.dump(train_data, output_file)
+    # with open('predict_data.pickle','wb') as output_file:
+    #     cPickle.dump(predict_data, output_file) 
     #with open(r"train_data.pickle", "rb") as input_file:
-    '''
     with open(r"train_data.pickle", "rb") as input_file:
         train_data = cPickle.load(input_file)    
     with open(r"predict_data.pickle", "rb") as input_file:
         predict_data = cPickle.load(input_file)       
-    '''
     #end pickle
     train_data = basic_cal(train_data)
     predict_data = basic_cal(predict_data)
@@ -163,59 +160,11 @@ def process_train_predict_data():
     df_new_feature,df_target= dataprepare(train_data)
     df_new_feature_predict, df_target_predict = dataprepare(predict_data)
 
-    df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
-    df_new_feature_predict_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_predict).loc[:,'day1playcount':'videocategory555']))
-
-    print('value feature generate successfully')
-
     train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid']
     predict_videoid = pd.DataFrame(df_new_feature_predict).loc[:,'videoid']
 
     train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist()
     predict_videoid_list = pd.DataFrame(df_new_feature_predict).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videoid']),1).tolist()
-
-
-    allvideo_raw = list(set(np.array(pd.concat([train_videoid,predict_videoid])).tolist()))
-    allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist()
-    
-
-    mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo)
-    train_videoid = mlb_model_videoid.transform(train_videoid_list)
-    predict_videoid = mlb_model_videoid.transform(predict_videoid_list)
-
-    print('videoid feature generate successfully')
-
-    #获取tag-one-hot
-    tags ,train_tag,predict_tag = process_tag.tag_preprocessing('tag', df_new_feature, df_new_feature_predict)
-    #获取tag tfidf
-    tag_dict = process_tag.get_tag_tfidf('20200305','video_tag_tf_idf')
-    print('lenth tag_dict:',len(tag_dict))
-    #获取tfidf_tag 稀疏矩阵
-    tag_corpus = tags.tolist()  #corpus
-    tag_tfidf_list = process_tag.ttfidf_list_generation(tag_corpus,tag_dict )
-    tag_tf_idf_matrix  = sparse.csr_matrix(np.array(tag_tfidf_list))
-
-    tag_feature_train = train_tag.multiply(tag_tf_idf_matrix)  
-    tag_feature_test = predict_tag.multiply(tag_tf_idf_matrix)  
-    print('tag tfidf feature generate successfully')
-    print('tag dimension:', len(tag_tfidf_list))
-
-    #获取values without tag
-    words ,train_words,test_words = process_tag.tag_preprocessing('words_no_tag', df_new_feature, df_new_feature_predict)
-    #获取words tfidf
-    words_dict = process_tag.get_tag_tfidf('20200305','video_words_without_tags_tfidf')
-    print('lenth words_dict:',len(words_dict))
-    #获取tfidf_tag 稀疏矩阵
-    words_corpus = words.tolist()  #corpus
-    words_tfidf_list = process_tag.ttfidf_list_generation(words_corpus,words_dict )
-    words_tf_idf_matrix  = sparse.csr_matrix(np.array(words_tfidf_list))
-    words_feature_train = train_words.multiply(words_tf_idf_matrix)  
-    words_feature_test = test_words.multiply(words_tf_idf_matrix)  
-    print('tag tfidf feature generate successfully')
-    print('words dimension:', len(words_tfidf_list))
-
-    df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train])
-    df_new_feature_predict = hstack([df_new_feature_predict_part_one,predict_videoid,tag_feature_test,words_feature_test])
     return train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict 
 
 
@@ -250,16 +199,10 @@ def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feat
     predictions = np.zeros(len(df_target_predict))
     feature_importance_df = pd.DataFrame()
 
-
-    # values_lenth = len(process_feature.features + process_feature.cate_feat)
-    # video_id_lenth = len(mlb_model_videoid.classes_)
-    # tag_length = len(tag_tfidf_list)
-    # word_length = len(words_tfidf_list)
-
     change_view = pd.DataFrame(pd.DataFrame(df_new_feature_predict.toarray()))
     change_view = change_view.sort_index()  
 
-    for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, train_data['return_back'].values)):
+    for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature)):
         print("folds {}".format(fold_))
         trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx])
         val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx])
@@ -296,10 +239,10 @@ def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feat
 
 
     print('oof_mape:', MAPE(df_target, oof))
-    print('test_mape:', MAPE(df_target_predict, predictions))
+    # print('test_mape:', MAPE(df_target_predict, predictions))
 
     print('verification r2:', r2_score(df_target, oof))
-    print('test r2:', r2_score(df_target_predict, predictions))
+    # print('test r2:', r2_score(df_target_predict, predictions))
 
     sub_df_ = pd.DataFrame({"videoid": predict_data["videoid"].values})
     sub_df_['score'] = predictions