|
@@ -73,7 +73,6 @@ def select_recent_video(df):
|
|
|
def basic_cal(df):
|
|
|
df['weighted_retrn'] = df['futre7dayreturn'].astype('int')
|
|
|
df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1)
|
|
|
- df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1)
|
|
|
return df
|
|
|
|
|
|
def dataprepare(df_pre):
|
|
@@ -131,17 +130,15 @@ def process_train_predict_data():
|
|
|
predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
|
|
|
#pickle for test
|
|
|
import _pickle as cPickle
|
|
|
- with open('train_data.pickle','wb') as output_file:
|
|
|
- cPickle.dump(train_data, output_file)
|
|
|
- with open('predict_data.pickle','wb') as output_file:
|
|
|
- cPickle.dump(predict_data, output_file)
|
|
|
+ # with open('train_data.pickle','wb') as output_file:
|
|
|
+ # cPickle.dump(train_data, output_file)
|
|
|
+ # with open('predict_data.pickle','wb') as output_file:
|
|
|
+ # cPickle.dump(predict_data, output_file)
|
|
|
#with open(r"train_data.pickle", "rb") as input_file:
|
|
|
- '''
|
|
|
with open(r"train_data.pickle", "rb") as input_file:
|
|
|
train_data = cPickle.load(input_file)
|
|
|
with open(r"predict_data.pickle", "rb") as input_file:
|
|
|
predict_data = cPickle.load(input_file)
|
|
|
- '''
|
|
|
#end pickle
|
|
|
train_data = basic_cal(train_data)
|
|
|
predict_data = basic_cal(predict_data)
|
|
@@ -163,59 +160,11 @@ def process_train_predict_data():
|
|
|
df_new_feature,df_target= dataprepare(train_data)
|
|
|
df_new_feature_predict, df_target_predict = dataprepare(predict_data)
|
|
|
|
|
|
- df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
|
|
|
- df_new_feature_predict_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_predict).loc[:,'day1playcount':'videocategory555']))
|
|
|
-
|
|
|
- print('value feature generate successfully')
|
|
|
-
|
|
|
train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid']
|
|
|
predict_videoid = pd.DataFrame(df_new_feature_predict).loc[:,'videoid']
|
|
|
|
|
|
train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist()
|
|
|
predict_videoid_list = pd.DataFrame(df_new_feature_predict).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videoid']),1).tolist()
|
|
|
-
|
|
|
-
|
|
|
- allvideo_raw = list(set(np.array(pd.concat([train_videoid,predict_videoid])).tolist()))
|
|
|
- allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist()
|
|
|
-
|
|
|
-
|
|
|
- mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo)
|
|
|
- train_videoid = mlb_model_videoid.transform(train_videoid_list)
|
|
|
- predict_videoid = mlb_model_videoid.transform(predict_videoid_list)
|
|
|
-
|
|
|
- print('videoid feature generate successfully')
|
|
|
-
|
|
|
- #获取tag-one-hot
|
|
|
- tags ,train_tag,predict_tag = process_tag.tag_preprocessing('tag', df_new_feature, df_new_feature_predict)
|
|
|
- #获取tag tfidf
|
|
|
- tag_dict = process_tag.get_tag_tfidf('20200305','video_tag_tf_idf')
|
|
|
- print('lenth tag_dict:',len(tag_dict))
|
|
|
- #获取tfidf_tag 稀疏矩阵
|
|
|
- tag_corpus = tags.tolist() #corpus
|
|
|
- tag_tfidf_list = process_tag.ttfidf_list_generation(tag_corpus,tag_dict )
|
|
|
- tag_tf_idf_matrix = sparse.csr_matrix(np.array(tag_tfidf_list))
|
|
|
-
|
|
|
- tag_feature_train = train_tag.multiply(tag_tf_idf_matrix)
|
|
|
- tag_feature_test = predict_tag.multiply(tag_tf_idf_matrix)
|
|
|
- print('tag tfidf feature generate successfully')
|
|
|
- print('tag dimension:', len(tag_tfidf_list))
|
|
|
-
|
|
|
- #获取values without tag
|
|
|
- words ,train_words,test_words = process_tag.tag_preprocessing('words_no_tag', df_new_feature, df_new_feature_predict)
|
|
|
- #获取words tfidf
|
|
|
- words_dict = process_tag.get_tag_tfidf('20200305','video_words_without_tags_tfidf')
|
|
|
- print('lenth words_dict:',len(words_dict))
|
|
|
- #获取tfidf_tag 稀疏矩阵
|
|
|
- words_corpus = words.tolist() #corpus
|
|
|
- words_tfidf_list = process_tag.ttfidf_list_generation(words_corpus,words_dict )
|
|
|
- words_tf_idf_matrix = sparse.csr_matrix(np.array(words_tfidf_list))
|
|
|
- words_feature_train = train_words.multiply(words_tf_idf_matrix)
|
|
|
- words_feature_test = test_words.multiply(words_tf_idf_matrix)
|
|
|
- print('tag tfidf feature generate successfully')
|
|
|
- print('words dimension:', len(words_tfidf_list))
|
|
|
-
|
|
|
- df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train])
|
|
|
- df_new_feature_predict = hstack([df_new_feature_predict_part_one,predict_videoid,tag_feature_test,words_feature_test])
|
|
|
return train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict
|
|
|
|
|
|
|
|
@@ -250,16 +199,10 @@ def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feat
|
|
|
predictions = np.zeros(len(df_target_predict))
|
|
|
feature_importance_df = pd.DataFrame()
|
|
|
|
|
|
-
|
|
|
- # values_lenth = len(process_feature.features + process_feature.cate_feat)
|
|
|
- # video_id_lenth = len(mlb_model_videoid.classes_)
|
|
|
- # tag_length = len(tag_tfidf_list)
|
|
|
- # word_length = len(words_tfidf_list)
|
|
|
-
|
|
|
change_view = pd.DataFrame(pd.DataFrame(df_new_feature_predict.toarray()))
|
|
|
change_view = change_view.sort_index()
|
|
|
|
|
|
- for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, train_data['return_back'].values)):
|
|
|
+ for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature)):
|
|
|
print("folds {}".format(fold_))
|
|
|
trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx])
|
|
|
val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx])
|
|
@@ -296,10 +239,10 @@ def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feat
|
|
|
|
|
|
|
|
|
print('oof_mape:', MAPE(df_target, oof))
|
|
|
- print('test_mape:', MAPE(df_target_predict, predictions))
|
|
|
+ # print('test_mape:', MAPE(df_target_predict, predictions))
|
|
|
|
|
|
print('verification r2:', r2_score(df_target, oof))
|
|
|
- print('test r2:', r2_score(df_target_predict, predictions))
|
|
|
+ # print('test r2:', r2_score(df_target_predict, predictions))
|
|
|
|
|
|
sub_df_ = pd.DataFrame({"videoid": predict_data["videoid"].values})
|
|
|
sub_df_['score'] = predictions
|