|
@@ -0,0 +1,311 @@
|
|
|
+import warnings
|
|
|
+
|
|
|
+warnings.filterwarnings("ignore")
|
|
|
+from sklearn.metrics import r2_score
|
|
|
+import os
|
|
|
+import pandas as pd
|
|
|
+import gc
|
|
|
+import math
|
|
|
+import numpy as np
|
|
|
+import time
|
|
|
+from sklearn.linear_model import SGDRegressor
|
|
|
+from sklearn.linear_model import SGDClassifier
|
|
|
+import lightgbm as lgb
|
|
|
+from sklearn.model_selection import train_test_split
|
|
|
+from sklearn.model_selection import StratifiedKFold
|
|
|
+from sklearn.preprocessing import MultiLabelBinarizer
|
|
|
+from sklearn import metrics
|
|
|
+import pickle
|
|
|
+from sklearn.metrics import mean_squared_error
|
|
|
+import seaborn as sns
|
|
|
+import matplotlib.pylab as plt
|
|
|
+from odps import ODPS
|
|
|
+from odps.df import DataFrame as odpsdf
|
|
|
+from datetime import datetime as dt
|
|
|
+import datetime
|
|
|
+from scipy import sparse
|
|
|
+from scipy.sparse import hstack
|
|
|
+
|
|
|
+import process_feature
|
|
|
+import process_tag
|
|
|
+
|
|
|
+
|
|
|
+def getRovfeaturetable(dt, table):
|
|
|
+ odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
|
|
|
+ endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
|
|
|
+ read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
|
|
|
+
|
|
|
+ featureArray = []
|
|
|
+ for record in odps.read_table(table, partition='dt=%s' % dt):
|
|
|
+ valueFeature = {}
|
|
|
+ for i in process_feature.featurename:
|
|
|
+ if i == 'dt':
|
|
|
+ valueFeature[i] = dt
|
|
|
+ else:
|
|
|
+ valueFeature[i] = record[i]
|
|
|
+ featureArray.append(valueFeature)
|
|
|
+ featureArray = pd.DataFrame(featureArray)
|
|
|
+ print(dt, table, 'feature table finish')
|
|
|
+ return featureArray
|
|
|
+
|
|
|
+def getdatasample(date, max_range, table):
|
|
|
+ new_date = dt.strptime(date, '%Y%m%d')
|
|
|
+ datelist = []
|
|
|
+ testlist = []
|
|
|
+ for i in range(0, max_range):
|
|
|
+ delta = datetime.timedelta(days=i)
|
|
|
+ tar_dt = new_date - delta
|
|
|
+ datelist.append(tar_dt.strftime("%Y%m%d"))
|
|
|
+ for tm in datelist:
|
|
|
+ testlist.append(getRovfeaturetable(tm, table))
|
|
|
+ testdata = pd.concat(testlist)
|
|
|
+ testdata.reset_index(inplace=True)
|
|
|
+ testdata = testdata.drop(axis=1, columns='index')
|
|
|
+ return testdata
|
|
|
+
|
|
|
+def select_recent_video(df):
|
|
|
+ """对每一个视频添加row number,按照日期排序,最后选取最近的那一天"""
|
|
|
+ df['dt'] = df['dt'].astype(int)
|
|
|
+ df['rk'] = df['dt'].groupby(df['videoid']).rank(ascending=0, method='first')
|
|
|
+ df = df[df['rk'] == 1]
|
|
|
+ return df
|
|
|
+
|
|
|
+def basic_cal(df):
|
|
|
+ df['weighted_retrn'] = df['futre7dayreturn'].astype('int')
|
|
|
+ df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1)
|
|
|
+ df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1)
|
|
|
+ return df
|
|
|
+
|
|
|
+def dataprepare(df_pre):
|
|
|
+ # 直接将特征送进去,不加交叉特征。
|
|
|
+ df_pre = df_pre.fillna(0)
|
|
|
+ #df_new_feature = df_pre[process_feature.features]
|
|
|
+ df_new_feature = df_pre[process_feature.filter_recent_features()]
|
|
|
+ df_target = df_pre['weighted_retrn_log']
|
|
|
+ df_new_feature = pd.concat([df_new_feature, df_pre[process_feature.cate_feat],df_pre[process_feature.one_hot_feature]], axis=1)
|
|
|
+ return df_new_feature, df_target
|
|
|
+
|
|
|
+
|
|
|
+def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length):
|
|
|
+ Feature_Data= pd.DataFrame()
|
|
|
+ for df in (fold1_df,fold2_df,fold3_df,fold4_df):
|
|
|
+ fold1_df1 = df.iloc[0:values_lenth,:]
|
|
|
+ videoid_fold1_importance = df.iloc[values_lenth:values_lenth+video_id_lenth,:]['importance'].sum()
|
|
|
+ fold1_df2 = pd.DataFrame([{'Feature':'videoid','importance':videoid_fold1_importance,'fold':1}])
|
|
|
+ tag_fold1_importance = df.iloc[values_lenth+video_id_lenth:values_lenth+video_id_lenth+tag_length,:]['importance'].sum()
|
|
|
+ fold1_df3 = pd.DataFrame([{'Feature':'tags','importance':tag_fold1_importance,'fold':1}])
|
|
|
+ words_fold1_importance = df.iloc[values_lenth+video_id_lenth+tag_length:values_lenth+video_id_lenth+tag_length+word_length,:]['importance'].sum()
|
|
|
+ fold1_df4 = pd.DataFrame([{'Feature':'words','importance':words_fold1_importance,'fold':1}])
|
|
|
+
|
|
|
+
|
|
|
+ Feature_Data = pd.concat([Feature_Data,fold1_df1,fold1_df2,fold1_df3,fold1_df4])
|
|
|
+
|
|
|
+ return Feature_Data
|
|
|
+
|
|
|
+
|
|
|
+def MAPE(true, pred):
|
|
|
+ true = np.array(true)
|
|
|
+ sum_ = 0
|
|
|
+ count = 0
|
|
|
+ for i in range(len(true)):
|
|
|
+ if true[i] != 0:
|
|
|
+ sum_ = sum_ + np.abs(true[i] - pred[i]) / true[i]
|
|
|
+ count = count + 1
|
|
|
+ else:
|
|
|
+ continue
|
|
|
+
|
|
|
+ return sum_ / count
|
|
|
+
|
|
|
+
|
|
|
+def process_train_predict_data():
|
|
|
+ now_date = datetime.date.today()
|
|
|
+ # day = datetime.datetime.strftime(now_date, '%Y%m%d')
|
|
|
+ diff_1 = datetime.timedelta(days=1)
|
|
|
+ diff_5 = datetime.timedelta(days=7)
|
|
|
+ predict_dt = now_date - diff_1
|
|
|
+ predict_day = datetime.datetime.strftime(predict_dt, '%Y%m%d')
|
|
|
+ train_dt = now_date - diff_5
|
|
|
+ train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
|
|
|
+ #read data from ali
|
|
|
+ train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
|
|
|
+ predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
|
|
|
+ #pickle for test
|
|
|
+ import _pickle as cPickle
|
|
|
+ with open('train_data.pickle','wb') as output_file:
|
|
|
+ cPickle.dump(train_data, output_file)
|
|
|
+ with open('predict_data.pickle','wb') as output_file:
|
|
|
+ cPickle.dump(predict_data, output_file)
|
|
|
+ #with open(r"train_data.pickle", "rb") as input_file:
|
|
|
+ '''
|
|
|
+ with open(r"train_data.pickle", "rb") as input_file:
|
|
|
+ train_data = cPickle.load(input_file)
|
|
|
+ with open(r"predict_data.pickle", "rb") as input_file:
|
|
|
+ predict_data = cPickle.load(input_file)
|
|
|
+ '''
|
|
|
+ #end pickle
|
|
|
+ train_data = basic_cal(train_data)
|
|
|
+ predict_data = basic_cal(predict_data)
|
|
|
+
|
|
|
+ predict_data = select_recent_video(predict_data)
|
|
|
+ #predict_data.loc[predict_data['dt'] != int(predict_day), 'futre7dayreturn'] = 0
|
|
|
+ predict_data = predict_data.drop(axis=1, columns='rk')
|
|
|
+
|
|
|
+ train_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
|
|
|
+ predict_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
|
|
|
+
|
|
|
+ train_data = train_data.fillna(0)
|
|
|
+ predict_data = predict_data.fillna(0)
|
|
|
+ train_data = process_feature.cal_feature(train_data)
|
|
|
+ predict_data = process_feature.cal_feature(predict_data)
|
|
|
+
|
|
|
+ predict_data['videoid'] = predict_data['videoid'].astype('int')
|
|
|
+
|
|
|
+ df_new_feature,df_target= dataprepare(train_data)
|
|
|
+ df_new_feature_predict, df_target_predict = dataprepare(predict_data)
|
|
|
+
|
|
|
+ df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
|
|
|
+ df_new_feature_predict_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_predict).loc[:,'day1playcount':'videocategory555']))
|
|
|
+
|
|
|
+ print('value feature generate successfully')
|
|
|
+
|
|
|
+ train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid']
|
|
|
+ predict_videoid = pd.DataFrame(df_new_feature_predict).loc[:,'videoid']
|
|
|
+
|
|
|
+ train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist()
|
|
|
+ predict_videoid_list = pd.DataFrame(df_new_feature_predict).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videoid']),1).tolist()
|
|
|
+
|
|
|
+
|
|
|
+ allvideo_raw = list(set(np.array(pd.concat([train_videoid,predict_videoid])).tolist()))
|
|
|
+ allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist()
|
|
|
+
|
|
|
+
|
|
|
+ mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo)
|
|
|
+ train_videoid = mlb_model_videoid.transform(train_videoid_list)
|
|
|
+ predict_videoid = mlb_model_videoid.transform(predict_videoid_list)
|
|
|
+
|
|
|
+ print('videoid feature generate successfully')
|
|
|
+
|
|
|
+ #获取tag-one-hot
|
|
|
+ tags ,train_tag,predict_tag = process_tag.tag_preprocessing('tag', df_new_feature, df_new_feature_predict)
|
|
|
+ #获取tag tfidf
|
|
|
+ tag_dict = process_tag.get_tag_tfidf('20200305','video_tag_tf_idf')
|
|
|
+ print('lenth tag_dict:',len(tag_dict))
|
|
|
+ #获取tfidf_tag 稀疏矩阵
|
|
|
+ tag_corpus = tags.tolist() #corpus
|
|
|
+ tag_tfidf_list = process_tag.ttfidf_list_generation(tag_corpus,tag_dict )
|
|
|
+ tag_tf_idf_matrix = sparse.csr_matrix(np.array(tag_tfidf_list))
|
|
|
+
|
|
|
+ tag_feature_train = train_tag.multiply(tag_tf_idf_matrix)
|
|
|
+ tag_feature_test = predict_tag.multiply(tag_tf_idf_matrix)
|
|
|
+ print('tag tfidf feature generate successfully')
|
|
|
+ print('tag dimension:', len(tag_tfidf_list))
|
|
|
+
|
|
|
+ #获取values without tag
|
|
|
+ words ,train_words,test_words = process_tag.tag_preprocessing('words_no_tag', df_new_feature, df_new_feature_predict)
|
|
|
+ #获取words tfidf
|
|
|
+ words_dict = process_tag.get_tag_tfidf('20200305','video_words_without_tags_tfidf')
|
|
|
+ print('lenth words_dict:',len(words_dict))
|
|
|
+ #获取tfidf_tag 稀疏矩阵
|
|
|
+ words_corpus = words.tolist() #corpus
|
|
|
+ words_tfidf_list = process_tag.ttfidf_list_generation(words_corpus,words_dict )
|
|
|
+ words_tf_idf_matrix = sparse.csr_matrix(np.array(words_tfidf_list))
|
|
|
+ words_feature_train = train_words.multiply(words_tf_idf_matrix)
|
|
|
+ words_feature_test = test_words.multiply(words_tf_idf_matrix)
|
|
|
+ print('tag tfidf feature generate successfully')
|
|
|
+ print('words dimension:', len(words_tfidf_list))
|
|
|
+
|
|
|
+ df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train])
|
|
|
+ df_new_feature_predict = hstack([df_new_feature_predict_part_one,predict_videoid,tag_feature_test,words_feature_test])
|
|
|
+ return train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict
|
|
|
+
|
|
|
+
|
|
|
+def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict):
|
|
|
+
|
|
|
+ #target
|
|
|
+ df_target_predict = sparse.csr_matrix(pd.DataFrame(df_target_predict).values).toarray()
|
|
|
+ df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray()
|
|
|
+
|
|
|
+
|
|
|
+ param = {'num_leaves': 18,
|
|
|
+ 'min_data_in_leaf': 60,
|
|
|
+ 'objective': 'regression',
|
|
|
+ 'max_depth': -1,
|
|
|
+ 'learning_rate': 0.01,
|
|
|
+ "min_child_samples": 30,
|
|
|
+ "boosting": "gbdt",
|
|
|
+ "feature_fraction": 0.8,
|
|
|
+ "bagging_freq": 1,
|
|
|
+ "bagging_fraction": 0.8,
|
|
|
+ "bagging_seed": 11,
|
|
|
+ "metric": 'rmse',
|
|
|
+ "lambda_l1": 0.1,
|
|
|
+ "verbosity": -1,
|
|
|
+ "nthread": 4,
|
|
|
+ # 'max_bin': 512,
|
|
|
+ "random_state": 4590}
|
|
|
+
|
|
|
+ folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590)
|
|
|
+ #oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray())))
|
|
|
+ oof = np.zeros(len(df_target))
|
|
|
+ predictions = np.zeros(len(df_target_predict))
|
|
|
+ feature_importance_df = pd.DataFrame()
|
|
|
+
|
|
|
+
|
|
|
+ # values_lenth = len(process_feature.features + process_feature.cate_feat)
|
|
|
+ # video_id_lenth = len(mlb_model_videoid.classes_)
|
|
|
+ # tag_length = len(tag_tfidf_list)
|
|
|
+ # word_length = len(words_tfidf_list)
|
|
|
+
|
|
|
+ change_view = pd.DataFrame(pd.DataFrame(df_new_feature_predict.toarray()))
|
|
|
+ change_view = change_view.sort_index()
|
|
|
+
|
|
|
+ for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, train_data['return_back'].values)):
|
|
|
+ print("folds {}".format(fold_))
|
|
|
+ trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx])
|
|
|
+ val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx])
|
|
|
+
|
|
|
+ num_round = 10000
|
|
|
+ clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100,
|
|
|
+ early_stopping_rounds=200)
|
|
|
+ oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration)
|
|
|
+ predictions += clf.predict(df_new_feature_predict, num_iteration=clf.best_iteration) / folds.n_splits
|
|
|
+
|
|
|
+ fold_importance_df = pd.DataFrame()
|
|
|
+
|
|
|
+ # column = process_feature.features+process_feature.cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus
|
|
|
+ # fold_importance_df["Feature"] = np.array(column)
|
|
|
+
|
|
|
+ # fold_importance_df["importance"] = clf.feature_importance()
|
|
|
+ # fold_importance_df["fold"] = fold_ + 1
|
|
|
+ # feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
|
|
|
+
|
|
|
+
|
|
|
+ # fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1]
|
|
|
+ # fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2]
|
|
|
+ # fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3]
|
|
|
+ # fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4]
|
|
|
+
|
|
|
+
|
|
|
+ # feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length)
|
|
|
+
|
|
|
+ print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof)))
|
|
|
+ print('oof_mse:', mean_squared_error(df_target, oof))
|
|
|
+
|
|
|
+ print('test_rmse:', np.sqrt(mean_squared_error(df_target_predict, predictions)))
|
|
|
+ print('test_mse:', mean_squared_error(df_target_predict, predictions))
|
|
|
+
|
|
|
+
|
|
|
+ print('oof_mape:', MAPE(df_target, oof))
|
|
|
+ print('test_mape:', MAPE(df_target_predict, predictions))
|
|
|
+
|
|
|
+ print('verification r2:', r2_score(df_target, oof))
|
|
|
+ print('test r2:', r2_score(df_target_predict, predictions))
|
|
|
+
|
|
|
+ sub_df_ = pd.DataFrame({"videoid": predict_data["videoid"].values})
|
|
|
+ sub_df_['score'] = predictions
|
|
|
+ print('regre ranking shape', sub_df_.shape)
|
|
|
+ sub_df_.to_csv('result.csv')
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict = process_train_predict_data()
|
|
|
+ do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict)
|