123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338 |
- import warnings
- warnings.filterwarnings("ignore")
- from sklearn.metrics import r2_score
- import os
- import pandas as pd
- import gc
- import math
- import numpy as np
- import time
- from sklearn.linear_model import SGDRegressor
- from sklearn.linear_model import SGDClassifier
- import lightgbm as lgb
- from sklearn.model_selection import train_test_split
- from sklearn.model_selection import StratifiedKFold
- from sklearn.preprocessing import MultiLabelBinarizer
- from sklearn import metrics
- import pickle
- from sklearn.metrics import mean_squared_error
- import seaborn as sns
- import matplotlib.pylab as plt
- from odps import ODPS
- from odps.df import DataFrame as odpsdf
- from datetime import datetime as dt
- import datetime
- from scipy import sparse
- from scipy.sparse import hstack
- import process_feature
- import process_tag
- def getRovfeaturetable(dt, table):
- odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
- endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
- read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
- featureArray = []
- for record in odps.read_table(table, partition='dt=%s' % dt):
- valueFeature = {}
- for i in process_feature.featurename:
- if i == 'dt':
- valueFeature[i] = dt
- else:
- valueFeature[i] = record[i]
- featureArray.append(valueFeature)
- featureArray = pd.DataFrame(featureArray)
- print(dt, table, 'feature table finish')
- return featureArray
- def getdatasample(date, max_range, table):
- new_date = dt.strptime(date, '%Y%m%d')
- datelist = []
- testlist = []
- for i in range(0, max_range):
- delta = datetime.timedelta(days=i)
- tar_dt = new_date - delta
- datelist.append(tar_dt.strftime("%Y%m%d"))
- for tm in datelist:
- testlist.append(getRovfeaturetable(tm, table))
- testdata = pd.concat(testlist)
- testdata.reset_index(inplace=True)
- testdata = testdata.drop(axis=1, columns='index')
- return testdata
- def select_recent_video(df):
- """对每一个视频添加row number,按照日期排序,最后选取最近的那一天"""
- df['dt'] = df['dt'].astype(int)
- df['rk'] = df['dt'].groupby(df['videoid']).rank(ascending=0, method='first')
- df = df[df['rk'] == 1]
- return df
- def basic_cal(df):
- df['weighted_retrn'] = df['futre7dayreturn'].astype('int')
- df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1)
- ## 设置回流大于thresh, label就是1, 没有分享或有分享但是回流数是零的标为0
- df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1)
- return df
- def today_view_category(predict_data):
- ### 对当天的曝光量分三个级别,未来三天的曝光量分3个级别,添加Category feaure
- data_test1_view1 = predict_data.loc[predict_data['day1viewcount_rank'] > 10000]['day1viewcount'].mean()
- data_test1_view2 = predict_data.loc[(predict_data['day1viewcount_rank'] > 3000)&(predict_data['day1viewcount_rank'] <= 10000)]['day1viewcount'].mean()
- data_test1_view3 = predict_data.loc[(predict_data['day1viewcount_rank'] > 1000)&(predict_data['day1viewcount_rank'] <= 3000)]['day1viewcount'].mean()
- data_test1_view4 = predict_data.loc[(predict_data['day1viewcount_rank'] > 300)&(predict_data['day1viewcount_rank'] <= 1000)]['day1viewcount'].mean()
- data_test1_view5 = predict_data.loc[(predict_data['day1viewcount_rank'] > 100)&(predict_data['day1viewcount_rank'] <= 300)]['day1viewcount'].mean()
- data_test1_view6 = predict_data.loc[(predict_data['day1viewcount_rank'] > 30)&(predict_data['day1viewcount_rank'] <= 100)]['day1viewcount'].mean()
- data_test1_view7 = predict_data.loc[(predict_data['day1viewcount_rank'] > 0)&(predict_data['day1viewcount_rank'] <= 30)]['day1viewcount'].mean()
-
- predict_data.loc[predict_data['day1viewcount_rank'] > 10000, 'todyviewcount'] = data_test1_view1
- predict_data.loc[(predict_data['day1viewcount_rank'] > 3000)&(predict_data['day1viewcount_rank'] <= 10000), 'todyviewcount'] = data_test1_view2
- predict_data.loc[(predict_data['day1viewcount_rank'] > 1000)&(predict_data['day1viewcount_rank'] <= 3000), 'todyviewcount'] = data_test1_view3
- predict_data.loc[(predict_data['day1viewcount_rank'] > 300)&(predict_data['day1viewcount_rank'] <= 1000), 'todyviewcount'] = data_test1_view4
- predict_data.loc[(predict_data['day1viewcount_rank'] > 100)&(predict_data['day1viewcount_rank'] <= 300), 'todyviewcount'] = data_test1_view5
- predict_data.loc[(predict_data['day1viewcount_rank'] > 30)&(predict_data['day1viewcount_rank'] <= 100), 'todyviewcount'] = data_test1_view6
- predict_data.loc[(predict_data['day1viewcount_rank'] > 0)&(predict_data['day1viewcount_rank'] <= 30), 'todyviewcount'] = data_test1_view7
- return predict_data
- def dataprepare(df_pre):
- # 直接将特征送进去,不加交叉特征。
- # 是否对数据补零
- df_pre = df_pre.fillna(0)
- #df_new_feature = df_pre[process_feature.features]
- df_new_feature = df_pre[process_feature.filter_recent_features()]
- print(df_new_feature.shape)
- df_target = df_pre['weighted_retrn_log']
- df_new_feature = pd.concat([df_new_feature, df_pre[process_feature.cate_feat],df_pre[process_feature.one_hot_feature]], axis=1)
- return df_new_feature, df_target
- def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length):
- Feature_Data= pd.DataFrame()
- for df in (fold1_df,fold2_df,fold3_df,fold4_df):
- fold1_df1 = df.iloc[0:values_lenth,:]
- videoid_fold1_importance = df.iloc[values_lenth:values_lenth+video_id_lenth,:]['importance'].sum()
- fold1_df2 = pd.DataFrame([{'Feature':'videoid','importance':videoid_fold1_importance,'fold':1}])
- tag_fold1_importance = df.iloc[values_lenth+video_id_lenth:values_lenth+video_id_lenth+tag_length,:]['importance'].sum()
- fold1_df3 = pd.DataFrame([{'Feature':'tags','importance':tag_fold1_importance,'fold':1}])
- words_fold1_importance = df.iloc[values_lenth+video_id_lenth+tag_length:values_lenth+video_id_lenth+tag_length+word_length,:]['importance'].sum()
- fold1_df4 = pd.DataFrame([{'Feature':'words','importance':words_fold1_importance,'fold':1}])
-
-
- Feature_Data = pd.concat([Feature_Data,fold1_df1,fold1_df2,fold1_df3,fold1_df4])
-
- return Feature_Data
- def MAPE(true, pred):
- true = np.array(true)
- sum_ = 0
- count = 0
- for i in range(len(true)):
- if true[i] != 0:
- sum_ = sum_ + np.abs(true[i] - pred[i]) / true[i]
- count = count + 1
- else:
- continue
- return sum_ / count
- def process_train_predict_data():
- now_date = datetime.date.today()
- # day = datetime.datetime.strftime(now_date, '%Y%m%d')
- diff_1 = datetime.timedelta(days=1)
- diff_5 = datetime.timedelta(days=7)
- predict_dt = now_date - diff_1
- predict_day = datetime.datetime.strftime(predict_dt, '%Y%m%d')
- train_dt = now_date - diff_5
- train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
- #read data from ali
- train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
- predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
- #pickle for test
- import _pickle as cPickle
- with open('train_data.pickle','wb') as output_file:
- cPickle.dump(train_data, output_file)
- with open('predict_data.pickle','wb') as output_file:
- cPickle.dump(predict_data, output_file)
- #with open(r"train_data.pickle", "rb") as input_file:
- '''
- with open(r"train_data.pickle", "rb") as input_file:
- train_data = cPickle.load(input_file)
- with open(r"predict_data.pickle", "rb") as input_file:
- predict_data = cPickle.load(input_file)
- '''
- #end pickle
- train_data = basic_cal(train_data)
- predict_data = basic_cal(predict_data)
- predict_data = select_recent_video(predict_data)
- predict_data.loc[predict_data['dt'] != int(predict_day), 'futre7dayreturn'] = 0
- predict_data = predict_data.drop(axis=1, columns='rk')
- train_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
- predict_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
- train_data = train_data.fillna(0)
- predict_data = predict_data.fillna(0)
- train_data = process_feature.cal_feature(train_data)
- predict_data = process_feature.cal_feature(predict_data)
- predict_data = today_view_category(predict_data)
- predict_data['videoid'] = predict_data['videoid'].astype('int')
- df_new_feature,df_target= dataprepare(train_data)
- df_new_feature_predict, df_target_predict = dataprepare(predict_data)
- print(df_target_predict)
- df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
- df_new_feature_predict_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_predict).loc[:,'day1playcount':'videocategory555']))
- print('value feature generate successfully')
- train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid']
- predict_videoid = pd.DataFrame(df_new_feature_predict).loc[:,'videoid']
- train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist()
- predict_videoid_list = pd.DataFrame(df_new_feature_predict).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videoid']),1).tolist()
- allvideo_raw = list(set(np.array(pd.concat([train_videoid,predict_videoid])).tolist()))
- allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist()
-
- mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo)
- train_videoid = mlb_model_videoid.transform(train_videoid_list)
- predict_videoid = mlb_model_videoid.transform(predict_videoid_list)
- print('videoid feature generate successfully')
- #获取tag-one-hot
- tags ,train_tag,predict_tag = process_tag.tag_preprocessing('tag', df_new_feature, df_new_feature_predict)
- #获取tag tfidf
- tag_dict = process_tag.get_tag_tfidf('20200305','video_tag_tf_idf')
- print('lenth tag_dict:',len(tag_dict))
- #获取tfidf_tag 稀疏矩阵
- tag_corpus = tags.tolist() #corpus
- tag_tfidf_list = process_tag.ttfidf_list_generation(tag_corpus,tag_dict )
- tag_tf_idf_matrix = sparse.csr_matrix(np.array(tag_tfidf_list))
- tag_feature_train = train_tag.multiply(tag_tf_idf_matrix)
- tag_feature_test = predict_tag.multiply(tag_tf_idf_matrix)
- print('tag tfidf feature generate successfully')
- print('tag dimension:', len(tag_tfidf_list))
- #获取values without tag
- words ,train_words,test_words = process_tag.tag_preprocessing('words_no_tag', df_new_feature, df_new_feature_predict)
- #获取words tfidf
- words_dict = process_tag.get_tag_tfidf('20200305','video_words_without_tags_tfidf')
- print('lenth words_dict:',len(words_dict))
- #获取tfidf_tag 稀疏矩阵
- words_corpus = words.tolist() #corpus
- words_tfidf_list = process_tag.ttfidf_list_generation(words_corpus,words_dict )
- words_tf_idf_matrix = sparse.csr_matrix(np.array(words_tfidf_list))
- words_feature_train = train_words.multiply(words_tf_idf_matrix)
- words_feature_test = test_words.multiply(words_tf_idf_matrix)
- print('tag tfidf feature generate successfully')
- print('words dimension:', len(words_tfidf_list))
- df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train])
- df_new_feature_predict = hstack([df_new_feature_predict_part_one,predict_videoid,tag_feature_test,words_feature_test])
- return train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict
- def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict):
- #target
- df_target_predict = sparse.csr_matrix(pd.DataFrame(df_target_predict).values).toarray()
- df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray()
- param = {'num_leaves': 18,
- 'min_data_in_leaf': 60,
- 'objective': 'regression',
- 'max_depth': -1,
- 'learning_rate': 0.01,
- "min_child_samples": 30,
- "boosting": "gbdt",
- "feature_fraction": 0.8,
- "bagging_freq": 1,
- "bagging_fraction": 0.8,
- "bagging_seed": 11,
- "metric": 'rmse',
- "lambda_l1": 0.1,
- "verbosity": -1,
- "nthread": 4,
- # 'max_bin': 512,
- "random_state": 4590}
- folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590)
- #oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray())))
- oof = np.zeros(len(df_target))
- predictions = np.zeros(len(df_target_predict))
- feature_importance_df = pd.DataFrame()
- # values_lenth = len(process_feature.features + process_feature.cate_feat)
- # video_id_lenth = len(mlb_model_videoid.classes_)
- # tag_length = len(tag_tfidf_list)
- # word_length = len(words_tfidf_list)
- change_view = pd.DataFrame(pd.DataFrame(df_new_feature_predict.toarray()))
- change_view = change_view.sort_index()
- for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, train_data['return_back'].values)):
- print("folds {}".format(fold_))
- trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx])
- val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx])
- num_round = 10000
- clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100,
- early_stopping_rounds=200)
- oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration)
- predictions += clf.predict(df_new_feature_predict, num_iteration=clf.best_iteration) / folds.n_splits
- fold_importance_df = pd.DataFrame()
-
- # column = process_feature.features+process_feature.cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus
- # fold_importance_df["Feature"] = np.array(column)
-
- # fold_importance_df["importance"] = clf.feature_importance()
- # fold_importance_df["fold"] = fold_ + 1
- # feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
- # fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1]
- # fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2]
- # fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3]
- # fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4]
- # feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length)
- print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof)))
- print('oof_mse:', mean_squared_error(df_target, oof))
- print('test_rmse:', np.sqrt(mean_squared_error(df_target_predict, predictions)))
- print('test_mse:', mean_squared_error(df_target_predict, predictions))
- print('oof_mape:', MAPE(df_target, oof))
- print('test_mape:', MAPE(df_target_predict, predictions))
- print('verification r2:', r2_score(df_target, oof))
- print('test r2:', r2_score(df_target_predict, predictions))
- sub_df_ = pd.DataFrame({"videoid": predict_data["videoid"].values})
- sub_df_['score'] = predictions
- print('regre ranking shape', sub_df_.shape)
- sub_df_.to_csv('result.csv')
- if __name__ == '__main__':
- train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict = process_train_predict_data()
- do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict)
|