import warnings warnings.filterwarnings("ignore") from sklearn.metrics import r2_score import os import pandas as pd import gc import math import numpy as np import time from sklearn.linear_model import SGDRegressor from sklearn.linear_model import SGDClassifier import lightgbm as lgb from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import MultiLabelBinarizer from sklearn import metrics import pickle from sklearn.metrics import mean_squared_error import seaborn as sns import matplotlib.pylab as plt from odps import ODPS from odps.df import DataFrame as odpsdf from datetime import datetime as dt import datetime from scipy import sparse from scipy.sparse import hstack import process_feature import process_tag def getRovfeaturetable(dt, table): odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm', endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \ read_timeout=500000, pool_maxsize=1000, pool_connections=1000) featureArray = [] for record in odps.read_table(table, partition='dt=%s' % dt): valueFeature = {} for i in process_feature.featurename: if i == 'dt': valueFeature[i] = dt else: valueFeature[i] = record[i] featureArray.append(valueFeature) featureArray = pd.DataFrame(featureArray) print(dt, table, 'feature table finish') return featureArray def getdatasample(date, max_range, table): new_date = dt.strptime(date, '%Y%m%d') datelist = [] testlist = [] for i in range(0, max_range): delta = datetime.timedelta(days=i) tar_dt = new_date - delta datelist.append(tar_dt.strftime("%Y%m%d")) for tm in datelist: testlist.append(getRovfeaturetable(tm, table)) testdata = pd.concat(testlist) testdata.reset_index(inplace=True) testdata = testdata.drop(axis=1, columns='index') return testdata def select_recent_video(df): """对每一个视频添加row number,按照日期排序,最后选取最近的那一天""" df['dt'] = df['dt'].astype(int) df['rk'] = df['dt'].groupby(df['videoid']).rank(ascending=0, method='first') df = df[df['rk'] == 1] return df def basic_cal(df): df['weighted_retrn'] = df['futre7dayreturn'].astype('int') df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1) ## 设置回流大于thresh, label就是1, 没有分享或有分享但是回流数是零的标为0 df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1) return df def today_view_category(predict_data): ### 对当天的曝光量分三个级别,未来三天的曝光量分3个级别,添加Category feaure data_test1_view1 = predict_data.loc[predict_data['day1viewcount_rank'] > 10000]['day1viewcount'].mean() data_test1_view2 = predict_data.loc[(predict_data['day1viewcount_rank'] > 3000)&(predict_data['day1viewcount_rank'] <= 10000)]['day1viewcount'].mean() data_test1_view3 = predict_data.loc[(predict_data['day1viewcount_rank'] > 1000)&(predict_data['day1viewcount_rank'] <= 3000)]['day1viewcount'].mean() data_test1_view4 = predict_data.loc[(predict_data['day1viewcount_rank'] > 300)&(predict_data['day1viewcount_rank'] <= 1000)]['day1viewcount'].mean() data_test1_view5 = predict_data.loc[(predict_data['day1viewcount_rank'] > 100)&(predict_data['day1viewcount_rank'] <= 300)]['day1viewcount'].mean() data_test1_view6 = predict_data.loc[(predict_data['day1viewcount_rank'] > 30)&(predict_data['day1viewcount_rank'] <= 100)]['day1viewcount'].mean() data_test1_view7 = predict_data.loc[(predict_data['day1viewcount_rank'] > 0)&(predict_data['day1viewcount_rank'] <= 30)]['day1viewcount'].mean() predict_data.loc[predict_data['day1viewcount_rank'] > 10000, 'todyviewcount'] = data_test1_view1 predict_data.loc[(predict_data['day1viewcount_rank'] > 3000)&(predict_data['day1viewcount_rank'] <= 10000), 'todyviewcount'] = data_test1_view2 predict_data.loc[(predict_data['day1viewcount_rank'] > 1000)&(predict_data['day1viewcount_rank'] <= 3000), 'todyviewcount'] = data_test1_view3 predict_data.loc[(predict_data['day1viewcount_rank'] > 300)&(predict_data['day1viewcount_rank'] <= 1000), 'todyviewcount'] = data_test1_view4 predict_data.loc[(predict_data['day1viewcount_rank'] > 100)&(predict_data['day1viewcount_rank'] <= 300), 'todyviewcount'] = data_test1_view5 predict_data.loc[(predict_data['day1viewcount_rank'] > 30)&(predict_data['day1viewcount_rank'] <= 100), 'todyviewcount'] = data_test1_view6 predict_data.loc[(predict_data['day1viewcount_rank'] > 0)&(predict_data['day1viewcount_rank'] <= 30), 'todyviewcount'] = data_test1_view7 return predict_data def dataprepare(df_pre): # 直接将特征送进去,不加交叉特征。 # 是否对数据补零 df_pre = df_pre.fillna(0) #df_new_feature = df_pre[process_feature.features] df_new_feature = df_pre[process_feature.filter_recent_features()] print(df_new_feature.shape) df_target = df_pre['weighted_retrn_log'] df_new_feature = pd.concat([df_new_feature, df_pre[process_feature.cate_feat],df_pre[process_feature.one_hot_feature]], axis=1) return df_new_feature, df_target def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length): Feature_Data= pd.DataFrame() for df in (fold1_df,fold2_df,fold3_df,fold4_df): fold1_df1 = df.iloc[0:values_lenth,:] videoid_fold1_importance = df.iloc[values_lenth:values_lenth+video_id_lenth,:]['importance'].sum() fold1_df2 = pd.DataFrame([{'Feature':'videoid','importance':videoid_fold1_importance,'fold':1}]) tag_fold1_importance = df.iloc[values_lenth+video_id_lenth:values_lenth+video_id_lenth+tag_length,:]['importance'].sum() fold1_df3 = pd.DataFrame([{'Feature':'tags','importance':tag_fold1_importance,'fold':1}]) words_fold1_importance = df.iloc[values_lenth+video_id_lenth+tag_length:values_lenth+video_id_lenth+tag_length+word_length,:]['importance'].sum() fold1_df4 = pd.DataFrame([{'Feature':'words','importance':words_fold1_importance,'fold':1}]) Feature_Data = pd.concat([Feature_Data,fold1_df1,fold1_df2,fold1_df3,fold1_df4]) return Feature_Data def MAPE(true, pred): true = np.array(true) sum_ = 0 count = 0 for i in range(len(true)): if true[i] != 0: sum_ = sum_ + np.abs(true[i] - pred[i]) / true[i] count = count + 1 else: continue return sum_ / count def process_train_predict_data(): now_date = datetime.date.today() # day = datetime.datetime.strftime(now_date, '%Y%m%d') diff_1 = datetime.timedelta(days=1) diff_5 = datetime.timedelta(days=7) predict_dt = now_date - diff_1 predict_day = datetime.datetime.strftime(predict_dt, '%Y%m%d') train_dt = now_date - diff_5 train_day = datetime.datetime.strftime(train_dt, '%Y%m%d') #read data from ali train_data = getdatasample(train_day, 30, 'rov_feature_add_v1') predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1') #pickle for test import _pickle as cPickle with open('train_data.pickle','wb') as output_file: cPickle.dump(train_data, output_file) with open('predict_data.pickle','wb') as output_file: cPickle.dump(predict_data, output_file) #with open(r"train_data.pickle", "rb") as input_file: ''' with open(r"train_data.pickle", "rb") as input_file: train_data = cPickle.load(input_file) with open(r"predict_data.pickle", "rb") as input_file: predict_data = cPickle.load(input_file) ''' #end pickle train_data = basic_cal(train_data) predict_data = basic_cal(predict_data) predict_data = select_recent_video(predict_data) predict_data.loc[predict_data['dt'] != int(predict_day), 'futre7dayreturn'] = 0 predict_data = predict_data.drop(axis=1, columns='rk') train_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True) predict_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True) train_data = train_data.fillna(0) predict_data = predict_data.fillna(0) train_data = process_feature.cal_feature(train_data) predict_data = process_feature.cal_feature(predict_data) predict_data = today_view_category(predict_data) predict_data['videoid'] = predict_data['videoid'].astype('int') df_new_feature,df_target= dataprepare(train_data) df_new_feature_predict, df_target_predict = dataprepare(predict_data) print(df_target_predict) df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555'])) df_new_feature_predict_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_predict).loc[:,'day1playcount':'videocategory555'])) print('value feature generate successfully') train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid'] predict_videoid = pd.DataFrame(df_new_feature_predict).loc[:,'videoid'] train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist() predict_videoid_list = pd.DataFrame(df_new_feature_predict).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videoid']),1).tolist() allvideo_raw = list(set(np.array(pd.concat([train_videoid,predict_videoid])).tolist())) allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist() mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo) train_videoid = mlb_model_videoid.transform(train_videoid_list) predict_videoid = mlb_model_videoid.transform(predict_videoid_list) print('videoid feature generate successfully') #获取tag-one-hot tags ,train_tag,predict_tag = process_tag.tag_preprocessing('tag', df_new_feature, df_new_feature_predict) #获取tag tfidf tag_dict = process_tag.get_tag_tfidf('20200305','video_tag_tf_idf') print('lenth tag_dict:',len(tag_dict)) #获取tfidf_tag 稀疏矩阵 tag_corpus = tags.tolist() #corpus tag_tfidf_list = process_tag.ttfidf_list_generation(tag_corpus,tag_dict ) tag_tf_idf_matrix = sparse.csr_matrix(np.array(tag_tfidf_list)) tag_feature_train = train_tag.multiply(tag_tf_idf_matrix) tag_feature_test = predict_tag.multiply(tag_tf_idf_matrix) print('tag tfidf feature generate successfully') print('tag dimension:', len(tag_tfidf_list)) #获取values without tag words ,train_words,test_words = process_tag.tag_preprocessing('words_no_tag', df_new_feature, df_new_feature_predict) #获取words tfidf words_dict = process_tag.get_tag_tfidf('20200305','video_words_without_tags_tfidf') print('lenth words_dict:',len(words_dict)) #获取tfidf_tag 稀疏矩阵 words_corpus = words.tolist() #corpus words_tfidf_list = process_tag.ttfidf_list_generation(words_corpus,words_dict ) words_tf_idf_matrix = sparse.csr_matrix(np.array(words_tfidf_list)) words_feature_train = train_words.multiply(words_tf_idf_matrix) words_feature_test = test_words.multiply(words_tf_idf_matrix) print('tag tfidf feature generate successfully') print('words dimension:', len(words_tfidf_list)) df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train]) df_new_feature_predict = hstack([df_new_feature_predict_part_one,predict_videoid,tag_feature_test,words_feature_test]) return train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict): #target df_target_predict = sparse.csr_matrix(pd.DataFrame(df_target_predict).values).toarray() df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray() param = {'num_leaves': 18, 'min_data_in_leaf': 60, 'objective': 'regression', 'max_depth': -1, 'learning_rate': 0.01, "min_child_samples": 30, "boosting": "gbdt", "feature_fraction": 0.8, "bagging_freq": 1, "bagging_fraction": 0.8, "bagging_seed": 11, "metric": 'rmse', "lambda_l1": 0.1, "verbosity": -1, "nthread": 4, # 'max_bin': 512, "random_state": 4590} folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590) #oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray()))) oof = np.zeros(len(df_target)) predictions = np.zeros(len(df_target_predict)) feature_importance_df = pd.DataFrame() # values_lenth = len(process_feature.features + process_feature.cate_feat) # video_id_lenth = len(mlb_model_videoid.classes_) # tag_length = len(tag_tfidf_list) # word_length = len(words_tfidf_list) change_view = pd.DataFrame(pd.DataFrame(df_new_feature_predict.toarray())) change_view = change_view.sort_index() for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, train_data['return_back'].values)): print("folds {}".format(fold_)) trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx]) val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx]) num_round = 10000 clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100, early_stopping_rounds=200) oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration) predictions += clf.predict(df_new_feature_predict, num_iteration=clf.best_iteration) / folds.n_splits fold_importance_df = pd.DataFrame() # column = process_feature.features+process_feature.cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus # fold_importance_df["Feature"] = np.array(column) # fold_importance_df["importance"] = clf.feature_importance() # fold_importance_df["fold"] = fold_ + 1 # feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) # fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1] # fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2] # fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3] # fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4] # feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length) print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof))) print('oof_mse:', mean_squared_error(df_target, oof)) print('test_rmse:', np.sqrt(mean_squared_error(df_target_predict, predictions))) print('test_mse:', mean_squared_error(df_target_predict, predictions)) print('oof_mape:', MAPE(df_target, oof)) print('test_mape:', MAPE(df_target_predict, predictions)) print('verification r2:', r2_score(df_target, oof)) print('test r2:', r2_score(df_target_predict, predictions)) sub_df_ = pd.DataFrame({"videoid": predict_data["videoid"].values}) sub_df_['score'] = predictions print('regre ranking shape', sub_df_.shape) sub_df_.to_csv('result.csv') if __name__ == '__main__': train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict = process_train_predict_data() do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict)