import warnings warnings.filterwarnings("ignore") from sklearn.metrics import r2_score import os import pandas as pd import gc import math import numpy as np import time from sklearn.linear_model import SGDRegressor from sklearn.linear_model import SGDClassifier import lightgbm as lgb from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import MultiLabelBinarizer from sklearn import metrics import pickle from sklearn.metrics import mean_squared_error import seaborn as sns import matplotlib.pylab as plt from odps import ODPS from odps.df import DataFrame as odpsdf from datetime import datetime as dt import datetime from scipy import sparse from scipy.sparse import hstack import process_feature import process_tag def getRovfeaturetable(dt, table): odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm', endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \ read_timeout=500000, pool_maxsize=1000, pool_connections=1000) featureArray = [] for record in odps.read_table(table, partition='dt=%s' % dt): valueFeature = {} for i in process_feature.featurename: if i == 'dt': valueFeature[i] = dt else: valueFeature[i] = record[i] featureArray.append(valueFeature) featureArray = pd.DataFrame(featureArray) print(dt, table, 'feature table finish') return featureArray def getdatasample(date, max_range, table): new_date = dt.strptime(date, '%Y%m%d') datelist = [] testlist = [] for i in range(0, max_range): delta = datetime.timedelta(days=i) tar_dt = new_date - delta datelist.append(tar_dt.strftime("%Y%m%d")) for tm in datelist: testlist.append(getRovfeaturetable(tm, table)) testdata = pd.concat(testlist) testdata.reset_index(inplace=True) testdata = testdata.drop(axis=1, columns='index') return testdata def select_recent_video(df): """对每一个视频添加row number,按照日期排序,最后选取最近的那一天""" df['dt'] = df['dt'].astype(int) df['rk'] = df['dt'].groupby(df['videoid']).rank(ascending=0, method='first') df = df[df['rk'] == 1] return df def basic_cal(df): df['weighted_retrn'] = df['futre7dayreturn'].astype('int') df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1) df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1) return df def dataprepare(df_pre): # 直接将特征送进去,不加交叉特征。 df_pre = df_pre.fillna(0) #df_new_feature = df_pre[process_feature.features] df_new_feature = df_pre[process_feature.filter_recent_features()] df_target = df_pre['weighted_retrn_log'] df_new_feature = pd.concat([df_new_feature, df_pre[process_feature.cate_feat],df_pre[process_feature.one_hot_feature]], axis=1) return df_new_feature, df_target def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length): Feature_Data= pd.DataFrame() for df in (fold1_df,fold2_df,fold3_df,fold4_df): fold1_df1 = df.iloc[0:values_lenth,:] videoid_fold1_importance = df.iloc[values_lenth:values_lenth+video_id_lenth,:]['importance'].sum() fold1_df2 = pd.DataFrame([{'Feature':'videoid','importance':videoid_fold1_importance,'fold':1}]) tag_fold1_importance = df.iloc[values_lenth+video_id_lenth:values_lenth+video_id_lenth+tag_length,:]['importance'].sum() fold1_df3 = pd.DataFrame([{'Feature':'tags','importance':tag_fold1_importance,'fold':1}]) words_fold1_importance = df.iloc[values_lenth+video_id_lenth+tag_length:values_lenth+video_id_lenth+tag_length+word_length,:]['importance'].sum() fold1_df4 = pd.DataFrame([{'Feature':'words','importance':words_fold1_importance,'fold':1}]) Feature_Data = pd.concat([Feature_Data,fold1_df1,fold1_df2,fold1_df3,fold1_df4]) return Feature_Data def MAPE(true, pred): true = np.array(true) sum_ = 0 count = 0 for i in range(len(true)): if true[i] != 0: sum_ = sum_ + np.abs(true[i] - pred[i]) / true[i] count = count + 1 else: continue return sum_ / count def process_train_predict_data(): now_date = datetime.date.today() # day = datetime.datetime.strftime(now_date, '%Y%m%d') diff_1 = datetime.timedelta(days=1) diff_5 = datetime.timedelta(days=7) predict_dt = now_date - diff_1 predict_day = datetime.datetime.strftime(predict_dt, '%Y%m%d') train_dt = now_date - diff_5 train_day = datetime.datetime.strftime(train_dt, '%Y%m%d') #read data from ali train_data = getdatasample(train_day, 30, 'rov_feature_add_v1') predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1') #pickle for test import _pickle as cPickle with open('train_data.pickle','wb') as output_file: cPickle.dump(train_data, output_file) with open('predict_data.pickle','wb') as output_file: cPickle.dump(predict_data, output_file) #with open(r"train_data.pickle", "rb") as input_file: ''' with open(r"train_data.pickle", "rb") as input_file: train_data = cPickle.load(input_file) with open(r"predict_data.pickle", "rb") as input_file: predict_data = cPickle.load(input_file) ''' #end pickle train_data = basic_cal(train_data) predict_data = basic_cal(predict_data) predict_data = select_recent_video(predict_data) #predict_data.loc[predict_data['dt'] != int(predict_day), 'futre7dayreturn'] = 0 predict_data = predict_data.drop(axis=1, columns='rk') train_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True) predict_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True) train_data = train_data.fillna(0) predict_data = predict_data.fillna(0) train_data = process_feature.cal_feature(train_data) predict_data = process_feature.cal_feature(predict_data) predict_data['videoid'] = predict_data['videoid'].astype('int') df_new_feature,df_target= dataprepare(train_data) df_new_feature_predict, df_target_predict = dataprepare(predict_data) df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555'])) df_new_feature_predict_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_predict).loc[:,'day1playcount':'videocategory555'])) print('value feature generate successfully') train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid'] predict_videoid = pd.DataFrame(df_new_feature_predict).loc[:,'videoid'] train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist() predict_videoid_list = pd.DataFrame(df_new_feature_predict).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videoid']),1).tolist() allvideo_raw = list(set(np.array(pd.concat([train_videoid,predict_videoid])).tolist())) allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist() mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo) train_videoid = mlb_model_videoid.transform(train_videoid_list) predict_videoid = mlb_model_videoid.transform(predict_videoid_list) print('videoid feature generate successfully') #获取tag-one-hot tags ,train_tag,predict_tag = process_tag.tag_preprocessing('tag', df_new_feature, df_new_feature_predict) #获取tag tfidf tag_dict = process_tag.get_tag_tfidf('20200305','video_tag_tf_idf') print('lenth tag_dict:',len(tag_dict)) #获取tfidf_tag 稀疏矩阵 tag_corpus = tags.tolist() #corpus tag_tfidf_list = process_tag.ttfidf_list_generation(tag_corpus,tag_dict ) tag_tf_idf_matrix = sparse.csr_matrix(np.array(tag_tfidf_list)) tag_feature_train = train_tag.multiply(tag_tf_idf_matrix) tag_feature_test = predict_tag.multiply(tag_tf_idf_matrix) print('tag tfidf feature generate successfully') print('tag dimension:', len(tag_tfidf_list)) #获取values without tag words ,train_words,test_words = process_tag.tag_preprocessing('words_no_tag', df_new_feature, df_new_feature_predict) #获取words tfidf words_dict = process_tag.get_tag_tfidf('20200305','video_words_without_tags_tfidf') print('lenth words_dict:',len(words_dict)) #获取tfidf_tag 稀疏矩阵 words_corpus = words.tolist() #corpus words_tfidf_list = process_tag.ttfidf_list_generation(words_corpus,words_dict ) words_tf_idf_matrix = sparse.csr_matrix(np.array(words_tfidf_list)) words_feature_train = train_words.multiply(words_tf_idf_matrix) words_feature_test = test_words.multiply(words_tf_idf_matrix) print('tag tfidf feature generate successfully') print('words dimension:', len(words_tfidf_list)) df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train]) df_new_feature_predict = hstack([df_new_feature_predict_part_one,predict_videoid,tag_feature_test,words_feature_test]) return train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict): #target df_target_predict = sparse.csr_matrix(pd.DataFrame(df_target_predict).values).toarray() df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray() param = {'num_leaves': 18, 'min_data_in_leaf': 60, 'objective': 'regression', 'max_depth': -1, 'learning_rate': 0.01, "min_child_samples": 30, "boosting": "gbdt", "feature_fraction": 0.8, "bagging_freq": 1, "bagging_fraction": 0.8, "bagging_seed": 11, "metric": 'rmse', "lambda_l1": 0.1, "verbosity": -1, "nthread": 4, # 'max_bin': 512, "random_state": 4590} folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590) #oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray()))) oof = np.zeros(len(df_target)) predictions = np.zeros(len(df_target_predict)) feature_importance_df = pd.DataFrame() # values_lenth = len(process_feature.features + process_feature.cate_feat) # video_id_lenth = len(mlb_model_videoid.classes_) # tag_length = len(tag_tfidf_list) # word_length = len(words_tfidf_list) change_view = pd.DataFrame(pd.DataFrame(df_new_feature_predict.toarray())) change_view = change_view.sort_index() for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, train_data['return_back'].values)): print("folds {}".format(fold_)) trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx]) val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx]) num_round = 10000 clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100, early_stopping_rounds=200) oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration) predictions += clf.predict(df_new_feature_predict, num_iteration=clf.best_iteration) / folds.n_splits fold_importance_df = pd.DataFrame() # column = process_feature.features+process_feature.cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus # fold_importance_df["Feature"] = np.array(column) # fold_importance_df["importance"] = clf.feature_importance() # fold_importance_df["fold"] = fold_ + 1 # feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) # fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1] # fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2] # fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3] # fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4] # feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length) print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof))) print('oof_mse:', mean_squared_error(df_target, oof)) print('test_rmse:', np.sqrt(mean_squared_error(df_target_predict, predictions))) print('test_mse:', mean_squared_error(df_target_predict, predictions)) print('oof_mape:', MAPE(df_target, oof)) print('test_mape:', MAPE(df_target_predict, predictions)) print('verification r2:', r2_score(df_target, oof)) print('test r2:', r2_score(df_target_predict, predictions)) sub_df_ = pd.DataFrame({"videoid": predict_data["videoid"].values}) sub_df_['score'] = predictions print('regre ranking shape', sub_df_.shape) sub_df_.to_csv('result.csv') if __name__ == '__main__': train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict = process_train_predict_data() do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict)