123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208 |
- import warnings
- warnings.filterwarnings("ignore")
- import os
- import pandas as pd
- import gc
- import math
- import numpy as np
- import time
- import lightgbm as lgb
- from sklearn.model_selection import train_test_split
- from sklearn.model_selection import StratifiedKFold
- from sklearn.metrics import r2_score
- from sklearn import metrics
- import pickle
- from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
- import seaborn as sns
- import matplotlib.pylab as plt
- from odps import ODPS
- from odps.df import DataFrame as odpsdf
- from datetime import datetime as dt
- import datetime
- import process_feature
- import _pickle as cPickle
- from sklearn.feature_selection import SelectFromModel
- from sklearn.linear_model import LogisticRegression
- def getRovfeaturetable(dt, table):
- odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
- endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
- read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
- featureArray = []
- for record in odps.read_table(table, partition='dt=%s' % dt):
- valueFeature = {}
- for i in process_feature.featurename:
- if i == 'dt':
- valueFeature[i] = dt
- else:
- valueFeature[i] = record[i]
- featureArray.append(valueFeature)
- featureArray = pd.DataFrame(featureArray)
- print(dt, table, 'feature table finish')
- return featureArray
- def getdatasample(date, max_range, table):
- new_date = dt.strptime(date, '%Y%m%d')
- datelist = []
- testlist = []
- for i in range(0, max_range):
- delta = datetime.timedelta(days=i)
- tar_dt = new_date - delta
- datelist.append(tar_dt.strftime("%Y%m%d"))
- for tm in datelist:
- testlist.append(getRovfeaturetable(tm, table))
- data = pd.concat(testlist)
- data.reset_index(inplace=True)
- data = data.drop(axis=1, columns='index')
- return data
- def clean_data(df):
- #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
- df['futre7dayreturn'].loc[df['futre7dayreturn']<=0] = 1
- y = df['futre7dayreturn']
- df_vids = df['videoid']
- #drop string
- #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
- x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
- #drop future
- #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
- x = x.drop(['futre7dayreturn'], axis=1)
- x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
- x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
- x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
- x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn']
- x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn']
- x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn']
- features = list(x)
- drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
- x = x.drop(drop_features, axis=1)
- x = x.fillna(0)
- x = x.astype('float64')
- #x.fillna(0)
- x.clip(0,2000000)
-
- features = [f for f in features if f not in drop_features]
- return x, y , df_vids, features
- def feature_selection(X, y):
- selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
- return selector
- def auto_train(X_train, y_train):
- from flaml import AutoML
- automl = AutoML()
- automl_settings = {
- "time_budget": 10, # in seconds
- "metric": 'r2',
- "task": 'regression',
- "log_file_name": "test/auto.log",
- "estimator_list": ["lgbm"]
- }
- automl.fit(X_train=X_train, y_train=y_train,
- **automl_settings)
- pred_test_y = automl.predict(X_train)
- y_test = y_train.values
- #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
- r2 = r2_score(y_test, pred_test_y)
- #print('err_mape', err_mape)
- print('r2', r2)
- pack_result(pred_test_y, y_test,[],'autoval.csv')
-
- def train(x,y,features):
- X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
- '''
- selector = feature_selection(X_train, y_train)
- X_train = selector.transform(X_train)
- X_test = selector.transform(X_test)
- selected_features = []
- _supported = selector.get_support()
- for i in range(0, len(_supported)):
- if _supported[i]:
- selected_features.append(features[i])
- features = selected_features
- '''
- print(len(X_train), len(X_test))
- params = {
- "objective": "regression",
- "reg_sqrt":True,
- "metric": "mape",
- "max_depth": 6,
- "num_leaves": 30,
- "learning_rate": 0.05,
- "bagging_fraction": 0.7,
- "feature_fraction": 0.7,
- "bagging_freq": 8,
- "bagging_seed": 2018,
- "lambda_l1": 0.1,
- "boosting": "gbdt",
- "nthread": 4,
- "verbosity": -1
- }
- lgtrain = lgb.Dataset(X_train, label=y_train)
- lgval = lgb.Dataset(X_test, label=y_test)
- evals_result = {}
- model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=200, verbose_eval=20,
- evals_result=evals_result)
- pack_result(model.feature_importance(), features, [], 'importance.csv')
-
- pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
- y_test = y_test.values
- err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
- r2 = r2_score(y_test, pred_test_y)
- print('err_mape', err_mape)
- print('r2', r2)
- pack_result(pred_test_y, y_test,[],'val.csv')
- return pred_test_y, model, evals_result
- def pack_result(y_, y, vid, fp):
- #y_ = y_.astype(int)
- y_.reshape(len(y_),1)
- df = pd.DataFrame(data=y_, columns=['score'])
- if len(vid) >0:
- df['vid'] = vid
- df['y'] = y
- df = df.sort_values(by=['score'], ascending=False)
- df.to_csv(fp, index=False)
-
- if __name__ == '__main__':
- with open(r"train_data_x.pickle", "rb") as input_file:
- train_data = cPickle.load(input_file)
- with open(r"predict_data_x.pickle", "rb") as input_file:
- predict_data = cPickle.load(input_file)
- x,y,_,features = clean_data(train_data)
- #auto train
- #auto_train(x,y)
- #train
- _, model, _ = train(x, y, features)
- with open('model.pickle','wb') as output_file:
- cPickle.dump(model, output_file)
- '''
- with open(r"model.pickle", "rb") as input_file:
- model = cPickle.load(input_file)
- '''
- x,y,vid,_ = clean_data(predict_data)
- y_ = model.predict(x, num_iteration=model.best_iteration)
- pack_result(y_, y, vid, 'pred.csv')
|