|  | @@ -0,0 +1,103 @@
 | 
	
		
			
				|  |  | +import warnings
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +warnings.filterwarnings("ignore")
 | 
	
		
			
				|  |  | +import os
 | 
	
		
			
				|  |  | +import pandas as pd
 | 
	
		
			
				|  |  | +import gc
 | 
	
		
			
				|  |  | +import math
 | 
	
		
			
				|  |  | +import numpy as np
 | 
	
		
			
				|  |  | +import time
 | 
	
		
			
				|  |  | +import lightgbm as lgb
 | 
	
		
			
				|  |  | +from sklearn.model_selection import train_test_split
 | 
	
		
			
				|  |  | +from sklearn.model_selection import StratifiedKFold
 | 
	
		
			
				|  |  | +from sklearn.metrics import mean_absolute_percentage_error, r2_score
 | 
	
		
			
				|  |  | +from sklearn import metrics
 | 
	
		
			
				|  |  | +import pickle
 | 
	
		
			
				|  |  | +from sklearn.metrics import mean_squared_error
 | 
	
		
			
				|  |  | +import seaborn as sns
 | 
	
		
			
				|  |  | +import matplotlib.pylab as plt
 | 
	
		
			
				|  |  | +from odps import ODPS
 | 
	
		
			
				|  |  | +from odps.df import DataFrame as odpsdf
 | 
	
		
			
				|  |  | +from datetime import datetime as dt
 | 
	
		
			
				|  |  | +import datetime
 | 
	
		
			
				|  |  | +import process_feature
 | 
	
		
			
				|  |  | +import _pickle as cPickle
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def getRovfeaturetable(dt, table):
 | 
	
		
			
				|  |  | +    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
 | 
	
		
			
				|  |  | +                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
 | 
	
		
			
				|  |  | +                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    featureArray = []
 | 
	
		
			
				|  |  | +    for record in odps.read_table(table, partition='dt=%s' % dt):
 | 
	
		
			
				|  |  | +        valueFeature = {}
 | 
	
		
			
				|  |  | +        for i in process_feature.featurename:
 | 
	
		
			
				|  |  | +            if i == 'dt':
 | 
	
		
			
				|  |  | +                valueFeature[i] = dt
 | 
	
		
			
				|  |  | +            else:
 | 
	
		
			
				|  |  | +                valueFeature[i] = record[i]
 | 
	
		
			
				|  |  | +        featureArray.append(valueFeature)
 | 
	
		
			
				|  |  | +    featureArray = pd.DataFrame(featureArray)
 | 
	
		
			
				|  |  | +    print(dt, table, 'feature table finish')
 | 
	
		
			
				|  |  | +    return featureArray
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def getdatasample(date, max_range, table):
 | 
	
		
			
				|  |  | +    new_date = dt.strptime(date, '%Y%m%d')
 | 
	
		
			
				|  |  | +    datelist = []
 | 
	
		
			
				|  |  | +    testlist = []
 | 
	
		
			
				|  |  | +    for i in range(0, max_range):
 | 
	
		
			
				|  |  | +        delta = datetime.timedelta(days=i)
 | 
	
		
			
				|  |  | +        tar_dt = new_date - delta
 | 
	
		
			
				|  |  | +        datelist.append(tar_dt.strftime("%Y%m%d"))
 | 
	
		
			
				|  |  | +    for tm in datelist:
 | 
	
		
			
				|  |  | +        testlist.append(getRovfeaturetable(tm, table))
 | 
	
		
			
				|  |  | +    data = pd.concat(testlist)
 | 
	
		
			
				|  |  | +    data.reset_index(inplace=True)
 | 
	
		
			
				|  |  | +    data = data.drop(axis=1, columns='index')
 | 
	
		
			
				|  |  | +    return data
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def clean_data(df):
 | 
	
		
			
				|  |  | +    y = df['futre7dayreturn']
 | 
	
		
			
				|  |  | +    df_vids = df['videoid']
 | 
	
		
			
				|  |  | +    #drop string
 | 
	
		
			
				|  |  | +    x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'])
 | 
	
		
			
				|  |  | +    #drop future
 | 
	
		
			
				|  |  | +    x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'])
 | 
	
		
			
				|  |  | +    return x, y , df_vids
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def train(x,y):
 | 
	
		
			
				|  |  | +    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
 | 
	
		
			
				|  |  | +    params = {
 | 
	
		
			
				|  |  | +        "objective": "regression",
 | 
	
		
			
				|  |  | +        "metric": "rmse",
 | 
	
		
			
				|  |  | +        "num_leaves": 30,
 | 
	
		
			
				|  |  | +        "learning_rate": 0.1,
 | 
	
		
			
				|  |  | +        "bagging_fraction": 0.7,
 | 
	
		
			
				|  |  | +        "feature_fraction": 0.7,
 | 
	
		
			
				|  |  | +        "bagging_frequency": 5,
 | 
	
		
			
				|  |  | +        "bagging_seed": 2018,
 | 
	
		
			
				|  |  | +        "verbosity": -1
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    lgtrain = lgb.Dataset(X_train, label=y_train)
 | 
	
		
			
				|  |  | +    lgval = lgb.Dataset(X_test, label=y_test)
 | 
	
		
			
				|  |  | +    evals_result = {}
 | 
	
		
			
				|  |  | +    model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20,
 | 
	
		
			
				|  |  | +                      evals_result=evals_result)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
 | 
	
		
			
				|  |  | +    err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
 | 
	
		
			
				|  |  | +    r2 = r2_score(y_test, pred_test_y)
 | 
	
		
			
				|  |  | +    print('err_mape', err_mape)
 | 
	
		
			
				|  |  | +    print('r2', r2)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    return pred_test_y, model, evals_result
 | 
	
		
			
				|  |  | +    
 | 
	
		
			
				|  |  | +if __name__ == '__main__':
 | 
	
		
			
				|  |  | +    with open(r"train_data.pickle", "rb") as input_file:
 | 
	
		
			
				|  |  | +        train_data = cPickle.load(input_file)    
 | 
	
		
			
				|  |  | +    with open(r"predict_data.pickle", "rb") as input_file:
 | 
	
		
			
				|  |  | +        predict_data = cPickle.load(input_file)   
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    #train
 | 
	
		
			
				|  |  | +    x,y,_ = clean_data(train_data)
 | 
	
		
			
				|  |  | +    train(x, y)
 |