algorithm
/
RovOpt


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
							import warnings

warnings.filterwarnings("ignore")
import os
import pandas as pd
import gc
import math
import numpy as np
import time
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_percentage_error, r2_score
from sklearn import metrics
import pickle
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pylab as plt
from odps import ODPS
from odps.df import DataFrame as odpsdf
from datetime import datetime as dt
import datetime
import process_feature
import _pickle as cPickle


def getRovfeaturetable(dt, table):
    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)

    featureArray = []
    for record in odps.read_table(table, partition='dt=%s' % dt):
        valueFeature = {}
        for i in process_feature.featurename:
            if i == 'dt':
                valueFeature[i] = dt
            else:
                valueFeature[i] = record[i]
        featureArray.append(valueFeature)
    featureArray = pd.DataFrame(featureArray)
    print(dt, table, 'feature table finish')
    return featureArray

def getdatasample(date, max_range, table):
    new_date = dt.strptime(date, '%Y%m%d')
    datelist = []
    testlist = []
    for i in range(0, max_range):
        delta = datetime.timedelta(days=i)
        tar_dt = new_date - delta
        datelist.append(tar_dt.strftime("%Y%m%d"))
    for tm in datelist:
        testlist.append(getRovfeaturetable(tm, table))
    data = pd.concat(testlist)
    data.reset_index(inplace=True)
    data = data.drop(axis=1, columns='index')
    return data


def clean_data(df):
    #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
    y = df['futre7dayreturn']
    df_vids = df['videoid']
    #drop string
    #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
    x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
    #drop future
    #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
    x = x.drop(['futre7dayreturn'], axis=1)
    features = list(x)
    drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
    x = x.drop(drop_features, axis=1)

    features = [f for f in features if f not in drop_features]
    return x, y , df_vids, features

def train(x,y,features):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
    params = {
        "objective": "regression",
        "metric": "mape",
        "max_depth": 5,
        "num_leaves": 30,
        "learning_rate": 0.1,
        "bagging_fraction": 0.7,
        "feature_fraction": 0.7,
        "bagging_freq": 5,
        "bagging_seed": 2018,
        "lambda_l1": 0.1,
        "boosting": "gbdt",
        "nthread": 4,
        "verbosity": -1
    }
    lgtrain = lgb.Dataset(X_train, label=y_train)
    lgval = lgb.Dataset(X_test, label=y_test)
    evals_result = {}
    model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20,
                      evals_result=evals_result)

    pack_result(model.feature_importance(), features, [], 'importance.csv')
    
    pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
    y_test = y_test.values

    #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
    r2 = r2_score(y_test, pred_test_y)
    #print('err_mape', err_mape)
    print('r2', r2)

    pack_result(pred_test_y, y_test,[],'val.csv')

    return pred_test_y, model, evals_result


def pack_result(y_, y, vid, fp):
    #y_ = y_.astype(int)
    y_.reshape(len(y_),1) 
    df = pd.DataFrame(data=y_, columns=['score'])
    if len(vid) >0:
        df['vid'] = vid
    df['y'] = y
    df = df.sort_values(by=['score'], ascending=False)
    df.to_csv(fp, index=False)

    
if __name__ == '__main__':
    with open(r"train_data.pickle", "rb") as input_file:
        train_data = cPickle.load(input_file)    
    with open(r"predict_data.pickle", "rb") as input_file:
        predict_data = cPickle.load(input_file)   

    #train
    x,y,_,features = clean_data(train_data)
    _, model, _ = train(x, y, features)
    with open('model.pickle','wb') as output_file:
        cPickle.dump(model, output_file)
    '''
    with open(r"model.pickle", "rb") as input_file:
        model = cPickle.load(input_file)    
    ''' 
    x,y,vid,_ = clean_data(predict_data)
    y_ = model.predict(x, num_iteration=model.best_iteration)

    pack_result(y_, y, vid, 'pred.csv')