|  | @@ -0,0 +1,222 @@
 | 
	
		
			
				|  |  | +import warnings
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +warnings.filterwarnings("ignore")
 | 
	
		
			
				|  |  | +import os
 | 
	
		
			
				|  |  | +import pandas as pd
 | 
	
		
			
				|  |  | +import gc
 | 
	
		
			
				|  |  | +import math
 | 
	
		
			
				|  |  | +import numpy as np
 | 
	
		
			
				|  |  | +import time
 | 
	
		
			
				|  |  | +import lightgbm as lgb
 | 
	
		
			
				|  |  | +from sklearn.model_selection import train_test_split
 | 
	
		
			
				|  |  | +from sklearn.model_selection import StratifiedKFold
 | 
	
		
			
				|  |  | +from sklearn import metrics
 | 
	
		
			
				|  |  | +import pickle
 | 
	
		
			
				|  |  | +from sklearn.metrics import top_k_accuracy_score
 | 
	
		
			
				|  |  | +import seaborn as sns
 | 
	
		
			
				|  |  | +import matplotlib.pylab as plt
 | 
	
		
			
				|  |  | +from odps import ODPS
 | 
	
		
			
				|  |  | +from odps.df import DataFrame as odpsdf
 | 
	
		
			
				|  |  | +from datetime import datetime as dt
 | 
	
		
			
				|  |  | +import datetime
 | 
	
		
			
				|  |  | +import process_feature
 | 
	
		
			
				|  |  | +import _pickle as cPickle
 | 
	
		
			
				|  |  | +from sklearn.feature_selection import SelectFromModel
 | 
	
		
			
				|  |  | +from sklearn.linear_model import LogisticRegression
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def getRovfeaturetable(dt, table):
 | 
	
		
			
				|  |  | +    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
 | 
	
		
			
				|  |  | +                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
 | 
	
		
			
				|  |  | +                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    featureArray = []
 | 
	
		
			
				|  |  | +    for record in odps.read_table(table, partition='dt=%s' % dt):
 | 
	
		
			
				|  |  | +        valueFeature = {}
 | 
	
		
			
				|  |  | +        for i in process_feature.featurename:
 | 
	
		
			
				|  |  | +            if i == 'dt':
 | 
	
		
			
				|  |  | +                valueFeature[i] = dt
 | 
	
		
			
				|  |  | +            else:
 | 
	
		
			
				|  |  | +                valueFeature[i] = record[i]
 | 
	
		
			
				|  |  | +        featureArray.append(valueFeature)
 | 
	
		
			
				|  |  | +    featureArray = pd.DataFrame(featureArray)
 | 
	
		
			
				|  |  | +    print(dt, table, 'feature table finish')
 | 
	
		
			
				|  |  | +    return featureArray
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def getdatasample(date, max_range, table):
 | 
	
		
			
				|  |  | +    new_date = dt.strptime(date, '%Y%m%d')
 | 
	
		
			
				|  |  | +    datelist = []
 | 
	
		
			
				|  |  | +    testlist = []
 | 
	
		
			
				|  |  | +    for i in range(0, max_range):
 | 
	
		
			
				|  |  | +        delta = datetime.timedelta(days=i)
 | 
	
		
			
				|  |  | +        tar_dt = new_date - delta
 | 
	
		
			
				|  |  | +        datelist.append(tar_dt.strftime("%Y%m%d"))
 | 
	
		
			
				|  |  | +    for tm in datelist:
 | 
	
		
			
				|  |  | +        testlist.append(getRovfeaturetable(tm, table))
 | 
	
		
			
				|  |  | +    data = pd.concat(testlist)
 | 
	
		
			
				|  |  | +    data.reset_index(inplace=True)
 | 
	
		
			
				|  |  | +    data = data.drop(axis=1, columns='index')
 | 
	
		
			
				|  |  | +    return data
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def discrete_y(y):
 | 
	
		
			
				|  |  | +    y = float(y)
 | 
	
		
			
				|  |  | +    if y>1000000:
 | 
	
		
			
				|  |  | +        return 7
 | 
	
		
			
				|  |  | +    elif y>500000:
 | 
	
		
			
				|  |  | +        return 6
 | 
	
		
			
				|  |  | +    elif y>100000:
 | 
	
		
			
				|  |  | +        return 5
 | 
	
		
			
				|  |  | +    elif y>50000:
 | 
	
		
			
				|  |  | +        return 4
 | 
	
		
			
				|  |  | +    elif y>10000:
 | 
	
		
			
				|  |  | +        return 3
 | 
	
		
			
				|  |  | +    elif y>5000:
 | 
	
		
			
				|  |  | +        return 2
 | 
	
		
			
				|  |  | +    elif y>1000:
 | 
	
		
			
				|  |  | +        return 1
 | 
	
		
			
				|  |  | +    else:
 | 
	
		
			
				|  |  | +        return 0
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def clean_data(df):
 | 
	
		
			
				|  |  | +    #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
 | 
	
		
			
				|  |  | +    df['futre7dayreturn'] = df['futre7dayreturn'].apply(discrete_y)
 | 
	
		
			
				|  |  | +    y = df['futre7dayreturn']
 | 
	
		
			
				|  |  | +    print(y)
 | 
	
		
			
				|  |  | +    df_vids = df['videoid']
 | 
	
		
			
				|  |  | +    #drop string
 | 
	
		
			
				|  |  | +    #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
 | 
	
		
			
				|  |  | +    x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
 | 
	
		
			
				|  |  | +    #drop future
 | 
	
		
			
				|  |  | +    #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
 | 
	
		
			
				|  |  | +    x = x.drop(['futre7dayreturn'], axis=1)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
 | 
	
		
			
				|  |  | +    x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
 | 
	
		
			
				|  |  | +    x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn']
 | 
	
		
			
				|  |  | +    x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn']
 | 
	
		
			
				|  |  | +    x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn']
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    features = list(x)
 | 
	
		
			
				|  |  | +    drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
 | 
	
		
			
				|  |  | +    x = x.drop(drop_features, axis=1)
 | 
	
		
			
				|  |  | +    x = x.fillna(0)
 | 
	
		
			
				|  |  | +    x = x.astype('float64')
 | 
	
		
			
				|  |  | +    #x.fillna(0)
 | 
	
		
			
				|  |  | +    x.clip(0,2000000)
 | 
	
		
			
				|  |  | + 
 | 
	
		
			
				|  |  | +    features = [f for f in features if f not in drop_features]
 | 
	
		
			
				|  |  | +    return x, y , df_vids, features
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def feature_selection(X, y):
 | 
	
		
			
				|  |  | +    selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
 | 
	
		
			
				|  |  | +    return selector
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def auto_train(X_train, y_train):
 | 
	
		
			
				|  |  | +    from flaml import  AutoML
 | 
	
		
			
				|  |  | +    automl = AutoML()
 | 
	
		
			
				|  |  | +    automl_settings = {
 | 
	
		
			
				|  |  | +    "time_budget": 10,  # in seconds
 | 
	
		
			
				|  |  | +    "metric": 'r2',
 | 
	
		
			
				|  |  | +    "task": 'regression',
 | 
	
		
			
				|  |  | +    "log_file_name": "test/auto.log",
 | 
	
		
			
				|  |  | +    "estimator_list": ["lgbm"]
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    automl.fit(X_train=X_train, y_train=y_train,
 | 
	
		
			
				|  |  | +           **automl_settings) 
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    pred_test_y = automl.predict(X_train)
 | 
	
		
			
				|  |  | +    y_test = y_train.values
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
 | 
	
		
			
				|  |  | +    r2 = r2_score(y_test, pred_test_y)
 | 
	
		
			
				|  |  | +    #print('err_mape', err_mape)
 | 
	
		
			
				|  |  | +    print('r2', r2)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    pack_result(pred_test_y, y_test,[],'autoval.csv')
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def train(x,y,features):
 | 
	
		
			
				|  |  | +    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, stratify=y )
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    '''
 | 
	
		
			
				|  |  | +    selector = feature_selection(X_train, y_train) 
 | 
	
		
			
				|  |  | +    X_train = selector.transform(X_train)
 | 
	
		
			
				|  |  | +    X_test = selector.transform(X_test)
 | 
	
		
			
				|  |  | +    selected_features = []
 | 
	
		
			
				|  |  | +    _supported = selector.get_support()
 | 
	
		
			
				|  |  | +    for i in range(0, len(_supported)):
 | 
	
		
			
				|  |  | +        if _supported[i]:
 | 
	
		
			
				|  |  | +            selected_features.append(features[i])
 | 
	
		
			
				|  |  | +    features = selected_features 
 | 
	
		
			
				|  |  | +    '''
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    print(len(X_train), len(X_test))
 | 
	
		
			
				|  |  | +    params = {
 | 
	
		
			
				|  |  | +        "objective": "multiclass",
 | 
	
		
			
				|  |  | +        "num_classes": 8,
 | 
	
		
			
				|  |  | +        "max_depth": 6,
 | 
	
		
			
				|  |  | +        "num_leaves": 30,
 | 
	
		
			
				|  |  | +        "learning_rate": 0.05,
 | 
	
		
			
				|  |  | +        "bagging_fraction": 0.7,
 | 
	
		
			
				|  |  | +        "feature_fraction": 0.7,
 | 
	
		
			
				|  |  | +        "bagging_freq": 8,
 | 
	
		
			
				|  |  | +        "bagging_seed": 2018,
 | 
	
		
			
				|  |  | +        "lambda_l1": 0.1,
 | 
	
		
			
				|  |  | +        "boosting": "gbdt",
 | 
	
		
			
				|  |  | +        "nthread": 4,
 | 
	
		
			
				|  |  | +        "verbosity": -1
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    lgtrain = lgb.Dataset(X_train, label=y_train)
 | 
	
		
			
				|  |  | +    lgval = lgb.Dataset(X_test, label=y_test)
 | 
	
		
			
				|  |  | +    evals_result = {}
 | 
	
		
			
				|  |  | +    model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=200, verbose_eval=20,
 | 
	
		
			
				|  |  | +                      evals_result=evals_result)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    pack_result(model.feature_importance(), features, [], 'importance.csv')
 | 
	
		
			
				|  |  | +    
 | 
	
		
			
				|  |  | +    pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
 | 
	
		
			
				|  |  | +    
 | 
	
		
			
				|  |  | +    top_k_accuracy_score(y_test, pred_test_y, k=1)
 | 
	
		
			
				|  |  | +    print('top_k_accuracy_score', top_k_accuracy_score) 
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    pack_result(pred_test_y, y_test,[],'val.csv')
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    return pred_test_y, model, evals_result
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def pack_result(y_, y, vid, fp):
 | 
	
		
			
				|  |  | +    #y_ = y_.astype(int)
 | 
	
		
			
				|  |  | +    y_.reshape(len(y_),1) 
 | 
	
		
			
				|  |  | +    df = pd.DataFrame(data=y_, columns=['score'])
 | 
	
		
			
				|  |  | +    if len(vid) >0:
 | 
	
		
			
				|  |  | +        df['vid'] = vid
 | 
	
		
			
				|  |  | +    df['y'] = y
 | 
	
		
			
				|  |  | +    df = df.sort_values(by=['score'], ascending=False)
 | 
	
		
			
				|  |  | +    df.to_csv(fp, index=False)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    
 | 
	
		
			
				|  |  | +if __name__ == '__main__':
 | 
	
		
			
				|  |  | +    with open(r"train_data_x.pickle", "rb") as input_file:
 | 
	
		
			
				|  |  | +        train_data = cPickle.load(input_file)    
 | 
	
		
			
				|  |  | +    with open(r"predict_data_x.pickle", "rb") as input_file:
 | 
	
		
			
				|  |  | +        predict_data = cPickle.load(input_file)   
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    x,y,_,features = clean_data(train_data)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    #auto train
 | 
	
		
			
				|  |  | +    #auto_train(x,y)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    #train
 | 
	
		
			
				|  |  | +    _, model, _ = train(x, y, features)
 | 
	
		
			
				|  |  | +    with open('model.pickle','wb') as output_file:
 | 
	
		
			
				|  |  | +        cPickle.dump(model, output_file)
 | 
	
		
			
				|  |  | +    '''
 | 
	
		
			
				|  |  | +    with open(r"model.pickle", "rb") as input_file:
 | 
	
		
			
				|  |  | +        model = cPickle.load(input_file)    
 | 
	
		
			
				|  |  | +    ''' 
 | 
	
		
			
				|  |  | +    x,y,vid,_ = clean_data(predict_data)
 | 
	
		
			
				|  |  | +    y_ = model.predict(x, num_iteration=model.best_iteration)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    pack_result(y_, y, vid, 'pred.csv')
 |