baichongyang 3 lat temu
rodzic
commit
aa2a4dacf3

+ 44 - 0
data_desc.py

@@ -0,0 +1,44 @@
+import _pickle as cPickle
+import pandas as pd
+
+def clean_data(df):
+    #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
+    df['futre7dayreturn'].loc[df['futre7dayreturn']<=0] = 1
+    y = df['futre7dayreturn']
+    df_vids = df['videoid']
+    #drop string
+    #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
+    x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
+    #drop future
+    #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
+    x = x.drop(['futre7dayreturn'], axis=1)
+
+    x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
+    x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
+    x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
+
+    x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn']
+    x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn']
+    x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn']
+
+    features = list(x)
+    drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
+    x = x.drop(drop_features, axis=1)
+    x = x.fillna(0)
+    x = x.astype('float64')
+    x = x.clip(0,2000000)
+ 
+    features = [f for f in features if f not in drop_features]
+    return x, y , df_vids, features
+
+if __name__ == '__main__':
+    with open(r"train_data_x.pickle", "rb") as input_file:
+        train_data = cPickle.load(input_file)    
+    with open(r"predict_data_x.pickle", "rb") as input_file:
+        predict_data = cPickle.load(input_file) 
+
+    X,Y,_,_ = clean_data(train_data)
+    x,y,_,_ = clean_data(predict_data)
+    print(X.describe())
+    print(x.describe())
+    #Y.describe()

+ 63 - 0
get_data.py

@@ -0,0 +1,63 @@
+import datetime
+from datetime import datetime as dt
+from odps import ODPS
+import process_feature
+import pandas as pd
+
+def getRovfeaturetable(dt, table):
+    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
+                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
+                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
+
+    featureArray = []
+    for record in odps.read_table(table, partition='dt=%s' % dt):
+        valueFeature = {}
+        for i in process_feature.featurename:
+            if i == 'dt':
+                valueFeature[i] = dt
+            else:
+                valueFeature[i] = record[i]
+        featureArray.append(valueFeature)
+    featureArray = pd.DataFrame(featureArray)
+    print(dt, table, 'feature table finish')
+    return featureArray
+
+def getdatasample(date, max_range, table):
+    new_date = dt.strptime(date, '%Y%m%d')
+    datelist = []
+    testlist = []
+    for i in range(0, max_range):
+        delta = datetime.timedelta(days=i)
+        tar_dt = new_date - delta
+        datelist.append(tar_dt.strftime("%Y%m%d"))
+    print(datelist)
+    for tm in datelist:
+        testlist.append(getRovfeaturetable(tm, table))
+    testdata = pd.concat(testlist)
+    testdata.reset_index(inplace=True)
+    testdata = testdata.drop(axis=1, columns='index')
+    return testdata
+
+def process_train_predict_data():
+    now_date = datetime.date.today() 
+    # day = datetime.datetime.strftime(now_date, '%Y%m%d')
+    DIFF1 = 1
+    DIFF7= 7
+    diff_1 = datetime.timedelta(days=DIFF1)
+    diff_5 = datetime.timedelta(days=DIFF7)
+    predict_dt = now_date - diff_1
+    predict_day = datetime.datetime.strftime(predict_dt, '%Y%m%d')
+    train_dt = now_date - diff_5
+    train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
+    #read data from ali
+    train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
+    predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
+    #pickle for test
+    import _pickle as cPickle
+    with open('train_data_all.pickle','wb') as output_file:
+        cPickle.dump(train_data, output_file)
+    with open('predict_data_all.pickle','wb') as output_file:
+        cPickle.dump(predict_data, output_file) 
+
+if __name__ == '__main__' :
+    process_train_predict_data()

Plik diff jest za duży
+ 8 - 0
nohup.out


+ 222 - 0
rov_train_classify.py

@@ -0,0 +1,222 @@
+import warnings
+
+warnings.filterwarnings("ignore")
+import os
+import pandas as pd
+import gc
+import math
+import numpy as np
+import time
+import lightgbm as lgb
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedKFold
+from sklearn import metrics
+import pickle
+from sklearn.metrics import top_k_accuracy_score
+import seaborn as sns
+import matplotlib.pylab as plt
+from odps import ODPS
+from odps.df import DataFrame as odpsdf
+from datetime import datetime as dt
+import datetime
+import process_feature
+import _pickle as cPickle
+from sklearn.feature_selection import SelectFromModel
+from sklearn.linear_model import LogisticRegression
+
+def getRovfeaturetable(dt, table):
+    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
+                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
+                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
+
+    featureArray = []
+    for record in odps.read_table(table, partition='dt=%s' % dt):
+        valueFeature = {}
+        for i in process_feature.featurename:
+            if i == 'dt':
+                valueFeature[i] = dt
+            else:
+                valueFeature[i] = record[i]
+        featureArray.append(valueFeature)
+    featureArray = pd.DataFrame(featureArray)
+    print(dt, table, 'feature table finish')
+    return featureArray
+
+def getdatasample(date, max_range, table):
+    new_date = dt.strptime(date, '%Y%m%d')
+    datelist = []
+    testlist = []
+    for i in range(0, max_range):
+        delta = datetime.timedelta(days=i)
+        tar_dt = new_date - delta
+        datelist.append(tar_dt.strftime("%Y%m%d"))
+    for tm in datelist:
+        testlist.append(getRovfeaturetable(tm, table))
+    data = pd.concat(testlist)
+    data.reset_index(inplace=True)
+    data = data.drop(axis=1, columns='index')
+    return data
+
+def discrete_y(y):
+    y = float(y)
+    if y>1000000:
+        return 7
+    elif y>500000:
+        return 6
+    elif y>100000:
+        return 5
+    elif y>50000:
+        return 4
+    elif y>10000:
+        return 3
+    elif y>5000:
+        return 2
+    elif y>1000:
+        return 1
+    else:
+        return 0
+
+def clean_data(df):
+    #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
+    df['futre7dayreturn'] = df['futre7dayreturn'].apply(discrete_y)
+    y = df['futre7dayreturn']
+    print(y)
+    df_vids = df['videoid']
+    #drop string
+    #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
+    x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
+    #drop future
+    #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
+    x = x.drop(['futre7dayreturn'], axis=1)
+
+    x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
+    x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
+    x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
+
+    x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn']
+    x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn']
+    x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn']
+
+    features = list(x)
+    drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
+    x = x.drop(drop_features, axis=1)
+    x = x.fillna(0)
+    x = x.astype('float64')
+    #x.fillna(0)
+    x.clip(0,2000000)
+ 
+    features = [f for f in features if f not in drop_features]
+    return x, y , df_vids, features
+
+def feature_selection(X, y):
+    selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
+    return selector
+
+def auto_train(X_train, y_train):
+    from flaml import  AutoML
+    automl = AutoML()
+    automl_settings = {
+    "time_budget": 10,  # in seconds
+    "metric": 'r2',
+    "task": 'regression',
+    "log_file_name": "test/auto.log",
+    "estimator_list": ["lgbm"]
+    }
+    automl.fit(X_train=X_train, y_train=y_train,
+           **automl_settings) 
+
+    pred_test_y = automl.predict(X_train)
+    y_test = y_train.values
+
+    #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
+    r2 = r2_score(y_test, pred_test_y)
+    #print('err_mape', err_mape)
+    print('r2', r2)
+
+    pack_result(pred_test_y, y_test,[],'autoval.csv')
+
+    
+
+def train(x,y,features):
+    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, stratify=y )
+
+    '''
+    selector = feature_selection(X_train, y_train) 
+    X_train = selector.transform(X_train)
+    X_test = selector.transform(X_test)
+    selected_features = []
+    _supported = selector.get_support()
+    for i in range(0, len(_supported)):
+        if _supported[i]:
+            selected_features.append(features[i])
+    features = selected_features 
+    '''
+
+    print(len(X_train), len(X_test))
+    params = {
+        "objective": "multiclass",
+        "num_classes": 8,
+        "max_depth": 6,
+        "num_leaves": 30,
+        "learning_rate": 0.05,
+        "bagging_fraction": 0.7,
+        "feature_fraction": 0.7,
+        "bagging_freq": 8,
+        "bagging_seed": 2018,
+        "lambda_l1": 0.1,
+        "boosting": "gbdt",
+        "nthread": 4,
+        "verbosity": -1
+    }
+    lgtrain = lgb.Dataset(X_train, label=y_train)
+    lgval = lgb.Dataset(X_test, label=y_test)
+    evals_result = {}
+    model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=200, verbose_eval=20,
+                      evals_result=evals_result)
+
+    pack_result(model.feature_importance(), features, [], 'importance.csv')
+    
+    pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
+    
+    top_k_accuracy_score(y_test, pred_test_y, k=1)
+    print('top_k_accuracy_score', top_k_accuracy_score) 
+
+    pack_result(pred_test_y, y_test,[],'val.csv')
+
+    return pred_test_y, model, evals_result
+
+
+def pack_result(y_, y, vid, fp):
+    #y_ = y_.astype(int)
+    y_.reshape(len(y_),1) 
+    df = pd.DataFrame(data=y_, columns=['score'])
+    if len(vid) >0:
+        df['vid'] = vid
+    df['y'] = y
+    df = df.sort_values(by=['score'], ascending=False)
+    df.to_csv(fp, index=False)
+
+    
+if __name__ == '__main__':
+    with open(r"train_data_x.pickle", "rb") as input_file:
+        train_data = cPickle.load(input_file)    
+    with open(r"predict_data_x.pickle", "rb") as input_file:
+        predict_data = cPickle.load(input_file)   
+
+    x,y,_,features = clean_data(train_data)
+
+    #auto train
+    #auto_train(x,y)
+
+    #train
+    _, model, _ = train(x, y, features)
+    with open('model.pickle','wb') as output_file:
+        cPickle.dump(model, output_file)
+    '''
+    with open(r"model.pickle", "rb") as input_file:
+        model = cPickle.load(input_file)    
+    ''' 
+    x,y,vid,_ = clean_data(predict_data)
+    y_ = model.predict(x, num_iteration=model.best_iteration)
+
+    pack_result(y_, y, vid, 'pred.csv')

+ 0 - 0
rov_train_describe.py


+ 77 - 14
rov_train_new.py

@@ -10,10 +10,10 @@ import time
 import lightgbm as lgb
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import StratifiedKFold
-from sklearn.metrics import mean_absolute_percentage_error, r2_score
+from sklearn.metrics import r2_score
 from sklearn import metrics
 import pickle
-from sklearn.metrics import mean_squared_error
+from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
 import seaborn as sns
 import matplotlib.pylab as plt
 from odps import ODPS
@@ -22,7 +22,8 @@ from datetime import datetime as dt
 import datetime
 import process_feature
 import _pickle as cPickle
-
+from sklearn.feature_selection import SelectFromModel
+from sklearn.linear_model import LogisticRegression
 
 def getRovfeaturetable(dt, table):
     odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
@@ -60,6 +61,7 @@ def getdatasample(date, max_range, table):
 
 def clean_data(df):
     #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
+    df['futre7dayreturn'].loc[df['futre7dayreturn']<=0] = 1
     y = df['futre7dayreturn']
     df_vids = df['videoid']
     #drop string
@@ -68,24 +70,81 @@ def clean_data(df):
     #drop future
     #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
     x = x.drop(['futre7dayreturn'], axis=1)
+
+    x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
+    x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
+    x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
+
+    x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn']
+    x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn']
+    x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn']
+
     features = list(x)
     drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
     x = x.drop(drop_features, axis=1)
-
+    x = x.fillna(0)
+    x = x.astype('float64')
+    #x.fillna(0)
+    x.clip(0,2000000)
+ 
     features = [f for f in features if f not in drop_features]
     return x, y , df_vids, features
 
+def feature_selection(X, y):
+    selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
+    return selector
+
+def auto_train(X_train, y_train):
+    from flaml import  AutoML
+    automl = AutoML()
+    automl_settings = {
+    "time_budget": 10,  # in seconds
+    "metric": 'r2',
+    "task": 'regression',
+    "log_file_name": "test/auto.log",
+    "estimator_list": ["lgbm"]
+    }
+    automl.fit(X_train=X_train, y_train=y_train,
+           **automl_settings) 
+
+    pred_test_y = automl.predict(X_train)
+    y_test = y_train.values
+
+    #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
+    r2 = r2_score(y_test, pred_test_y)
+    #print('err_mape', err_mape)
+    print('r2', r2)
+
+    pack_result(pred_test_y, y_test,[],'autoval.csv')
+
+    
+
 def train(x,y,features):
-    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
+    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
+
+    '''
+    selector = feature_selection(X_train, y_train) 
+    X_train = selector.transform(X_train)
+    X_test = selector.transform(X_test)
+    selected_features = []
+    _supported = selector.get_support()
+    for i in range(0, len(_supported)):
+        if _supported[i]:
+            selected_features.append(features[i])
+    features = selected_features 
+    '''
+
+    print(len(X_train), len(X_test))
     params = {
         "objective": "regression",
+        "reg_sqrt":True,
         "metric": "mape",
-        "max_depth": 5,
+        "max_depth": 6,
         "num_leaves": 30,
-        "learning_rate": 0.1,
+        "learning_rate": 0.05,
         "bagging_fraction": 0.7,
         "feature_fraction": 0.7,
-        "bagging_freq": 5,
+        "bagging_freq": 8,
         "bagging_seed": 2018,
         "lambda_l1": 0.1,
         "boosting": "gbdt",
@@ -95,7 +154,7 @@ def train(x,y,features):
     lgtrain = lgb.Dataset(X_train, label=y_train)
     lgval = lgb.Dataset(X_test, label=y_test)
     evals_result = {}
-    model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20,
+    model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=200, verbose_eval=20,
                       evals_result=evals_result)
 
     pack_result(model.feature_importance(), features, [], 'importance.csv')
@@ -103,9 +162,9 @@ def train(x,y,features):
     pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
     y_test = y_test.values
 
-    #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
+    err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
     r2 = r2_score(y_test, pred_test_y)
-    #print('err_mape', err_mape)
+    print('err_mape', err_mape)
     print('r2', r2)
 
     pack_result(pred_test_y, y_test,[],'val.csv')
@@ -125,13 +184,17 @@ def pack_result(y_, y, vid, fp):
 
     
 if __name__ == '__main__':
-    with open(r"train_data.pickle", "rb") as input_file:
+    with open(r"train_data_x.pickle", "rb") as input_file:
         train_data = cPickle.load(input_file)    
-    with open(r"predict_data.pickle", "rb") as input_file:
+    with open(r"predict_data_x.pickle", "rb") as input_file:
         predict_data = cPickle.load(input_file)   
 
-    #train
     x,y,_,features = clean_data(train_data)
+
+    #auto train
+    #auto_train(x,y)
+
+    #train
     _, model, _ = train(x, y, features)
     with open('model.pickle','wb') as output_file:
         cPickle.dump(model, output_file)

+ 14 - 9
rov_train_paddle.py

@@ -8,6 +8,7 @@ class RovDataset(Dataset):
     def __init__(self, path):
         super(RovDataset, self).__init__()
         self.path = path
+        _,_,_,self.features = self._parse_dataset()
 
     def _parse_dataset(self):
         self.data = []
@@ -21,15 +22,20 @@ class RovDataset(Dataset):
             features = list(x)
             drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
             x = x.drop(drop_features, axis=1)
-            x = x.apply(lambda x: (x - np.min())/ (np.max(x) - np.min(x)))
-            #features = [f for f in features if f not in drop_features]
+            x.fillna(0)
+            x = x.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)) )
+            features = [f for f in features if f not in drop_features]
             self.data = x
             self.labels = y
-            #return x, y , df_vids, features
+
+        return x, y , df_vids, features
 
     def __getitem__(self, idx):
-        data, label = self.data.iloc[idx], self.labels.iloc[idx]
-        return data.astype('float21'), label.astype('float32')
+        try:
+            data, label = np.array(self.data.iloc[idx]), np.array(self.labels.iloc[idx])
+            return data.astype('float32'), label.astype('float32')
+        except Exception as e:
+            print(e)
 
     def __len__(self):
         return len(self.labels)
@@ -42,6 +48,7 @@ def train():
 
     train_dataset = RovDataset("train_data.pickle")
     test_dataset = RovDataset("predict_data.pickle")
+    feature_dim = len(train_dataset.features)
 
     linear=paddle.nn.Sequential(
             paddle.nn.Linear(feature_dim, 4096),
@@ -51,7 +58,7 @@ def train():
             paddle.nn.Dropout(0.2),
             paddle.nn.Linear(1024,19),
             paddle.nn.ReLU(),
-            paddle.nn.Linear(19,1)
+            paddle.nn.Linear(19,result_dim)
             )
 
     model=paddle.Model(linear)
@@ -74,6 +81,4 @@ def pack_result(y_, y, vid, fp):
 
     
 if __name__ == '__main__':
-
-    #train
-    pass
+    train()

+ 198 - 0
rov_train_regression.py

@@ -0,0 +1,198 @@
+import warnings
+
+warnings.filterwarnings("ignore")
+import os
+import pandas as pd
+import gc
+import math
+import numpy as np
+import time
+import lightgbm as lgb
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedKFold
+from sklearn.metrics import r2_score
+from sklearn import metrics
+import pickle
+from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
+import seaborn as sns
+import matplotlib.pylab as plt
+from odps import ODPS
+from odps.df import DataFrame as odpsdf
+from datetime import datetime as dt
+import datetime
+import process_feature
+import _pickle as cPickle
+from sklearn.feature_selection import SelectFromModel
+from sklearn.linear_model import LogisticRegression
+
+def getRovfeaturetable(dt, table):
+    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
+                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
+                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
+
+    featureArray = []
+    for record in odps.read_table(table, partition='dt=%s' % dt):
+        valueFeature = {}
+        for i in process_feature.featurename:
+            if i == 'dt':
+                valueFeature[i] = dt
+            else:
+                valueFeature[i] = record[i]
+        featureArray.append(valueFeature)
+    featureArray = pd.DataFrame(featureArray)
+    print(dt, table, 'feature table finish')
+    return featureArray
+
+def getdatasample(date, max_range, table):
+    new_date = dt.strptime(date, '%Y%m%d')
+    datelist = []
+    testlist = []
+    for i in range(0, max_range):
+        delta = datetime.timedelta(days=i)
+        tar_dt = new_date - delta
+        datelist.append(tar_dt.strftime("%Y%m%d"))
+    for tm in datelist:
+        testlist.append(getRovfeaturetable(tm, table))
+    data = pd.concat(testlist)
+    data.reset_index(inplace=True)
+    data = data.drop(axis=1, columns='index')
+    return data
+
+
+def clean_data(df):
+    #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
+    df['futre7dayreturn'].loc[df['futre7dayreturn']<=0] = 1
+    y = df['futre7dayreturn']
+    df_vids = df['videoid']
+    #drop string
+    #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
+    x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
+    #drop future
+    #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
+    x = x.drop(['futre7dayreturn'], axis=1)
+
+    x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
+    x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
+    x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
+
+    x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn']
+    x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn']
+    x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn']
+
+    features = list(x)
+    drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
+    x = x.drop(drop_features, axis=1)
+    x.fillna(0)
+    x = x.astype('float64')
+    x = x.clip(1,2000000)
+ 
+    #features = [f for f in features if f not in drop_features]
+    features = list(x)
+    return x, y , df_vids, features
+
+
+def std_data(df, features):
+    for f in features:
+        if df[f].max()>1:
+            df[f] = (df[f]-df[f].min()) / (df[f]-df[f].max()+1)
+    return df
+
+def feature_selection(X, y):
+    selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
+    return selector
+
+def train(x,y,features):
+    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
+
+    '''
+    selector = feature_selection(X_train, y_train) 
+    X_train = selector.transform(X_train)
+    X_test = selector.transform(X_test)
+    selected_features = []
+    _supported = selector.get_support()
+    for i in range(0, len(_supported)):
+        if _supported[i]:
+            selected_features.append(features[i])
+    features = selected_features 
+    '''
+
+    print(len(X_train), len(X_test))
+    params = {
+        "objective": "regression",
+        "reg_sqrt":True,
+        "metric": "mape",
+        "max_depth": -1,
+        "num_leaves": 50,
+        "learning_rate": 0.1,
+        "bagging_fraction": 0.7,
+        "feature_fraction": 0.7,
+        "bagging_freq": 8,
+        "bagging_seed": 2018,
+        "lambda_l1": 0.11,
+        "boosting": "dart",
+        "nthread": 4,
+        "verbosity": -1
+    }
+    lgtrain = lgb.Dataset(X_train, label=y_train)
+    lgval = lgb.Dataset(X_test, label=y_test)
+    evals_result = {}
+
+    #model = lgb.train(params, lgtrain, 5000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100,evals_result=evals_result, init_model='lgb_regression.txt')
+     
+    model = lgb.train(params, lgtrain, 5000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100,
+                      evals_result=evals_result)
+
+    #model.save_model('lgb_regression.txt', num_iteration=model.best_iteration)
+
+    pack_result(model.feature_importance(), features, [], 'importance.csv')
+    
+    pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
+    y_test = y_test.values
+
+    err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
+    r2 = r2_score(y_test, pred_test_y)
+    print('err_mape', err_mape)
+    print('r2', r2)
+
+    pack_result(pred_test_y, y_test,[],'val.csv')
+
+    return pred_test_y, model, evals_result
+
+
+def pack_result(y_, y, vid, fp):
+    #y_ = y_.astype(int)
+    y_.reshape(len(y_),1) 
+    df = pd.DataFrame(data=y_, columns=['score'])
+    if len(vid) >0:
+        df['vid'] = vid
+    df['y'] = y
+    df = df.sort_values(by=['score'], ascending=False)
+    df.to_csv(fp, index=False)
+
+    
+if __name__ == '__main__':
+    with open(r"train_data_x.pickle", "rb") as input_file:
+        train_data = cPickle.load(input_file)    
+    with open(r"predict_data_x.pickle", "rb") as input_file:
+        predict_data = cPickle.load(input_file)   
+
+    x,y,_,features = clean_data(train_data)
+    #x = std_data(x, features)
+    #print(x.describe())
+
+    #auto train
+    #auto_train(x,y)
+
+    #train
+    _, model, _ = train(x, y, features)
+    with open('model.pickle','wb') as output_file:
+        cPickle.dump(model, output_file)
+    '''
+    with open(r"model.pickle", "rb") as input_file:
+        model = cPickle.load(input_file)    
+    ''' 
+    x,y,vid,_ = clean_data(predict_data)
+    #x = std_data(x, features)
+    y_ = model.predict(x, num_iteration=model.best_iteration)
+
+    pack_result(y_, y, vid, 'pred.csv')

+ 205 - 0
rov_train_regression_auto.py

@@ -0,0 +1,205 @@
+import warnings
+
+warnings.filterwarnings("ignore")
+import os
+import pandas as pd
+import gc
+import math
+import numpy as np
+import time
+import lightgbm as lgb
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedKFold
+from sklearn.metrics import r2_score
+from sklearn import metrics
+import pickle
+from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
+import seaborn as sns
+import matplotlib.pylab as plt
+from odps import ODPS
+from odps.df import DataFrame as odpsdf
+from datetime import datetime as dt
+import datetime
+import process_feature
+import _pickle as cPickle
+from sklearn.feature_selection import SelectFromModel
+from sklearn.linear_model import LogisticRegression
+
+def getRovfeaturetable(dt, table):
+    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
+                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
+                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
+
+    featureArray = []
+    for record in odps.read_table(table, partition='dt=%s' % dt):
+        valueFeature = {}
+        for i in process_feature.featurename:
+            if i == 'dt':
+                valueFeature[i] = dt
+            else:
+                valueFeature[i] = record[i]
+        featureArray.append(valueFeature)
+    featureArray = pd.DataFrame(featureArray)
+    print(dt, table, 'feature table finish')
+    return featureArray
+
+def getdatasample(date, max_range, table):
+    new_date = dt.strptime(date, '%Y%m%d')
+    datelist = []
+    testlist = []
+    for i in range(0, max_range):
+        delta = datetime.timedelta(days=i)
+        tar_dt = new_date - delta
+        datelist.append(tar_dt.strftime("%Y%m%d"))
+    for tm in datelist:
+        testlist.append(getRovfeaturetable(tm, table))
+    data = pd.concat(testlist)
+    data.reset_index(inplace=True)
+    data = data.drop(axis=1, columns='index')
+    return data
+
+
+def clean_data(df):
+    #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
+    df['futre7dayreturn'].loc[df['futre7dayreturn']<=0] = 1
+    y = df['futre7dayreturn']
+    df_vids = df['videoid']
+    #drop string
+    #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
+    x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
+    #drop future
+    #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
+    x = x.drop(['futre7dayreturn'], axis=1)
+
+    x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
+    x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
+    x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
+
+    x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn']
+    x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn']
+    x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn']
+
+    features = list(x)
+    drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
+    x = x.drop(drop_features, axis=1)
+    x.fillna(0)
+    x = x.astype('float64')
+    x = x.clip(1,2000000)
+ 
+    #features = [f for f in features if f not in drop_features]
+    features = list(x)
+    return x, y , df_vids, features
+
+
+def std_data(df, features):
+    for f in features:
+        if df[f].max()>1:
+            df[f] = (df[f]-df[f].min()) / (df[f]-df[f].max()+1)
+    return df
+
+def feature_selection(X, y):
+    selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
+    return selector
+
+def auto_train(X_train, y_train):
+    from flaml import  AutoML
+    automl = AutoML()
+    automl_settings = {
+    "time_budget": 8000,  # in seconds
+    "metric": 'mae',
+    "task": 'regression',
+    "log_file_name": "auto.log",
+    "estimator_list": ["lgbm"]
+    }
+    automl.fit(X_train=X_train, y_train=y_train,
+           **automl_settings) 
+
+    pred_test_y = automl.predict(X_train)
+    y_test = y_train.values
+
+    err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
+    r2 = r2_score(y_test, pred_test_y)
+    print('err_mape', err_mape)
+    print('r2', r2)
+
+    pack_result(pred_test_y, y_test,[],'autoval.csv')
+
+    
+
+def train(x,y,features):
+    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
+
+    '''
+    selector = feature_selection(X_train, y_train) 
+    X_train = selector.transform(X_train)
+    X_test = selector.transform(X_test)
+    selected_features = []
+    _supported = selector.get_support()
+    for i in range(0, len(_supported)):
+        if _supported[i]:
+            selected_features.append(features[i])
+    features = selected_features 
+    '''
+
+    print(len(X_train), len(X_test))
+    params = {
+        "objective": "regression",
+        "reg_sqrt":True,
+        "metric": "mape",
+        "max_depth": -1,
+        "num_leaves": 50,
+        "learning_rate": 0.1,
+        "bagging_fraction": 0.7,
+        "feature_fraction": 0.7,
+        "bagging_freq": 8,
+        "bagging_seed": 2018,
+        "lambda_l1": 0.11,
+        "boosting": "gbdt",
+        "nthread": 4,
+        "verbosity": -1
+    }
+    lgtrain = lgb.Dataset(X_train, label=y_train)
+    lgval = lgb.Dataset(X_test, label=y_test)
+    evals_result = {}
+    model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=200, verbose_eval=20,
+                      evals_result=evals_result)
+
+    pack_result(model.feature_importance(), features, [], 'importance.csv')
+    
+    pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
+    y_test = y_test.values
+
+    err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
+    r2 = r2_score(y_test, pred_test_y)
+    print('err_mape', err_mape)
+    print('r2', r2)
+
+    pack_result(pred_test_y, y_test,[],'val.csv')
+
+    return pred_test_y, model, evals_result
+
+
+def pack_result(y_, y, vid, fp):
+    #y_ = y_.astype(int)
+    y_.reshape(len(y_),1) 
+    df = pd.DataFrame(data=y_, columns=['score'])
+    if len(vid) >0:
+        df['vid'] = vid
+    df['y'] = y
+    df = df.sort_values(by=['score'], ascending=False)
+    df.to_csv(fp, index=False)
+
+    
+if __name__ == '__main__':
+    with open(r"train_data_x.pickle", "rb") as input_file:
+        train_data = cPickle.load(input_file)    
+    with open(r"predict_data_x.pickle", "rb") as input_file:
+        predict_data = cPickle.load(input_file)   
+
+    x,y,_,features = clean_data(train_data)
+    #x = std_data(x, features)
+    #print(x.describe())
+
+    #auto train
+    auto_train(x,y)
+

+ 14 - 0
test/parse.py

@@ -0,0 +1,14 @@
+import json
+
+fp = 'video_score_0927.json'
+a = 0
+t = 0
+
+with open(fp) as f:
+    for line in f:
+        j = json.loads(line)
+        for i in j:
+            if i['score'] > 0:
+                t += 1
+            a += 1
+print(t, a)

+ 7 - 0
test/parse2.py

@@ -0,0 +1,7 @@
+import pandas as pd
+
+fp = 'video_metric.csv'
+
+df = pd.read_csv(fp)
+df = df.sort_values(['score'], ascending=False)
+df.to_csv('df.csv')

+ 9 - 0
test/parse3.py

@@ -0,0 +1,9 @@
+import pandas as pd
+
+fp = 'feature_importance.csv'
+
+df = pd.read_csv(fp)
+df = df.sort_values(['importance'], ascending=False)
+print(df.iloc[0].values)
+df.to_csv('df_importance.csv')
+

Plik diff jest za duży
+ 0 - 0
test/video_score_0927.json


Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików