baichongyang 3 lat temu
rodzic
commit
e9c84888a5
3 zmienionych plików z 107 dodań i 11 usunięć
  1. 52 10
      rov_train_new.py
  2. 54 0
      rov_train_paddle.py
  3. 1 1
      sort.py

+ 52 - 10
rov_train_new.py

@@ -23,6 +23,7 @@ import datetime
 import process_feature
 import _pickle as cPickle
 
+
 def getRovfeaturetable(dt, table):
     odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
                 endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
@@ -56,26 +57,39 @@ def getdatasample(date, max_range, table):
     data = data.drop(axis=1, columns='index')
     return data
 
+
 def clean_data(df):
+    #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
     y = df['futre7dayreturn']
     df_vids = df['videoid']
     #drop string
-    x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'])
+    #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
+    x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
     #drop future
-    x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'])
-    return x, y , df_vids
+    #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
+    x = x.drop(['futre7dayreturn'], axis=1)
+    features = list(x)
+    drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
+    x = x.drop(drop_features, axis=1)
+
+    features = [f for f in features if f not in drop_features]
+    return x, y , df_vids, features
 
-def train(x,y):
+def train(x,y,features):
     X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
     params = {
         "objective": "regression",
-        "metric": "rmse",
+        "metric": "mape",
+        "max_depth": 5,
         "num_leaves": 30,
         "learning_rate": 0.1,
         "bagging_fraction": 0.7,
         "feature_fraction": 0.7,
-        "bagging_frequency": 5,
+        "bagging_freq": 5,
         "bagging_seed": 2018,
+        "lambda_l1": 0.1,
+        "boosting": "gbdt",
+        "nthread": 4,
         "verbosity": -1
     }
     lgtrain = lgb.Dataset(X_train, label=y_train)
@@ -84,13 +98,31 @@ def train(x,y):
     model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20,
                       evals_result=evals_result)
 
+    pack_result(model.feature_importance(), features, [], 'importance.csv')
+    
     pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
-    err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
+    y_test = y_test.values
+
+    #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
     r2 = r2_score(y_test, pred_test_y)
-    print('err_mape', err_mape)
+    #print('err_mape', err_mape)
     print('r2', r2)
 
+    pack_result(pred_test_y, y_test,[],'val.csv')
+
     return pred_test_y, model, evals_result
+
+
+def pack_result(y_, y, vid, fp):
+    #y_ = y_.astype(int)
+    y_.reshape(len(y_),1) 
+    df = pd.DataFrame(data=y_, columns=['score'])
+    if len(vid) >0:
+        df['vid'] = vid
+    df['y'] = y
+    df = df.sort_values(by=['score'], ascending=False)
+    df.to_csv(fp, index=False)
+
     
 if __name__ == '__main__':
     with open(r"train_data.pickle", "rb") as input_file:
@@ -99,5 +131,15 @@ if __name__ == '__main__':
         predict_data = cPickle.load(input_file)   
 
     #train
-    x,y,_ = clean_data(train_data)
-    train(x, y)
+    x,y,_,features = clean_data(train_data)
+    _, model, _ = train(x, y, features)
+    with open('model.pickle','wb') as output_file:
+        cPickle.dump(model, output_file)
+    '''
+    with open(r"model.pickle", "rb") as input_file:
+        model = cPickle.load(input_file)    
+    ''' 
+    x,y,vid,_ = clean_data(predict_data)
+    y_ = model.predict(x, num_iteration=model.best_iteration)
+
+    pack_result(y_, y, vid, 'pred.csv')

+ 54 - 0
rov_train_paddle.py

@@ -0,0 +1,54 @@
+import pandas as pd
+import numpy as np
+from paddle.io import Dataset
+
+class RovDataset(Dataset):
+
+def clean_data(df):
+    #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
+    y = df['futre7dayreturn']
+    df_vids = df['videoid']
+    #drop string
+    #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
+    x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
+    #drop future
+    #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
+    x = x.drop(['futre7dayreturn'], axis=1)
+    features = list(x)
+    drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
+    x = x.drop(drop_features, axis=1)
+
+    features = [f for f in features if f not in drop_features]
+    return x, y , df_vids, features
+
+
+def pack_result(y_, y, vid, fp):
+    #y_ = y_.astype(int)
+    y_.reshape(len(y_),1) 
+    df = pd.DataFrame(data=y_, columns=['score'])
+    if len(vid) >0:
+        df['vid'] = vid
+    df['y'] = y
+    df = df.sort_values(by=['score'], ascending=False)
+    df.to_csv(fp, index=False)
+
+    
+if __name__ == '__main__':
+    with open(r"train_data.pickle", "rb") as input_file:
+        train_data = cPickle.load(input_file)    
+    with open(r"predict_data.pickle", "rb") as input_file:
+        predict_data = cPickle.load(input_file)   
+
+    #train
+    x,y,_,features = clean_data(train_data)
+    _, model, _ = train(x, y, features)
+    with open('model.pickle','wb') as output_file:
+        cPickle.dump(model, output_file)
+    '''
+    with open(r"model.pickle", "rb") as input_file:
+        model = cPickle.load(input_file)    
+    ''' 
+    x,y,vid,_ = clean_data(predict_data)
+    y_ = model.predict(x, num_iteration=model.best_iteration)
+
+    pack_result(y_, y, vid, 'pred.csv')

+ 1 - 1
sort.py

@@ -3,4 +3,4 @@ import pandas as pd
 df = pd.read_csv('result.csv')
 df2 = df[df['score']>0.001]
 df2 = df2.sort_values('score', axis=0, ascending=False)
-df2.to_csv('result2.csv')
+df2.to_csv('result3.csv')