3 lat temu · e9c84888a5
--- a/rov_train_new.py
+++ b/rov_train_new.py
@@ -23,6 +23,7 @@ import datetime
 
				 import process_feature
			
 
				 import _pickle as cPickle
			
 
				 
			
 
				+
			
 
				 def getRovfeaturetable(dt, table):
			
 
				     odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
			
 
				                 endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
			
@@ -56,26 +57,39 @@ def getdatasample(date, max_range, table):
 
				     data = data.drop(axis=1, columns='index')
			
 
				     return data
			
 
				 
			
 
				+
			
 
				 def clean_data(df):
			
 
				+    #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
			
 
				     y = df['futre7dayreturn']
			
 
				     df_vids = df['videoid']
			
 
				     #drop string
			
 
				-    x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'])
			
 
				+    #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
			
 
				+    x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
			
 
				     #drop future
			
 
				-    x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'])
			
 
				-    return x, y , df_vids
			
 
				+    #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
			
 
				+    x = x.drop(['futre7dayreturn'], axis=1)
			
 
				+    features = list(x)
			
 
				+    drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
			
 
				+    x = x.drop(drop_features, axis=1)
			
 
				+
			
 
				+    features = [f for f in features if f not in drop_features]
			
 
				+    return x, y , df_vids, features
			
 
				 
			
 
				-def train(x,y):
			
 
				+def train(x,y,features):
			
 
				     X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
			
 
				     params = {
			
 
				         "objective": "regression",
			
 
				-        "metric": "rmse",
			
 
				+        "metric": "mape",
			
 
				+        "max_depth": 5,
			
 
				         "num_leaves": 30,
			
 
				         "learning_rate": 0.1,
			
 
				         "bagging_fraction": 0.7,
			
 
				         "feature_fraction": 0.7,
			
 
				-        "bagging_frequency": 5,
			
 
				+        "bagging_freq": 5,
			
 
				         "bagging_seed": 2018,
			
 
				+        "lambda_l1": 0.1,
			
 
				+        "boosting": "gbdt",
			
 
				+        "nthread": 4,
			
 
				         "verbosity": -1
			
 
				     }
			
 
				     lgtrain = lgb.Dataset(X_train, label=y_train)
			
@@ -84,13 +98,31 @@ def train(x,y):
 
				     model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20,
			
 
				                       evals_result=evals_result)
			
 
				 
			
 
				+    pack_result(model.feature_importance(), features, [], 'importance.csv')
			
 
				+    
			
 
				     pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
			
 
				-    err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
			
 
				+    y_test = y_test.values
			
 
				+
			
 
				+    #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
			
 
				     r2 = r2_score(y_test, pred_test_y)
			
 
				-    print('err_mape', err_mape)
			
 
				+    #print('err_mape', err_mape)
			
 
				     print('r2', r2)
			
 
				 
			
 
				+    pack_result(pred_test_y, y_test,[],'val.csv')
			
 
				+
			
 
				     return pred_test_y, model, evals_result
			
 
				+
			
 
				+
			
 
				+def pack_result(y_, y, vid, fp):
			
 
				+    #y_ = y_.astype(int)
			
 
				+    y_.reshape(len(y_),1) 
			
 
				+    df = pd.DataFrame(data=y_, columns=['score'])
			
 
				+    if len(vid) >0:
			
 
				+        df['vid'] = vid
			
 
				+    df['y'] = y
			
 
				+    df = df.sort_values(by=['score'], ascending=False)
			
 
				+    df.to_csv(fp, index=False)
			
 
				+
			
 
				     
			
 
				 if __name__ == '__main__':
			
 
				     with open(r"train_data.pickle", "rb") as input_file:
			
@@ -99,5 +131,15 @@ if __name__ == '__main__':
 
				         predict_data = cPickle.load(input_file)   
			
 
				 
			
 
				     #train
			
 
				-    x,y,_ = clean_data(train_data)
			
 
				-    train(x, y)
			
 
				+    x,y,_,features = clean_data(train_data)
			
 
				+    _, model, _ = train(x, y, features)
			
 
				+    with open('model.pickle','wb') as output_file:
			
 
				+        cPickle.dump(model, output_file)
			
 
				+    '''
			
 
				+    with open(r"model.pickle", "rb") as input_file:
			
 
				+        model = cPickle.load(input_file)    
			
 
				+    ''' 
			
 
				+    x,y,vid,_ = clean_data(predict_data)
			
 
				+    y_ = model.predict(x, num_iteration=model.best_iteration)
			
 
				+
			
 
				+    pack_result(y_, y, vid, 'pred.csv')
			
--- a/rov_train_paddle.py
+++ b/rov_train_paddle.py
@@ -0,0 +1,54 @@
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+from paddle.io import Dataset
			
 
				+
			
 
				+class RovDataset(Dataset):
			
 
				+
			
 
				+def clean_data(df):
			
 
				+    #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
			
 
				+    y = df['futre7dayreturn']
			
 
				+    df_vids = df['videoid']
			
 
				+    #drop string
			
 
				+    #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
			
 
				+    x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
			
 
				+    #drop future
			
 
				+    #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
			
 
				+    x = x.drop(['futre7dayreturn'], axis=1)
			
 
				+    features = list(x)
			
 
				+    drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
			
 
				+    x = x.drop(drop_features, axis=1)
			
 
				+
			
 
				+    features = [f for f in features if f not in drop_features]
			
 
				+    return x, y , df_vids, features
			
 
				+
			
 
				+
			
 
				+def pack_result(y_, y, vid, fp):
			
 
				+    #y_ = y_.astype(int)
			
 
				+    y_.reshape(len(y_),1) 
			
 
				+    df = pd.DataFrame(data=y_, columns=['score'])
			
 
				+    if len(vid) >0:
			
 
				+        df['vid'] = vid
			
 
				+    df['y'] = y
			
 
				+    df = df.sort_values(by=['score'], ascending=False)
			
 
				+    df.to_csv(fp, index=False)
			
 
				+
			
 
				+    
			
 
				+if __name__ == '__main__':
			
 
				+    with open(r"train_data.pickle", "rb") as input_file:
			
 
				+        train_data = cPickle.load(input_file)    
			
 
				+    with open(r"predict_data.pickle", "rb") as input_file:
			
 
				+        predict_data = cPickle.load(input_file)   
			
 
				+
			
 
				+    #train
			
 
				+    x,y,_,features = clean_data(train_data)
			
 
				+    _, model, _ = train(x, y, features)
			
 
				+    with open('model.pickle','wb') as output_file:
			
 
				+        cPickle.dump(model, output_file)
			
 
				+    '''
			
 
				+    with open(r"model.pickle", "rb") as input_file:
			
 
				+        model = cPickle.load(input_file)    
			
 
				+    ''' 
			
 
				+    x,y,vid,_ = clean_data(predict_data)
			
 
				+    y_ = model.predict(x, num_iteration=model.best_iteration)
			
 
				+
			
 
				+    pack_result(y_, y, vid, 'pred.csv')
			
--- a/sort.py
+++ b/sort.py
@@ -3,4 +3,4 @@ import pandas as pd
 
				 df = pd.read_csv('result.csv')
			
 
				 df2 = df[df['score']>0.001]
			
 
				 df2 = df2.sort_values('score', axis=0, ascending=False)
			
 
				-df2.to_csv('result2.csv')
			
 
				+df2.to_csv('result3.csv')