|
@@ -7,7 +7,10 @@ import pandas as pd
|
|
|
import lightgbm as lgb
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
from sklearn.datasets import make_classification
|
|
|
+from sklearn.preprocessing import LabelEncoder
|
|
|
from sklearn.metrics import accuracy_score
|
|
|
+
|
|
|
+label_encoder = LabelEncoder()
|
|
|
my_c = [
|
|
|
"uid",
|
|
|
"type",
|
|
@@ -25,18 +28,34 @@ my_c = [
|
|
|
"out_share_cnt",
|
|
|
"out_collection_cnt"
|
|
|
]
|
|
|
+
|
|
|
+str_cols = ["uid", "type", "channel", "mode"]
|
|
|
+float_cols = [
|
|
|
+ "fans",
|
|
|
+ "view_count_user_30days",
|
|
|
+ "share_count_user_30days",
|
|
|
+ "return_count_user_30days",
|
|
|
+ "rov_user",
|
|
|
+ "str_user",
|
|
|
+ "out_user_id",
|
|
|
+ "out_play_cnt",
|
|
|
+ "out_like_cnt",
|
|
|
+ "out_share_cnt",
|
|
|
+ "out_collection_cnt"
|
|
|
+ ]
|
|
|
with open("whole_data/x_data.json") as f1:
|
|
|
x_list = json.loads(f1.read())
|
|
|
X_train = pd.DataFrame(x_list[:10000], columns=my_c)
|
|
|
- X_train['uid'] = X_train['uid'].astype(str)
|
|
|
- X_train['type'] = X_train['type'].astype(str)
|
|
|
- X_train['channel'] = X_train['channel'].astype(str)
|
|
|
- X_train['mode'] = X_train['mode'].astype(str)
|
|
|
+ for key in str_cols:
|
|
|
+ X_train[key] = label_encoder.fit_transform(X_train[key])
|
|
|
+ for key in float_cols:
|
|
|
+ X_train[key] = pd.to_numeric(X_train[key], errors='coerce')
|
|
|
X_test = pd.DataFrame(x_list[10000:], columns=my_c)
|
|
|
- X_test['uid'] = X_test['uid'].astype(str)
|
|
|
- X_test['type'] = X_test['type'].astype(str)
|
|
|
- X_test['channel'] = X_test['channel'].astype(str)
|
|
|
- X_test['mode'] = X_test['mode'].astype(str)
|
|
|
+ for key in str_cols:
|
|
|
+ X_test[key] = label_encoder.fit_transform(X_test[key])
|
|
|
+ for key in float_cols:
|
|
|
+ X_test[key] = pd.to_numeric(X_test[key], errors='coerce')
|
|
|
+
|
|
|
|
|
|
with open("whole_data/y_data.json") as f2:
|
|
|
y_list = json.loads(f2.read())
|