|
@@ -45,49 +45,48 @@ float_cols = [
|
|
|
]
|
|
|
with open("whole_data/x_data.json") as f1:
|
|
|
x_list = json.loads(f1.read())
|
|
|
- print(len(x_list))
|
|
|
-# X_train = pd.DataFrame(x_list[:15000], columns=my_c)
|
|
|
-# for key in str_cols:
|
|
|
-# X_train[key] = label_encoder.fit_transform(X_train[key])
|
|
|
-# for key in float_cols:
|
|
|
-# X_train[key] = pd.to_numeric(X_train[key], errors='coerce')
|
|
|
-# X_test = pd.DataFrame(x_list[15000:], columns=my_c)
|
|
|
-# for key in str_cols:
|
|
|
-# X_test[key] = label_encoder.fit_transform(X_test[key])
|
|
|
-# for key in float_cols:
|
|
|
-# X_test[key] = pd.to_numeric(X_test[key], errors='coerce')
|
|
|
-#
|
|
|
-#
|
|
|
-# with open("whole_data/y_data.json") as f2:
|
|
|
-# y_list = json.loads(f2.read())
|
|
|
-# y__list = [0 if i <= 25 else 1 for i in y_list]
|
|
|
-# y_train = np.array(y__list[:15000])
|
|
|
-# y_test = np.array(y__list[15000:])
|
|
|
-#
|
|
|
-# # 创建LightGBM数据集
|
|
|
-# train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=['uid', 'type', 'channel', 'mode'])
|
|
|
-# test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
|
|
|
-#
|
|
|
-# # 设置模型的参数
|
|
|
-# params = {
|
|
|
-# 'objective': 'binary', # 指定二分类任务
|
|
|
-# 'metric': 'binary_logloss', # 评估指标为二分类的log损失
|
|
|
-# 'num_leaves': 31, # 叶子节点数
|
|
|
-# 'learning_rate': 0.05, # 学习率
|
|
|
-# 'bagging_fraction': 0.9, # 建树的样本采样比例
|
|
|
-# 'feature_fraction': 0.8, # 建树的特征选择比例
|
|
|
-# 'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
|
|
|
-# }
|
|
|
+ X_train = pd.DataFrame(x_list[:86434], columns=my_c)
|
|
|
+ for key in str_cols:
|
|
|
+ X_train[key] = label_encoder.fit_transform(X_train[key])
|
|
|
+ for key in float_cols:
|
|
|
+ X_train[key] = pd.to_numeric(X_train[key], errors='coerce')
|
|
|
+ X_test = pd.DataFrame(x_list[86434:], columns=my_c)
|
|
|
+ for key in str_cols:
|
|
|
+ X_test[key] = label_encoder.fit_transform(X_test[key])
|
|
|
+ for key in float_cols:
|
|
|
+ X_test[key] = pd.to_numeric(X_test[key], errors='coerce')
|
|
|
+
|
|
|
+
|
|
|
+with open("whole_data/y_data.json") as f2:
|
|
|
+ y_list = json.loads(f2.read())
|
|
|
+ y__list = [0 if i <= 25 else 1 for i in y_list]
|
|
|
+ y_train = np.array(y__list[:86434])
|
|
|
+ y_test = np.array(y__list[86434:])
|
|
|
+
|
|
|
+# 创建LightGBM数据集
|
|
|
+train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=['uid', 'type', 'channel', 'mode'])
|
|
|
+test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
|
|
|
+
|
|
|
+# 设置模型的参数
|
|
|
+params = {
|
|
|
+ 'objective': 'binary', # 指定二分类任务
|
|
|
+ 'metric': 'binary_logloss', # 评估指标为二分类的log损失
|
|
|
+ 'num_leaves': 31, # 叶子节点数
|
|
|
+ 'learning_rate': 0.05, # 学习率
|
|
|
+ 'bagging_fraction': 0.9, # 建树的样本采样比例
|
|
|
+ 'feature_fraction': 0.8, # 建树的特征选择比例
|
|
|
+ 'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
|
|
|
+}
|
|
|
|
|
|
# 训练模型
|
|
|
-# num_round = 100
|
|
|
-# bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
|
|
|
-#
|
|
|
-# # 预测
|
|
|
-# y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
|
|
|
-# # 转换为二进制输出
|
|
|
-# y_pred_binary = np.where(y_pred > 0.5, 1, 0)
|
|
|
-#
|
|
|
-# # 评估模型
|
|
|
-# accuracy = accuracy_score(y_test, y_pred_binary)
|
|
|
-# print(f'Accuracy: {accuracy}')
|
|
|
+num_round = 100
|
|
|
+bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
|
|
|
+
|
|
|
+# 预测
|
|
|
+y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
|
|
|
+# 转换为二进制输出
|
|
|
+y_pred_binary = np.where(y_pred > 0.5, 1, 0)
|
|
|
+
|
|
|
+# 评估模型
|
|
|
+accuracy = accuracy_score(y_test, y_pred_binary)
|
|
|
+print(f'Accuracy: {accuracy}')
|