罗俊辉 1 год назад
Родитель
Сommit
41a33bc97b
1 измененных файлов с 44 добавлено и 43 удалено
  1. 44 43
      main.py

+ 44 - 43
main.py

@@ -45,48 +45,49 @@ float_cols = [
     ]
 with open("whole_data/x_data.json") as f1:
     x_list = json.loads(f1.read())
-    X_train = pd.DataFrame(x_list[:15000], columns=my_c)
-    for key in str_cols:
-        X_train[key] = label_encoder.fit_transform(X_train[key])
-    for key in float_cols:
-        X_train[key] = pd.to_numeric(X_train[key], errors='coerce')
-    X_test = pd.DataFrame(x_list[15000:], columns=my_c)
-    for key in str_cols:
-        X_test[key] = label_encoder.fit_transform(X_test[key])
-    for key in float_cols:
-        X_test[key] = pd.to_numeric(X_test[key], errors='coerce')
-
-
-with open("whole_data/y_data.json") as f2:
-    y_list = json.loads(f2.read())
-    y__list = [0 if i <= 25 else 1 for i in y_list]
-    y_train = np.array(y__list[:15000])
-    y_test = np.array(y__list[15000:])
-
-# 创建LightGBM数据集
-train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=['uid', 'type', 'channel', 'mode'])
-test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
-
-# 设置模型的参数
-params = {
-    'objective': 'binary',  # 指定二分类任务
-    'metric': 'binary_logloss',  # 评估指标为二分类的log损失
-    'num_leaves': 31,  # 叶子节点数
-    'learning_rate': 0.05,  # 学习率
-    'bagging_fraction': 0.9,  # 建树的样本采样比例
-    'feature_fraction': 0.8,  # 建树的特征选择比例
-    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
-}
+    print(len(x_list))
+#     X_train = pd.DataFrame(x_list[:15000], columns=my_c)
+#     for key in str_cols:
+#         X_train[key] = label_encoder.fit_transform(X_train[key])
+#     for key in float_cols:
+#         X_train[key] = pd.to_numeric(X_train[key], errors='coerce')
+#     X_test = pd.DataFrame(x_list[15000:], columns=my_c)
+#     for key in str_cols:
+#         X_test[key] = label_encoder.fit_transform(X_test[key])
+#     for key in float_cols:
+#         X_test[key] = pd.to_numeric(X_test[key], errors='coerce')
+#
+#
+# with open("whole_data/y_data.json") as f2:
+#     y_list = json.loads(f2.read())
+#     y__list = [0 if i <= 25 else 1 for i in y_list]
+#     y_train = np.array(y__list[:15000])
+#     y_test = np.array(y__list[15000:])
+#
+# # 创建LightGBM数据集
+# train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=['uid', 'type', 'channel', 'mode'])
+# test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
+#
+# # 设置模型的参数
+# params = {
+#     'objective': 'binary',  # 指定二分类任务
+#     'metric': 'binary_logloss',  # 评估指标为二分类的log损失
+#     'num_leaves': 31,  # 叶子节点数
+#     'learning_rate': 0.05,  # 学习率
+#     'bagging_fraction': 0.9,  # 建树的样本采样比例
+#     'feature_fraction': 0.8,  # 建树的特征选择比例
+#     'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
+# }
 
 # 训练模型
-num_round = 100
-bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
-
-# 预测
-y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
-# 转换为二进制输出
-y_pred_binary = np.where(y_pred > 0.5, 1, 0)
-
-# 评估模型
-accuracy = accuracy_score(y_test, y_pred_binary)
-print(f'Accuracy: {accuracy}')
+# num_round = 100
+# bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
+#
+# # 预测
+# y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
+# # 转换为二进制输出
+# y_pred_binary = np.where(y_pred > 0.5, 1, 0)
+#
+# # 评估模型
+# accuracy = accuracy_score(y_test, y_pred_binary)
+# print(f'Accuracy: {accuracy}')