فهرست منبع

获取 1 , 2月 label

罗俊辉 1 سال پیش
والد
کامیت
e9eccf387a
1فایلهای تغییر یافته به همراه43 افزوده شده و 44 حذف شده
  1. 43 44
      main.py

+ 43 - 44
main.py

@@ -45,49 +45,48 @@ float_cols = [
     ]
 with open("whole_data/x_data.json") as f1:
     x_list = json.loads(f1.read())
-    print(len(x_list))
-#     X_train = pd.DataFrame(x_list[:15000], columns=my_c)
-#     for key in str_cols:
-#         X_train[key] = label_encoder.fit_transform(X_train[key])
-#     for key in float_cols:
-#         X_train[key] = pd.to_numeric(X_train[key], errors='coerce')
-#     X_test = pd.DataFrame(x_list[15000:], columns=my_c)
-#     for key in str_cols:
-#         X_test[key] = label_encoder.fit_transform(X_test[key])
-#     for key in float_cols:
-#         X_test[key] = pd.to_numeric(X_test[key], errors='coerce')
-#
-#
-# with open("whole_data/y_data.json") as f2:
-#     y_list = json.loads(f2.read())
-#     y__list = [0 if i <= 25 else 1 for i in y_list]
-#     y_train = np.array(y__list[:15000])
-#     y_test = np.array(y__list[15000:])
-#
-# # 创建LightGBM数据集
-# train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=['uid', 'type', 'channel', 'mode'])
-# test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
-#
-# # 设置模型的参数
-# params = {
-#     'objective': 'binary',  # 指定二分类任务
-#     'metric': 'binary_logloss',  # 评估指标为二分类的log损失
-#     'num_leaves': 31,  # 叶子节点数
-#     'learning_rate': 0.05,  # 学习率
-#     'bagging_fraction': 0.9,  # 建树的样本采样比例
-#     'feature_fraction': 0.8,  # 建树的特征选择比例
-#     'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
-# }
+    X_train = pd.DataFrame(x_list[:86434], columns=my_c)
+    for key in str_cols:
+        X_train[key] = label_encoder.fit_transform(X_train[key])
+    for key in float_cols:
+        X_train[key] = pd.to_numeric(X_train[key], errors='coerce')
+    X_test = pd.DataFrame(x_list[86434:], columns=my_c)
+    for key in str_cols:
+        X_test[key] = label_encoder.fit_transform(X_test[key])
+    for key in float_cols:
+        X_test[key] = pd.to_numeric(X_test[key], errors='coerce')
+
+
+with open("whole_data/y_data.json") as f2:
+    y_list = json.loads(f2.read())
+    y__list = [0 if i <= 25 else 1 for i in y_list]
+    y_train = np.array(y__list[:86434])
+    y_test = np.array(y__list[86434:])
+
+# 创建LightGBM数据集
+train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=['uid', 'type', 'channel', 'mode'])
+test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
+
+# 设置模型的参数
+params = {
+    'objective': 'binary',  # 指定二分类任务
+    'metric': 'binary_logloss',  # 评估指标为二分类的log损失
+    'num_leaves': 31,  # 叶子节点数
+    'learning_rate': 0.05,  # 学习率
+    'bagging_fraction': 0.9,  # 建树的样本采样比例
+    'feature_fraction': 0.8,  # 建树的特征选择比例
+    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
+}
 
 # 训练模型
-# num_round = 100
-# bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
-#
-# # 预测
-# y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
-# # 转换为二进制输出
-# y_pred_binary = np.where(y_pred > 0.5, 1, 0)
-#
-# # 评估模型
-# accuracy = accuracy_score(y_test, y_pred_binary)
-# print(f'Accuracy: {accuracy}')
+num_round = 100
+bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
+
+# 预测
+y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
+# 转换为二进制输出
+y_pred_binary = np.where(y_pred > 0.5, 1, 0)
+
+# 评估模型
+accuracy = accuracy_score(y_test, y_pred_binary)
+print(f'Accuracy: {accuracy}')