main.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. import os
  2. import sys
  3. import json
  4. sys.path.append(os.getcwd())
  5. import numpy as np
  6. import pandas as pd
  7. import lightgbm as lgb
  8. from sklearn.model_selection import train_test_split
  9. from sklearn.datasets import make_classification
  10. from sklearn.metrics import accuracy_score
  11. my_c = [
  12. "uid",
  13. "type",
  14. "channel",
  15. "fans",
  16. "view_count_user_30days",
  17. "share_count_user_30days",
  18. "return_count_user_30days",
  19. "rov_user",
  20. "str_user",
  21. "out_user_id",
  22. "mode",
  23. "out_play_cnt",
  24. "out_like_cnt",
  25. "out_share_cnt",
  26. "out_collection_cnt"
  27. ]
  28. with open("whole_data/x_data.json") as f1:
  29. x_list = json.loads(f1.read())
  30. X_train = pd.DataFrame(x_list[:10000], columns=my_c)
  31. X_train['uid'] = X_train['uid'].astype(str)
  32. X_train['type'] = X_train['type'].astype(str)
  33. X_train['channel'] = X_train['channel'].astype(str)
  34. X_train['mode'] = X_train['mode'].astype(str)
  35. X_test = pd.DataFrame(x_list[10000:], columns=my_c)
  36. X_test['uid'] = X_test['uid'].astype(str)
  37. X_test['type'] = X_test['type'].astype(str)
  38. X_test['channel'] = X_test['channel'].astype(str)
  39. X_test['mode'] = X_test['mode'].astype(str)
  40. with open("whole_data/y_data.json") as f2:
  41. y_list = json.loads(f2.read())
  42. y_train = np.array(y_list[:10000])
  43. y_test = np.array(y_list[10000:])
  44. # 创建LightGBM数据集
  45. train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=['uid', 'type', 'channel', 'mode'])
  46. test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
  47. # 设置模型的参数
  48. params = {
  49. 'objective': 'binary', # 指定二分类任务
  50. 'metric': 'binary_logloss', # 评估指标为二分类的log损失
  51. 'num_leaves': 31, # 叶子节点数
  52. 'learning_rate': 0.05, # 学习率
  53. 'bagging_fraction': 0.9, # 建树的样本采样比例
  54. 'feature_fraction': 0.8, # 建树的特征选择比例
  55. 'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
  56. }
  57. # 训练模型
  58. num_round = 100
  59. bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
  60. # 预测
  61. y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
  62. # 转换为二进制输出
  63. y_pred_binary = np.where(y_pred > 0.5, 1, 0)
  64. # 评估模型
  65. accuracy = accuracy_score(y_test, y_pred_binary)
  66. print(f'Accuracy: {accuracy}')