main.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. import os
  2. import sys
  3. import json
  4. sys.path.append(os.getcwd())
  5. import numpy as np
  6. import pandas as pd
  7. import lightgbm as lgb
  8. from sklearn.preprocessing import LabelEncoder
  9. from sklearn.metrics import accuracy_score
  10. label_encoder = LabelEncoder()
  11. my_c = [
  12. "uid",
  13. "type",
  14. "channel",
  15. "fans",
  16. "view_count_user_30days",
  17. "share_count_user_30days",
  18. "return_count_user_30days",
  19. "rov_user",
  20. "str_user",
  21. "out_user_id",
  22. "mode",
  23. "out_play_cnt",
  24. "out_like_cnt",
  25. "out_share_cnt",
  26. "out_collection_cnt"
  27. ]
  28. str_cols = ["uid", "type", "channel", "mode", "out_user_id"]
  29. float_cols = [
  30. "fans",
  31. "view_count_user_30days",
  32. "share_count_user_30days",
  33. "return_count_user_30days",
  34. "rov_user",
  35. "str_user",
  36. "out_play_cnt",
  37. "out_like_cnt",
  38. "out_share_cnt",
  39. "out_collection_cnt"
  40. ]
  41. with open("whole_data/x_data_total_return.json") as f1:
  42. x_list = json.loads(f1.read())[30000:230000]
  43. print(len(x_list))
  44. index_t = int(len(x_list) * 0.7)
  45. X_train = pd.DataFrame(x_list[:index_t], columns=my_c)
  46. for key in str_cols:
  47. X_train[key] = label_encoder.fit_transform(X_train[key])
  48. for key in float_cols:
  49. X_train[key] = pd.to_numeric(X_train[key], errors='coerce')
  50. X_test = pd.DataFrame(x_list[index_t:], columns=my_c)
  51. for key in str_cols:
  52. X_test[key] = label_encoder.fit_transform(X_test[key])
  53. for key in float_cols:
  54. X_test[key] = pd.to_numeric(X_test[key], errors='coerce')
  55. print("读取X数据成功!")
  56. with open("whole_data/y_data_total_return.json") as f2:
  57. y_list = json.loads(f2.read())[30000:230000]
  58. print(len(y_list))
  59. index_t = int(len(y_list) * 0.7)
  60. temp = sorted(y_list)
  61. yuzhi = temp[int(len(temp) * 0.8)-1]
  62. y__list = [0 if i <= yuzhi else 1 for i in y_list]
  63. y_train = np.array(y__list[:index_t])
  64. y_test = np.array(y__list[index_t:])
  65. print("读取Y数据成功!")
  66. # 创建LightGBM数据集
  67. train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=['uid', 'type', 'channel', 'mode', 'out_user_id'])
  68. test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
  69. print("数据集创建成功")
  70. # 设置模型的参数
  71. # params = {
  72. # 'objective': 'binary', # 指定二分类任务
  73. # 'metric': 'binary_logloss', # 评估指标为二分类的log损失
  74. # 'num_leaves': 31, # 叶子节点数
  75. # 'learning_rate': 0.05, # 学习率
  76. # 'bagging_fraction': 0.9, # 建树的样本采样比例
  77. # 'feature_fraction': 0.8, # 建树的特征选择比例
  78. # 'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
  79. # }
  80. params = {
  81. 'objective': 'binary',
  82. 'metric': 'binary_logloss',
  83. 'num_leaves': 20, # 减少叶子节点数
  84. 'learning_rate': 0.1, # 增大学习率
  85. 'bagging_fraction': 0.95, # 略微增加抽样比例
  86. 'feature_fraction': 0.95, # 略微增加特征抽样比例
  87. 'bagging_freq': 0, # 减少bagging频率
  88. # 'boosting_type': 'hist', # 使用基于直方图的快速方法
  89. }
  90. # 训练模型
  91. num_round = 100
  92. print("开始训练......")
  93. bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
  94. print("训练完成! , 开始预测......")
  95. # 预测
  96. y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
  97. # 转换为二进制输出
  98. y_pred_binary = np.where(y_pred > 0.7, 1, 0)
  99. # 评估模型
  100. accuracy = accuracy_score(y_test, y_pred_binary)
  101. print(f'Accuracy: {accuracy}')