main.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. import os
  2. import sys
  3. import json
  4. sys.path.append(os.getcwd())
  5. import numpy as np
  6. import pandas as pd
  7. import lightgbm as lgb
  8. from sklearn.preprocessing import LabelEncoder
  9. from sklearn.metrics import accuracy_score
  10. label_encoder = LabelEncoder()
  11. my_c = [
  12. "uid",
  13. "type",
  14. "channel",
  15. "fans",
  16. "view_count_user_30days",
  17. "share_count_user_30days",
  18. "return_count_user_30days",
  19. "rov_user",
  20. "str_user",
  21. "out_user_id",
  22. "mode",
  23. "out_play_cnt",
  24. "out_like_cnt",
  25. "out_share_cnt",
  26. "out_collection_cnt"
  27. ]
  28. str_cols = ["uid", "type", "channel", "mode", "out_user_id"]
  29. float_cols = [
  30. "fans",
  31. "view_count_user_30days",
  32. "share_count_user_30days",
  33. "return_count_user_30days",
  34. "rov_user",
  35. "str_user",
  36. "out_play_cnt",
  37. "out_like_cnt",
  38. "out_share_cnt",
  39. "out_collection_cnt"
  40. ]
  41. with open("whole_data/x_data_3day_up_level.json") as f1:
  42. x_list = json.loads(f1.read())
  43. index_t = int(len(x_list) * 0.7)
  44. X_train = pd.DataFrame(x_list[:index_t], columns=my_c)
  45. for key in str_cols:
  46. X_train[key] = label_encoder.fit_transform(X_train[key])
  47. for key in float_cols:
  48. X_train[key] = pd.to_numeric(X_train[key], errors='coerce')
  49. X_test = pd.DataFrame(x_list[index_t:], columns=my_c)
  50. for key in str_cols:
  51. X_test[key] = label_encoder.fit_transform(X_test[key])
  52. for key in float_cols:
  53. X_test[key] = pd.to_numeric(X_test[key], errors='coerce')
  54. with open("whole_data/y_data_3day_up_level.json") as f2:
  55. y_list = json.loads(f2.read())
  56. index_t = int(len(y_list) * 0.7)
  57. # temp = sorted(y_list)
  58. # yuzhi = temp[int(len(temp) * 0.8)-1]
  59. # y__list = [0 if i <= yuzhi else 1 for i in y_list]
  60. y_train = np.array(y_list[:index_t])
  61. y_test = np.array(y_list[index_t:])
  62. # 创建LightGBM数据集
  63. train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=['uid', 'type', 'channel', 'mode', 'out_user_id'])
  64. test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
  65. # 设置模型的参数
  66. params = {
  67. 'objective': 'binary', # 指定二分类任务
  68. 'metric': 'binary_logloss', # 评估指标为二分类的log损失
  69. 'num_leaves': 31, # 叶子节点数
  70. 'learning_rate': 0.05, # 学习率
  71. 'bagging_fraction': 0.9, # 建树的样本采样比例
  72. 'feature_fraction': 0.8, # 建树的特征选择比例
  73. 'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
  74. }
  75. # 训练模型
  76. num_round = 1000
  77. bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
  78. # 预测
  79. y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
  80. # 转换为二进制输出
  81. y_pred_binary = np.where(y_pred > 0.7, 1, 0)
  82. # 评估模型
  83. accuracy = accuracy_score(y_test, y_pred_binary)
  84. print(f'Accuracy: {accuracy}')