main.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. import os
  2. import sys
  3. import json
  4. sys.path.append(os.getcwd())
  5. import numpy as np
  6. import pandas as pd
  7. import lightgbm as lgb
  8. from sklearn.model_selection import train_test_split
  9. from sklearn.datasets import make_classification
  10. from sklearn.metrics import accuracy_score
  11. my_c = [
  12. "uid",
  13. "type",
  14. "channel",
  15. "fans",
  16. "view_count_user_30days",
  17. "share_count_user_30days",
  18. "return_count_user_30days",
  19. "rov_user",
  20. "str_user",
  21. "out_user_id",
  22. "mode",
  23. "out_play_cnt",
  24. "out_like_cnt",
  25. "out_share_cnt",
  26. "out_collection_cnt"
  27. ]
  28. with open("whole_data/x_data.json") as f1:
  29. x_list = json.loads(f1.read())
  30. X_train = pd.DataFrame(x_list[:10000], columns=my_c)
  31. X_train['uid'] = X_train['uid'].astype(str)
  32. X_train['type'] = X_train['type'].astype(str)
  33. X_train['channel'] = X_train['channel'].astype(str)
  34. X_test = pd.DataFrame(x_list[10000:], columns=my_c)
  35. X_test['uid'] = X_test['uid'].astype(str)
  36. X_test['type'] = X_test['type'].astype(str)
  37. X_test['channel'] = X_test['channel'].astype(str)
  38. with open("whole_data/y_data.json") as f2:
  39. y_list = json.loads(f2.read())
  40. y_train = np.array(y_list[:10000])
  41. y_test = np.array(y_list[10000:])
  42. # 创建LightGBM数据集
  43. train_data = lgb.Dataset(X_train, label=y_train, categorical_features=['uid', 'type', 'channel'])
  44. test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
  45. # 设置模型的参数
  46. params = {
  47. 'objective': 'binary', # 指定二分类任务
  48. 'metric': 'binary_logloss', # 评估指标为二分类的log损失
  49. 'num_leaves': 31, # 叶子节点数
  50. 'learning_rate': 0.05, # 学习率
  51. 'bagging_fraction': 0.9, # 建树的样本采样比例
  52. 'feature_fraction': 0.8, # 建树的特征选择比例
  53. 'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
  54. }
  55. # 训练模型
  56. num_round = 100
  57. bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
  58. # 预测
  59. y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
  60. # 转换为二进制输出
  61. y_pred_binary = np.where(y_pred > 0.5, 1, 0)
  62. # 评估模型
  63. accuracy = accuracy_score(y_test, y_pred_binary)
  64. print(f'Accuracy: {accuracy}')