Browse Source

获取 rov 数据

罗俊辉 1 year ago
parent
commit
3067f9d9f9
1 changed files with 14 additions and 20 deletions
  1. 14 20
      main_spider.py

+ 14 - 20
main_spider.py

@@ -3,10 +3,6 @@
 """
 import os
 import sys
-import json
-import optuna
-import numpy as np
-from odps import DataFrame
 
 from sklearn.preprocessing import LabelEncoder
 
@@ -51,7 +47,7 @@ class LightGBM(object):
         ]
         self.split_c = 0.7
         self.yc = 0.8
-        self.model = "models/lightgbm_0408_spider.bin"
+        self.model = "models/lightgbm_0409_spider.bin"
         self.flag = flag
         self.dt = dt
         # self.label_mapping = {}
@@ -64,11 +60,11 @@ class LightGBM(object):
         df = pd.read_json(path)
         df = df.dropna(subset=['label'])  # 把 label 为空的删掉
         labels = df['label']
-        # if not yc:
-        #     temp = sorted(labels)
-        #     yc = temp[int(len(temp) * 0.7)]
-        # print("阈值", yc)
-        # labels = [0 if i < yc else 1 for i in labels]
+        if not yc:
+            temp = sorted(labels)
+            yc = temp[int(len(temp) * 0.7)]
+        print("阈值", yc)
+        labels = [0 if i < yc else 1 for i in labels]
         features = df.drop(['video_id', 'label', 'video_title'], axis=1)
         for key in self.float_columns:
             features[key] = pd.to_numeric(features[key], errors="coerce")
@@ -81,8 +77,8 @@ class LightGBM(object):
         """
         find best params for lightgbm
         """
-        path = "data/train_data/spider_train_20240408.json"
-        X, y, ori_df = self.read_data(path)
+        path = "data/train_data/spider_train_20240409.json"
+        X, y, ori_df = self.read_data(path, 0.05)
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
         lgb_ = lgb.LGBMClassifier(objective='binary')
@@ -126,7 +122,7 @@ class LightGBM(object):
         :return:
         """
         path = "data/train_data/spider_train_20240408.json"
-        x, y, ori_df = self.read_data(path)
+        x, y, ori_df = self.read_data(path, 0.05)
         train_size = int(len(x) * self.split_c)
         X_train, X_test = x[:train_size], x[train_size:]
         Y_train, Y_test = y[:train_size], y[train_size:]
@@ -159,23 +155,21 @@ class LightGBM(object):
         :return:
         """
         fw = open("result/summary_{}.txt".format(dt), "a+", encoding="utf-8")
-        path = 'data/predict_data/predict_{}.json'.format(dt)
-        x, y, ori_df = self.read_data(path, yc=6)
+        path = 'data/predict_data/predict_spider_{}.json'.format(dt)
+        x, y, ori_df = self.read_data(path, yc=0.05)
         true_label_df = pd.DataFrame(list(y), columns=['ture_label'])
         bst = lgb.Booster(model_file=self.model)
         y_pred = bst.predict(x, num_iteration=bst.best_iteration)
         pred_score_df = pd.DataFrame(list(y_pred), columns=['pred_score'])
-        # temp = sorted(list(y_pred))
-        # yuzhi = temp[int(len(temp) * 0.9) - 1]
-        y_pred_binary = [0 if i <= 0.169541 else 1 for i in list(y_pred)]
+        temp = sorted(list(y_pred))
+        yuzhi = temp[int(len(temp) * 0.75) - 1]
+        y_pred_binary = [0 if i <= yuzhi else 1 for i in list(y_pred)]
         pred_label_df = pd.DataFrame(list(y_pred_binary), columns=['pred_label'])
         score_list = []
         for index, item in enumerate(list(y_pred)):
             real_label = y[index]
             score = item
             prid_label = y_pred_binary[index]
-            if score < 0.169541:
-                print(real_label, "\t", prid_label, "\t", score)
             fw.write("{}\t{}\t{}\n".format(real_label, prid_label, score))
             score_list.append(score)
         print("预测样本总量: {}".format(len(score_list)))