Browse Source

更新 label 数据

罗俊辉 1 year ago
parent
commit
1beb98a29f
1 changed files with 12 additions and 9 deletions
  1. 12 9
      main_spider.py

+ 12 - 9
main_spider.py

@@ -64,11 +64,11 @@ class LightGBM(object):
         df = pd.read_json(path)
         df = df.dropna(subset=['label'])  # 把 label 为空的删掉
         labels = df['label']
-        if not yc:
-            temp = sorted(labels)
-            yc = temp[int(len(temp) * 0.7)]
-        print("阈值", yc)
-        labels = [0 if i < yc else 1 for i in labels]
+        # if not yc:
+        #     temp = sorted(labels)
+        #     yc = temp[int(len(temp) * 0.7)]
+        # print("阈值", yc)
+        # labels = [0 if i < yc else 1 for i in labels]
         features = df.drop(['video_id', 'label', 'video_title'], axis=1)
         for key in self.float_columns:
             features[key] = pd.to_numeric(features[key], errors="coerce")
@@ -78,11 +78,14 @@ class LightGBM(object):
         return features, labels, df
 
     def best_params(self):
-        path = "data/train_data/spider_data_240401.json"
+        """
+        find best params for lightgbm
+        """
+        path = "data/train_data/spider_train_20240408.json"
         X, y, ori_df = self.read_data(path)
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
-        lgbm = lgb.LGBMClassifier(objective='binary')
+        lgb_ = lgb.LGBMClassifier(objective='binary')
 
         # 设置搜索的参数范围
         param_dist = {
@@ -96,7 +99,7 @@ class LightGBM(object):
 
         # 定义 RandomizedSearchCV
         rsearch = RandomizedSearchCV(
-            estimator=lgbm,
+            estimator=lgb_,
             param_distributions=param_dist,
             n_iter=100,
             cv=3,
@@ -122,7 +125,7 @@ class LightGBM(object):
         Load dataset
         :return:
         """
-        path = "data/train_data/spider_train_20240402.json"
+        path = "data/train_data/spider_train_20240408.json"
         x, y, ori_df = self.read_data(path)
         train_size = int(len(x) * self.split_c)
         X_train, X_test = x[:train_size], x[train_size:]