|
@@ -64,11 +64,11 @@ class LightGBM(object):
|
|
|
df = pd.read_json(path)
|
|
|
df = df.dropna(subset=['label']) # 把 label 为空的删掉
|
|
|
labels = df['label']
|
|
|
- if not yc:
|
|
|
- temp = sorted(labels)
|
|
|
- yc = temp[int(len(temp) * 0.7)]
|
|
|
- print("阈值", yc)
|
|
|
- labels = [0 if i < yc else 1 for i in labels]
|
|
|
+ # if not yc:
|
|
|
+ # temp = sorted(labels)
|
|
|
+ # yc = temp[int(len(temp) * 0.7)]
|
|
|
+ # print("阈值", yc)
|
|
|
+ # labels = [0 if i < yc else 1 for i in labels]
|
|
|
features = df.drop(['video_id', 'label', 'video_title'], axis=1)
|
|
|
for key in self.float_columns:
|
|
|
features[key] = pd.to_numeric(features[key], errors="coerce")
|
|
@@ -78,11 +78,14 @@ class LightGBM(object):
|
|
|
return features, labels, df
|
|
|
|
|
|
def best_params(self):
|
|
|
- path = "data/train_data/spider_data_240401.json"
|
|
|
+ """
|
|
|
+ find best params for lightgbm
|
|
|
+ """
|
|
|
+ path = "data/train_data/spider_train_20240408.json"
|
|
|
X, y, ori_df = self.read_data(path)
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
|
|
- lgbm = lgb.LGBMClassifier(objective='binary')
|
|
|
+ lgb_ = lgb.LGBMClassifier(objective='binary')
|
|
|
|
|
|
# 设置搜索的参数范围
|
|
|
param_dist = {
|
|
@@ -96,7 +99,7 @@ class LightGBM(object):
|
|
|
|
|
|
# 定义 RandomizedSearchCV
|
|
|
rsearch = RandomizedSearchCV(
|
|
|
- estimator=lgbm,
|
|
|
+ estimator=lgb_,
|
|
|
param_distributions=param_dist,
|
|
|
n_iter=100,
|
|
|
cv=3,
|
|
@@ -122,7 +125,7 @@ class LightGBM(object):
|
|
|
Load dataset
|
|
|
:return:
|
|
|
"""
|
|
|
- path = "data/train_data/spider_train_20240402.json"
|
|
|
+ path = "data/train_data/spider_train_20240408.json"
|
|
|
x, y, ori_df = self.read_data(path)
|
|
|
train_size = int(len(x) * self.split_c)
|
|
|
X_train, X_test = x[:train_size], x[train_size:]
|