|
@@ -61,23 +61,25 @@ class LightGBM(object):
|
|
|
:return:
|
|
|
"""
|
|
|
df = pd.read_json(path)
|
|
|
- df = df.dropna(subset=['label'])
|
|
|
+ df = df.dropna(subset=['label']) # 把 label 为空的删掉
|
|
|
labels = df['label']
|
|
|
+ video_ids = df['video_id']
|
|
|
+ video_titles = df['video_title']
|
|
|
if not yc:
|
|
|
temp = sorted(labels)
|
|
|
yc = temp[int(len(temp) * 0.7)]
|
|
|
print("阈值", yc)
|
|
|
labels = [0 if i < yc else 1 for i in labels]
|
|
|
- features = df.drop("label", axis=1)
|
|
|
+ features = df.drop(['video_id', 'label', 'video_title'], axis=1)
|
|
|
for key in self.float_columns:
|
|
|
features[key] = pd.to_numeric(features[key], errors="coerce")
|
|
|
for key in self.str_columns:
|
|
|
features[key] = self.label_encoder.fit_transform(features[key])
|
|
|
- return features, labels
|
|
|
+ return features, labels, video_ids, video_titles
|
|
|
|
|
|
def best_params(self):
|
|
|
path = "data/train_data/spider_data_240401.json"
|
|
|
- X, y = self.read_data(path)
|
|
|
+ X, y, ids, titles = self.read_data(path)
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
|
|
lgbm = lgb.LGBMClassifier(objective='binary')
|
|
@@ -121,7 +123,7 @@ class LightGBM(object):
|
|
|
:return:
|
|
|
"""
|
|
|
path = "data/train_data/spider_train_20240402.json"
|
|
|
- x, y = self.read_data(path)
|
|
|
+ x, y, ids, titles = self.read_data(path)
|
|
|
train_size = int(len(x) * self.split_c)
|
|
|
X_train, X_test = x[:train_size], x[train_size:]
|
|
|
Y_train, Y_test = y[:train_size], y[train_size:]
|
|
@@ -155,9 +157,7 @@ class LightGBM(object):
|
|
|
"""
|
|
|
fw = open("result/summary_{}.txt".format(dt), "a+", encoding="utf-8")
|
|
|
path = 'data/predict_data/predict_{}.json'.format(dt)
|
|
|
- x, y = self.read_data(path, yc=6)
|
|
|
- print(type(x))
|
|
|
- print(type(y))
|
|
|
+ x, y, ids, titles = self.read_data(path, yc=6)
|
|
|
true_label_df = pd.DataFrame(list(y), columns=['ture_label'])
|
|
|
bst = lgb.Booster(model_file=self.model)
|
|
|
y_pred = bst.predict(x, num_iteration=bst.best_iteration)
|
|
@@ -183,7 +183,7 @@ class LightGBM(object):
|
|
|
print(f"Accuracy: {accuracy}")
|
|
|
fw.close()
|
|
|
# 水平合并
|
|
|
- df_concatenated = pd.concat([x, true_label_df,pred_score_df, pred_label_df], axis=1)
|
|
|
+ df_concatenated = pd.concat([ids, titles, x, true_label_df, pred_score_df, pred_label_df], axis=1)
|
|
|
df_concatenated.to_excel("data/predict_data/spider_predict_result_{}.xlsx".format(dt), index=False)
|
|
|
|
|
|
def feature_importance(self):
|