|
@@ -3,10 +3,6 @@
|
|
|
"""
|
|
|
import os
|
|
|
import sys
|
|
|
-import json
|
|
|
-import optuna
|
|
|
-import numpy as np
|
|
|
-from odps import DataFrame
|
|
|
|
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
|
|
@@ -51,7 +47,7 @@ class LightGBM(object):
|
|
|
]
|
|
|
self.split_c = 0.7
|
|
|
self.yc = 0.8
|
|
|
- self.model = "models/lightgbm_0408_spider.bin"
|
|
|
+ self.model = "models/lightgbm_0409_spider.bin"
|
|
|
self.flag = flag
|
|
|
self.dt = dt
|
|
|
# self.label_mapping = {}
|
|
@@ -64,11 +60,11 @@ class LightGBM(object):
|
|
|
df = pd.read_json(path)
|
|
|
df = df.dropna(subset=['label']) # 把 label 为空的删掉
|
|
|
labels = df['label']
|
|
|
- # if not yc:
|
|
|
- # temp = sorted(labels)
|
|
|
- # yc = temp[int(len(temp) * 0.7)]
|
|
|
- # print("阈值", yc)
|
|
|
- # labels = [0 if i < yc else 1 for i in labels]
|
|
|
+ if not yc:
|
|
|
+ temp = sorted(labels)
|
|
|
+ yc = temp[int(len(temp) * 0.7)]
|
|
|
+ print("阈值", yc)
|
|
|
+ labels = [0 if i < yc else 1 for i in labels]
|
|
|
features = df.drop(['video_id', 'label', 'video_title'], axis=1)
|
|
|
for key in self.float_columns:
|
|
|
features[key] = pd.to_numeric(features[key], errors="coerce")
|
|
@@ -81,8 +77,8 @@ class LightGBM(object):
|
|
|
"""
|
|
|
find best params for lightgbm
|
|
|
"""
|
|
|
- path = "data/train_data/spider_train_20240408.json"
|
|
|
- X, y, ori_df = self.read_data(path)
|
|
|
+ path = "data/train_data/spider_train_20240409.json"
|
|
|
+ X, y, ori_df = self.read_data(path, 0.05)
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
|
|
lgb_ = lgb.LGBMClassifier(objective='binary')
|
|
@@ -126,7 +122,7 @@ class LightGBM(object):
|
|
|
:return:
|
|
|
"""
|
|
|
path = "data/train_data/spider_train_20240408.json"
|
|
|
- x, y, ori_df = self.read_data(path)
|
|
|
+ x, y, ori_df = self.read_data(path, 0.05)
|
|
|
train_size = int(len(x) * self.split_c)
|
|
|
X_train, X_test = x[:train_size], x[train_size:]
|
|
|
Y_train, Y_test = y[:train_size], y[train_size:]
|
|
@@ -159,23 +155,21 @@ class LightGBM(object):
|
|
|
:return:
|
|
|
"""
|
|
|
fw = open("result/summary_{}.txt".format(dt), "a+", encoding="utf-8")
|
|
|
- path = 'data/predict_data/predict_{}.json'.format(dt)
|
|
|
- x, y, ori_df = self.read_data(path, yc=6)
|
|
|
+ path = 'data/predict_data/predict_spider_{}.json'.format(dt)
|
|
|
+ x, y, ori_df = self.read_data(path, yc=0.05)
|
|
|
true_label_df = pd.DataFrame(list(y), columns=['ture_label'])
|
|
|
bst = lgb.Booster(model_file=self.model)
|
|
|
y_pred = bst.predict(x, num_iteration=bst.best_iteration)
|
|
|
pred_score_df = pd.DataFrame(list(y_pred), columns=['pred_score'])
|
|
|
- # temp = sorted(list(y_pred))
|
|
|
- # yuzhi = temp[int(len(temp) * 0.9) - 1]
|
|
|
- y_pred_binary = [0 if i <= 0.169541 else 1 for i in list(y_pred)]
|
|
|
+ temp = sorted(list(y_pred))
|
|
|
+ yuzhi = temp[int(len(temp) * 0.75) - 1]
|
|
|
+ y_pred_binary = [0 if i <= yuzhi else 1 for i in list(y_pred)]
|
|
|
pred_label_df = pd.DataFrame(list(y_pred_binary), columns=['pred_label'])
|
|
|
score_list = []
|
|
|
for index, item in enumerate(list(y_pred)):
|
|
|
real_label = y[index]
|
|
|
score = item
|
|
|
prid_label = y_pred_binary[index]
|
|
|
- if score < 0.169541:
|
|
|
- print(real_label, "\t", prid_label, "\t", score)
|
|
|
fw.write("{}\t{}\t{}\n".format(real_label, prid_label, score))
|
|
|
score_list.append(score)
|
|
|
print("预测样本总量: {}".format(len(score_list)))
|