소스 검색

data-to-loacl

罗俊辉 1 년 전
부모
커밋
dfe5b10881
2개의 변경된 파일18개의 추가작업 그리고 16개의 파일을 삭제
  1. 9 7
      data_process/process_data_for_lightgbm.py
  2. 9 9
      main_spider.py

+ 9 - 7
data_process/process_data_for_lightgbm.py

@@ -82,12 +82,13 @@ class SpiderProcess(object):
                     temp.append(None)
                     temp.append(None)
                     temp.append(None)
-
-                df.append(temp[2:])
+                df.append(temp)
             except:
                 continue
-        df = pd.DataFrame(df, columns=['label', 'channel', 'out_user_id', 'mode', 'out_play_cnt', 'out_like_cnt',
-                                       'out_share_cnt', 'lop', 'duration', 'tag1', 'tag2', 'tag3'])
+        df = pd.DataFrame(df,
+                          columns=['video_id', 'video_title', 'label', 'channel', 'out_user_id', 'mode', 'out_play_cnt',
+                                   'out_like_cnt',
+                                   'out_share_cnt', 'lop', 'duration', 'tag1', 'tag2', 'tag3'])
 
         df.to_json(des_path, orient='records')
 
@@ -130,8 +131,9 @@ class UserProcess(object):
         temp_time = three_date_before.strftime("%Y%m%d")
         if flag == "train":
             sql = f"""select video_title, label, user_id, channel, user_fans, user_view_30, user_share_30, user_return_30, user_rov, user_str, user_return_videos_30, user_return_videos_3, user_return_3, user_view_3, user_share_3, address from lightgbm_data where type = 'userupload' and daily_dt_str <= '{temp_time}';"""
-            des_path = "/root/luojunhui/alg/data/train_data/user_train_{}.json".format(datetime.datetime.today().strftime("%Y%m%d"))
-            
+            des_path = "/root/luojunhui/alg/data/train_data/user_train_{}.json".format(
+                datetime.datetime.today().strftime("%Y%m%d"))
+
         elif flag == "predict":
             sql = f"""select video_title, label, user_id, channel, user_fans, user_view_30, user_share_30, user_return_30, user_rov, user_str, user_return_videos_30, user_return_videos_3, user_return_3, user_view_3, user_share_3, address from lightgbm_data where type = 'userupload' and daily_dt_str = '{temp_time}';"""
             des_path = "/root/luojunhui/alg/data/predict_data/user_predict_{}.json".format(dt_time.strftime("%Y%m%d"))
@@ -178,4 +180,4 @@ if __name__ == '__main__':
             S.spider_data_produce(flag=flag, dt_time=dt)
         case "user_info":
             U = UserProcess()
-            U.generate_user_data(flag=flag, dt_time=dt)
+            U.generate_user_data(flag=flag, dt_time=dt)

+ 9 - 9
main_spider.py

@@ -61,23 +61,25 @@ class LightGBM(object):
         :return:
         """
         df = pd.read_json(path)
-        df = df.dropna(subset=['label'])
+        df = df.dropna(subset=['label'])  # 把 label 为空的删掉
         labels = df['label']
+        video_ids = df['video_id']
+        video_titles = df['video_title']
         if not yc:
             temp = sorted(labels)
             yc = temp[int(len(temp) * 0.7)]
         print("阈值", yc)
         labels = [0 if i < yc else 1 for i in labels]
-        features = df.drop("label", axis=1)
+        features = df.drop(['video_id', 'label', 'video_title'], axis=1)
         for key in self.float_columns:
             features[key] = pd.to_numeric(features[key], errors="coerce")
         for key in self.str_columns:
             features[key] = self.label_encoder.fit_transform(features[key])
-        return features, labels
+        return features, labels, video_ids, video_titles
 
     def best_params(self):
         path = "data/train_data/spider_data_240401.json"
-        X, y = self.read_data(path)
+        X, y, ids, titles = self.read_data(path)
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
         lgbm = lgb.LGBMClassifier(objective='binary')
@@ -121,7 +123,7 @@ class LightGBM(object):
         :return:
         """
         path = "data/train_data/spider_train_20240402.json"
-        x, y = self.read_data(path)
+        x, y, ids, titles = self.read_data(path)
         train_size = int(len(x) * self.split_c)
         X_train, X_test = x[:train_size], x[train_size:]
         Y_train, Y_test = y[:train_size], y[train_size:]
@@ -155,9 +157,7 @@ class LightGBM(object):
         """
         fw = open("result/summary_{}.txt".format(dt), "a+", encoding="utf-8")
         path = 'data/predict_data/predict_{}.json'.format(dt)
-        x, y = self.read_data(path, yc=6)
-        print(type(x))
-        print(type(y))
+        x, y, ids, titles = self.read_data(path, yc=6)
         true_label_df = pd.DataFrame(list(y), columns=['ture_label'])
         bst = lgb.Booster(model_file=self.model)
         y_pred = bst.predict(x, num_iteration=bst.best_iteration)
@@ -183,7 +183,7 @@ class LightGBM(object):
         print(f"Accuracy: {accuracy}")
         fw.close()
         # 水平合并
-        df_concatenated = pd.concat([x, true_label_df,pred_score_df, pred_label_df], axis=1)
+        df_concatenated = pd.concat([ids, titles, x, true_label_df, pred_score_df, pred_label_df], axis=1)
         df_concatenated.to_excel("data/predict_data/spider_predict_result_{}.xlsx".format(dt), index=False)
 
     def feature_importance(self):