소스 검색

优化数据处理代码

罗俊辉 1 년 전
부모
커밋
c60d90ef14
2개의 변경된 파일3개의 추가작업 그리고 3개의 파일을 삭제
  1. 2 2
      data_process/process_data_for_lightgbm.py
  2. 1 1
      main_spider.py

+ 2 - 2
data_process/process_data_for_lightgbm.py

@@ -10,7 +10,7 @@ import pandas as pd
 
 sys.path.append(os.getcwd())
 
-from functions import generate_label_date, MysqlClient, MySQLClientSpider
+from functions import MySQLClientSpider
 
 
 class SpiderProcess(object):
@@ -54,7 +54,7 @@ class SpiderProcess(object):
         temp_time = three_date_before.strftime("%Y%m%d")
         if flag == "train":
             select_sql = f"""SELECT video_id, video_title, label, channel, out_user_id, spider_mode, out_play_cnt, out_like_cnt, out_share_cnt FROM lightgbm_data WHERE type = 'spider' and daily_dt_str <= '{temp_time}' order by daily_dt_str;"""
-            des_path = "/root/luojunhui/alg/data/train_data/spider_train_{}".format(
+            des_path = "/root/luojunhui/alg/data/train_data/spider_train_{}.json".format(
                 datetime.datetime.today().strftime("%Y%m%d"))
         elif flag == "predict":
             select_sql = f"""SELECT video_id, video_title, label, channel, out_user_id, spider_mode, out_play_cnt, out_like_cnt, out_share_cnt FROM lightgbm_data WHERE type = 'spider' and daily_dt_str = '{temp_time}';"""

+ 1 - 1
main_spider.py

@@ -118,7 +118,7 @@ class LightGBM(object):
         Load dataset
         :return:
         """
-        path = "data/train_data/spider_data_240401.json"
+        path = "data/train_data/spider_data_20240402"
         x, y = self.read_data(path)
         train_size = int(len(x) * self.split_c)
         X_train, X_test = x[:train_size], x[train_size:]