Browse Source

generate label for mysql

罗俊辉 1 year ago
parent
commit
a7a5bb9b13
1 changed files with 6 additions and 4 deletions
  1. 6 4
      process_data.py

+ 6 - 4
process_data.py

@@ -7,6 +7,8 @@ import os
 import json
 import asyncio
 import argparse
+import time
+
 from tqdm import tqdm
 import jieba.analyse
 from concurrent.futures.thread import ThreadPoolExecutor
@@ -154,17 +156,17 @@ class DataProcessor(object):
             title = read_title(client=self.client, video_id=video_id)
             label, dt_daily = generate_label(video_id, hour_dt, label_info)
 
-            insert_sql = f"""UPDATE lightgbm_data 
-            set video_title = '{title}',  label = '{label}', daily_dt_str = '{dt_daily}' where video_id = '{video_id}';"""
+            insert_sql = f"""UPDATE lightgbm_data set video_title = '{title}',  label = '{label}', daily_dt_str = '{dt_daily}' where video_id = '{video_id}';"""
             print(insert_sql)
             self.client_spider.update(insert_sql)
 
-        select_sql = "SELECT video_id, hour_dt_str FROM lightgbm_data where label is NULL and hour_dt_str < '20240327';"
+        select_sql = "SELECT video_id, hour_dt_str FROM lightgbm_data where label = 0 and hour_dt_str < '20240327';"
         init_data_tuple = self.client_spider.select(select_sql)
         init_list = list(init_data_tuple)
-        for item in tqdm(init_list):
+        for item in tqdm(init_list[:100]):
             # print(item)
             process_info(item)
+            time.sleep(0.5)
         # with ThreadPoolExecutor(max_workers=10) as Pool:
         #     Pool.map(process_info, init_list)