Explorar o código

Merge branch '2025-02-12-nlp-task-improve' of luojunhui/LongArticlesJob into master

luojunhui hai 2 meses
pai
achega
5526fa764e

+ 31 - 18
coldStartTasks/filter/title_similarity_task.py

@@ -1,7 +1,9 @@
 """
 @author: luojunhui
 """
+import datetime
 import numpy as np
+import traceback
 
 from pymysql.cursors import DictCursor
 
@@ -52,14 +54,19 @@ class ColdStartTitleSimilarityTask(object):
         title_list = [i[0] for i in mysql_response]
         return title_list
 
-    def get_title_from_meta_base(self):
+    def get_title_from_meta_base(self, limit):
         """
         获取meta_base表中文章标题列表
         status: 1 表示文章初始化状态
         """
-        sql = f"""
-            select article_id, title from crawler_meta_article where status = 1 and score is null;
-        """
+        if limit:
+            sql = f"""
+                select article_id, title from crawler_meta_article where status = 1 and score is null limit {limit};
+            """
+        else:
+            sql = f"""
+                select article_id, title from crawler_meta_article where status = 1 and score is null;
+            """
         mysql_response = self.db_client.fetch(query=sql, cursor_type=DictCursor)
         return mysql_response
 
@@ -72,7 +79,7 @@ class ColdStartTitleSimilarityTask(object):
             set score = case article_id
                 {}
             end
-            where article_id in %s;
+            where article_id in %s and score is null;
         """
         case_statement = []
         article_id_list = []
@@ -88,11 +95,11 @@ class ColdStartTitleSimilarityTask(object):
         affected_rows = self.db_client.save(formatted_sql, params)
         return affected_rows
 
-    def run(self):
+    def run(self, limit=None):
         """
         执行任务
         """
-        target_article_list = self.get_title_from_meta_base()
+        target_article_list = self.get_title_from_meta_base(limit=limit)
         if not target_article_list:
             print("No more articles to process.")
             return
@@ -102,14 +109,20 @@ class ColdStartTitleSimilarityTask(object):
         batch_task_list = chunks(target_article_list, ARTICLE_BATCH)
 
         for batch_task in batch_task_list:
-            batch_target_title_list = [i['title'] for i in batch_task]
-            similarity_array = similarity_between_title_list(batch_target_title_list, base_title_list)
-
-            update_data_list = []
-            for index, score_list in enumerate(similarity_array):
-                sorted_score_list = sorted(score_list)
-                percent_threshold_score = np.percentile(sorted_score_list, PERCENT_THRESHOLD)
-                update_data_list.append((percent_threshold_score, batch_task[index]['article_id']))
-
-            affected_rows = self.update_meta_article_batch(update_data_list)
-            print(affected_rows)
+            try:
+                batch_target_title_list = [i['title'] for i in batch_task]
+                similarity_array = similarity_between_title_list(batch_target_title_list, base_title_list)
+
+                update_data_list = []
+                for index, score_list in enumerate(similarity_array):
+                    sorted_score_list = sorted(score_list)
+                    percent_threshold_score = np.percentile(sorted_score_list, PERCENT_THRESHOLD)
+                    update_data_list.append((percent_threshold_score, batch_task[index]['article_id']))
+
+                affected_rows = self.update_meta_article_batch(update_data_list)
+
+                print("{}: \t本次任务处理数量: {}".format(datetime.datetime.today().__str__(), affected_rows))
+            except Exception as e:
+                print("{}: \t本次任务处理失败: {}".format(datetime.datetime.today().__str__(), e))
+                print(traceback.format_exc())
+                continue

+ 7 - 2
requirements.txt

@@ -5,7 +5,7 @@ aliyun-log-python-sdk
 async-timeout
 elastic-transport
 elasticsearch
-numpy
+numpy~=1.26.4
 odps~=3.5.1
 pandas~=2.2.2
 pip
@@ -15,4 +15,9 @@ requests~=2.32.3
 schedule~=1.2.2
 setuptools
 tqdm~=4.66.4
-pyapollos~=0.1.5
+pyapollos~=0.1.5
+protobuf~=3.20.3
+openai~=1.17.0
+oss2~=2.19.1
+fake-useragent~=1.5.1
+playwright~=1.49.1

+ 26 - 0
sh/run_title_similarity_task.sh

@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# 获取当前日期,格式为 YYYY-MM-DD
+CURRENT_DATE=$(date +%F)
+
+# 日志文件路径,包含日期
+LOG_FILE="/root/luojunhui/logs/title_similarity_log_$CURRENT_DATE.txt"
+
+# 重定向整个脚本的输出到带日期的日志文件
+exec >> "$LOG_FILE" 2>&1
+if pgrep -f "python3 title_similarity_score_task.py" > /dev/null
+then
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - title_similarity_score_task.py"
+else
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart title_similarity_score_task.py"
+    # 切换到指定目录
+    cd /root/luojunhui/LongArticlesJob
+
+    # 激活 Conda 环境
+    source /root/miniconda3/etc/profile.d/conda.sh
+    conda activate tasks
+
+    # 在后台运行 Python 脚本并重定向日志输出
+    nohup python3 title_similarity_score_task.py >> "${LOG_FILE}" 2>&1 &
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted title_similarity_score_task.py"
+fi

+ 2 - 1
title_similarity_score_task.py

@@ -5,6 +5,7 @@ from coldStartTasks.filter.title_similarity_task import ColdStartTitleSimilarity
 
 
 if __name__ == '__main__':
+    batch_size = 3000
     task = ColdStartTitleSimilarityTask()
     task.init_database()
-    task.run()
+    task.run(limit=batch_size)