Browse Source

Merge branch '2025-02-21-LLM-sensitity-improve' of luojunhui/LongArticlesJob into master

luojunhui 8 tháng trước cách đây
mục cha
commit
fb9e58eb36

+ 3 - 3
applications/llm_sensitivity.py

@@ -8,8 +8,8 @@ from openai import OpenAI
 
 def request_llm_api(prompt, text):
     client = OpenAI(
-        api_key='sk-c1b18099dadc4dd1b48239bdde184f6c',
-        base_url="https://api.deepseek.com"
+        api_key='5e275c38-44fd-415f-abcf-4b59f6377f72',
+        base_url="https://ark.cn-beijing.volces.com/api/v3"
     )
     chat_completion = client.chat.completions.create(
         messages=[
@@ -18,7 +18,7 @@ def request_llm_api(prompt, text):
                 "content": prompt + text,
             }
         ],
-        model="deepseek-chat",
+        model="ep-20250213194558-rrmr2", # deepseek-v3
         temperature=0.2,
         response_format={"type": "json_object"}
     )

+ 1 - 25
coldStartTasks/crawler/weixinCategoryCrawler.py

@@ -8,7 +8,7 @@ import time
 from tqdm import tqdm
 from pymysql.cursors import DictCursor
 
-from applications import WeixinSpider, Functions, llm_sensitivity, log
+from applications import WeixinSpider, Functions, log
 from coldStartTasks.filter import article_crawler_duplicate_filter
 from config import apolloConfig
 
@@ -158,18 +158,6 @@ class weixinCategory(object):
                     print(e)
         return success_records
 
-    def update_article_sensitive_status(self, category, unique_index, status):
-        """
-        更新文章敏感状态
-        :return:
-        """
-        update_sql = f"""
-            update crawler_meta_article
-            set llm_sensitivity = %s
-            where category = %s and unique_index = %s;
-        """
-        self.db_client_lam.update(sql=update_sql, params=(status, category, unique_index))
-
     def update_latest_account_timestamp(self, gh_id):
         """
         更新账号的最新时间戳
@@ -242,18 +230,6 @@ class weixinCategory(object):
                 print("success")
             except Exception as e:
                 print("fail because of {}".format(e))
-        success_titles = [x['title'] for x in success_records]
-        if success_titles:
-            try:
-                sensitive_results = llm_sensitivity.check_titles(success_titles)
-                for record, sensitive_result in zip(success_records, sensitive_results):
-                    self.update_article_sensitive_status(
-                        category=category,
-                        unique_index=record['unique_index'],
-                        status=sensitive_result['hit_rule']
-                    )
-            except Exception as e:
-                print("failed to update sensitive status: {}".format(e))
 
     def deal(self, category_list, date_str):
         """

+ 29 - 1
coldStartTasks/publish/publishCategoryArticles.py

@@ -9,7 +9,7 @@ import traceback
 
 from pandas import DataFrame
 
-from applications import aiditApi, log, bot
+from applications import aiditApi, log, bot, llm_sensitivity
 from config import apolloConfig
 
 apollo = apolloConfig()
@@ -295,6 +295,18 @@ class CategoryColdStartTask(object):
         )
         return zero_level_funnel_df
 
+    def update_article_sensitive_status(self, article_id, status):
+        """
+        更新文章敏感状态
+        :return:
+        """
+        update_sql = f"""
+            update crawler_meta_article
+            set llm_sensitivity = %s
+            where article_id = %s;
+        """
+        self.db_client.update(sql=update_sql, params=(status, article_id))
+
     def publish_filter_articles(self, category, articles_df, article_source):
         """
         过滤文章
@@ -313,6 +325,22 @@ class CategoryColdStartTask(object):
             case _:
                 return
 
+        success_titles = filtered_articles_df['title'].values.tolist()
+        article_id_list = filtered_articles_df['article_id'].values.tolist()
+        if success_titles:
+            try:
+                sensitive_results = llm_sensitivity.check_titles(success_titles)
+                for article_id, sensitive_result in zip(article_id_list, sensitive_results):
+                    self.update_article_sensitive_status(
+                        article_id=article_id,
+                        status=sensitive_result['hit_rule']
+                    )
+                    if sensitive_result['hit_rule'] > TITLE_NOT_SENSITIVE:
+                        filtered_articles_df = filtered_articles_df[filtered_articles_df['article_id'] != article_id]
+
+            except Exception as e:
+                print("failed to update sensitive status: {}".format(e))
+
         url_list = filtered_articles_df['link'].values.tolist()
         if url_list:
             # create_crawler_plan