Browse Source

Merge branch 'feature/luojunhui/2025-08-04-auto-add-toutiao-accounts' of Server/LongArticleTaskServer into master

luojunhui 1 tháng trước cách đây
mục cha
commit
2b183307c4

+ 1 - 1
applications/api/deep_seek_official_api.py

@@ -18,7 +18,7 @@ def fetch_deepseek_completion(
     output_type: str = "text",
     tool_calls: bool = False,
     tools: List[Dict] = None,
-) -> Optional[Dict]:
+) -> Optional[Dict, List]:
     messages = [{"role": "user", "content": prompt}]
     kwargs = {
         "model": deep_seek_official_model.get(model, "deepseek-chat"),

+ 20 - 1
applications/tasks/llm_tasks/candidate_account_process.py

@@ -4,6 +4,7 @@ from typing import List, Dict
 from tqdm import tqdm
 
 from applications.api import fetch_deepseek_completion
+from applications.utils import ci_lower
 
 
 class CandidateAccountProcessConst:
@@ -20,6 +21,8 @@ class CandidateAccountProcessConst:
     ARTICLE_COUNT_THRESHOLD = 13
     AVG_TITLE_LENGTH_THRESHOLD = 45
 
+    ACCOUNT_GOOD_STATUS = 1
+
     @staticmethod
     def generate_title_match_score_prompt(title_list):
         title_list_string = "\n".join(title_list)
@@ -71,7 +74,7 @@ class CandidateAccountQualityScoreRecognizer(CandidateAccountProcessConst):
         get account tasks from the database
         """
         fetch_query = f"""
-            select id, title_list, platform 
+            select id, title_list, platform, account_id, account_name
             from crawler_candidate_account_pool
             where avg_score is null and status = {self.INIT_STATUS} and title_list is not null;
         """
@@ -93,6 +96,20 @@ class CandidateAccountQualityScoreRecognizer(CandidateAccountProcessConst):
             update_query, (new_status, account_id, ori_status)
         )
 
+    async def insert_account_into_crawler_queue(self, score_list: List[int], account: dict) -> None:
+        """
+        计算账号的得分置信区间下限,若置信区间下限的分数大于阈值,则认为是好的账号
+        """
+        if ci_lower(score_list) > self.AVG_SCORE_THRESHOLD:
+            query = f"""
+                insert into article_meta_accounts (platform, account_id, account_name, account_source, status)
+                values (%s, %s, %s, %s, %s);
+            """
+            await self.pool.async_save(
+                query=query,
+                params=(account["platform"], account["account_id"], account["account_name"], 'ai_recognize', self.ACCOUNT_GOOD_STATUS)
+            )
+
     async def score_for_each_account_by_llm(self, account):
         account_id = account["id"]
         # lock
@@ -141,6 +158,8 @@ class CandidateAccountQualityScoreRecognizer(CandidateAccountProcessConst):
                     self.SUCCESS_STATUS,
                 ),
             )
+            # 判断置信区间下限, 并且插入账号
+            await self.insert_account_into_crawler_queue(score_list=completion, account=account)
 
         except Exception as e:
             await self.log_client.log(

+ 20 - 0
applications/utils/common.py

@@ -5,8 +5,13 @@
 import random
 import string
 import hashlib
+import math
+import statistics
+from scipy.stats import t
 
 from datetime import datetime, timezone, date, timedelta
+from typing import List
+
 from requests import RequestException
 from urllib.parse import urlparse, parse_qs
 from tenacity import (
@@ -198,3 +203,18 @@ def days_remaining_in_month():
 def generate_task_trace_id():
     random_str = "".join(random.choices(string.ascii_lowercase + string.digits, k=16))
     return f"Task-{datetime.now().strftime('%Y%m%d%H%M%S')}-{random_str}"
+
+
+def ci_lower(data: List[int], conf: float = 0.95) -> float:
+    """
+    计算data的置信区间下限
+    """
+    if len(data) < 2:
+        raise ValueError("Sample length less than 2")
+
+    n = len(data)
+    mean = statistics.mean(data)
+    std = statistics.stdev(data) / math.sqrt(n)
+    # t 分位点(左侧):ppf 返回负值
+    t_left = t.ppf((1 - conf) / 2, df=n - 1)
+    return mean + t_left * std