3 months ago · 2b183307c4
--- a/applications/api/deep_seek_official_api.py
+++ b/applications/api/deep_seek_official_api.py
@@ -18,7 +18,7 @@ def fetch_deepseek_completion(
 
				     output_type: str = "text",
			
 
				     tool_calls: bool = False,
			
 
				     tools: List[Dict] = None,
			
 
				-) -> Optional[Dict]:
			
 
				+) -> Optional[Dict, List]:
			
 
				     messages = [{"role": "user", "content": prompt}]
			
 
				     kwargs = {
			
 
				         "model": deep_seek_official_model.get(model, "deepseek-chat"),
			
--- a/applications/tasks/llm_tasks/candidate_account_process.py
+++ b/applications/tasks/llm_tasks/candidate_account_process.py
@@ -4,6 +4,7 @@ from typing import List, Dict
 
				 from tqdm import tqdm
			
 
				 
			
 
				 from applications.api import fetch_deepseek_completion
			
 
				+from applications.utils import ci_lower
			
 
				 
			
 
				 
			
 
				 class CandidateAccountProcessConst:
			
@@ -20,6 +21,8 @@ class CandidateAccountProcessConst:
 
				     ARTICLE_COUNT_THRESHOLD = 13
			
 
				     AVG_TITLE_LENGTH_THRESHOLD = 45
			
 
				 
			
 
				+    ACCOUNT_GOOD_STATUS = 1
			
 
				+
			
 
				     @staticmethod
			
 
				     def generate_title_match_score_prompt(title_list):
			
 
				         title_list_string = "\n".join(title_list)
			
@@ -71,7 +74,7 @@ class CandidateAccountQualityScoreRecognizer(CandidateAccountProcessConst):
 
				         get account tasks from the database
			
 
				         """
			
 
				         fetch_query = f"""
			
 
				-            select id, title_list, platform 
			
 
				+            select id, title_list, platform, account_id, account_name
			
 
				             from crawler_candidate_account_pool
			
 
				             where avg_score is null and status = {self.INIT_STATUS} and title_list is not null;
			
 
				         """
			
@@ -93,6 +96,20 @@ class CandidateAccountQualityScoreRecognizer(CandidateAccountProcessConst):
 
				             update_query, (new_status, account_id, ori_status)
			
 
				         )
			
 
				 
			
 
				+    async def insert_account_into_crawler_queue(self, score_list: List[int], account: dict) -> None:
			
 
				+        """
			
 
				+        计算账号的得分置信区间下限，若置信区间下限的分数大于阈值，则认为是好的账号
			
 
				+        """
			
 
				+        if ci_lower(score_list) > self.AVG_SCORE_THRESHOLD:
			
 
				+            query = f"""
			
 
				+                insert into article_meta_accounts (platform, account_id, account_name, account_source, status)
			
 
				+                values (%s, %s, %s, %s, %s);
			
 
				+            """
			
 
				+            await self.pool.async_save(
			
 
				+                query=query,
			
 
				+                params=(account["platform"], account["account_id"], account["account_name"], 'ai_recognize', self.ACCOUNT_GOOD_STATUS)
			
 
				+            )
			
 
				+
			
 
				     async def score_for_each_account_by_llm(self, account):
			
 
				         account_id = account["id"]
			
 
				         # lock
			
@@ -141,6 +158,8 @@ class CandidateAccountQualityScoreRecognizer(CandidateAccountProcessConst):
 
				                     self.SUCCESS_STATUS,
			
 
				                 ),
			
 
				             )
			
 
				+            # 判断置信区间下限, 并且插入账号
			
 
				+            await self.insert_account_into_crawler_queue(score_list=completion, account=account)
			
 
				 
			
 
				         except Exception as e:
			
 
				             await self.log_client.log(
			
--- a/applications/utils/common.py
+++ b/applications/utils/common.py
@@ -5,8 +5,13 @@
 
				 import random
			
 
				 import string
			
 
				 import hashlib
			
 
				+import math
			
 
				+import statistics
			
 
				+from scipy.stats import t
			
 
				 
			
 
				 from datetime import datetime, timezone, date, timedelta
			
 
				+from typing import List
			
 
				+
			
 
				 from requests import RequestException
			
 
				 from urllib.parse import urlparse, parse_qs
			
 
				 from tenacity import (
			
@@ -198,3 +203,18 @@ def days_remaining_in_month():
 
				 def generate_task_trace_id():
			
 
				     random_str = "".join(random.choices(string.ascii_lowercase + string.digits, k=16))
			
 
				     return f"Task-{datetime.now().strftime('%Y%m%d%H%M%S')}-{random_str}"
			
 
				+
			
 
				+
			
 
				+def ci_lower(data: List[int], conf: float = 0.95) -> float:
			
 
				+    """
			
 
				+    计算data的置信区间下限
			
 
				+    """
			
 
				+    if len(data) < 2:
			
 
				+        raise ValueError("Sample length less than 2")
			
 
				+
			
 
				+    n = len(data)
			
 
				+    mean = statistics.mean(data)
			
 
				+    std = statistics.stdev(data) / math.sqrt(n)
			
 
				+    # t 分位点（左侧）：ppf 返回负值
			
 
				+    t_left = t.ppf((1 - conf) / 2, df=n - 1)
			
 
				+    return mean + t_left * std