пре 3 недеља · 39082fc06c
--- a/account_quality_analysis.py
+++ b/account_quality_analysis.py
@@ -1,12 +1,16 @@
 
															-from tasks.ai_tasks.account_recognize_by_llm import AccountRecognizer
														
 
															+from tasks.ai_tasks.account_recognize_by_llm import CandidateAccountCategoryRecognizer
														
 
															+from tasks.ai_tasks.account_recognize_by_llm import CandidateAccountQualityScoreRecognizer
														
 
															 def main():
														
 
															     """
														
 
															     main function
														
 
															     """
														
 
															-    account_recognizer = AccountRecognizer()
														
 
															-    account_recognizer.deal()
														
 
															+    account_category_task = CandidateAccountCategoryRecognizer()
														
 
															+    account_category_task.deal()
														
 
															+
														
 
															+    account_quality_task = CandidateAccountQualityScoreRecognizer()
														
 
															+    account_quality_task.deal()
														
 
															 if __name__ == "__main__":
														
--- a/applications/api/__init__.py
+++ b/applications/api/__init__.py
@@ -4,6 +4,7 @@
 
															 from .aigc_system_api import AigcSystemApi
														
 
															 from .apollo_api import ApolloApi
														
 
															 from .deep_seek_api_by_volcanoengine import fetch_deepseek_response
														
 
															+from .deep_seek_api_official import fetch_deepseek_completion
														
 
															 from .moon_shot_api import fetch_moon_shot_response
														
 
															 from .nlp_api import similarity_between_title_list
														
 
															 from .gewe_api import WechatChannelAPI
														
--- a/tasks/ai_tasks/account_recognize_by_llm.py
+++ b/tasks/ai_tasks/account_recognize_by_llm.py
@@ -9,54 +9,15 @@ from threading import local
 
															 import concurrent
														
 
															 from concurrent.futures import ThreadPoolExecutor
														
 
															-from applications.api import fetch_deepseek_response
														
 
															+from applications.api import fetch_deepseek_completion
														
 
															 from applications.db import DatabaseConnector
														
 
															 from config import long_articles_config
														
 
															-thread_local = local()
														
 
															+from tasks.ai_tasks.prompts import category_generation_for_each_account
														
 
															+from tasks.ai_tasks.prompts import get_title_match_score_list
														
 
															-def generate_prompt(account_title_list):
														
 
															-    """
														
 
															-    生成prompt
														
 
															-    :param account_title_list:
														
 
															-    """
														
 
															-    title_list = "\n".join(account_title_list)
														
 
															-    g_prompt = f"""
														
 
															-    ** 任务指令 **
														
 
															-        你是一名资深中文新闻编辑，需根据以下标准对一批标题进行主题匹配度评分（0-100分）
														
 
															-        
														
 
															-    ** 评估维度及权重 **
														
 
															-        1. 受众精准度（50%）
														
 
															-            正向匹配：存款/养老/健康/饮食/疾病警示/家庭伦理/近现代战争历史/老知青/奇闻异事
														
 
															-            负向排除：影视解说/文学解读/个人收藏（钱币/邮票）/机械科普/数码测评/电子游戏/时尚潮流/明星八卦/极限运动/学术研究/网络热梗/宠物饲养/音乐/棋牌
														
 
															-            
														
 
															-        2. 标题技法（40%）
														
 
															-            悬念设计：疑问句/省略号/反转结构（例："打开后瞬间愣住..."）
														
 
															-            情感强度：使用"痛心！""寒心！"等情绪词
														
 
															-            数据冲击：具体数字增强可信度（例："存款180万消失"）
														
 
															-            口语化表达：使用"涨知识了""别不当回事"等日常用语
														
 
															-            
														
 
															-        3. 内容调性（10%）
														
 
															-            煽情猎奇：家庭悲剧/离奇事件（例："棺材板挖出金条"）
														
 
															-            警示价值：健康建议/法律案例（例："三种食物禁止二次加热"）
														
 
															-            历史揭秘：人物秘闻/老照片故事
														
 
															-            爱国情怀：军事突破/资源发现（例："南极发现巨型粮仓"）
														
 
															-
														
 
															-    ** 评分规则 **
														
 
															-        90-100分：同时满足3个维度且要素齐全，无负向内容
														
 
															-        70-89分：满足2个核心维度，无负向内容
														
 
															-        50-69分：仅满足受众群体正向匹配，无负向内容
														
 
															-        30-49分：存在轻微关联但要素缺失
														
 
															-        0-29分：完全无关或包含任意负向品类内容
														
 
															-        
														
 
															-    ** 待评估标题 **
														
 
															-        {title_list}
														
 
															-        
														
 
															-    ** 输出要求 **
														
 
															-        仅输出这一批标题的评分，用数组 List 返回 [score1, score2, score3,...] 不要包含任何解释或说明。
														
 
															-    """
														
 
															-    return g_prompt
														
 
															+thread_local = local()
														
 
															 def get_db_client():
														
@@ -76,12 +37,24 @@ def update_task_status(thread_db_client, task_id, ori_status, new_status):
 
															     update_query = f"""
														
 
															         update crawler_candidate_account_pool
														
 
															         set status = %s
														
 
															-        where id = %s and status = %s;  
														
 
															+        where id = %s and status = %s;
														
 
															+    """
														
 
															+    thread_db_client.save(update_query, (new_status, task_id, ori_status))
														
 
															+
														
 
															+
														
 
															+def update_task_category_status(thread_db_client, task_id, ori_status, new_status):
														
 
															+    """
														
 
															+    update task status
														
 
															+    """
														
 
															+    update_query = f"""
														
 
															+        update crawler_candidate_account_pool
														
 
															+        set category_status = %s
														
 
															+        where id = %s and category_status = %s;
														
 
															     """
														
 
															     thread_db_client.save(update_query, (new_status, task_id, ori_status))
														
 
															-def recognize_each_account(thread_db_client, account):
														
 
															+def get_account_score(thread_db_client, account):
														
 
															     """
														
 
															     recognize each account
														
 
															     """
														
@@ -91,7 +64,7 @@ def recognize_each_account(thread_db_client, account):
 
															     # process
														
 
															     title_list = json.loads(account["title_list"])
														
 
															-    if len(title_list) < 15 and account['platform'] == 'toutiao':
														
 
															+    if len(title_list) < 15 and account["platform"] == "toutiao":
														
 
															         # 账号数量不足，直接跳过
														
 
															         print("bad account, skip")
														
 
															         update_task_status(thread_db_client, task_id, 1, 11)
														
@@ -105,8 +78,8 @@ def recognize_each_account(thread_db_client, account):
 
															         update_task_status(thread_db_client, task_id, 1, 14)
														
 
															         return
														
 
															-    prompt = generate_prompt(title_list)
														
 
															-    response = fetch_deepseek_response(model="DeepSeek-V3", prompt=prompt)
														
 
															+    prompt = get_title_match_score_list(title_list)
														
 
															+    response = fetch_deepseek_completion(model="DeepSeek-V3", prompt=prompt)
														
 
															     response_score_str = response.strip()
														
 
															     try:
														
 
															         score_list = json.loads(response_score_str)
														
@@ -129,35 +102,86 @@ def recognize_each_account(thread_db_client, account):
 
															         update_task_status(thread_db_client, task_id, 1, 12)
														
 
															-def recognize_task_thread(task):
														
 
															+def get_account_category(thread_db_client, account):
														
 
															     """
														
 
															-    recognize thread
														
 
															+    recognize each account
														
 
															     """
														
 
															-    thread_db_client = get_db_client()
														
 
															-    try:
														
 
															-        recognize_each_account(thread_db_client, task)
														
 
															-    except Exception as e:
														
 
															-        print(e)
														
 
															-        update_task_status(
														
 
															-            thread_db_client=thread_db_client,
														
 
															-            task_id=["id"],
														
 
															-            ori_status=1,
														
 
															-            new_status=13,
														
 
															-        )
														
 
															+    task_id = account["id"]
														
 
															+    title_list = json.loads(account["title_list"])
														
 
															+
														
 
															+    # lock task
														
 
															+    update_task_category_status(thread_db_client, task_id, 0, 1)
														
 
															+
														
 
															+    prompt = category_generation_for_each_account(title_list)
														
 
															+    response = fetch_deepseek_completion(model="DeepSeek-V3", prompt=prompt)
														
 
															+    print(response)
														
 
															+    response_category = response.strip()
														
 
															+    if response_category:
														
 
															+        update_query = f"""
														
 
															+                update crawler_candidate_account_pool
														
 
															+                set category = %s, category_status = %s
														
 
															+                where id = %s and category_status = %s;
														
 
															+            """
														
 
															+        thread_db_client.save(update_query, (response_category, 2, task_id, 1))
														
 
															+    else:
														
 
															+        update_task_category_status(thread_db_client, task_id, 1, 99)
														
 
															-class AccountRecognizer:
														
 
															+def recognize_account_thread(account, task):
														
 
															+    """
														
 
															+    recognize thread
														
 
															+    """
														
 
															+    match task:
														
 
															+        case "score":
														
 
															+            thread_db_client = get_db_client()
														
 
															+            try:
														
 
															+                get_account_score(thread_db_client, account)
														
 
															+            except Exception as e:
														
 
															+                update_task_status(
														
 
															+                    thread_db_client=thread_db_client,
														
 
															+                    task_id=account["id"],
														
 
															+                    ori_status=1,
														
 
															+                    new_status=13,
														
 
															+                )
														
 
															+        case "category":
														
 
															+            thread_db_client = get_db_client()
														
 
															+            try:
														
 
															+                get_account_category(thread_db_client, account)
														
 
															+            except Exception as e:
														
 
															+                update_task_category_status(
														
 
															+                    thread_db_client=thread_db_client,
														
 
															+                    task_id=account["id"],
														
 
															+                    ori_status=1,
														
 
															+                    new_status=99,
														
 
															+                )
														
 
															+        case "_":
														
 
															+            return
														
 
															+
														
 
															+
														
 
															+class CandidateAccountRecognizer:
														
 
															+
														
 
															+    INIT_STATUS = 0
														
 
															+    PROCESSING_STATUS = 1
														
 
															+    SUCCESS_STATUS = 2
														
 
															+    FAILED_STATUS = 99
														
 
															+
														
 
															+    AVG_SCORE_THRESHOLD = 65
														
 
															+
														
 
															     def __init__(self):
														
 
															         self.db_client = DatabaseConnector(long_articles_config)
														
 
															         self.db_client.connect()
														
 
															+
														
 
															+class CandidateAccountQualityScoreRecognizer(CandidateAccountRecognizer):
														
 
															+
														
 
															     def get_task_list(self):
														
 
															         """
														
 
															-        get account task from database
														
 
															+        get account tasks from the database
														
 
															         """
														
 
															         fetch_query = f"""
														
 
															-            select id, title_list, platform from crawler_candidate_account_pool
														
 
															-            where avg_score is null and status = 0 and title_list is not null;
														
 
															+            select id, title_list, platform 
														
 
															+            from crawler_candidate_account_pool
														
 
															+            where avg_score is null and status = {self.INIT_STATUS} and title_list is not null;
														
 
															         """
														
 
															         fetch_response = self.db_client.fetch(fetch_query, cursor_type=DictCursor)
														
 
															         return fetch_response
														
@@ -167,7 +191,38 @@ class AccountRecognizer:
 
															         with ThreadPoolExecutor(max_workers=8) as executor:
														
 
															             futures = [
														
 
															-                executor.submit(recognize_task_thread, task) for task in task_list
														
 
															+                executor.submit(recognize_account_thread, task, "score")
														
 
															+                for task in task_list
														
 
															+            ]
														
 
															+            for future in tqdm(
														
 
															+                concurrent.futures.as_completed(futures),
														
 
															+                total=len(task_list),
														
 
															+                desc="处理进度",
														
 
															+            ):
														
 
															+                future.result()
														
 
															+
														
 
															+
														
 
															+class CandidateAccountCategoryRecognizer(CandidateAccountRecognizer):
														
 
															+
														
 
															+    def get_task_list(self):
														
 
															+        fetch_query = f"""
														
 
															+            select id, title_list 
														
 
															+            from crawler_candidate_account_pool
														
 
															+            where category_status = %s and avg_score >= %s;
														
 
															+        """
														
 
															+        fetch_response = self.db_client.fetch(
														
 
															+            fetch_query,
														
 
															+            cursor_type=DictCursor,
														
 
															+            params=(self.INIT_STATUS, self.AVG_SCORE_THRESHOLD),
														
 
															+        )
														
 
															+        return fetch_response
														
 
															+
														
 
															+    def deal(self):
														
 
															+        task_list = self.get_task_list()
														
 
															+        with ThreadPoolExecutor(max_workers=8) as executor:
														
 
															+            futures = [
														
 
															+                executor.submit(recognize_account_thread, task, "category")
														
 
															+                for task in task_list
														
 
															             ]
														
 
															             for future in tqdm(
														
 
															                 concurrent.futures.as_completed(futures),
														
--- a/tasks/ai_tasks/prompts.py
+++ b/tasks/ai_tasks/prompts.py
@@ -2,6 +2,7 @@
 
															 ai tasks prompt
														
 
															 """
														
 
															+
														
 
															 def category_generation_from_title(title_list):
														
 
															     """
														
 
															     generate prompt category for given title
														
@@ -124,5 +125,66 @@ def category_generation_from_title(title_list):
 
															     return prompt
														
 
															+def category_generation_for_each_account(title_list):
														
 
															+    title_list_str = "\n".join(title_list)
														
 
															+    prompt = f"""
														
 
															+        你是一个账号分类大师，我会给你一个账号的一批标题，需要你给出这个账号的品类，品类需要是唯一的，以字符输出，输出品类需要是以下15个品类之一
														
 
															+        品类定义如下：
														
 
															+            知识科普:以通俗易懂的方式普及科学、技术、健康、安全、生活常识、财产保护、医保政策、为人处事方式等内容，旨在提高公众的知识水平和认知能力。内容通常具有教育性和实用性，涵盖自然、社会、文化等多个领域。
														
 
															+            军事历史:聚焦于历史上的军事事件、战争故事、军事策略、英雄人物等内容，旨在还原战争场景、探讨军事决策、揭示历史真相，并展现战争中的人物命运与历史影响。内容通常以叙事、分析或回忆的形式呈现，兼具历史深度和故事性。
														
 
															+            家长里短:围绕家庭成员之间的关系、矛盾、情感、道德、等展开的故事或讨论，内容常涉及婚姻、亲子、婆媳、兄弟姐妹等关系，或是人情往来、金钱纠纷、情感变化等内容，反映家庭生活中的温情、冲突与人性。
														
 
															+            社会法治:聚焦社会事件、法律纠纷、法院判决、社会现象等内容，通常涉及道德、法律、公平正义等议题，旨在揭示社会问题、探讨法律规则或反映人性与社会现实。
														
 
															+            奇闻趣事:以猎奇、娱乐为主，涵盖罕见、奇特、有趣的事件、发现或故事，内容通常具有趣味性和话题性，能够引发读者的好奇心和讨论。
														
 
															+            名人八卦:围绕名人的生活、言论、事件、八卦等内容展开，通常涉及娱乐圈、政界、历史人物等，旨在满足公众对名人隐私和动态的好奇心。
														
 
															+            健康养生:关注健康、养生、疾病预防、生活习惯等方面的知识和建议，内容通常具有实用性和指导性，旨在帮助读者改善生活质量、提升健康水平。
														
 
															+            情感故事:以人与人之间的情感交流、感人故事、情感经历为主题，内容通常充满温情、感动或反思，旨在引发读者的情感共鸣和思考。
														
 
															+            国家大事:涉及国家实力、科技发展、资源发现、国际合作等内容，通常以宏观视角展现国家的综合实力、科技成就或国际影响力，体现国家的崛起与发展。
														
 
															+            现代人物:聚焦活跃在21世纪后具有传奇色彩或巨大贡献的人物、事迹、成就等，内容通常充满戏剧性和启发性，旨在展现人物的非凡经历或历史贡献。
														
 
															+            怀旧时光:以回忆和怀旧为主题，涉及过去的历史、文化、生活、照片等内容，旨在唤起读者对过去时光的情感共鸣和怀念。
														
 
															+            政治新闻:聚焦政治事件、领导人动态、国际关系等内容，通常以新闻或分析的形式呈现，旨在揭示政治局势、政策变化或国际关系的动态。
														
 
															+            历史人物:聚焦于21世纪前具有重要影响的人物，包括他们的生平、事迹、成就、性格、趣事及其对历史进程的贡献。内容通常以传记、回忆录或历史分析的形式呈现，旨在还原人物的真实面貌并探讨其历史意义。
														
 
															+            社会现象:关注社会中出现的普遍现象、趋势或问题，通常涉及文化、经济、教育、民生等领域。内容以观察、分析或评论为主，旨在揭示现象背后的原因、影响及社会意义，引发公众的思考和讨论。
														
 
															+            财经科技:聚焦于经济、金融、投资及行业发展的分析与预测，涵盖未来经济趋势、资产价值变化、行业变革及个人理财策略等内容。可以提供前瞻性的财经视角和实用的理财建议，帮助其把握经济动态、优化财务规划并应对行业变化。
														
 
															+        输入的标题list是：{title_list_str}
														
 
															+        只需要输出品类的字符信息
														
 
															+    """
														
 
															+    return prompt
														
 
															+
														
 
															+
														
 
															+def get_title_match_score_list(title_list):
														
 
															+    title_list_str = "\n".join(title_list)
														
 
															+    g_prompt = f"""
														
 
															+    ** 任务指令 **
														
 
															+        你是一名资深中文新闻编辑，需根据以下标准对一批标题进行主题匹配度评分（0-100分）
														
 
															+    ** 评估维度及权重 **
														
 
															+        1. 受众精准度（50%）
														
 
															+            正向匹配：存款/养老/健康/饮食/疾病警示/家庭伦理/近现代战争历史/老知青/奇闻异事
														
 
															+            负向排除：影视解说/文学解读/个人收藏（钱币/邮票）/机械科普/数码测评/电子游戏/时尚潮流/明星八卦/极限运动/学术研究/网络热梗/宠物饲养/音乐/棋牌
														
 
															+        2. 标题技法（40%）
														
 
															+            悬念设计：疑问句/省略号/反转结构（例："打开后瞬间愣住..."）
														
 
															+            情感强度：使用"痛心！""寒心！"等情绪词
														
 
															+            数据冲击：具体数字增强可信度（例："存款180万消失"）
														
 
															+            口语化表达：使用"涨知识了""别不当回事"等日常用语
														
 
															+
														
 
															+        3. 内容调性（10%）
														
 
															+            煽情猎奇：家庭悲剧/离奇事件（例："棺材板挖出金条"）
														
 
															+            警示价值：健康建议/法律案例（例："三种食物禁止二次加热"）
														
 
															+            历史揭秘：人物秘闻/老照片故事
														
 
															+            爱国情怀：军事突破/资源发现（例："南极发现巨型粮仓"）
														
 
															+
														
 
															+    ** 评分规则 **
														
 
															+        90-100分：同时满足3个维度且要素齐全，无负向内容
														
 
															+        70-89分：满足2个核心维度，无负向内容
														
 
															+        50-69分：仅满足受众群体正向匹配，无负向内容
														
 
															+        30-49分：存在轻微关联但要素缺失
														
 
															+        0-29分：完全无关或包含任意负向品类内容
														
 
															+
														
 
															+    ** 待评估标题 **
														
 
															+        {title_list_str}
														
 
															+
														
 
															+    ** 输出要求 **
														
 
															+        仅输出这一批标题的评分，用数组 List 返回 [score1, score2, score3,...] 不要包含任何解释或说明。
														
 
															+    """
														
 
															+    return g_prompt