|
@@ -9,54 +9,15 @@ from threading import local
|
|
import concurrent
|
|
import concurrent
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
-from applications.api import fetch_deepseek_response
|
|
|
|
|
|
+from applications.api import fetch_deepseek_completion
|
|
from applications.db import DatabaseConnector
|
|
from applications.db import DatabaseConnector
|
|
from config import long_articles_config
|
|
from config import long_articles_config
|
|
|
|
|
|
-thread_local = local()
|
|
|
|
|
|
+from tasks.ai_tasks.prompts import category_generation_for_each_account
|
|
|
|
+from tasks.ai_tasks.prompts import get_title_match_score_list
|
|
|
|
|
|
|
|
|
|
-def generate_prompt(account_title_list):
|
|
|
|
- """
|
|
|
|
- 生成prompt
|
|
|
|
- :param account_title_list:
|
|
|
|
- """
|
|
|
|
- title_list = "\n".join(account_title_list)
|
|
|
|
- g_prompt = f"""
|
|
|
|
- ** 任务指令 **
|
|
|
|
- 你是一名资深中文新闻编辑,需根据以下标准对一批标题进行主题匹配度评分(0-100分)
|
|
|
|
-
|
|
|
|
- ** 评估维度及权重 **
|
|
|
|
- 1. 受众精准度(50%)
|
|
|
|
- 正向匹配:存款/养老/健康/饮食/疾病警示/家庭伦理/近现代战争历史/老知青/奇闻异事
|
|
|
|
- 负向排除:影视解说/文学解读/个人收藏(钱币/邮票)/机械科普/数码测评/电子游戏/时尚潮流/明星八卦/极限运动/学术研究/网络热梗/宠物饲养/音乐/棋牌
|
|
|
|
-
|
|
|
|
- 2. 标题技法(40%)
|
|
|
|
- 悬念设计:疑问句/省略号/反转结构(例:"打开后瞬间愣住...")
|
|
|
|
- 情感强度:使用"痛心!""寒心!"等情绪词
|
|
|
|
- 数据冲击:具体数字增强可信度(例:"存款180万消失")
|
|
|
|
- 口语化表达:使用"涨知识了""别不当回事"等日常用语
|
|
|
|
-
|
|
|
|
- 3. 内容调性(10%)
|
|
|
|
- 煽情猎奇:家庭悲剧/离奇事件(例:"棺材板挖出金条")
|
|
|
|
- 警示价值:健康建议/法律案例(例:"三种食物禁止二次加热")
|
|
|
|
- 历史揭秘:人物秘闻/老照片故事
|
|
|
|
- 爱国情怀:军事突破/资源发现(例:"南极发现巨型粮仓")
|
|
|
|
-
|
|
|
|
- ** 评分规则 **
|
|
|
|
- 90-100分:同时满足3个维度且要素齐全,无负向内容
|
|
|
|
- 70-89分:满足2个核心维度,无负向内容
|
|
|
|
- 50-69分:仅满足受众群体正向匹配,无负向内容
|
|
|
|
- 30-49分:存在轻微关联但要素缺失
|
|
|
|
- 0-29分:完全无关或包含任意负向品类内容
|
|
|
|
-
|
|
|
|
- ** 待评估标题 **
|
|
|
|
- {title_list}
|
|
|
|
-
|
|
|
|
- ** 输出要求 **
|
|
|
|
- 仅输出这一批标题的评分,用数组 List 返回 [score1, score2, score3,...] 不要包含任何解释或说明。
|
|
|
|
- """
|
|
|
|
- return g_prompt
|
|
|
|
|
|
+thread_local = local()
|
|
|
|
|
|
|
|
|
|
def get_db_client():
|
|
def get_db_client():
|
|
@@ -76,12 +37,24 @@ def update_task_status(thread_db_client, task_id, ori_status, new_status):
|
|
update_query = f"""
|
|
update_query = f"""
|
|
update crawler_candidate_account_pool
|
|
update crawler_candidate_account_pool
|
|
set status = %s
|
|
set status = %s
|
|
- where id = %s and status = %s;
|
|
|
|
|
|
+ where id = %s and status = %s;
|
|
|
|
+ """
|
|
|
|
+ thread_db_client.save(update_query, (new_status, task_id, ori_status))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def update_task_category_status(thread_db_client, task_id, ori_status, new_status):
|
|
|
|
+ """
|
|
|
|
+ update task status
|
|
|
|
+ """
|
|
|
|
+ update_query = f"""
|
|
|
|
+ update crawler_candidate_account_pool
|
|
|
|
+ set category_status = %s
|
|
|
|
+ where id = %s and category_status = %s;
|
|
"""
|
|
"""
|
|
thread_db_client.save(update_query, (new_status, task_id, ori_status))
|
|
thread_db_client.save(update_query, (new_status, task_id, ori_status))
|
|
|
|
|
|
|
|
|
|
-def recognize_each_account(thread_db_client, account):
|
|
|
|
|
|
+def get_account_score(thread_db_client, account):
|
|
"""
|
|
"""
|
|
recognize each account
|
|
recognize each account
|
|
"""
|
|
"""
|
|
@@ -91,7 +64,7 @@ def recognize_each_account(thread_db_client, account):
|
|
|
|
|
|
# process
|
|
# process
|
|
title_list = json.loads(account["title_list"])
|
|
title_list = json.loads(account["title_list"])
|
|
- if len(title_list) < 15 and account['platform'] == 'toutiao':
|
|
|
|
|
|
+ if len(title_list) < 15 and account["platform"] == "toutiao":
|
|
# 账号数量不足,直接跳过
|
|
# 账号数量不足,直接跳过
|
|
print("bad account, skip")
|
|
print("bad account, skip")
|
|
update_task_status(thread_db_client, task_id, 1, 11)
|
|
update_task_status(thread_db_client, task_id, 1, 11)
|
|
@@ -105,8 +78,8 @@ def recognize_each_account(thread_db_client, account):
|
|
update_task_status(thread_db_client, task_id, 1, 14)
|
|
update_task_status(thread_db_client, task_id, 1, 14)
|
|
return
|
|
return
|
|
|
|
|
|
- prompt = generate_prompt(title_list)
|
|
|
|
- response = fetch_deepseek_response(model="DeepSeek-V3", prompt=prompt)
|
|
|
|
|
|
+ prompt = get_title_match_score_list(title_list)
|
|
|
|
+ response = fetch_deepseek_completion(model="DeepSeek-V3", prompt=prompt)
|
|
response_score_str = response.strip()
|
|
response_score_str = response.strip()
|
|
try:
|
|
try:
|
|
score_list = json.loads(response_score_str)
|
|
score_list = json.loads(response_score_str)
|
|
@@ -129,35 +102,86 @@ def recognize_each_account(thread_db_client, account):
|
|
update_task_status(thread_db_client, task_id, 1, 12)
|
|
update_task_status(thread_db_client, task_id, 1, 12)
|
|
|
|
|
|
|
|
|
|
-def recognize_task_thread(task):
|
|
|
|
|
|
+def get_account_category(thread_db_client, account):
|
|
"""
|
|
"""
|
|
- recognize thread
|
|
|
|
|
|
+ recognize each account
|
|
"""
|
|
"""
|
|
- thread_db_client = get_db_client()
|
|
|
|
- try:
|
|
|
|
- recognize_each_account(thread_db_client, task)
|
|
|
|
- except Exception as e:
|
|
|
|
- print(e)
|
|
|
|
- update_task_status(
|
|
|
|
- thread_db_client=thread_db_client,
|
|
|
|
- task_id=["id"],
|
|
|
|
- ori_status=1,
|
|
|
|
- new_status=13,
|
|
|
|
- )
|
|
|
|
|
|
+ task_id = account["id"]
|
|
|
|
+ title_list = json.loads(account["title_list"])
|
|
|
|
+
|
|
|
|
+ # lock task
|
|
|
|
+ update_task_category_status(thread_db_client, task_id, 0, 1)
|
|
|
|
+
|
|
|
|
+ prompt = category_generation_for_each_account(title_list)
|
|
|
|
+ response = fetch_deepseek_completion(model="DeepSeek-V3", prompt=prompt)
|
|
|
|
+ print(response)
|
|
|
|
+ response_category = response.strip()
|
|
|
|
+ if response_category:
|
|
|
|
+ update_query = f"""
|
|
|
|
+ update crawler_candidate_account_pool
|
|
|
|
+ set category = %s, category_status = %s
|
|
|
|
+ where id = %s and category_status = %s;
|
|
|
|
+ """
|
|
|
|
+ thread_db_client.save(update_query, (response_category, 2, task_id, 1))
|
|
|
|
+ else:
|
|
|
|
+ update_task_category_status(thread_db_client, task_id, 1, 99)
|
|
|
|
|
|
|
|
|
|
-class AccountRecognizer:
|
|
|
|
|
|
+def recognize_account_thread(account, task):
|
|
|
|
+ """
|
|
|
|
+ recognize thread
|
|
|
|
+ """
|
|
|
|
+ match task:
|
|
|
|
+ case "score":
|
|
|
|
+ thread_db_client = get_db_client()
|
|
|
|
+ try:
|
|
|
|
+ get_account_score(thread_db_client, account)
|
|
|
|
+ except Exception as e:
|
|
|
|
+ update_task_status(
|
|
|
|
+ thread_db_client=thread_db_client,
|
|
|
|
+ task_id=account["id"],
|
|
|
|
+ ori_status=1,
|
|
|
|
+ new_status=13,
|
|
|
|
+ )
|
|
|
|
+ case "category":
|
|
|
|
+ thread_db_client = get_db_client()
|
|
|
|
+ try:
|
|
|
|
+ get_account_category(thread_db_client, account)
|
|
|
|
+ except Exception as e:
|
|
|
|
+ update_task_category_status(
|
|
|
|
+ thread_db_client=thread_db_client,
|
|
|
|
+ task_id=account["id"],
|
|
|
|
+ ori_status=1,
|
|
|
|
+ new_status=99,
|
|
|
|
+ )
|
|
|
|
+ case "_":
|
|
|
|
+ return
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class CandidateAccountRecognizer:
|
|
|
|
+
|
|
|
|
+ INIT_STATUS = 0
|
|
|
|
+ PROCESSING_STATUS = 1
|
|
|
|
+ SUCCESS_STATUS = 2
|
|
|
|
+ FAILED_STATUS = 99
|
|
|
|
+
|
|
|
|
+ AVG_SCORE_THRESHOLD = 65
|
|
|
|
+
|
|
def __init__(self):
|
|
def __init__(self):
|
|
self.db_client = DatabaseConnector(long_articles_config)
|
|
self.db_client = DatabaseConnector(long_articles_config)
|
|
self.db_client.connect()
|
|
self.db_client.connect()
|
|
|
|
|
|
|
|
+
|
|
|
|
+class CandidateAccountQualityScoreRecognizer(CandidateAccountRecognizer):
|
|
|
|
+
|
|
def get_task_list(self):
|
|
def get_task_list(self):
|
|
"""
|
|
"""
|
|
- get account task from database
|
|
|
|
|
|
+ get account tasks from the database
|
|
"""
|
|
"""
|
|
fetch_query = f"""
|
|
fetch_query = f"""
|
|
- select id, title_list, platform from crawler_candidate_account_pool
|
|
|
|
- where avg_score is null and status = 0 and title_list is not null;
|
|
|
|
|
|
+ select id, title_list, platform
|
|
|
|
+ from crawler_candidate_account_pool
|
|
|
|
+ where avg_score is null and status = {self.INIT_STATUS} and title_list is not null;
|
|
"""
|
|
"""
|
|
fetch_response = self.db_client.fetch(fetch_query, cursor_type=DictCursor)
|
|
fetch_response = self.db_client.fetch(fetch_query, cursor_type=DictCursor)
|
|
return fetch_response
|
|
return fetch_response
|
|
@@ -167,7 +191,38 @@ class AccountRecognizer:
|
|
|
|
|
|
with ThreadPoolExecutor(max_workers=8) as executor:
|
|
with ThreadPoolExecutor(max_workers=8) as executor:
|
|
futures = [
|
|
futures = [
|
|
- executor.submit(recognize_task_thread, task) for task in task_list
|
|
|
|
|
|
+ executor.submit(recognize_account_thread, task, "score")
|
|
|
|
+ for task in task_list
|
|
|
|
+ ]
|
|
|
|
+ for future in tqdm(
|
|
|
|
+ concurrent.futures.as_completed(futures),
|
|
|
|
+ total=len(task_list),
|
|
|
|
+ desc="处理进度",
|
|
|
|
+ ):
|
|
|
|
+ future.result()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class CandidateAccountCategoryRecognizer(CandidateAccountRecognizer):
|
|
|
|
+
|
|
|
|
+ def get_task_list(self):
|
|
|
|
+ fetch_query = f"""
|
|
|
|
+ select id, title_list
|
|
|
|
+ from crawler_candidate_account_pool
|
|
|
|
+ where category_status = %s and avg_score >= %s;
|
|
|
|
+ """
|
|
|
|
+ fetch_response = self.db_client.fetch(
|
|
|
|
+ fetch_query,
|
|
|
|
+ cursor_type=DictCursor,
|
|
|
|
+ params=(self.INIT_STATUS, self.AVG_SCORE_THRESHOLD),
|
|
|
|
+ )
|
|
|
|
+ return fetch_response
|
|
|
|
+
|
|
|
|
+ def deal(self):
|
|
|
|
+ task_list = self.get_task_list()
|
|
|
|
+ with ThreadPoolExecutor(max_workers=8) as executor:
|
|
|
|
+ futures = [
|
|
|
|
+ executor.submit(recognize_account_thread, task, "category")
|
|
|
|
+ for task in task_list
|
|
]
|
|
]
|
|
for future in tqdm(
|
|
for future in tqdm(
|
|
concurrent.futures.as_completed(futures),
|
|
concurrent.futures.as_completed(futures),
|