فهرست منبع

account crawler

luojunhui 2 ماه پیش
والد
کامیت
cb59f250e3
1فایلهای تغییر یافته به همراه11 افزوده شده و 3 حذف شده
  1. 11 3
      tasks/account_recognize_by_llm.py

+ 11 - 3
tasks/account_recognize_by_llm.py

@@ -29,7 +29,7 @@ def generate_prompt(account_title_list):
     ** 评估维度及权重 **
         1. 受众精准度(50%)
             正向匹配:存款/养老/健康/饮食/疾病警示/家庭伦理/近现代战争历史/老知青/奇闻异事
-            负向排除:影视解说/文学解读/个人收藏(钱币/邮票)/机械科普/数码测评/电子游戏/时尚潮流/明星八卦/极限运动/学术研究/网络热梗/宠物饲养
+            负向排除:影视解说/文学解读/个人收藏(钱币/邮票)/机械科普/数码测评/电子游戏/时尚潮流/明星八卦/极限运动/学术研究/网络热梗/宠物饲养/音乐/棋牌
             
         2. 标题技法(40%)
             悬念设计:疑问句/省略号/反转结构(例:"打开后瞬间愣住...")
@@ -91,12 +91,20 @@ def recognize_each_account(thread_db_client, account):
 
     # process
     title_list = json.loads(account["title_list"])
-    if len(title_list) < 15:
+    if len(title_list) < 15 and account['platform'] == 'toutiao':
         # 账号数量不足,直接跳过
         print("bad account, skip")
         update_task_status(thread_db_client, task_id, 1, 11)
         return
 
+    # 标题长度过长,需要过滤
+    title_total_length = sum(len(title) for title in title_list)
+    avg_title_length = title_total_length / len(title_list)
+    if avg_title_length > 30:
+        print("title too long, skip")
+        update_task_status(thread_db_client, task_id, 1, 14)
+        return
+
     prompt = generate_prompt(title_list)
     response = fetch_deepseek_response(model="DeepSeek-R1", prompt=prompt)
     response_score_str = response.strip()
@@ -148,7 +156,7 @@ class AccountRecognizer:
         get account task from database
         """
         fetch_query = f"""
-            select id, title_list from crawler_candidate_account_pool
+            select id, title_list, platform from crawler_candidate_account_pool
             where avg_score is null and status = 0 and title_list is not null;
         """
         fetch_response = self.db_client.fetch(fetch_query, cursor_type=DictCursor)