Bladeren bron

Merge branch 'feature/luojunhui/2025-08-25-recycle-daily-task-improve' of Server/LongArticleTaskServer into master

luojunhui 2 weken geleden
bovenliggende
commit
6b70234f9e

+ 0 - 43
applications/crawler/wechat/gzh_spider.py

@@ -37,27 +37,6 @@ async def get_article_detail(
         response = await http_client.post(target_url, headers=headers, data=payload)
 
     return response
-    # try:
-    #     response = requests.post(
-    #         url=target_url, headers=headers, data=payload, timeout=120
-    #     )
-    #     response.raise_for_status()
-    #     return response.json()
-    # except requests.exceptions.RequestException as e:
-    #     log(
-    #         task="get_official_article_detail",
-    #         function="get_official_article_detail",
-    #         message=f"API请求失败: {e}",
-    #         data={"link": article_link},
-    #     )
-    # except json.JSONDecodeError as e:
-    #     log(
-    #         task="get_official_article_detail",
-    #         function="get_official_article_detail",
-    #         message=f"响应解析失败: {e}",
-    #         data={"link": article_link},
-    #     )
-    # return None
 
 
 @retry(**retry_desc)
@@ -67,27 +46,6 @@ async def get_article_list_from_account(account_id: str, index=None) -> dict | N
     async with AsyncHttpClient(timeout=120) as http_client:
         response = await http_client.post(target_url, headers=headers, data=payload)
     return response
-    # try:
-    #     response = requests.post(
-    #         url=target_url, headers=headers, data=payload, timeout=120
-    #     )
-    #     response.raise_for_status()
-    #     return response.json()
-    # except requests.exceptions.RequestException as e:
-    #     log(
-    #         task="get_official_account_article_list",
-    #         function="get_official_account_article_list",
-    #         message=f"API请求失败: {e}",
-    #         data={"gh_id": account_id},
-    #     )
-    # except json.JSONDecodeError as e:
-    #     log(
-    #         task="get_official_account_article_list",
-    #         function="get_official_account_article_list",
-    #         message=f"响应解析失败: {e}",
-    #         data={"gh_id": account_id},
-    #     )
-    # return None
 
 
 @retry(**retry_desc)
@@ -135,7 +93,6 @@ def get_source_account_from_article(article_link) -> dict | None:
 async def weixin_search(keyword: str, page="1") -> dict | None:
     url = "{}/keyword".format(base_url)
     payload = json.dumps({"keyword": keyword, "cursor": page})
-    # response = requests.request("POST", url, headers=headers, data=payload, timeout=120)
     async with AsyncHttpClient(timeout=120) as http_client:
         response = await http_client.post(url=url, headers=headers, data=payload)
 

+ 19 - 5
applications/tasks/data_recycle_tasks/recycle_daily_publish_articles.py

@@ -48,6 +48,9 @@ class Const:
     ARTICLE_SUCCESS_CODE = 0
     ARTICLE_UNKNOWN_CODE = 10000
 
+    ACCOUNT_FORBIDDEN_CODE = 25013
+    CRAWL_CRASH_CODE = 20000
+
     STAT_PERIOD = 3 * 24 * 3600
 
     INIT_STATUS = 0
@@ -116,7 +119,7 @@ class RecycleDailyPublishArticlesTask(Const):
             )
             response_code = response["code"]
             match response_code:
-                case 25013:
+                case self.ACCOUNT_FORBIDDEN_CODE:
                     await feishu_robot.bot(
                         title="发布账号封禁",
                         detail={
@@ -125,7 +128,7 @@ class RecycleDailyPublishArticlesTask(Const):
                         },
                     )
                     return
-                case 0:
+                case self.ARTICLE_SUCCESS_CODE:
                     msg_list = response.get("data", {}).get("data", [])
                     if not msg_list:
                         return
@@ -145,6 +148,17 @@ class RecycleDailyPublishArticlesTask(Const):
                     cursor = response["data"].get("next_cursor")
                     if not cursor:
                         return
+                case self.CRAWL_CRASH_CODE:
+                    await self.log_client.log(
+                        contexts={
+                            "task": "recycle_daily_publish_articles",
+                            "data": {
+                                "gh_id": account["gh_id"],
+                            },
+                            "message": "爬虫挂掉",
+                            "status": "fail",
+                        }
+                    )
                 case _:
                     return
 
@@ -279,7 +293,7 @@ class UpdateRootSourceIdAndUpdateTimeTask(Const):
         self.pool = pool
         self.log_client = log_client
 
-    async def get_article_list(self):
+    async def get_article_list(self) -> list[dict]:
         query = """select ContentUrl, wx_sn from official_articles_v2 where publish_timestamp in %s;"""
         article_list = await self.pool.async_fetch(
             query=query, db_name="piaoquan_crawler", params=(tuple([0, -1]),)
@@ -374,13 +388,13 @@ class UpdateRootSourceIdAndUpdateTimeTask(Const):
             set publish_timestamp = updateTime
             where publish_timestamp < %s;
         """
-        affected_rows_2 = await self.pool.async_save(query=update_sql_2, params=0)
+        affected_rows_2 = await self.pool.async_save(query=update_sql_2, params=(0,), db_name="piaoquan_crawler")
         if affected_rows_1 or affected_rows_2:
             await feishu_robot.bot(
                 title="执行兜底修改发布时间戳",
                 detail={
                     "通过msgId修改": affected_rows_1,
-                    "通过update_timestamp修改": affected_rows_2,
+                    "通过create_timestamp修改": affected_rows_2,
                 },
                 mention=False,
             )