Browse Source

视频号抓取任务

luojunhui 7 months ago
parent
commit
c82fd24a51
1 changed files with 29 additions and 28 deletions
  1. 29 28
      tasks/crawler_channel_account_videos.py

+ 29 - 28
tasks/crawler_channel_account_videos.py

@@ -130,48 +130,49 @@ class CrawlerChannelAccountVideos:
 
 
     def crawler_each_account(self, channel_account: dict, last_buffer: str = "") -> None:
     def crawler_each_account(self, channel_account: dict, last_buffer: str = "") -> None:
         """
         """
-        get channel account videos
+        通过循环替代递归,分页爬取频道账号视频
         """
         """
         channel_account_id = channel_account["account_id"]
         channel_account_id = channel_account["account_id"]
-        max_cursor = channel_account["max_cursor"]
-        if not max_cursor:
-            max_cursor = const.DEFAULT_CURSOR
-
-        response = get_channel_account_videos(
-            channel_account_id, last_buffer=last_buffer
-        )
-        if response["ret"] == 200:
+        max_cursor = channel_account.get("max_cursor") or const.DEFAULT_CURSOR
+        current_last_buffer = last_buffer
+        has_more = True
+
+        while has_more:
+            response = get_channel_account_videos(channel_account_id, last_buffer=current_last_buffer)
+            if response["ret"] != 200:
+                log(
+                    task="crawler_channel_account_videos",
+                    function="crawler_each_video",
+                    message="get_channel_account_videos failed",
+                    data={
+                        "response": response,
+                        "channel_account_id": channel_account_id,
+                        "max_cursor": max_cursor,
+                    },
+                )
+                break
+
             response_data = response["data"]
             response_data = response["data"]
-            last_buffer = response_data["lastBuffer"]
-            continue_flag = response_data["continueFlag"]
+            current_last_buffer = response_data["lastBuffer"]  # 更新分页游标
+            has_more = response_data["continueFlag"]  # 是否还有下一页
             video_list = response_data["object"]
             video_list = response_data["object"]
+
+            if not video_list:
+                break
+
             create_timestamp = video_list[0]["createtime"]
             create_timestamp = video_list[0]["createtime"]
             if create_timestamp < max_cursor:
             if create_timestamp < max_cursor:
-                return
+                break
 
 
             crawl_video_list_bar = tqdm(video_list, desc="crawl videos")
             crawl_video_list_bar = tqdm(video_list, desc="crawl videos")
             for video in crawl_video_list_bar:
             for video in crawl_video_list_bar:
                 crawl_video_list_bar.set_postfix({"video_id": video["id"]})
                 crawl_video_list_bar.set_postfix({"video_id": video["id"]})
                 self.crawler_each_video(video)
                 self.crawler_each_video(video)
 
 
-            if continue_flag:
+            if has_more:
                 time.sleep(const.SLEEP_SECOND)
                 time.sleep(const.SLEEP_SECOND)
-                return self.crawler_each_account(channel_account_id, last_buffer)
             else:
             else:
-                return
-
-        else:
-            log(
-                task="crawler_channel_account_videos",
-                function="crawler_each_video",
-                message="get_channel_account_videos failed",
-                data={
-                    "response": response,
-                    "channel_account_id": channel_account_id,
-                    "max_cursor": max_cursor,
-                },
-            )
-            return
+                break
 
 
     def update_account_max_cursor(self, account_id: str) -> None:
     def update_account_max_cursor(self, account_id: str) -> None:
         """
         """