Browse Source

视频号抓取任务

luojunhui 1 month ago
parent
commit
66c16e10ad
3 changed files with 118 additions and 47 deletions
  1. 27 0
      applications/const/__init__.py
  2. 3 28
      crawler_sph_video.py
  3. 88 19
      tasks/crawler_channel_account_videos.py

+ 27 - 0
applications/const/__init__.py

@@ -268,6 +268,33 @@ class BaiduVideoCrawlerConst:
     LOCAL_PATH_DIR = "static"
 
 
+class ChannelVideoCrawlerConst:
+    """
+    const for baidu video crawler
+    """
+    # account status
+    CHANNEL_ACCOUNT_GOOD_STATUS = 1
+    CHANNEL_ACCOUNT_BAD_STATUS = 0
+
+    # earliest cursor, 2024-01-01 00:00:00
+    DEFAULT_CURSOR = 1704038400
+
+    # no source account
+    NO_SOURCE_ACCOUNT_STATUS = 0
+
+    # local path dir
+    LOCAL_PATH_DIR = "static"
+
+    # title length min
+    MIN_TITLE_LENGTH = 10
+
+    # max video length(second)
+    MAX_VIDEO_LENGTH = 240
+
+    # sleep second
+    SLEEP_SECOND = 2
+
+
 
 
 

+ 3 - 28
crawler_sph_video.py

@@ -1,34 +1,9 @@
 """
 @author: luojunhui
 """
+
 from tasks.crawler_channel_account_videos import CrawlerChannelAccountVideos
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     crawler_channel_account_videos = CrawlerChannelAccountVideos()
-    account_id_list = [
-        "v2_060000231003b20faec8c4eb8019c6d5cf06e83cb07794095553fa65d33438ab96e611936d56@finder",
-        "v2_060000231003b20faec8c6e58d1fcad0cc00eb33b0770dba35360f0ed62c88d0d350c27774c2@finder",
-        "v2_060000231003b20faec8c4e08e1cc4d6c607ec31b077680c6f14d55b527766e41cf9be586e4d@finder",
-        "v2_060000231003b20faec8c4e58b1ac3d3cf02eb31b077400c101a45e79401bcb5798ad570737d@finder",
-        "v2_060000231003b20faec8c5e18c18c2d6ca05e43db077bca0f7123717c7596f0fd88052d9eab9@finder",
-        "v2_060000231003b20faec8c4e0811cc3dcc805e83cb077667e01df91de36a6306008913f0fa791@finder",
-        "v2_060000231003b20faec8c4e5881ecad4cc03e537b077218547d9428ce776f68aecea9daf6e0b@finder",
-        "v2_060000231003b20faec8c4e1891acbd5c907ed34b0777b93708eca16641e6026fa1673abc031@finder",
-        "v2_060000231003b20faec8c6e5891acad4ce05ed36b077a565ac908007215df7a1fc7b83555742@finder",
-        "v2_060000231003b20faec8c7e58a10c4d4ce0dec34b0776ab62f75818c8e8434cd92ae68711d51@finder",
-        "v2_060000231003b20faec8c7e28f1fcbddce02eb33b0777cabc2c0135c4e65c9d0ecdf1b6eee4a@finder",
-        "v2_060000231003b20faec8c7e38f1bc3dccb01e432b0774a6d9f6dcdee627b0cc2c20a4c0c2c31@finder",
-        "v2_060000231003b20faec8c5e18a1bc1d7c804ea33b0778d39f109d2d4ab142d0a6f797a8cf82e@finder",
-        "v2_060000231003b20faec8c7e28819c0d1cc04ea36b077e0d091a104419b7643051529e3bdd44e@finder",
-        "v2_060000231003b20faec8c7e28910c1d0cb04ed34b077a39e2b87d05a120d27058262e08c4b07@finder",
-        "v2_060000231003b20faec8c5ea8f1dc3d6c801e934b0770b7ff832d113bda97a48eb93da5ac745@finder",
-        "v2_060000231003b20faec8c7e58919cbddcc0def32b077358968830a932033408fd4c03e76035e@finder",
-        "v2_060000231003b20faec8c7ea8f1dc4d1cb0de531b077a58ea59f1982d0865313a1ebba09ba7a@finder",
-        "v2_060000231003b20faec8c4e1891ac4dccc01ef31b077fb07460c62de09e1d8ab86923248623f@finder"
-    ]
-
-    for account_id in account_id_list:
-        try:
-            crawler_channel_account_videos.crawler_each_account(channel_account_id=account_id, channel_account_name="")
-        except Exception as e:
-            continue
+    crawler_channel_account_videos.deal()

+ 88 - 19
tasks/crawler_channel_account_videos.py

@@ -2,14 +2,17 @@
 @author: luojunhui
 @tool: pycharm && deepseek
 """
+
 import re
 import os
 import traceback
 import time
 
+from pymysql.cursors import DictCursor
 from tqdm import tqdm
 
 from applications import log
+from applications.const import ChannelVideoCrawlerConst
 from applications.db import DatabaseConnector
 from applications.utils import download_sph_video
 from applications.utils import insert_into_single_video_source_table
@@ -19,7 +22,7 @@ from applications.utils import upload_to_oss
 from config import long_articles_config
 from coldStartTasks.crawler.channels import get_channel_account_videos
 
-NO_SOURCE_ACCOUNT = 0
+const = ChannelVideoCrawlerConst()
 
 
 class CrawlerChannelAccountVideos:
@@ -30,7 +33,6 @@ class CrawlerChannelAccountVideos:
     def __init__(self):
         self.db_client = DatabaseConnector(db_config=long_articles_config)
         self.db_client.connect()
-        self.success_crawler_video_count = 0
 
     def whether_video_exists(self, title: str) -> bool:
         """
@@ -51,7 +53,9 @@ class CrawlerChannelAccountVideos:
         """
         get channel account list from database
         """
-        return
+        sql = f"""select account_id, max_cursor from sph_account_for_videos where status = {const.CHANNEL_ACCOUNT_GOOD_STATUS};"""
+        account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
+        return account_list
 
     def crawler_each_video(self, video: dict):
         """
@@ -66,23 +70,28 @@ class CrawlerChannelAccountVideos:
                 task="crawler_channel_account_videos",
                 function="crawler_each_video",
                 message="video title exists",
-                data={"video_id": video["id"], "title": title}
+                data={"video_id": video["id"], "title": title},
             )
             return
 
-        cleaned_title = re.sub(r'[^\u4e00-\u9fff]', '', title)
-        if len(cleaned_title) < 10:
+        cleaned_title = re.sub(r"[^\u4e00-\u9fff]", "", title)
+        if len(cleaned_title) < const.MIN_TITLE_LENGTH:
             log(
                 task="crawler_channel_account_videos",
                 function="crawler_each_video",
                 message="video title is too short",
-                data={"video_id": video["id"], "title": title}
+                data={"video_id": video["id"], "title": title},
             )
             return
 
-        video_length = video['objectDesc']['media'][0]['VideoPlayLen']
-        if video_length and int(video_length) > 240:
-            print("video to large")
+        video_length = video["objectDesc"]["media"][0]["VideoPlayLen"]
+        if video_length and int(video_length) > const.MAX_VIDEO_LENGTH:
+            log(
+                task="crawler_channel_account_videos",
+                function="crawler_each_video",
+                message="video length is too long",
+                data={"video_id": video["id"], "title": title},
+            )
             return
 
         video_item = Item()
@@ -93,7 +102,7 @@ class CrawlerChannelAccountVideos:
         video_item.add("out_account_id", video["username"])
         video_item.add("out_account_name", video["nickname"])
         video_item.add("publish_timestamp", video["createtime"])
-        video_item.add("platform", 'sph')
+        video_item.add("platform", "sph")
         media = object_desc["media"][0]
         url = media["Url"]
         decode_key = media["decodeKey"]
@@ -103,7 +112,7 @@ class CrawlerChannelAccountVideos:
             decrypt_path = download_sph_video(download_url=download_url, key=decode_key)
             oss_path = upload_to_oss(decrypt_path)
             video_item.add("video_oss_path", oss_path)
-            video_item.add("source_account", NO_SOURCE_ACCOUNT)
+            video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
             video_item.check(source="video")
             insert_into_single_video_source_table(self.db_client, video_item.item)
             os.remove(decrypt_path)
@@ -119,18 +128,25 @@ class CrawlerChannelAccountVideos:
                 },
             )
 
-    def crawler_each_account(self, channel_account_id: str, channel_account_name: str, last_buffer: str = ""):
+    def crawler_each_account(self, channel_account: dict, last_buffer: str = ""):
         """
         get channel account videos
         """
-        response = get_channel_account_videos(channel_account_id, last_buffer=last_buffer)
+        channel_account_id = channel_account["account_id"]
+        max_cursor = channel_account["max_cursor"]
+        if not max_cursor:
+            max_cursor = const.DEFAULT_CURSOR
+
+        response = get_channel_account_videos(
+            channel_account_id, last_buffer=last_buffer
+        )
         if response["ret"] == 200:
             response_data = response["data"]
             last_buffer = response_data["lastBuffer"]
             continue_flag = response_data["continueFlag"]
             video_list = response_data["object"]
-            create_timestamp = video_list[0]['createtime']
-            if create_timestamp < 1704038400:
+            create_timestamp = video_list[0]["createtime"]
+            if create_timestamp < max_cursor:
                 return
 
             crawl_video_list_bar = tqdm(video_list, desc="crawl videos")
@@ -139,11 +155,64 @@ class CrawlerChannelAccountVideos:
                 self.crawler_each_video(video)
 
             if continue_flag:
-                time.sleep(1)
-                return self.crawler_each_account(channel_account_id, channel_account_name, last_buffer)
+                time.sleep(const.SLEEP_SECOND)
+                return self.crawler_each_account(channel_account_id, last_buffer)
             else:
                 return
 
         else:
-            print(f"crawler channel account {channel_account_name} videos failed")
+            log(
+                task="crawler_channel_account_videos",
+                function="crawler_each_video",
+                message="get_channel_account_videos failed",
+                data={
+                    "response": response,
+                    "channel_account_id": channel_account_id,
+                    "max_cursor": max_cursor,
+                },
+            )
             return
+
+    def update_account_max_cursor(self, account_id):
+        """
+        update account max cursor
+        """
+        select_sql = f"""
+            select max(publish_timestamp) as max_cursor from publish_single_video_source where out_account_id = '{account_id}';
+        """
+        response_mysql = self.db_client.fetch(query=select_sql)
+        max_publish_timestamp = response_mysql[0][0]
+
+        if max_publish_timestamp:
+            update_sql = f"""
+                        update sph_account_for_videos
+                        set max_cursor = %s
+                        where account_id = %s;
+                    """
+            self.db_client.save(
+                query=update_sql, params=(max_publish_timestamp, account_id)
+            )
+
+    def deal(self):
+        """
+        deal channel account videos
+        """
+        account_list = self.get_channel_account_list()
+        account_crawler_bar = tqdm(account_list, desc="crawler channel account videos")
+        for account in account_crawler_bar:
+            try:
+                account_crawler_bar.set_postfix({"account_id": account["account_id"]})
+                self.crawler_each_account(channel_account=account)
+                self.update_account_max_cursor(account["account_id"])
+
+            except Exception as e:
+                log(
+                    task="crawler_channel_account_videos",
+                    function="deal",
+                    message="crawler channel account videos failed",
+                    data={
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                        "account_id": account["account_id"],
+                    },
+                )