3 months ago · 66c16e10ad
--- a/applications/const/__init__.py
+++ b/applications/const/__init__.py
@@ -268,6 +268,33 @@ class BaiduVideoCrawlerConst:
 
				     LOCAL_PATH_DIR = "static"
			
 
				 
			
 
				 
			
 
				+class ChannelVideoCrawlerConst:
			
 
				+    """
			
 
				+    const for baidu video crawler
			
 
				+    """
			
 
				+    # account status
			
 
				+    CHANNEL_ACCOUNT_GOOD_STATUS = 1
			
 
				+    CHANNEL_ACCOUNT_BAD_STATUS = 0
			
 
				+
			
 
				+    # earliest cursor, 2024-01-01 00:00:00
			
 
				+    DEFAULT_CURSOR = 1704038400
			
 
				+
			
 
				+    # no source account
			
 
				+    NO_SOURCE_ACCOUNT_STATUS = 0
			
 
				+
			
 
				+    # local path dir
			
 
				+    LOCAL_PATH_DIR = "static"
			
 
				+
			
 
				+    # title length min
			
 
				+    MIN_TITLE_LENGTH = 10
			
 
				+
			
 
				+    # max video length(second)
			
 
				+    MAX_VIDEO_LENGTH = 240
			
 
				+
			
 
				+    # sleep second
			
 
				+    SLEEP_SECOND = 2
			
 
				+
			
 
				+
			
 
				 
			
 
				 
			
 
				 
			
--- a/crawler_sph_video.py
+++ b/crawler_sph_video.py
@@ -1,34 +1,9 @@
 
				 """
			
 
				 @author: luojunhui
			
 
				 """
			
 
				+
			
 
				 from tasks.crawler_channel_account_videos import CrawlerChannelAccountVideos
			
 
				 
			
 
				-if __name__ == '__main__':
			
 
				+if __name__ == "__main__":
			
 
				     crawler_channel_account_videos = CrawlerChannelAccountVideos()
			
 
				-    account_id_list = [
			
 
				-        "v2_060000231003b20faec8c4eb8019c6d5cf06e83cb07794095553fa65d33438ab96e611936d56@finder",
			
 
				-        "v2_060000231003b20faec8c6e58d1fcad0cc00eb33b0770dba35360f0ed62c88d0d350c27774c2@finder",
			
 
				-        "v2_060000231003b20faec8c4e08e1cc4d6c607ec31b077680c6f14d55b527766e41cf9be586e4d@finder",
			
 
				-        "v2_060000231003b20faec8c4e58b1ac3d3cf02eb31b077400c101a45e79401bcb5798ad570737d@finder",
			
 
				-        "v2_060000231003b20faec8c5e18c18c2d6ca05e43db077bca0f7123717c7596f0fd88052d9eab9@finder",
			
 
				-        "v2_060000231003b20faec8c4e0811cc3dcc805e83cb077667e01df91de36a6306008913f0fa791@finder",
			
 
				-        "v2_060000231003b20faec8c4e5881ecad4cc03e537b077218547d9428ce776f68aecea9daf6e0b@finder",
			
 
				-        "v2_060000231003b20faec8c4e1891acbd5c907ed34b0777b93708eca16641e6026fa1673abc031@finder",
			
 
				-        "v2_060000231003b20faec8c6e5891acad4ce05ed36b077a565ac908007215df7a1fc7b83555742@finder",
			
 
				-        "v2_060000231003b20faec8c7e58a10c4d4ce0dec34b0776ab62f75818c8e8434cd92ae68711d51@finder",
			
 
				-        "v2_060000231003b20faec8c7e28f1fcbddce02eb33b0777cabc2c0135c4e65c9d0ecdf1b6eee4a@finder",
			
 
				-        "v2_060000231003b20faec8c7e38f1bc3dccb01e432b0774a6d9f6dcdee627b0cc2c20a4c0c2c31@finder",
			
 
				-        "v2_060000231003b20faec8c5e18a1bc1d7c804ea33b0778d39f109d2d4ab142d0a6f797a8cf82e@finder",
			
 
				-        "v2_060000231003b20faec8c7e28819c0d1cc04ea36b077e0d091a104419b7643051529e3bdd44e@finder",
			
 
				-        "v2_060000231003b20faec8c7e28910c1d0cb04ed34b077a39e2b87d05a120d27058262e08c4b07@finder",
			
 
				-        "v2_060000231003b20faec8c5ea8f1dc3d6c801e934b0770b7ff832d113bda97a48eb93da5ac745@finder",
			
 
				-        "v2_060000231003b20faec8c7e58919cbddcc0def32b077358968830a932033408fd4c03e76035e@finder",
			
 
				-        "v2_060000231003b20faec8c7ea8f1dc4d1cb0de531b077a58ea59f1982d0865313a1ebba09ba7a@finder",
			
 
				-        "v2_060000231003b20faec8c4e1891ac4dccc01ef31b077fb07460c62de09e1d8ab86923248623f@finder"
			
 
				-    ]
			
 
				-
			
 
				-    for account_id in account_id_list:
			
 
				-        try:
			
 
				-            crawler_channel_account_videos.crawler_each_account(channel_account_id=account_id, channel_account_name="")
			
 
				-        except Exception as e:
			
 
				-            continue
			
 
				+    crawler_channel_account_videos.deal()
			
--- a/tasks/crawler_channel_account_videos.py
+++ b/tasks/crawler_channel_account_videos.py
@@ -2,14 +2,17 @@
 
				 @author: luojunhui
			
 
				 @tool: pycharm && deepseek
			
 
				 """
			
 
				+
			
 
				 import re
			
 
				 import os
			
 
				 import traceback
			
 
				 import time
			
 
				 
			
 
				+from pymysql.cursors import DictCursor
			
 
				 from tqdm import tqdm
			
 
				 
			
 
				 from applications import log
			
 
				+from applications.const import ChannelVideoCrawlerConst
			
 
				 from applications.db import DatabaseConnector
			
 
				 from applications.utils import download_sph_video
			
 
				 from applications.utils import insert_into_single_video_source_table
			
@@ -19,7 +22,7 @@ from applications.utils import upload_to_oss
 
				 from config import long_articles_config
			
 
				 from coldStartTasks.crawler.channels import get_channel_account_videos
			
 
				 
			
 
				-NO_SOURCE_ACCOUNT = 0
			
 
				+const = ChannelVideoCrawlerConst()
			
 
				 
			
 
				 
			
 
				 class CrawlerChannelAccountVideos:
			
@@ -30,7 +33,6 @@ class CrawlerChannelAccountVideos:
 
				     def __init__(self):
			
 
				         self.db_client = DatabaseConnector(db_config=long_articles_config)
			
 
				         self.db_client.connect()
			
 
				-        self.success_crawler_video_count = 0
			
 
				 
			
 
				     def whether_video_exists(self, title: str) -> bool:
			
 
				         """
			
@@ -51,7 +53,9 @@ class CrawlerChannelAccountVideos:
 
				         """
			
 
				         get channel account list from database
			
 
				         """
			
 
				-        return
			
 
				+        sql = f"""select account_id, max_cursor from sph_account_for_videos where status = {const.CHANNEL_ACCOUNT_GOOD_STATUS};"""
			
 
				+        account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
			
 
				+        return account_list
			
 
				 
			
 
				     def crawler_each_video(self, video: dict):
			
 
				         """
			
@@ -66,23 +70,28 @@ class CrawlerChannelAccountVideos:
 
				                 task="crawler_channel_account_videos",
			
 
				                 function="crawler_each_video",
			
 
				                 message="video title exists",
			
 
				-                data={"video_id": video["id"], "title": title}
			
 
				+                data={"video_id": video["id"], "title": title},
			
 
				             )
			
 
				             return
			
 
				 
			
 
				-        cleaned_title = re.sub(r'[^\u4e00-\u9fff]', '', title)
			
 
				-        if len(cleaned_title) < 10:
			
 
				+        cleaned_title = re.sub(r"[^\u4e00-\u9fff]", "", title)
			
 
				+        if len(cleaned_title) < const.MIN_TITLE_LENGTH:
			
 
				             log(
			
 
				                 task="crawler_channel_account_videos",
			
 
				                 function="crawler_each_video",
			
 
				                 message="video title is too short",
			
 
				-                data={"video_id": video["id"], "title": title}
			
 
				+                data={"video_id": video["id"], "title": title},
			
 
				             )
			
 
				             return
			
 
				 
			
 
				-        video_length = video['objectDesc']['media'][0]['VideoPlayLen']
			
 
				-        if video_length and int(video_length) > 240:
			
 
				-            print("video to large")
			
 
				+        video_length = video["objectDesc"]["media"][0]["VideoPlayLen"]
			
 
				+        if video_length and int(video_length) > const.MAX_VIDEO_LENGTH:
			
 
				+            log(
			
 
				+                task="crawler_channel_account_videos",
			
 
				+                function="crawler_each_video",
			
 
				+                message="video length is too long",
			
 
				+                data={"video_id": video["id"], "title": title},
			
 
				+            )
			
 
				             return
			
 
				 
			
 
				         video_item = Item()
			
@@ -93,7 +102,7 @@ class CrawlerChannelAccountVideos:
 
				         video_item.add("out_account_id", video["username"])
			
 
				         video_item.add("out_account_name", video["nickname"])
			
 
				         video_item.add("publish_timestamp", video["createtime"])
			
 
				-        video_item.add("platform", 'sph')
			
 
				+        video_item.add("platform", "sph")
			
 
				         media = object_desc["media"][0]
			
 
				         url = media["Url"]
			
 
				         decode_key = media["decodeKey"]
			
@@ -103,7 +112,7 @@ class CrawlerChannelAccountVideos:
 
				             decrypt_path = download_sph_video(download_url=download_url, key=decode_key)
			
 
				             oss_path = upload_to_oss(decrypt_path)
			
 
				             video_item.add("video_oss_path", oss_path)
			
 
				-            video_item.add("source_account", NO_SOURCE_ACCOUNT)
			
 
				+            video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
			
 
				             video_item.check(source="video")
			
 
				             insert_into_single_video_source_table(self.db_client, video_item.item)
			
 
				             os.remove(decrypt_path)
			
@@ -119,18 +128,25 @@ class CrawlerChannelAccountVideos:
 
				                 },
			
 
				             )
			
 
				 
			
 
				-    def crawler_each_account(self, channel_account_id: str, channel_account_name: str, last_buffer: str = ""):
			
 
				+    def crawler_each_account(self, channel_account: dict, last_buffer: str = ""):
			
 
				         """
			
 
				         get channel account videos
			
 
				         """
			
 
				-        response = get_channel_account_videos(channel_account_id, last_buffer=last_buffer)
			
 
				+        channel_account_id = channel_account["account_id"]
			
 
				+        max_cursor = channel_account["max_cursor"]
			
 
				+        if not max_cursor:
			
 
				+            max_cursor = const.DEFAULT_CURSOR
			
 
				+
			
 
				+        response = get_channel_account_videos(
			
 
				+            channel_account_id, last_buffer=last_buffer
			
 
				+        )
			
 
				         if response["ret"] == 200:
			
 
				             response_data = response["data"]
			
 
				             last_buffer = response_data["lastBuffer"]
			
 
				             continue_flag = response_data["continueFlag"]
			
 
				             video_list = response_data["object"]
			
 
				-            create_timestamp = video_list[0]['createtime']
			
 
				-            if create_timestamp < 1704038400:
			
 
				+            create_timestamp = video_list[0]["createtime"]
			
 
				+            if create_timestamp < max_cursor:
			
 
				                 return
			
 
				 
			
 
				             crawl_video_list_bar = tqdm(video_list, desc="crawl videos")
			
@@ -139,11 +155,64 @@ class CrawlerChannelAccountVideos:
 
				                 self.crawler_each_video(video)
			
 
				 
			
 
				             if continue_flag:
			
 
				-                time.sleep(1)
			
 
				-                return self.crawler_each_account(channel_account_id, channel_account_name, last_buffer)
			
 
				+                time.sleep(const.SLEEP_SECOND)
			
 
				+                return self.crawler_each_account(channel_account_id, last_buffer)
			
 
				             else:
			
 
				                 return
			
 
				 
			
 
				         else:
			
 
				-            print(f"crawler channel account {channel_account_name} videos failed")
			
 
				+            log(
			
 
				+                task="crawler_channel_account_videos",
			
 
				+                function="crawler_each_video",
			
 
				+                message="get_channel_account_videos failed",
			
 
				+                data={
			
 
				+                    "response": response,
			
 
				+                    "channel_account_id": channel_account_id,
			
 
				+                    "max_cursor": max_cursor,
			
 
				+                },
			
 
				+            )
			
 
				             return
			
 
				+
			
 
				+    def update_account_max_cursor(self, account_id):
			
 
				+        """
			
 
				+        update account max cursor
			
 
				+        """
			
 
				+        select_sql = f"""
			
 
				+            select max(publish_timestamp) as max_cursor from publish_single_video_source where out_account_id = '{account_id}';
			
 
				+        """
			
 
				+        response_mysql = self.db_client.fetch(query=select_sql)
			
 
				+        max_publish_timestamp = response_mysql[0][0]
			
 
				+
			
 
				+        if max_publish_timestamp:
			
 
				+            update_sql = f"""
			
 
				+                        update sph_account_for_videos
			
 
				+                        set max_cursor = %s
			
 
				+                        where account_id = %s;
			
 
				+                    """
			
 
				+            self.db_client.save(
			
 
				+                query=update_sql, params=(max_publish_timestamp, account_id)
			
 
				+            )
			
 
				+
			
 
				+    def deal(self):
			
 
				+        """
			
 
				+        deal channel account videos
			
 
				+        """
			
 
				+        account_list = self.get_channel_account_list()
			
 
				+        account_crawler_bar = tqdm(account_list, desc="crawler channel account videos")
			
 
				+        for account in account_crawler_bar:
			
 
				+            try:
			
 
				+                account_crawler_bar.set_postfix({"account_id": account["account_id"]})
			
 
				+                self.crawler_each_account(channel_account=account)
			
 
				+                self.update_account_max_cursor(account["account_id"])
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="crawler_channel_account_videos",
			
 
				+                    function="deal",
			
 
				+                    message="crawler channel account videos failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "account_id": account["account_id"],
			
 
				+                    },
			
 
				+                )