4 месяцев назад · 514149837b
--- a/applications/const/__init__.py
+++ b/applications/const/__init__.py
@@ -205,3 +205,22 @@ class ArticleCollectorConst:
 
				     ARTICLE_SUCCESS_CODE = 0
			
 
				     ARTICLE_UNKNOWN_CODE = 10000
			
 
				 
			
 
				+
			
 
				+class BaiduVideoCrawlerConst:
			
 
				+    """
			
 
				+    const for baidu video crawler
			
 
				+    """
			
 
				+    # account status
			
 
				+    BAIDU_ACCOUNT_GOOD_STATUS = 1
			
 
				+    BAIDU_ACCOUNT_BAD_STATUS = 0
			
 
				+
			
 
				+    # earliest cursor, 2024-01-01 00:00:00
			
 
				+    DEFAULT_CURSOR = 17040384000000
			
 
				+
			
 
				+    # no source account
			
 
				+    NO_SOURCE_ACCOUNT_STATUS = 0
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/coldStartTasks/crawler/baidu/video_crawler.py
+++ b/coldStartTasks/crawler/baidu/video_crawler.py
@@ -4,20 +4,24 @@
 
				 """
			
 
				 import json
			
 
				 import time
			
 
				+import traceback
			
 
				+from typing import List, Dict
			
 
				 
			
 
				 from pymysql.cursors import DictCursor
			
 
				 from tqdm import tqdm
			
 
				 
			
 
				 from applications import Functions
			
 
				+from applications import bot, log
			
 
				+from applications.const import BaiduVideoCrawlerConst
			
 
				 from applications.db import DatabaseConnector
			
 
				 from applications.exception import SpiderError
			
 
				 from config import long_articles_config
			
 
				 from coldStartTasks.crawler.baidu.baidu_spider import baidu_account_video_crawler
			
 
				 from coldStartTasks.crawler.baidu.baidu_spider import baidu_single_video_crawler
			
 
				 
			
 
				+const = BaiduVideoCrawlerConst()
			
 
				 empty_list = []
			
 
				 functions = Functions()
			
 
				-DEFAULT_CURSOR = 17040384000000  # 最早时间为2024-01-01 00:00:00
			
 
				 
			
 
				 
			
 
				 class BaiduVideoCrawler(object):
			
@@ -27,54 +31,44 @@ class BaiduVideoCrawler(object):
 
				 
			
 
				     def __init__(self):
			
 
				         self.db = None
			
 
				+        self.success_crawler_video_count = 0
			
 
				 
			
 
				-    def connect_db(self):
			
 
				+    def connect_db(self) -> None:
			
 
				         """
			
 
				         connect db
			
 
				         """
			
 
				         self.db = DatabaseConnector(db_config=long_articles_config)
			
 
				         self.db.connect()
			
 
				 
			
 
				-    def get_account_list(self):
			
 
				+    def get_account_list(self) -> List[Dict]:
			
 
				         """
			
 
				         get account list
			
 
				-        status = 1 表示正常抓取的账号
			
 
				         """
			
 
				         sql = f"""
			
 
				             select account_id, account_name, max_cursor 
			
 
				             from baidu_account_for_videos
			
 
				-            where status = 1 and priority = 0;
			
 
				+            where status = {const.BAIDU_ACCOUNT_GOOD_STATUS};
			
 
				         """
			
 
				         account_list = self.db.fetch(query=sql, cursor_type=DictCursor)
			
 
				         return account_list
			
 
				 
			
 
				-    def whether_video_exists(self, video_id, title):
			
 
				+    def whether_video_exists(self, title: str) -> bool:
			
 
				         """
			
 
				         whether video exists, use video_id && title
			
 
				         """
			
 
				-        # first check video_id
			
 
				-        sql_1 = f"""
			
 
				-            select id from publish_single_video_source
			
 
				-            where url_unique_md5 = '{video_id}';
			
 
				-        """
			
 
				-        count_1 = self.db.fetch(query=sql_1)
			
 
				-        if count_1:
			
 
				-            print(video_id + " video exists")
			
 
				-            return True
			
 
				-
			
 
				         # check title
			
 
				         sql_2 = f"""
			
 
				             select id from publish_single_video_source
			
 
				             where article_title = '{title}';
			
 
				         """
			
 
				-        count_2 = self.db.fetch(query=sql_2)
			
 
				-        if count_2:
			
 
				+        duplicate_id = self.db.fetch(query=sql_2)
			
 
				+        if duplicate_id:
			
 
				             print(title + " video exists")
			
 
				             return True
			
 
				 
			
 
				         return False
			
 
				 
			
 
				-    def save_each_video(self, video, account_id, account_name):
			
 
				+    def save_each_video(self, video: Dict, account_id: str, account_name: str) -> None:
			
 
				         """
			
 
				         download and save each video
			
 
				         """
			
@@ -83,7 +77,7 @@ class BaiduVideoCrawler(object):
 
				         title = video['title']
			
 
				 
			
 
				         # judge whether video exists
			
 
				-        if self.whether_video_exists(video_id, title):
			
 
				+        if self.whether_video_exists(title):
			
 
				             return
			
 
				 
			
 
				         read_cnt = video.get('playcnt', 0)
			
@@ -133,15 +127,16 @@ class BaiduVideoCrawler(object):
 
				                         video_category,
			
 
				                         json.dumps(manual_tags, ensure_ascii=False) if manual_tags else None,
			
 
				                         "baidu",
			
 
				-                        0
			
 
				+                        const.NO_SOURCE_ACCOUNT_STATUS
			
 
				                     )
			
 
				                 )
			
 
				+                self.success_crawler_video_count += 1
			
 
				             except Exception as e:
			
 
				                 print(e)
			
 
				         else:
			
 
				             print(f"download video failed, video_id: {video_id}")
			
 
				 
			
 
				-    def save_video_list(self, account_id, account_name, video_list):
			
 
				+    def save_video_list(self, account_id: str, account_name: str, video_list: List[Dict]) -> None:
			
 
				         """
			
 
				         save video list
			
 
				         """
			
@@ -153,12 +148,12 @@ class BaiduVideoCrawler(object):
 
				                     video_detail = baidu_single_video_crawler(video_id)
			
 
				                     self.save_each_video(video_detail, account_id=account_id, account_name=account_name)
			
 
				                 except SpiderError as e:
			
 
				-                    print(e)
			
 
				+                    print("save single video fail", e)
			
 
				                     continue
			
 
				             else:
			
 
				                 continue
			
 
				 
			
 
				-    def crawler_each_account(self, account, cursor=None):
			
 
				+    def crawler_each_account(self, account: Dict, cursor=None) -> None:
			
 
				         """
			
 
				         crawler each account
			
 
				         response_strategy
			
@@ -166,7 +161,7 @@ class BaiduVideoCrawler(object):
 
				         account_id = account['account_id']
			
 
				         max_cursor = account['max_cursor']
			
 
				         if not max_cursor:
			
 
				-            max_cursor = DEFAULT_CURSOR
			
 
				+            max_cursor = const.DEFAULT_CURSOR
			
 
				         account_name = account['account_name']
			
 
				         try:
			
 
				             response_json = baidu_account_video_crawler(account_id, cursor=cursor)
			
@@ -181,7 +176,7 @@ class BaiduVideoCrawler(object):
 
				             # check next page
			
 
				             has_next_page = response_json.get("has_more", False)
			
 
				             if has_next_page:
			
 
				-                next_cursor = response_json.get("ctime", DEFAULT_CURSOR)
			
 
				+                next_cursor = response_json.get("ctime", const.DEFAULT_CURSOR)
			
 
				                 if next_cursor < max_cursor:
			
 
				                     print("No more videos after 2024-01-01")
			
 
				                     return
			
@@ -191,7 +186,7 @@ class BaiduVideoCrawler(object):
 
				             print(e)
			
 
				             return
			
 
				 
			
 
				-    def update_cursor(self, account_id):
			
 
				+    def update_cursor(self, account_id: str) -> None:
			
 
				         """
			
 
				         update cursor for each account
			
 
				         """
			
@@ -212,16 +207,38 @@ class BaiduVideoCrawler(object):
 
				                 params=(max_cursor, account_id)
			
 
				             )
			
 
				 
			
 
				-    def deal(self):
			
 
				+    def deal(self) -> None:
			
 
				         """
			
 
				         deal
			
 
				         """
			
 
				         account_list = self.get_account_list()
			
 
				+        success_cnt = 0
			
 
				+        fail_cnt = 0
			
 
				         for account in account_list:
			
 
				-            self.crawler_each_account(account)
			
 
				-            self.update_cursor(account['account_id'])
			
 
				-
			
 
				-
			
 
				-b = BaiduVideoCrawler()
			
 
				-b.connect_db()
			
 
				-b.deal()
			
 
				+            try:
			
 
				+                self.crawler_each_account(account)
			
 
				+                self.update_cursor(account['account_id'])
			
 
				+                success_cnt += 1
			
 
				+            except Exception as e:
			
 
				+                fail_cnt += 1
			
 
				+                log(
			
 
				+                    task="baidu_video_crawler",
			
 
				+                    function="deal",
			
 
				+                    message="crawler each account failed",
			
 
				+                    data={
			
 
				+                        "account_id": account['account_id'],
			
 
				+                        "account_name": account['account_name'],
			
 
				+                        "error": str(e),
			
 
				+                        "trace_back": traceback.format_exc()
			
 
				+                    }
			
 
				+                )
			
 
				+        bot(
			
 
				+            title="baidu video crawler task finished",
			
 
				+            detail={
			
 
				+                "success_crawl_account_num": success_cnt,
			
 
				+                "fail_crawl_account_num": fail_cnt,
			
 
				+                "success_crawl_video_num": self.success_crawler_video_count,
			
 
				+                "success_crawl_account_rate": success_cnt / (success_cnt + fail_cnt)
			
 
				+            },
			
 
				+            metion=False
			
 
				+        )
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,4 +20,5 @@ protobuf~=3.20.3
 
				 openai~=1.17.0
			
 
				 oss2~=2.19.1
			
 
				 fake-useragent~=1.5.1
			
 
				-playwright~=1.49.1
			
 
				+playwright~=1.49.1
			
 
				+volcengine-python-sdk[ark]
			
--- a/sh/run_baidu_video_crawler.sh
+++ b/sh/run_baidu_video_crawler.sh
@@ -0,0 +1,26 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# 获取当前日期，格式为 YYYY-MM-DD
			
 
				+CURRENT_DATE=$(date +%F)
			
 
				+
			
 
				+# 日志文件路径，包含日期
			
 
				+LOG_FILE="/root/luojunhui/logs/baidu_video_crawler_log_$CURRENT_DATE.txt"
			
 
				+
			
 
				+# 重定向整个脚本的输出到带日期的日志文件
			
 
				+exec >> "$LOG_FILE" 2>&1
			
 
				+if pgrep -f "python3 run_baidu_video_crawler.py" > /dev/null
			
 
				+then
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - run_baidu_video_crawler.py is running"
			
 
				+else
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart run_baidu_video_crawler.py"
			
 
				+    # 切换到指定目录
			
 
				+    cd /root/luojunhui/LongArticlesJob
			
 
				+
			
 
				+    # 激活 Conda 环境
			
 
				+    source /root/miniconda3/etc/profile.d/conda.sh
			
 
				+    conda activate tasks
			
 
				+
			
 
				+    # 在后台运行 Python 脚本并重定向日志输出
			
 
				+    nohup python3 run_baidu_video_crawler.py >> "${LOG_FILE}" 2>&1 &
			
 
				+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted run_baidu_video_crawler.py"
			
 
				+fi