luojunhui 1 ماه پیش
والد
کامیت
514149837b
4فایلهای تغییر یافته به همراه99 افزوده شده و 36 حذف شده
  1. 19 0
      applications/const/__init__.py
  2. 52 35
      coldStartTasks/crawler/baidu/video_crawler.py
  3. 2 1
      requirements.txt
  4. 26 0
      sh/run_baidu_video_crawler.sh

+ 19 - 0
applications/const/__init__.py

@@ -205,3 +205,22 @@ class ArticleCollectorConst:
     ARTICLE_SUCCESS_CODE = 0
     ARTICLE_UNKNOWN_CODE = 10000
 
+
+class BaiduVideoCrawlerConst:
+    """
+    const for baidu video crawler
+    """
+    # account status
+    BAIDU_ACCOUNT_GOOD_STATUS = 1
+    BAIDU_ACCOUNT_BAD_STATUS = 0
+
+    # earliest cursor, 2024-01-01 00:00:00
+    DEFAULT_CURSOR = 17040384000000
+
+    # no source account
+    NO_SOURCE_ACCOUNT_STATUS = 0
+
+
+
+
+

+ 52 - 35
coldStartTasks/crawler/baidu/video_crawler.py

@@ -4,20 +4,24 @@
 """
 import json
 import time
+import traceback
+from typing import List, Dict
 
 from pymysql.cursors import DictCursor
 from tqdm import tqdm
 
 from applications import Functions
+from applications import bot, log
+from applications.const import BaiduVideoCrawlerConst
 from applications.db import DatabaseConnector
 from applications.exception import SpiderError
 from config import long_articles_config
 from coldStartTasks.crawler.baidu.baidu_spider import baidu_account_video_crawler
 from coldStartTasks.crawler.baidu.baidu_spider import baidu_single_video_crawler
 
+const = BaiduVideoCrawlerConst()
 empty_list = []
 functions = Functions()
-DEFAULT_CURSOR = 17040384000000  # 最早时间为2024-01-01 00:00:00
 
 
 class BaiduVideoCrawler(object):
@@ -27,54 +31,44 @@ class BaiduVideoCrawler(object):
 
     def __init__(self):
         self.db = None
+        self.success_crawler_video_count = 0
 
-    def connect_db(self):
+    def connect_db(self) -> None:
         """
         connect db
         """
         self.db = DatabaseConnector(db_config=long_articles_config)
         self.db.connect()
 
-    def get_account_list(self):
+    def get_account_list(self) -> List[Dict]:
         """
         get account list
-        status = 1 表示正常抓取的账号
         """
         sql = f"""
             select account_id, account_name, max_cursor 
             from baidu_account_for_videos
-            where status = 1 and priority = 0;
+            where status = {const.BAIDU_ACCOUNT_GOOD_STATUS};
         """
         account_list = self.db.fetch(query=sql, cursor_type=DictCursor)
         return account_list
 
-    def whether_video_exists(self, video_id, title):
+    def whether_video_exists(self, title: str) -> bool:
         """
         whether video exists, use video_id && title
         """
-        # first check video_id
-        sql_1 = f"""
-            select id from publish_single_video_source
-            where url_unique_md5 = '{video_id}';
-        """
-        count_1 = self.db.fetch(query=sql_1)
-        if count_1:
-            print(video_id + " video exists")
-            return True
-
         # check title
         sql_2 = f"""
             select id from publish_single_video_source
             where article_title = '{title}';
         """
-        count_2 = self.db.fetch(query=sql_2)
-        if count_2:
+        duplicate_id = self.db.fetch(query=sql_2)
+        if duplicate_id:
             print(title + " video exists")
             return True
 
         return False
 
-    def save_each_video(self, video, account_id, account_name):
+    def save_each_video(self, video: Dict, account_id: str, account_name: str) -> None:
         """
         download and save each video
         """
@@ -83,7 +77,7 @@ class BaiduVideoCrawler(object):
         title = video['title']
 
         # judge whether video exists
-        if self.whether_video_exists(video_id, title):
+        if self.whether_video_exists(title):
             return
 
         read_cnt = video.get('playcnt', 0)
@@ -133,15 +127,16 @@ class BaiduVideoCrawler(object):
                         video_category,
                         json.dumps(manual_tags, ensure_ascii=False) if manual_tags else None,
                         "baidu",
-                        0
+                        const.NO_SOURCE_ACCOUNT_STATUS
                     )
                 )
+                self.success_crawler_video_count += 1
             except Exception as e:
                 print(e)
         else:
             print(f"download video failed, video_id: {video_id}")
 
-    def save_video_list(self, account_id, account_name, video_list):
+    def save_video_list(self, account_id: str, account_name: str, video_list: List[Dict]) -> None:
         """
         save video list
         """
@@ -153,12 +148,12 @@ class BaiduVideoCrawler(object):
                     video_detail = baidu_single_video_crawler(video_id)
                     self.save_each_video(video_detail, account_id=account_id, account_name=account_name)
                 except SpiderError as e:
-                    print(e)
+                    print("save single video fail", e)
                     continue
             else:
                 continue
 
-    def crawler_each_account(self, account, cursor=None):
+    def crawler_each_account(self, account: Dict, cursor=None) -> None:
         """
         crawler each account
         response_strategy
@@ -166,7 +161,7 @@ class BaiduVideoCrawler(object):
         account_id = account['account_id']
         max_cursor = account['max_cursor']
         if not max_cursor:
-            max_cursor = DEFAULT_CURSOR
+            max_cursor = const.DEFAULT_CURSOR
         account_name = account['account_name']
         try:
             response_json = baidu_account_video_crawler(account_id, cursor=cursor)
@@ -181,7 +176,7 @@ class BaiduVideoCrawler(object):
             # check next page
             has_next_page = response_json.get("has_more", False)
             if has_next_page:
-                next_cursor = response_json.get("ctime", DEFAULT_CURSOR)
+                next_cursor = response_json.get("ctime", const.DEFAULT_CURSOR)
                 if next_cursor < max_cursor:
                     print("No more videos after 2024-01-01")
                     return
@@ -191,7 +186,7 @@ class BaiduVideoCrawler(object):
             print(e)
             return
 
-    def update_cursor(self, account_id):
+    def update_cursor(self, account_id: str) -> None:
         """
         update cursor for each account
         """
@@ -212,16 +207,38 @@ class BaiduVideoCrawler(object):
                 params=(max_cursor, account_id)
             )
 
-    def deal(self):
+    def deal(self) -> None:
         """
         deal
         """
         account_list = self.get_account_list()
+        success_cnt = 0
+        fail_cnt = 0
         for account in account_list:
-            self.crawler_each_account(account)
-            self.update_cursor(account['account_id'])
-
-
-b = BaiduVideoCrawler()
-b.connect_db()
-b.deal()
+            try:
+                self.crawler_each_account(account)
+                self.update_cursor(account['account_id'])
+                success_cnt += 1
+            except Exception as e:
+                fail_cnt += 1
+                log(
+                    task="baidu_video_crawler",
+                    function="deal",
+                    message="crawler each account failed",
+                    data={
+                        "account_id": account['account_id'],
+                        "account_name": account['account_name'],
+                        "error": str(e),
+                        "trace_back": traceback.format_exc()
+                    }
+                )
+        bot(
+            title="baidu video crawler task finished",
+            detail={
+                "success_crawl_account_num": success_cnt,
+                "fail_crawl_account_num": fail_cnt,
+                "success_crawl_video_num": self.success_crawler_video_count,
+                "success_crawl_account_rate": success_cnt / (success_cnt + fail_cnt)
+            },
+            metion=False
+        )

+ 2 - 1
requirements.txt

@@ -20,4 +20,5 @@ protobuf~=3.20.3
 openai~=1.17.0
 oss2~=2.19.1
 fake-useragent~=1.5.1
-playwright~=1.49.1
+playwright~=1.49.1
+volcengine-python-sdk[ark]

+ 26 - 0
sh/run_baidu_video_crawler.sh

@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# 获取当前日期,格式为 YYYY-MM-DD
+CURRENT_DATE=$(date +%F)
+
+# 日志文件路径,包含日期
+LOG_FILE="/root/luojunhui/logs/baidu_video_crawler_log_$CURRENT_DATE.txt"
+
+# 重定向整个脚本的输出到带日期的日志文件
+exec >> "$LOG_FILE" 2>&1
+if pgrep -f "python3 run_baidu_video_crawler.py" > /dev/null
+then
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - run_baidu_video_crawler.py is running"
+else
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart run_baidu_video_crawler.py"
+    # 切换到指定目录
+    cd /root/luojunhui/LongArticlesJob
+
+    # 激活 Conda 环境
+    source /root/miniconda3/etc/profile.d/conda.sh
+    conda activate tasks
+
+    # 在后台运行 Python 脚本并重定向日志输出
+    nohup python3 run_baidu_video_crawler.py >> "${LOG_FILE}" 2>&1 &
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted run_baidu_video_crawler.py"
+fi