| 
					
				 | 
			
			
				@@ -4,20 +4,24 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import json 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import time 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import traceback 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from typing import List, Dict 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from pymysql.cursors import DictCursor 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from tqdm import tqdm 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from applications import Functions 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from applications import bot, log 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from applications.const import BaiduVideoCrawlerConst 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from applications.db import DatabaseConnector 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from applications.exception import SpiderError 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from config import long_articles_config 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from coldStartTasks.crawler.baidu.baidu_spider import baidu_account_video_crawler 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from coldStartTasks.crawler.baidu.baidu_spider import baidu_single_video_crawler 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+const = BaiduVideoCrawlerConst() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 empty_list = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 functions = Functions() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-DEFAULT_CURSOR = 17040384000000  # 最早时间为2024-01-01 00:00:00 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 class BaiduVideoCrawler(object): 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -27,54 +31,44 @@ class BaiduVideoCrawler(object): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     def __init__(self): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         self.db = None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        self.success_crawler_video_count = 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    def connect_db(self): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def connect_db(self) -> None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         connect db 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         self.db = DatabaseConnector(db_config=long_articles_config) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         self.db.connect() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    def get_account_list(self): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def get_account_list(self) -> List[Dict]: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         get account list 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        status = 1 表示正常抓取的账号 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         sql = f""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             select account_id, account_name, max_cursor  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             from baidu_account_for_videos 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            where status = 1 and priority = 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            where status = {const.BAIDU_ACCOUNT_GOOD_STATUS}; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         account_list = self.db.fetch(query=sql, cursor_type=DictCursor) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         return account_list 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    def whether_video_exists(self, video_id, title): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def whether_video_exists(self, title: str) -> bool: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         whether video exists, use video_id && title 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        # first check video_id 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        sql_1 = f""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            select id from publish_single_video_source 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            where url_unique_md5 = '{video_id}'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        count_1 = self.db.fetch(query=sql_1) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        if count_1: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            print(video_id + " video exists") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            return True 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         # check title 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         sql_2 = f""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             select id from publish_single_video_source 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             where article_title = '{title}'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        count_2 = self.db.fetch(query=sql_2) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        if count_2: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        duplicate_id = self.db.fetch(query=sql_2) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if duplicate_id: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             print(title + " video exists") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             return True 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         return False 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    def save_each_video(self, video, account_id, account_name): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def save_each_video(self, video: Dict, account_id: str, account_name: str) -> None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         download and save each video 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -83,7 +77,7 @@ class BaiduVideoCrawler(object): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         title = video['title'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         # judge whether video exists 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        if self.whether_video_exists(video_id, title): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if self.whether_video_exists(title): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             return 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         read_cnt = video.get('playcnt', 0) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -133,15 +127,16 @@ class BaiduVideoCrawler(object): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                         video_category, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                         json.dumps(manual_tags, ensure_ascii=False) if manual_tags else None, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                         "baidu", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                        0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                        const.NO_SOURCE_ACCOUNT_STATUS 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                     ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                self.success_crawler_video_count += 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 print(e) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             print(f"download video failed, video_id: {video_id}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    def save_video_list(self, account_id, account_name, video_list): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def save_video_list(self, account_id: str, account_name: str, video_list: List[Dict]) -> None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         save video list 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -153,12 +148,12 @@ class BaiduVideoCrawler(object): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                     video_detail = baidu_single_video_crawler(video_id) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                     self.save_each_video(video_detail, account_id=account_id, account_name=account_name) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 except SpiderError as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                    print(e) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    print("save single video fail", e) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                     continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    def crawler_each_account(self, account, cursor=None): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def crawler_each_account(self, account: Dict, cursor=None) -> None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         crawler each account 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         response_strategy 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -166,7 +161,7 @@ class BaiduVideoCrawler(object): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         account_id = account['account_id'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         max_cursor = account['max_cursor'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         if not max_cursor: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            max_cursor = DEFAULT_CURSOR 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            max_cursor = const.DEFAULT_CURSOR 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         account_name = account['account_name'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             response_json = baidu_account_video_crawler(account_id, cursor=cursor) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -181,7 +176,7 @@ class BaiduVideoCrawler(object): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             # check next page 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             has_next_page = response_json.get("has_more", False) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             if has_next_page: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                next_cursor = response_json.get("ctime", DEFAULT_CURSOR) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                next_cursor = response_json.get("ctime", const.DEFAULT_CURSOR) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 if next_cursor < max_cursor: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                     print("No more videos after 2024-01-01") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                     return 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -191,7 +186,7 @@ class BaiduVideoCrawler(object): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             print(e) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             return 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    def update_cursor(self, account_id): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def update_cursor(self, account_id: str) -> None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         update cursor for each account 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -212,16 +207,38 @@ class BaiduVideoCrawler(object): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 params=(max_cursor, account_id) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    def deal(self): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def deal(self) -> None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         deal 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         account_list = self.get_account_list() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        success_cnt = 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        fail_cnt = 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         for account in account_list: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            self.crawler_each_account(account) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            self.update_cursor(account['account_id']) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-b = BaiduVideoCrawler() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-b.connect_db() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-b.deal() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                self.crawler_each_account(account) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                self.update_cursor(account['account_id']) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                success_cnt += 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                fail_cnt += 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                log( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    task="baidu_video_crawler", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    function="deal", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    message="crawler each account failed", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    data={ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                        "account_id": account['account_id'], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                        "account_name": account['account_name'], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                        "error": str(e), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                        "trace_back": traceback.format_exc() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        bot( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            title="baidu video crawler task finished", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            detail={ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                "success_crawl_account_num": success_cnt, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                "fail_crawl_account_num": fail_cnt, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                "success_crawl_video_num": self.success_crawler_video_count, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                "success_crawl_account_rate": success_cnt / (success_cnt + fail_cnt) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            metion=False 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        ) 
			 |