|
@@ -4,20 +4,24 @@
|
|
|
"""
|
|
|
import json
|
|
|
import time
|
|
|
+import traceback
|
|
|
+from typing import List, Dict
|
|
|
|
|
|
from pymysql.cursors import DictCursor
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
from applications import Functions
|
|
|
+from applications import bot, log
|
|
|
+from applications.const import BaiduVideoCrawlerConst
|
|
|
from applications.db import DatabaseConnector
|
|
|
from applications.exception import SpiderError
|
|
|
from config import long_articles_config
|
|
|
from coldStartTasks.crawler.baidu.baidu_spider import baidu_account_video_crawler
|
|
|
from coldStartTasks.crawler.baidu.baidu_spider import baidu_single_video_crawler
|
|
|
|
|
|
+const = BaiduVideoCrawlerConst()
|
|
|
empty_list = []
|
|
|
functions = Functions()
|
|
|
-DEFAULT_CURSOR = 17040384000000 # 最早时间为2024-01-01 00:00:00
|
|
|
|
|
|
|
|
|
class BaiduVideoCrawler(object):
|
|
@@ -27,54 +31,44 @@ class BaiduVideoCrawler(object):
|
|
|
|
|
|
def __init__(self):
|
|
|
self.db = None
|
|
|
+ self.success_crawler_video_count = 0
|
|
|
|
|
|
- def connect_db(self):
|
|
|
+ def connect_db(self) -> None:
|
|
|
"""
|
|
|
connect db
|
|
|
"""
|
|
|
self.db = DatabaseConnector(db_config=long_articles_config)
|
|
|
self.db.connect()
|
|
|
|
|
|
- def get_account_list(self):
|
|
|
+ def get_account_list(self) -> List[Dict]:
|
|
|
"""
|
|
|
get account list
|
|
|
- status = 1 表示正常抓取的账号
|
|
|
"""
|
|
|
sql = f"""
|
|
|
select account_id, account_name, max_cursor
|
|
|
from baidu_account_for_videos
|
|
|
- where status = 1 and priority = 0;
|
|
|
+ where status = {const.BAIDU_ACCOUNT_GOOD_STATUS};
|
|
|
"""
|
|
|
account_list = self.db.fetch(query=sql, cursor_type=DictCursor)
|
|
|
return account_list
|
|
|
|
|
|
- def whether_video_exists(self, video_id, title):
|
|
|
+ def whether_video_exists(self, title: str) -> bool:
|
|
|
"""
|
|
|
whether video exists, use video_id && title
|
|
|
"""
|
|
|
- # first check video_id
|
|
|
- sql_1 = f"""
|
|
|
- select id from publish_single_video_source
|
|
|
- where url_unique_md5 = '{video_id}';
|
|
|
- """
|
|
|
- count_1 = self.db.fetch(query=sql_1)
|
|
|
- if count_1:
|
|
|
- print(video_id + " video exists")
|
|
|
- return True
|
|
|
-
|
|
|
# check title
|
|
|
sql_2 = f"""
|
|
|
select id from publish_single_video_source
|
|
|
where article_title = '{title}';
|
|
|
"""
|
|
|
- count_2 = self.db.fetch(query=sql_2)
|
|
|
- if count_2:
|
|
|
+ duplicate_id = self.db.fetch(query=sql_2)
|
|
|
+ if duplicate_id:
|
|
|
print(title + " video exists")
|
|
|
return True
|
|
|
|
|
|
return False
|
|
|
|
|
|
- def save_each_video(self, video, account_id, account_name):
|
|
|
+ def save_each_video(self, video: Dict, account_id: str, account_name: str) -> None:
|
|
|
"""
|
|
|
download and save each video
|
|
|
"""
|
|
@@ -83,7 +77,7 @@ class BaiduVideoCrawler(object):
|
|
|
title = video['title']
|
|
|
|
|
|
# judge whether video exists
|
|
|
- if self.whether_video_exists(video_id, title):
|
|
|
+ if self.whether_video_exists(title):
|
|
|
return
|
|
|
|
|
|
read_cnt = video.get('playcnt', 0)
|
|
@@ -133,15 +127,16 @@ class BaiduVideoCrawler(object):
|
|
|
video_category,
|
|
|
json.dumps(manual_tags, ensure_ascii=False) if manual_tags else None,
|
|
|
"baidu",
|
|
|
- 0
|
|
|
+ const.NO_SOURCE_ACCOUNT_STATUS
|
|
|
)
|
|
|
)
|
|
|
+ self.success_crawler_video_count += 1
|
|
|
except Exception as e:
|
|
|
print(e)
|
|
|
else:
|
|
|
print(f"download video failed, video_id: {video_id}")
|
|
|
|
|
|
- def save_video_list(self, account_id, account_name, video_list):
|
|
|
+ def save_video_list(self, account_id: str, account_name: str, video_list: List[Dict]) -> None:
|
|
|
"""
|
|
|
save video list
|
|
|
"""
|
|
@@ -153,12 +148,12 @@ class BaiduVideoCrawler(object):
|
|
|
video_detail = baidu_single_video_crawler(video_id)
|
|
|
self.save_each_video(video_detail, account_id=account_id, account_name=account_name)
|
|
|
except SpiderError as e:
|
|
|
- print(e)
|
|
|
+ print("save single video fail", e)
|
|
|
continue
|
|
|
else:
|
|
|
continue
|
|
|
|
|
|
- def crawler_each_account(self, account, cursor=None):
|
|
|
+ def crawler_each_account(self, account: Dict, cursor=None) -> None:
|
|
|
"""
|
|
|
crawler each account
|
|
|
response_strategy
|
|
@@ -166,7 +161,7 @@ class BaiduVideoCrawler(object):
|
|
|
account_id = account['account_id']
|
|
|
max_cursor = account['max_cursor']
|
|
|
if not max_cursor:
|
|
|
- max_cursor = DEFAULT_CURSOR
|
|
|
+ max_cursor = const.DEFAULT_CURSOR
|
|
|
account_name = account['account_name']
|
|
|
try:
|
|
|
response_json = baidu_account_video_crawler(account_id, cursor=cursor)
|
|
@@ -181,7 +176,7 @@ class BaiduVideoCrawler(object):
|
|
|
# check next page
|
|
|
has_next_page = response_json.get("has_more", False)
|
|
|
if has_next_page:
|
|
|
- next_cursor = response_json.get("ctime", DEFAULT_CURSOR)
|
|
|
+ next_cursor = response_json.get("ctime", const.DEFAULT_CURSOR)
|
|
|
if next_cursor < max_cursor:
|
|
|
print("No more videos after 2024-01-01")
|
|
|
return
|
|
@@ -191,7 +186,7 @@ class BaiduVideoCrawler(object):
|
|
|
print(e)
|
|
|
return
|
|
|
|
|
|
- def update_cursor(self, account_id):
|
|
|
+ def update_cursor(self, account_id: str) -> None:
|
|
|
"""
|
|
|
update cursor for each account
|
|
|
"""
|
|
@@ -212,16 +207,38 @@ class BaiduVideoCrawler(object):
|
|
|
params=(max_cursor, account_id)
|
|
|
)
|
|
|
|
|
|
- def deal(self):
|
|
|
+ def deal(self) -> None:
|
|
|
"""
|
|
|
deal
|
|
|
"""
|
|
|
account_list = self.get_account_list()
|
|
|
+ success_cnt = 0
|
|
|
+ fail_cnt = 0
|
|
|
for account in account_list:
|
|
|
- self.crawler_each_account(account)
|
|
|
- self.update_cursor(account['account_id'])
|
|
|
-
|
|
|
-
|
|
|
-b = BaiduVideoCrawler()
|
|
|
-b.connect_db()
|
|
|
-b.deal()
|
|
|
+ try:
|
|
|
+ self.crawler_each_account(account)
|
|
|
+ self.update_cursor(account['account_id'])
|
|
|
+ success_cnt += 1
|
|
|
+ except Exception as e:
|
|
|
+ fail_cnt += 1
|
|
|
+ log(
|
|
|
+ task="baidu_video_crawler",
|
|
|
+ function="deal",
|
|
|
+ message="crawler each account failed",
|
|
|
+ data={
|
|
|
+ "account_id": account['account_id'],
|
|
|
+ "account_name": account['account_name'],
|
|
|
+ "error": str(e),
|
|
|
+ "trace_back": traceback.format_exc()
|
|
|
+ }
|
|
|
+ )
|
|
|
+ bot(
|
|
|
+ title="baidu video crawler task finished",
|
|
|
+ detail={
|
|
|
+ "success_crawl_account_num": success_cnt,
|
|
|
+ "fail_crawl_account_num": fail_cnt,
|
|
|
+ "success_crawl_video_num": self.success_crawler_video_count,
|
|
|
+ "success_crawl_account_rate": success_cnt / (success_cnt + fail_cnt)
|
|
|
+ },
|
|
|
+ metion=False
|
|
|
+ )
|