|
@@ -2,6 +2,8 @@
|
|
|
@author: luojunhui
|
|
|
@description: video crawler
|
|
|
"""
|
|
|
+
|
|
|
+import os
|
|
|
import json
|
|
|
import time
|
|
|
import traceback
|
|
@@ -32,6 +34,7 @@ class BaiduVideoCrawler(object):
|
|
|
def __init__(self):
|
|
|
self.db = None
|
|
|
self.success_crawler_video_count = 0
|
|
|
+ self.connect_db()
|
|
|
|
|
|
def connect_db(self) -> None:
|
|
|
"""
|
|
@@ -57,11 +60,11 @@ class BaiduVideoCrawler(object):
|
|
|
whether video exists, use video_id && title
|
|
|
"""
|
|
|
# check title
|
|
|
- sql_2 = f"""
|
|
|
+ sql = f"""
|
|
|
select id from publish_single_video_source
|
|
|
- where article_title = '{title}';
|
|
|
+ where article_title = %s;
|
|
|
"""
|
|
|
- duplicate_id = self.db.fetch(query=sql_2)
|
|
|
+ duplicate_id = self.db.fetch(query=sql, params=(title,))
|
|
|
if duplicate_id:
|
|
|
print(title + " video exists")
|
|
|
return True
|
|
@@ -73,32 +76,32 @@ class BaiduVideoCrawler(object):
|
|
|
download and save each video
|
|
|
"""
|
|
|
# print(json.dumps(video, ensure_ascii=False, indent=4))
|
|
|
- video_id = video['id']
|
|
|
- title = video['title']
|
|
|
+ video_id = video["id"]
|
|
|
+ title = video["title"]
|
|
|
|
|
|
# judge whether video exists
|
|
|
if self.whether_video_exists(title):
|
|
|
return
|
|
|
|
|
|
- read_cnt = video.get('playcnt', 0)
|
|
|
- like_cnt = video.get('like_num', 0)
|
|
|
- publish_timestamp = video['publish_time']
|
|
|
+ read_cnt = video.get("playcnt", 0)
|
|
|
+ like_cnt = video.get("like_num", 0)
|
|
|
+ publish_timestamp = video["publish_time"]
|
|
|
# duration = video['duration']
|
|
|
- cover_url = video['poster']
|
|
|
- video_url = video['playurl']
|
|
|
+ cover_url = video["poster"]
|
|
|
+ video_url = video["playurl"]
|
|
|
# sensitive_flag = video.get('sensitive_flag')
|
|
|
- video_more_info = video.get('contentcms_intervene_data')
|
|
|
+ video_more_info = video.get("contentcms_intervene_data")
|
|
|
if video_more_info:
|
|
|
- video_category_list = video_more_info.get('category_v2')
|
|
|
+ video_category_list = video_more_info.get("category_v2")
|
|
|
if video_category_list:
|
|
|
video_category = video_category_list[0]
|
|
|
else:
|
|
|
video_category = None
|
|
|
else:
|
|
|
video_category = None
|
|
|
- manual_tags = video.get('manual_tags')
|
|
|
+ manual_tags = video.get("manual_tags")
|
|
|
|
|
|
- video_path = 'static/{}.mp4'.format(video_id)
|
|
|
+ video_path = os.path.join(const.LOCAL_PATH_DIR, "{}.mp4".format(video_id))
|
|
|
download_path = functions.download_baidu_videos(video_url, video_path)
|
|
|
if download_path:
|
|
|
oss_path = functions.upload_to_oss(local_video_path=download_path)
|
|
@@ -125,28 +128,49 @@ class BaiduVideoCrawler(object):
|
|
|
int(time.time()),
|
|
|
video_id,
|
|
|
video_category,
|
|
|
- json.dumps(manual_tags, ensure_ascii=False) if manual_tags else None,
|
|
|
+ (
|
|
|
+ json.dumps(manual_tags, ensure_ascii=False)
|
|
|
+ if manual_tags
|
|
|
+ else None
|
|
|
+ ),
|
|
|
"baidu",
|
|
|
- const.NO_SOURCE_ACCOUNT_STATUS
|
|
|
- )
|
|
|
+ const.NO_SOURCE_ACCOUNT_STATUS,
|
|
|
+ ),
|
|
|
)
|
|
|
self.success_crawler_video_count += 1
|
|
|
except Exception as e:
|
|
|
- print(e)
|
|
|
+ log(
|
|
|
+ task="baidu_video_crawler",
|
|
|
+ function="save_each_video",
|
|
|
+ message="save video failed",
|
|
|
+ data={
|
|
|
+ "error": str(e),
|
|
|
+ "traceback": traceback.format_exc(),
|
|
|
+ "video_id": video_id,
|
|
|
+ "oss_path": oss_path,
|
|
|
+ },
|
|
|
+ )
|
|
|
else:
|
|
|
print(f"download video failed, video_id: {video_id}")
|
|
|
|
|
|
- def save_video_list(self, account_id: str, account_name: str, video_list: List[Dict]) -> None:
|
|
|
+ def save_video_list(
|
|
|
+ self, account_id: str, account_name: str, video_list: List[Dict]
|
|
|
+ ) -> None:
|
|
|
"""
|
|
|
save video list
|
|
|
"""
|
|
|
- # print(json.dumps(video_list, ensure_ascii=False, indent=4))
|
|
|
- for video_obj in tqdm(video_list, desc="save video list"):
|
|
|
- if video_obj['type'] == 'video':
|
|
|
- video_id = video_obj['content']['vid']
|
|
|
+ progress_bar = tqdm(video_list, desc="crawler account: {}".format(account_name))
|
|
|
+ for video_obj in progress_bar:
|
|
|
+ if video_obj["type"] == "video":
|
|
|
+ video_id = video_obj["content"]["vid"]
|
|
|
try:
|
|
|
video_detail = baidu_single_video_crawler(video_id)
|
|
|
- self.save_each_video(video_detail, account_id=account_id, account_name=account_name)
|
|
|
+ self.save_each_video(
|
|
|
+ video=video_detail,
|
|
|
+ account_id=account_id,
|
|
|
+ account_name=account_name,
|
|
|
+ )
|
|
|
+ progress_bar.set_postfix({"videoId": video_id})
|
|
|
except SpiderError as e:
|
|
|
print("save single video fail", e)
|
|
|
continue
|
|
@@ -158,11 +182,11 @@ class BaiduVideoCrawler(object):
|
|
|
crawler each account
|
|
|
response_strategy
|
|
|
"""
|
|
|
- account_id = account['account_id']
|
|
|
- max_cursor = account['max_cursor']
|
|
|
+ account_id = account["account_id"]
|
|
|
+ max_cursor = account["max_cursor"]
|
|
|
if not max_cursor:
|
|
|
max_cursor = const.DEFAULT_CURSOR
|
|
|
- account_name = account['account_name']
|
|
|
+ account_name = account["account_name"]
|
|
|
try:
|
|
|
response_json = baidu_account_video_crawler(account_id, cursor=cursor)
|
|
|
|
|
@@ -171,7 +195,7 @@ class BaiduVideoCrawler(object):
|
|
|
self.save_video_list(
|
|
|
account_id=account_id,
|
|
|
account_name=account_name,
|
|
|
- video_list=video_list
|
|
|
+ video_list=video_list,
|
|
|
)
|
|
|
# check next page
|
|
|
has_next_page = response_json.get("has_more", False)
|
|
@@ -196,16 +220,13 @@ class BaiduVideoCrawler(object):
|
|
|
response_mysql = self.db.fetch(query=select_sql)
|
|
|
max_publish_timestamp = response_mysql[0][0]
|
|
|
if max_publish_timestamp:
|
|
|
- max_cursor = max_publish_timestamp * 10000
|
|
|
+ max_cursor = max_publish_timestamp * const.TIMESTAMP_TO_CURSOR
|
|
|
update_sql = f"""
|
|
|
update baidu_account_for_videos
|
|
|
set max_cursor = %s
|
|
|
where account_id = %s;
|
|
|
"""
|
|
|
- self.db.save(
|
|
|
- query=update_sql,
|
|
|
- params=(max_cursor, account_id)
|
|
|
- )
|
|
|
+ self.db.save(query=update_sql, params=(max_cursor, account_id))
|
|
|
|
|
|
def deal(self) -> None:
|
|
|
"""
|
|
@@ -214,10 +235,14 @@ class BaiduVideoCrawler(object):
|
|
|
account_list = self.get_account_list()
|
|
|
success_cnt = 0
|
|
|
fail_cnt = 0
|
|
|
- for account in account_list:
|
|
|
+ account_list_process_bar = tqdm(account_list, desc="process account list")
|
|
|
+ for account in account_list_process_bar:
|
|
|
try:
|
|
|
+ account_list_process_bar.set_postfix(
|
|
|
+ {"account_name": account["account_name"]}
|
|
|
+ )
|
|
|
self.crawler_each_account(account)
|
|
|
- self.update_cursor(account['account_id'])
|
|
|
+ self.update_cursor(account["account_id"])
|
|
|
success_cnt += 1
|
|
|
except Exception as e:
|
|
|
fail_cnt += 1
|
|
@@ -226,11 +251,11 @@ class BaiduVideoCrawler(object):
|
|
|
function="deal",
|
|
|
message="crawler each account failed",
|
|
|
data={
|
|
|
- "account_id": account['account_id'],
|
|
|
- "account_name": account['account_name'],
|
|
|
+ "account_id": account["account_id"],
|
|
|
+ "account_name": account["account_name"],
|
|
|
"error": str(e),
|
|
|
- "trace_back": traceback.format_exc()
|
|
|
- }
|
|
|
+ "trace_back": traceback.format_exc(),
|
|
|
+ },
|
|
|
)
|
|
|
bot(
|
|
|
title="baidu video crawler task finished",
|
|
@@ -238,7 +263,7 @@ class BaiduVideoCrawler(object):
|
|
|
"success_crawl_account_num": success_cnt,
|
|
|
"fail_crawl_account_num": fail_cnt,
|
|
|
"success_crawl_video_num": self.success_crawler_video_count,
|
|
|
- "success_crawl_account_rate": success_cnt / (success_cnt + fail_cnt)
|
|
|
+ "success_crawl_account_rate": success_cnt / (success_cnt + fail_cnt),
|
|
|
},
|
|
|
- metion=False
|
|
|
- )
|
|
|
+ metion=False,
|
|
|
+ )
|