1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- """
- @author: luojunhui
- """
- import os
- from applications.utils import download_gzh_video
- from applications.utils import download_toutiao_video
- from applications.utils import upload_to_oss
- empty_dict = {}
- def whether_duplicate_video_title(video_title, db_client):
- """
- whether duplicate video title
- """
- sql = f"""
- select id from publish_single_video_source
- where article_title = %s;
- """
- duplicate_id = db_client.fetch(query=sql, params=(video_title,))
- if duplicate_id:
- return True
- return False
- def scrape_video_entities_process(video_item, db_client) -> dict:
- """
- video crawler pipeline
- """
- # whether duplicate video title
- video_title = video_item["article_title"]
- if whether_duplicate_video_title(video_title, db_client):
- return empty_dict
- # video title sensitive words filter
- # download video
- article_url = video_item["article_url"]
- platform = video_item["platform"]
- match platform:
- case "toutiao":
- video_path = download_toutiao_video(article_url)
- case "gzh":
- video_path = download_gzh_video(article_url)
- case "hksp":
- video_path = ""
- case "sph":
- video_path = ""
- case _:
- return empty_dict
- if video_path:
- # upload video to oss
- oss_path = upload_to_oss(video_path)
- video_item["video_oss_path"] = oss_path
- os.remove(video_path)
- return video_item
- else:
- return empty_dict
|