12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061 |
- """
- @author: luojunhui
- """
- import os
- from applications.utils import download_gzh_video
- from applications.utils import download_toutiao_video
- from applications.utils import upload_to_oss
- empty_dict = {}
- def whether_duplicate_video_title(video_title, db_client):
- """
- whether duplicate video title
- """
- sql = f"""
- select id from publish_single_video_source
- where article_title = %s;
- """
- duplicate_id = db_client.fetch(query=sql, params=(video_title,))
- if duplicate_id:
- return True
- return False
- def video_crawler_pipeline(video_item, db_client) -> dict:
- """
- video crawler pipeline
- """
- # whether duplicate video title
- video_title = video_item['article_title']
- if whether_duplicate_video_title(video_title, db_client):
- return empty_dict
- # video title sensitive words filter
- # download video
- article_url = video_item['article_url']
- platform = video_item['platform']
- match platform:
- case "toutiao":
- video_path = download_toutiao_video(article_url)
- case "gzh":
- video_path = download_gzh_video(article_url)
- case "hksp":
- video_path = ''
- case "sph":
- video_path = ''
- case _:
- return empty_dict
- if video_path:
- # upload video to oss
- oss_path = upload_to_oss(video_path)
- video_item['video_oss_path'] = oss_path
- os.remove(video_path)
- return video_item
- else:
- return empty_dict
|