""" @author: luojunhui """ import os from applications.utils import download_gzh_video from applications.utils import download_toutiao_video from applications.utils import upload_to_oss empty_dict = {} def whether_duplicate_video_title(video_title, db_client): """ whether duplicate video title """ sql = f""" select id from publish_single_video_source where article_title = %s; """ duplicate_id = db_client.fetch(query=sql, params=(video_title,)) if duplicate_id: return True return False def video_crawler_pipeline(video_item, db_client) -> dict: """ video crawler pipeline """ # whether duplicate video title video_title = video_item['article_title'] if whether_duplicate_video_title(video_title, db_client): return empty_dict # video title sensitive words filter # download video article_url = video_item['article_url'] platform = video_item['platform'] match platform: case "toutiao": video_path = download_toutiao_video(article_url) case "gzh": video_path = download_gzh_video(article_url) case "hksp": video_path = '' case "sph": video_path = '' case _: return empty_dict if video_path: # upload video to oss oss_path = upload_to_oss(video_path) video_item['video_oss_path'] = oss_path os.remove(video_path) return video_item else: return empty_dict