""" @author: luojunhui """ import os import json from applications import log from applications.utils import download_gzh_video from applications.utils import download_toutiao_video from applications.utils import upload_to_oss from config import apolloConfig my_config = apolloConfig() empty_dict = {} sensitive_word_list = json.loads(my_config.getConfigValue("sensitive_word_list")) def whether_title_sensitive(title: str) -> bool: """ title sensitive words filter """ for word in sensitive_word_list: if word in title: return True return False def whether_duplicate_video_title(video_title: str, db_client) -> bool: """ whether duplicate video title """ sql = f""" select id from publish_single_video_source where article_title = %s; """ duplicate_id = db_client.fetch(query=sql, params=(video_title,)) if duplicate_id: return True return False def scrape_video_entities_process(video_item, db_client, oss_path=None) -> dict: """ video crawler pipeline """ article_url = video_item["article_url"] platform = video_item["platform"] video_title = video_item["article_title"] # whether title sensitive if whether_title_sensitive(video_title): return empty_dict # whether duplicate video title if whether_duplicate_video_title(video_title, db_client): return empty_dict if oss_path: video_item["video_oss_path"] = oss_path return video_item # download video match platform: case "toutiao": video_path = download_toutiao_video(article_url) case "gzh": video_path = download_gzh_video(article_url) case "hksp": video_path = "" case "sph": video_path = "" case _: return empty_dict if video_path: # upload video to oss oss_path = upload_to_oss(video_path) video_item["video_oss_path"] = oss_path os.remove(video_path) return video_item else: return empty_dict