| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687 | """@author: luojunhui"""import osimport jsonfrom applications import logfrom applications.utils import download_gzh_videofrom applications.utils import download_toutiao_videofrom applications.utils import upload_to_ossfrom config import apolloConfigmy_config = apolloConfig()empty_dict = {}sensitive_word_list = json.loads(my_config.getConfigValue("sensitive_word_list"))def whether_title_sensitive(title: str) -> bool:    """    title sensitive words filter    """    for word in sensitive_word_list:        if word in title:            return True    return Falsedef whether_duplicate_video_title(video_title: str, db_client) -> bool:    """    whether duplicate video title    """    sql = f"""        select id from publish_single_video_source        where article_title = %s;    """    duplicate_id = db_client.fetch(query=sql, params=(video_title,))    if duplicate_id:        return True    return Falsedef scrape_video_entities_process(video_item, db_client) -> dict:    """    video crawler pipeline    """    article_url = video_item["article_url"]    platform = video_item["platform"]    video_title = video_item["article_title"]    # whether title sensitive    if whether_title_sensitive(video_title):        print("title is sensitive")        return empty_dict    # whether duplicate video title    if whether_duplicate_video_title(video_title, db_client):        print("duplicate video title")        return empty_dict    # download video    match platform:        case "toutiao":            video_path = download_toutiao_video(article_url)        case "gzh":            video_path = download_gzh_video(article_url)        case "hksp":            video_path = ""        case "sph":            video_path = ""        case "sohu":            video_path = download_toutiao_video(article_url)        case _:            return empty_dict    if video_path:        # upload video to oss        oss_path = upload_to_oss(video_path)        video_item["video_oss_path"] = oss_path        os.remove(video_path)        return video_item    else:        return empty_dict
 |