123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687 |
- """
- @author: luojunhui
- """
- import os
- import json
- from applications import log
- from applications.utils import download_gzh_video
- from applications.utils import download_toutiao_video
- from applications.utils import upload_to_oss
- from config import apolloConfig
- my_config = apolloConfig()
- empty_dict = {}
- sensitive_word_list = json.loads(my_config.getConfigValue("sensitive_word_list"))
- def whether_title_sensitive(title: str) -> bool:
- """
- title sensitive words filter
- """
- for word in sensitive_word_list:
- if word in title:
- return True
- return False
- def whether_duplicate_video_title(video_title: str, db_client) -> bool:
- """
- whether duplicate video title
- """
- sql = f"""
- select id from publish_single_video_source
- where article_title = %s;
- """
- duplicate_id = db_client.fetch(query=sql, params=(video_title,))
- if duplicate_id:
- return True
- return False
- def scrape_video_entities_process(video_item, db_client, oss_path=None) -> dict:
- """
- video crawler pipeline
- """
- article_url = video_item["article_url"]
- platform = video_item["platform"]
- video_title = video_item["article_title"]
- # whether title sensitive
- if whether_title_sensitive(video_title):
- return empty_dict
- # whether duplicate video title
- if whether_duplicate_video_title(video_title, db_client):
- return empty_dict
- if oss_path:
- video_item["video_oss_path"] = oss_path
- return video_item
- # download video
- match platform:
- case "toutiao":
- video_path = download_toutiao_video(article_url)
- case "gzh":
- video_path = download_gzh_video(article_url)
- case "hksp":
- video_path = ""
- case "sph":
- video_path = ""
- case _:
- return empty_dict
- if video_path:
- # upload video to oss
- oss_path = upload_to_oss(video_path)
- video_item["video_oss_path"] = oss_path
- os.remove(video_path)
- return video_item
- else:
- return empty_dict
|