1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586 |
- """
- @author: luojunhui
- """
- import os
- import json
- from applications import log
- from applications.utils import download_sohu_video
- from applications.utils import download_gzh_video
- from applications.utils import download_toutiao_video
- from applications.utils import upload_to_oss
- from config import apolloConfig
- my_config = apolloConfig()
- empty_dict = {}
- sensitive_word_list = json.loads(my_config.getConfigValue("sensitive_word_list"))
- def whether_title_sensitive(title: str) -> bool:
- """
- title sensitive words filter
- """
- for word in sensitive_word_list:
- if word in title:
- return True
- return False
- def whether_duplicate_video_title(video_title: str, db_client) -> bool:
- """
- whether duplicate video title
- """
- sql = f"""
- select id from publish_single_video_source
- where article_title = %s;
- """
- duplicate_id = db_client.fetch(query=sql, params=(video_title,))
- if duplicate_id:
- return True
- return False
- def scrape_video_entities_process(video_item, db_client) -> dict:
- """
- video crawler pipeline
- """
- article_url = video_item["article_url"]
- platform = video_item["platform"]
- video_title = video_item["article_title"]
- # whether title sensitive
- if whether_title_sensitive(video_title):
- return empty_dict
- # whether duplicate video title
- if whether_duplicate_video_title(video_title, db_client):
- return empty_dict
- # download video
- match platform:
- case "toutiao":
- video_path = download_toutiao_video(article_url)
- case "gzh":
- video_path = download_gzh_video(article_url)
- case "hksp":
- video_path = ""
- case "sph":
- video_path = ""
- case "sohu":
- video_path = download_sohu_video(article_url)
- case _:
- return empty_dict
- if video_path:
- # upload video to oss
- oss_path = upload_to_oss(video_path)
- video_item["video_oss_path"] = oss_path
- os.remove(video_path)
- return video_item
- else:
- return empty_dict
|