|
@@ -3,25 +3,43 @@
|
|
"""
|
|
"""
|
|
|
|
|
|
import os
|
|
import os
|
|
|
|
+import json
|
|
|
|
|
|
from applications.utils import download_gzh_video
|
|
from applications.utils import download_gzh_video
|
|
from applications.utils import download_toutiao_video
|
|
from applications.utils import download_toutiao_video
|
|
from applications.utils import upload_to_oss
|
|
from applications.utils import upload_to_oss
|
|
|
|
|
|
|
|
+from config import apolloConfig
|
|
|
|
+
|
|
|
|
+my_config = apolloConfig()
|
|
|
|
+
|
|
empty_dict = {}
|
|
empty_dict = {}
|
|
|
|
+sensitive_word_list = json.loads(my_config.getConfigValue("sensitive_word_list"))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def whether_title_sensitive(title: str) -> bool:
|
|
|
|
+ """
|
|
|
|
+ title sensitive words filter
|
|
|
|
+ """
|
|
|
|
+ for word in sensitive_word_list:
|
|
|
|
+ if word in title:
|
|
|
|
+ return True
|
|
|
|
+
|
|
|
|
+ return False
|
|
|
|
|
|
|
|
|
|
-def whether_duplicate_video_title(video_title, db_client):
|
|
|
|
|
|
+def whether_duplicate_video_title(video_title: str, db_client) -> bool:
|
|
"""
|
|
"""
|
|
whether duplicate video title
|
|
whether duplicate video title
|
|
"""
|
|
"""
|
|
sql = f"""
|
|
sql = f"""
|
|
- select id from publish_single_video_source
|
|
|
|
- where article_title = %s;
|
|
|
|
|
|
+ select id from publish_single_video_source
|
|
|
|
+ where article_title = %s;
|
|
"""
|
|
"""
|
|
duplicate_id = db_client.fetch(query=sql, params=(video_title,))
|
|
duplicate_id = db_client.fetch(query=sql, params=(video_title,))
|
|
if duplicate_id:
|
|
if duplicate_id:
|
|
return True
|
|
return True
|
|
|
|
+
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
|
|
@@ -29,12 +47,14 @@ def scrape_video_entities_process(video_item, db_client) -> dict:
|
|
"""
|
|
"""
|
|
video crawler pipeline
|
|
video crawler pipeline
|
|
"""
|
|
"""
|
|
- # whether duplicate video title
|
|
|
|
video_title = video_item["article_title"]
|
|
video_title = video_item["article_title"]
|
|
- if whether_duplicate_video_title(video_title, db_client):
|
|
|
|
|
|
+ # whether title sensitive
|
|
|
|
+ if whether_title_sensitive(video_title):
|
|
return empty_dict
|
|
return empty_dict
|
|
|
|
|
|
- # video title sensitive words filter
|
|
|
|
|
|
+ # whether duplicate video title
|
|
|
|
+ if whether_duplicate_video_title(video_title, db_client):
|
|
|
|
+ return empty_dict
|
|
|
|
|
|
# download video
|
|
# download video
|
|
article_url = video_item["article_url"]
|
|
article_url = video_item["article_url"]
|