3 månader sedan · 5cd0d6f64c
--- a/applications/pipeline/crawler_pipeline.py
+++ b/applications/pipeline/crawler_pipeline.py
@@ -3,25 +3,43 @@
 
															 """
														
 
															 import os
														
 
															+import json
														
 
															 from applications.utils import download_gzh_video
														
 
															 from applications.utils import download_toutiao_video
														
 
															 from applications.utils import upload_to_oss
														
 
															+from config import apolloConfig
														
 
															+
														
 
															+my_config = apolloConfig()
														
 
															+
														
 
															 empty_dict = {}
														
 
															+sensitive_word_list = json.loads(my_config.getConfigValue("sensitive_word_list"))
														
 
															+
														
 
															+
														
 
															+def whether_title_sensitive(title: str) -> bool:
														
 
															+    """
														
 
															+    title sensitive words filter
														
 
															+    """
														
 
															+    for word in sensitive_word_list:
														
 
															+        if word in title:
														
 
															+            return True
														
 
															+
														
 
															+    return False
														
 
															-def whether_duplicate_video_title(video_title, db_client):
														
 
															+def whether_duplicate_video_title(video_title: str, db_client) -> bool:
														
 
															     """
														
 
															     whether duplicate video title
														
 
															     """
														
 
															     sql = f"""
														
 
															-            select id from publish_single_video_source
														
 
															-            where article_title = %s;
														
 
															+        select id from publish_single_video_source
														
 
															+        where article_title = %s;
														
 
															     """
														
 
															     duplicate_id = db_client.fetch(query=sql, params=(video_title,))
														
 
															     if duplicate_id:
														
 
															         return True
														
 
															+
														
 
															     return False
														
@@ -29,12 +47,14 @@ def scrape_video_entities_process(video_item, db_client) -> dict:
 
															     """
														
 
															     video crawler pipeline
														
 
															     """
														
 
															-    # whether duplicate video title
														
 
															     video_title = video_item["article_title"]
														
 
															-    if whether_duplicate_video_title(video_title, db_client):
														
 
															+    # whether title sensitive
														
 
															+    if whether_title_sensitive(video_title):
														
 
															         return empty_dict
														
 
															-    # video title sensitive words filter
														
 
															+    # whether duplicate video title
														
 
															+    if whether_duplicate_video_title(video_title, db_client):
														
 
															+        return empty_dict
														
 
															     # download video
														
 
															     article_url = video_item["article_url"]