Quellcode durchsuchen

头条视频测试抓取

luojunhui vor 7 Monaten
Ursprung
Commit
5cd0d6f64c
1 geänderte Dateien mit 26 neuen und 6 gelöschten Zeilen
  1. 26 6
      applications/pipeline/crawler_pipeline.py

+ 26 - 6
applications/pipeline/crawler_pipeline.py

@@ -3,25 +3,43 @@
 """
 
 import os
+import json
 
 from applications.utils import download_gzh_video
 from applications.utils import download_toutiao_video
 from applications.utils import upload_to_oss
 
+from config import apolloConfig
+
+my_config = apolloConfig()
+
 empty_dict = {}
+sensitive_word_list = json.loads(my_config.getConfigValue("sensitive_word_list"))
+
+
+def whether_title_sensitive(title: str) -> bool:
+    """
+    title sensitive words filter
+    """
+    for word in sensitive_word_list:
+        if word in title:
+            return True
+
+    return False
 
 
-def whether_duplicate_video_title(video_title, db_client):
+def whether_duplicate_video_title(video_title: str, db_client) -> bool:
     """
     whether duplicate video title
     """
     sql = f"""
-            select id from publish_single_video_source
-            where article_title = %s;
+        select id from publish_single_video_source
+        where article_title = %s;
     """
     duplicate_id = db_client.fetch(query=sql, params=(video_title,))
     if duplicate_id:
         return True
+
     return False
 
 
@@ -29,12 +47,14 @@ def scrape_video_entities_process(video_item, db_client) -> dict:
     """
     video crawler pipeline
     """
-    # whether duplicate video title
     video_title = video_item["article_title"]
-    if whether_duplicate_video_title(video_title, db_client):
+    # whether title sensitive
+    if whether_title_sensitive(video_title):
         return empty_dict
 
-    # video title sensitive words filter
+    # whether duplicate video title
+    if whether_duplicate_video_title(video_title, db_client):
+        return empty_dict
 
     # download video
     article_url = video_item["article_url"]