luojunhui преди 3 месеца
родител
ревизия
3d3589ba53
променени са 3 файла, в които са добавени 13 реда и са изтрити 7 реда
  1. 1 1
      applications/pipeline/__init__.py
  2. 1 1
      applications/pipeline/crawler_pipeline.py
  3. 11 5
      tasks/crawler_toutiao_account_videos.py

+ 1 - 1
applications/pipeline/__init__.py

@@ -1,4 +1,4 @@
 """
 @author: luojunhui
 """
-from .crawler_pipeline import video_crawler_pipeline
+from .crawler_pipeline import scrape_video_entities_process

+ 1 - 1
applications/pipeline/crawler_pipeline.py

@@ -24,7 +24,7 @@ def whether_duplicate_video_title(video_title, db_client):
     return False
 
 
-def video_crawler_pipeline(video_item, db_client) -> dict:
+def scrape_video_entities_process(video_item, db_client) -> dict:
     """
     video crawler pipeline
     """

+ 11 - 5
tasks/crawler_toutiao_account_videos.py

@@ -5,7 +5,7 @@ import time
 from tqdm import tqdm
 
 from applications.db import DatabaseConnector
-from applications.pipeline import video_crawler_pipeline
+from applications.pipeline import scrape_video_entities_process
 from applications.utils import Item
 from applications.utils import str_to_md5
 from applications.utils import insert_into_single_video_source_table
@@ -74,6 +74,10 @@ class CrawlerToutiaoAccountVideos:
         video_item = Item()
         video_id = video_data['video_id']
         title = video_data['title']
+        media = video_data['video']
+        url = media["download_addr"]['url_list'][0]
+
+        # add info into item
         video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
         video_item.add("url_unique_md5", video_id)
         video_item.add("article_title", title)
@@ -82,17 +86,19 @@ class CrawlerToutiaoAccountVideos:
         video_item.add("publish_timestamp", video_data['publish_time'])
         video_item.add("platform", "toutiao")
         video_item.add("read_cnt", video_data['read_count'])
-        media = video_data['video']
-        url = media["download_addr"]['url_list'][0]
         video_item.add("article_url", url)
         video_item.add("source_account", 0)
+        video_item.add("crawler_timestamp", int(time.time()))
+
+        # check item before insert
         video_item.check(source="video")
         try:
-            item_with_oss_path = video_crawler_pipeline(
+            item_with_oss_path = scrape_video_entities_process(
                 video_item=video_item.item,
                 db_client=self.db_client
             )
-            insert_into_single_video_source_table(self.db_client, item_with_oss_path)
+            if item_with_oss_path:
+                insert_into_single_video_source_table(self.db_client, item_with_oss_path)
         except Exception as e:
             print(e)