luojunhui 3 hónapja
szülő
commit
3d3589ba53

+ 1 - 1
applications/pipeline/__init__.py

@@ -1,4 +1,4 @@
 """
 @author: luojunhui
 """
-from .crawler_pipeline import video_crawler_pipeline
+from .crawler_pipeline import scrape_video_entities_process

+ 1 - 1
applications/pipeline/crawler_pipeline.py

@@ -24,7 +24,7 @@ def whether_duplicate_video_title(video_title, db_client):
     return False
 
 
-def video_crawler_pipeline(video_item, db_client) -> dict:
+def scrape_video_entities_process(video_item, db_client) -> dict:
     """
     video crawler pipeline
     """

+ 11 - 5
tasks/crawler_toutiao_account_videos.py

@@ -5,7 +5,7 @@ import time
 from tqdm import tqdm
 
 from applications.db import DatabaseConnector
-from applications.pipeline import video_crawler_pipeline
+from applications.pipeline import scrape_video_entities_process
 from applications.utils import Item
 from applications.utils import str_to_md5
 from applications.utils import insert_into_single_video_source_table
@@ -74,6 +74,10 @@ class CrawlerToutiaoAccountVideos:
         video_item = Item()
         video_id = video_data['video_id']
         title = video_data['title']
+        media = video_data['video']
+        url = media["download_addr"]['url_list'][0]
+
+        # add info into item
         video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
         video_item.add("url_unique_md5", video_id)
         video_item.add("article_title", title)
@@ -82,17 +86,19 @@ class CrawlerToutiaoAccountVideos:
         video_item.add("publish_timestamp", video_data['publish_time'])
         video_item.add("platform", "toutiao")
         video_item.add("read_cnt", video_data['read_count'])
-        media = video_data['video']
-        url = media["download_addr"]['url_list'][0]
         video_item.add("article_url", url)
         video_item.add("source_account", 0)
+        video_item.add("crawler_timestamp", int(time.time()))
+
+        # check item before insert
         video_item.check(source="video")
         try:
-            item_with_oss_path = video_crawler_pipeline(
+            item_with_oss_path = scrape_video_entities_process(
                 video_item=video_item.item,
                 db_client=self.db_client
             )
-            insert_into_single_video_source_table(self.db_client, item_with_oss_path)
+            if item_with_oss_path:
+                insert_into_single_video_source_table(self.db_client, item_with_oss_path)
         except Exception as e:
             print(e)