|
@@ -5,7 +5,7 @@ import time
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
from applications.db import DatabaseConnector
|
|
|
-from applications.pipeline import video_crawler_pipeline
|
|
|
+from applications.pipeline import scrape_video_entities_process
|
|
|
from applications.utils import Item
|
|
|
from applications.utils import str_to_md5
|
|
|
from applications.utils import insert_into_single_video_source_table
|
|
@@ -74,6 +74,10 @@ class CrawlerToutiaoAccountVideos:
|
|
|
video_item = Item()
|
|
|
video_id = video_data['video_id']
|
|
|
title = video_data['title']
|
|
|
+ media = video_data['video']
|
|
|
+ url = media["download_addr"]['url_list'][0]
|
|
|
+
|
|
|
+ # add info into item
|
|
|
video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
|
|
|
video_item.add("url_unique_md5", video_id)
|
|
|
video_item.add("article_title", title)
|
|
@@ -82,17 +86,19 @@ class CrawlerToutiaoAccountVideos:
|
|
|
video_item.add("publish_timestamp", video_data['publish_time'])
|
|
|
video_item.add("platform", "toutiao")
|
|
|
video_item.add("read_cnt", video_data['read_count'])
|
|
|
- media = video_data['video']
|
|
|
- url = media["download_addr"]['url_list'][0]
|
|
|
video_item.add("article_url", url)
|
|
|
video_item.add("source_account", 0)
|
|
|
+ video_item.add("crawler_timestamp", int(time.time()))
|
|
|
+
|
|
|
+ # check item before insert
|
|
|
video_item.check(source="video")
|
|
|
try:
|
|
|
- item_with_oss_path = video_crawler_pipeline(
|
|
|
+ item_with_oss_path = scrape_video_entities_process(
|
|
|
video_item=video_item.item,
|
|
|
db_client=self.db_client
|
|
|
)
|
|
|
- insert_into_single_video_source_table(self.db_client, item_with_oss_path)
|
|
|
+ if item_with_oss_path:
|
|
|
+ insert_into_single_video_source_table(self.db_client, item_with_oss_path)
|
|
|
except Exception as e:
|
|
|
print(e)
|
|
|
|