|
@@ -0,0 +1,104 @@
|
|
|
+"""
|
|
|
+@author: luojunhui
|
|
|
+"""
|
|
|
+import time
|
|
|
+from tqdm import tqdm
|
|
|
+
|
|
|
+from applications.db import DatabaseConnector
|
|
|
+from applications.pipeline import video_crawler_pipeline
|
|
|
+from applications.utils import Item
|
|
|
+from applications.utils import str_to_md5
|
|
|
+from applications.utils import insert_into_single_video_source_table
|
|
|
+from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
|
|
|
+from config import apolloConfig, long_articles_config
|
|
|
+
|
|
|
+config = apolloConfig()
|
|
|
+cookie = config.getConfigValue("toutiao_blogger_cookie")
|
|
|
+
|
|
|
+
|
|
|
+class CrawlerToutiaoAccountVideos:
|
|
|
+ """
|
|
|
+ toutiao blogger crawler
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.db_client = DatabaseConnector(db_config=long_articles_config)
|
|
|
+ self.db_client.connect()
|
|
|
+
|
|
|
+ def get_account_list(self):
|
|
|
+ """
|
|
|
+ get account list
|
|
|
+ """
|
|
|
+ return
|
|
|
+
|
|
|
+ def crawler_each_account_video_list(self, account_id, max_behot_time=0):
|
|
|
+ """
|
|
|
+ get each account video list
|
|
|
+ """
|
|
|
+ min_behot_time = 1704038400
|
|
|
+ current_cursor = max_behot_time
|
|
|
+ has_more = True
|
|
|
+
|
|
|
+ while has_more:
|
|
|
+ response = get_toutiao_account_video_list(account_id=account_id, cookie=cookie,
|
|
|
+ max_behot_time=current_cursor)
|
|
|
+ if response['message'] != 'success':
|
|
|
+ print("error")
|
|
|
+ break
|
|
|
+
|
|
|
+ video_list = response['data']
|
|
|
+ has_more = response['has_more']
|
|
|
+ current_cursor = response['next']['max_behot_time']
|
|
|
+
|
|
|
+ if not video_list:
|
|
|
+ break
|
|
|
+
|
|
|
+ max_timestamp_in_this_group = video_list[0]['publish_time']
|
|
|
+ if max_timestamp_in_this_group < min_behot_time:
|
|
|
+ break
|
|
|
+
|
|
|
+ crawler_video_list_bar = tqdm(video_list, desc="crawler videos")
|
|
|
+ for video in crawler_video_list_bar:
|
|
|
+ crawler_video_list_bar.set_postfix({"video_id": video["id"]})
|
|
|
+ self.crawler_each_video(video)
|
|
|
+
|
|
|
+ if has_more:
|
|
|
+ time.sleep(3)
|
|
|
+ else:
|
|
|
+ break
|
|
|
+
|
|
|
+ def crawler_each_video(self, video_data):
|
|
|
+ """
|
|
|
+ crawler each video data
|
|
|
+ """
|
|
|
+ video_item = Item()
|
|
|
+ video_id = video_data['video_id']
|
|
|
+ title = video_data['title']
|
|
|
+ video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
|
|
|
+ video_item.add("url_unique_md5", video_id)
|
|
|
+ video_item.add("article_title", title)
|
|
|
+ video_item.add("out_account_id", video_data['user']['user_id'])
|
|
|
+ video_item.add("out_account_name", video_data['source'])
|
|
|
+ video_item.add("publish_timestamp", video_data['publish_time'])
|
|
|
+ video_item.add("platform", "toutiao")
|
|
|
+ video_item.add("read_cnt", video_data['read_count'])
|
|
|
+ media = video_data['video']
|
|
|
+ url = media["download_addr"]['url_list'][0]
|
|
|
+ video_item.add("article_url", url)
|
|
|
+ video_item.add("source_account", 0)
|
|
|
+ video_item.check(source="video")
|
|
|
+ try:
|
|
|
+ item_with_oss_path = video_crawler_pipeline(
|
|
|
+ video_item=video_item.item,
|
|
|
+ db_client=self.db_client
|
|
|
+ )
|
|
|
+ insert_into_single_video_source_table(self.db_client, item_with_oss_path)
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+
|
|
|
+ def deal(self):
|
|
|
+ """
|
|
|
+ class entrance
|
|
|
+ """
|
|
|
+ account_id = 'MS4wLjABAAAAXp7v7A9VfXh-Pfo1TwejlJViATS7aqxuLnBHjaEb8tx1nDTLe7jF7KsNAR9RoVWk'
|
|
|
+ self.crawler_each_account_video_list(account_id)
|