""" @author: luojunhui """ import time from tqdm import tqdm from applications.db import DatabaseConnector from applications.pipeline import scrape_video_entities_process from applications.utils import Item from applications.utils import str_to_md5 from applications.utils import insert_into_single_video_source_table from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list from config import apolloConfig, long_articles_config config = apolloConfig() cookie = config.getConfigValue("toutiao_blogger_cookie") class CrawlerToutiaoAccountVideos: """ toutiao blogger crawler """ def __init__(self): self.db_client = DatabaseConnector(db_config=long_articles_config) self.db_client.connect() def get_account_list(self): """ get account list """ return def crawler_each_account_video_list(self, account_id, max_behot_time=0): """ get each account video list """ min_behot_time = 1704038400 current_cursor = max_behot_time has_more = True while has_more: response = get_toutiao_account_video_list(account_id=account_id, cookie=cookie, max_behot_time=current_cursor) if response['message'] != 'success': print("error") break video_list = response['data'] has_more = response['has_more'] current_cursor = response['next']['max_behot_time'] if not video_list: break max_timestamp_in_this_group = video_list[0]['publish_time'] if max_timestamp_in_this_group < min_behot_time: break crawler_video_list_bar = tqdm(video_list, desc="crawler videos") for video in crawler_video_list_bar: crawler_video_list_bar.set_postfix({"video_id": video["id"]}) self.crawler_each_video(video) if has_more: time.sleep(3) else: break def crawler_each_video(self, video_data): """ crawler each video data """ video_item = Item() video_id = video_data['video_id'] title = video_data['title'] media = video_data['video'] url = media["download_addr"]['url_list'][0] # add info into item video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id)))) video_item.add("url_unique_md5", video_id) video_item.add("article_title", title) video_item.add("out_account_id", video_data['user']['user_id']) video_item.add("out_account_name", video_data['source']) video_item.add("publish_timestamp", video_data['publish_time']) video_item.add("platform", "toutiao") video_item.add("read_cnt", video_data['read_count']) video_item.add("article_url", url) video_item.add("source_account", 0) video_item.add("crawler_timestamp", int(time.time())) # check item before insert video_item.check(source="video") try: item_with_oss_path = scrape_video_entities_process( video_item=video_item.item, db_client=self.db_client ) if item_with_oss_path: insert_into_single_video_source_table(self.db_client, item_with_oss_path) except Exception as e: print(e) def deal(self): """ class entrance """ account_id = 'MS4wLjABAAAAaQYyWZTkidUVmt1tivdSY5UZdGD02UfW9yRlLfrxNGwVltHSwvInIauOKyra-HCi' self.crawler_each_account_video_list(account_id)