123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110 |
- """
- @author: luojunhui
- """
- import time
- from tqdm import tqdm
- from applications.db import DatabaseConnector
- from applications.pipeline import scrape_video_entities_process
- from applications.utils import Item
- from applications.utils import str_to_md5
- from applications.utils import insert_into_single_video_source_table
- from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
- from config import apolloConfig, long_articles_config
- config = apolloConfig()
- cookie = config.getConfigValue("toutiao_blogger_cookie")
- class CrawlerToutiaoAccountVideos:
- """
- toutiao blogger crawler
- """
- def __init__(self):
- self.db_client = DatabaseConnector(db_config=long_articles_config)
- self.db_client.connect()
- def get_account_list(self):
- """
- get account list
- """
- return
- def crawler_each_account_video_list(self, account_id, max_behot_time=0):
- """
- get each account video list
- """
- min_behot_time = 1704038400
- current_cursor = max_behot_time
- has_more = True
- while has_more:
- response = get_toutiao_account_video_list(account_id=account_id, cookie=cookie,
- max_behot_time=current_cursor)
- if response['message'] != 'success':
- print("error")
- break
- video_list = response['data']
- has_more = response['has_more']
- current_cursor = response['next']['max_behot_time']
- if not video_list:
- break
- max_timestamp_in_this_group = video_list[0]['publish_time']
- if max_timestamp_in_this_group < min_behot_time:
- break
- crawler_video_list_bar = tqdm(video_list, desc="crawler videos")
- for video in crawler_video_list_bar:
- crawler_video_list_bar.set_postfix({"video_id": video["id"]})
- self.crawler_each_video(video)
- if has_more:
- time.sleep(3)
- else:
- break
- def crawler_each_video(self, video_data):
- """
- crawler each video data
- """
- video_item = Item()
- video_id = video_data['video_id']
- title = video_data['title']
- media = video_data['video']
- url = media["download_addr"]['url_list'][0]
- # add info into item
- video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
- video_item.add("url_unique_md5", video_id)
- video_item.add("article_title", title)
- video_item.add("out_account_id", video_data['user']['user_id'])
- video_item.add("out_account_name", video_data['source'])
- video_item.add("publish_timestamp", video_data['publish_time'])
- video_item.add("platform", "toutiao")
- video_item.add("read_cnt", video_data['read_count'])
- video_item.add("article_url", url)
- video_item.add("source_account", 0)
- video_item.add("crawler_timestamp", int(time.time()))
- # check item before insert
- video_item.check(source="video")
- try:
- item_with_oss_path = scrape_video_entities_process(
- video_item=video_item.item,
- db_client=self.db_client
- )
- if item_with_oss_path:
- insert_into_single_video_source_table(self.db_client, item_with_oss_path)
- except Exception as e:
- print(e)
- def deal(self):
- """
- class entrance
- """
- account_id = 'MS4wLjABAAAAaQYyWZTkidUVmt1tivdSY5UZdGD02UfW9yRlLfrxNGwVltHSwvInIauOKyra-HCi'
- self.crawler_each_account_video_list(account_id)
|