""" @author: luojunhui """ import time from tqdm import tqdm from applications.const import ToutiaoVideoCrawlerConst from applications.db import DatabaseConnector from applications.pipeline import scrape_video_entities_process from applications.utils import Item from applications.utils import str_to_md5 from applications.utils import insert_into_single_video_source_table from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list from config import apolloConfig, long_articles_config const = ToutiaoVideoCrawlerConst() config = apolloConfig() cookie = config.getConfigValue("toutiao_blogger_cookie") class CrawlerToutiaoAccountVideos: """ toutiao blogger crawler """ def __init__(self): self.db_client = DatabaseConnector(db_config=long_articles_config) self.db_client.connect() def get_account_list(self): """ get account list """ return def crawler_each_account_video_list(self, account_id, max_behot_time=0): """ get each account video list """ current_cursor = max_behot_time has_more = True while has_more: response = get_toutiao_account_video_list( account_id=account_id, cookie=cookie, max_behot_time=current_cursor ) if response["message"] != "success": print("error") break video_list = response["data"] has_more = response["has_more"] current_cursor = response["next"]["max_behot_time"] if not video_list: break max_timestamp_in_this_group = video_list[0]["publish_time"] if max_timestamp_in_this_group < const.DEFAULT_CURSOR: break crawler_video_list_bar = tqdm(video_list, desc="crawler videos") for video in crawler_video_list_bar: crawler_video_list_bar.set_postfix({"video_id": video["id"]}) self.crawler_each_video(video) if has_more: time.sleep(const.SLEEP_SECOND) else: break def crawler_each_video(self, video_data): """ crawler each video data """ video_item = Item() video_id = video_data["video_id"] title = video_data["title"] media = video_data["video"] url = media["download_addr"]["url_list"][0] # add info into item video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id)))) video_item.add("url_unique_md5", video_id) video_item.add("article_title", title) video_item.add("out_account_id", video_data["user"]["user_id"]) video_item.add("out_account_name", video_data["source"]) video_item.add("publish_timestamp", video_data["publish_time"]) video_item.add("platform", "toutiao") video_item.add("read_cnt", video_data["read_count"]) video_item.add("article_url", url) video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS) video_item.add("crawler_timestamp", int(time.time())) # check item before insert video_item.check(source="video") try: item_with_oss_path = scrape_video_entities_process( video_item=video_item.item, db_client=self.db_client ) if item_with_oss_path: insert_into_single_video_source_table( self.db_client, item_with_oss_path ) except Exception as e: print(e) def deal(self): """ class entrance """ account_id_list = [ "MS4wLjABAAAABIPlAy8EngHf3bXkYFrN935tdJuS9nu3wCdeONe5ZkMxcsQ5AQkxYEcUGqPcA6K7", "MS4wLjABAAAAXqC8gtp2uYlGyO8Bua10GOTqsmi6TPUshTullb1vsSlK1WoRPRW0b1cFmKEpKDyy", "MS4wLjABAAAAUG5on9TNGiGcDAnthjQUz8hs93QU-R37KzAqsCj_IsU", "MS4wLjABAAAAHbUq1p1NodyaVw8nBwdz7su5NIrONIcZ22xLbCRIxUC09s8FeqmQh4tg9MOCLktV", "MS4wLjABAAAAbH-9GMPXTyC9RE-aSpzi0thIrqw-SzbdPz-v7M7YGGQ", "MS4wLjABAAAAq3YelxNuDki2gDu83MEBS7zultxsY8YZ1AWcC1XRugSrFFLOgBZvFeFmNn-h_5Qa", "MS4wLjABAAAA29pf0waQ3QOGd03JpLbYgju5Bg4t1xyIZByY0ijTDYN5Y1aL9LV-DuiSAz7UNfqL", "MS4wLjABAAAAlNBBh2wsAfQIKY6XkVQj6FC9FZonfX8jjsIiVl7xV4c", "MS4wLjABAAAA_5u04HihfTRaYKILhN0ksZqGQXtPqoAS3lMe44oEKFc8NKsVrA6hR-OSN82gw-ue", "MS4wLjABAAAAG5dpmasVG0C2bgr9hNclcKxqm6DPz_1dCOr4fzNT-V0", "MS4wLjABAAAAoUESfCcb-NXbHXJr-A7TszauMxIvjXd0EhULvmVyUhpj-HSs5gsCxrbZFvEcJZzU", "MS4wLjABAAAAlshV8QVXTo4VxSjSHh9B7LpK4_DPKA1vJkbcH8-3Jmq7QohWBHpcphQ2gKAKYe7M", "MS4wLjABAAAAKO4skzt3d35FYb92Vv1lVgzpPz9PdAGsXvqs3WyXILs", "MS4wLjABAAAAp1CP5bxMGYW7fxMZOJKSuSMQeMD7AMw5MyOvP-1xC14", "MS4wLjABAAAAld-tIrZWcmQp9K_IRTI2zcT5GFlzrOH2yj7Cino8xqU", "MS4wLjABAAAAncBYHG1eIO-gSC1FIs8YmGjVTQuN9s9-NBbFs_1pOX0apGmlQd0GroZpb2TpAzVb", "MS4wLjABAAAAqYXDF25BWZBXePfjCISRSmzQRytwOJhBwii9YnzwirYt1MAzdk6kikc6QChcYC9G", "MS4wLjABAAAA_t2pW2XSRFL4P8rV4X3T0hIEnEBxCbLC_cgD3B-Q9mwYorMiNyyoGcmLuyVxnyj1", "MS4wLjABAAAAEU1n5akXZ7Fvd8wkm1BV6pMRI58mgZUPgyQGHBiRKIi4UcoRglDk6xgEgEK8Lk3n", "MS4wLjABAAAAlwoEZD-OROoX_nMoulzBDCnlMqj72GIAB-PO2A3C0GVmYGOnBEH0jhbibVyRUqir", ] for account_id in account_id_list: try: self.crawler_each_account_video_list(account_id) except Exception as e: print(e) continue