luojunhui
/
LongArticlesJob


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
							"""
@author: luojunhui
"""
import time

from tqdm import tqdm

from applications.const import ToutiaoVideoCrawlerConst
from applications.db import DatabaseConnector
from applications.pipeline import scrape_video_entities_process
from applications.utils import Item
from applications.utils import str_to_md5
from applications.utils import insert_into_single_video_source_table
from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
from config import apolloConfig, long_articles_config

const = ToutiaoVideoCrawlerConst()
config = apolloConfig()
cookie = config.getConfigValue("toutiao_blogger_cookie")


class CrawlerToutiaoAccountVideos:
    """
    toutiao blogger crawler
    """

    def __init__(self):
        self.db_client = DatabaseConnector(db_config=long_articles_config)
        self.db_client.connect()

    def get_account_list(self):
        """
        get account list
        """
        return

    def crawler_each_account_video_list(self, account_id, max_behot_time=0):
        """
        get each account video list
        """
        current_cursor = max_behot_time
        has_more = True

        while has_more:
            response = get_toutiao_account_video_list(
                account_id=account_id, cookie=cookie,
                max_behot_time=current_cursor)
            if response['message'] != 'success':
                print("error")
                break

            video_list = response['data']
            has_more = response['has_more']
            current_cursor = response['next']['max_behot_time']

            if not video_list:
                break

            max_timestamp_in_this_group = video_list[0]['publish_time']
            if max_timestamp_in_this_group < const.DEFAULT_CURSOR:
                break

            crawler_video_list_bar = tqdm(video_list, desc="crawler videos")
            for video in crawler_video_list_bar:
                crawler_video_list_bar.set_postfix({"video_id": video["id"]})
                self.crawler_each_video(video)

            if has_more:
                time.sleep(const.SLEEP_SECOND)
            else:
                break

    def crawler_each_video(self, video_data):
        """
        crawler each video data
        """
        video_item = Item()
        video_id = video_data['video_id']
        title = video_data['title']
        media = video_data['video']
        url = media["download_addr"]['url_list'][0]

        # add info into item
        video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
        video_item.add("url_unique_md5", video_id)
        video_item.add("article_title", title)
        video_item.add("out_account_id", video_data['user']['user_id'])
        video_item.add("out_account_name", video_data['source'])
        video_item.add("publish_timestamp", video_data['publish_time'])
        video_item.add("platform", "toutiao")
        video_item.add("read_cnt", video_data['read_count'])
        video_item.add("article_url", url)
        video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
        video_item.add("crawler_timestamp", int(time.time()))

        # check item before insert
        video_item.check(source="video")
        try:
            item_with_oss_path = scrape_video_entities_process(
                video_item=video_item.item,
                db_client=self.db_client
            )
            if item_with_oss_path:
                insert_into_single_video_source_table(self.db_client, item_with_oss_path)
        except Exception as e:
            print(e)

    def deal(self):
        """
        class entrance
        """
        account_id = 'MS4wLjABAAAAaQYyWZTkidUVmt1tivdSY5UZdGD02UfW9yRlLfrxNGwVltHSwvInIauOKyra-HCi'
        self.crawler_each_account_video_list(account_id)