""" @author: luojunhui """ from __future__ import annotations import time import traceback from pymysql.cursors import DictCursor from tqdm import tqdm from applications import log from applications.const import ToutiaoVideoCrawlerConst from applications.db import DatabaseConnector from applications.pipeline import scrape_video_entities_process from applications.utils import Item from applications.utils import str_to_md5 from applications.utils import insert_into_single_video_source_table from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list from config import apolloConfig, long_articles_config const = ToutiaoVideoCrawlerConst() config = apolloConfig() cookie = config.getConfigValue("toutiao_detail_recommend_cookie") class CrawlerAccounts: def __init__(self): self.db_client = DatabaseConnector(db_config=long_articles_config) self.db_client.connect() class ChannelAccountCrawler(CrawlerAccounts): """ crawler channel accounts strategy: 1. try to get search keys and titles from database 2. try to get hot_points from web 2. use search api to get accounts """ def get_seed_keys(self): """ get search keys from database """ sql = "select * from datastat_sort_strategy limit 100;" result = self.db_client.fetch(sql) return result class ToutiaoAccountCrawler(CrawlerAccounts): def get_seed_videos(self): fetch_query = f""" select article_title, url_unique_md5 from publish_single_video_source where platform = 'toutiao' and video_pool_audit_status = 1 and bad_status = 0 order by score desc limit 100; """ seed_video_list = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor) return seed_video_list def get_recommend_videos(self, seed_video_id: str): # get recommend videos for each video recommend_video_list = get_toutiao_account_video_list(seed_video_id, cookie) return recommend_video_list