""" @author: luojunhui @description: crawler toutiao account videos by recommendation """ from __future__ import annotations import time import traceback from pymysql.cursors import DictCursor from tqdm import tqdm from applications import log from applications.const import ToutiaoVideoCrawlerConst from applications.db import DatabaseConnector from coldStartTasks.crawler.toutiao import get_associated_recommendation from config import apolloConfig, long_articles_config const = ToutiaoVideoCrawlerConst() config = apolloConfig() cookie = config.getConfigValue("toutiao_detail_recommend_cookie") class CrawlerToutiaoAccounts: """ toutiao blogger crawler """ def __init__(self): self.db_client = DatabaseConnector(db_config=long_articles_config) self.db_client.connect() def get_seed_video_with_strategy(self, strategy: str = 'basic'): """ 采用策略获取种子视频 """ match strategy: case 'basic': sql = "select id, article_title, out_account_name, url_unique_md5 from publish_single_video_source where platform = 'toutiao' and audit_status = 1 and bad_status = 0;" seed_video_list = self.db_client.fetch(query=sql, cursor_type=DictCursor) return seed_video_list case _: return [] def get_exist_account_list(self) -> set: """ get already exist account list """ sql = f""" select account_id from video_meta_accounts where platform = 'toutiao' and status = {const.TOUTIAO_ACCOUNT_GOOD_STATUS}; """ account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor) account_id_set = set( [i['account_id'] for i in account_list] ) return account_id_set def insert_video_into_video_association_table(self, video_info: dict): """ insert video into video_association table """ select_sql = f"""select id from video_association where recommend_video_id = %s""" video_id = self.db_client.fetch(query=select_sql, params=(video_info['recommend_video_id'],), cursor_type=DictCursor) if video_id: print(f'duplicated video id: {video_id}') return sql = f""" insert into video_association (account_name, account_id, recommend_video_id, title, read_cnt, duration, seed_account, seed_title, recommend_date, platform) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s); """ self.db_client.save( query=sql, params=( video_info['author'], video_info['account_id'], video_info['recommend_video_id'], video_info['title'], video_info['read_cnt'], video_info['duration'], video_info['seed_account'], video_info['seed_title'], video_info['recommend_date'], video_info['platform'], ) ) def get_recommend_video_list(self, seed_video: dict, exist_account_set: set): """ group_id: toutiao group id """ group_id = seed_video['url_unique_md5'] seed_title = seed_video['article_title'] seed_account = seed_video['out_account_name'] response = get_associated_recommendation(article_id=group_id, cookie=cookie) recommend_video_list = response['data'] recommend_video_list_bar = tqdm(recommend_video_list, desc="get recommend video list") for video in recommend_video_list_bar: try: account_info = video["user_info"] author = account_info["name"] account_id = account_info["user_id"] if account_id in exist_account_set: print("exists account:\t", author, ) continue video_obj = { "author": author, "account_id": account_id, "title": video["title"], "read_cnt": video.get("read_count", 0), "duration": video.get("video_duration", 0), "recommend_video_id": video["group_id"], "seed_account": seed_account, "seed_title": seed_title, "recommend_date": time.strftime("%Y-%m-%d", time.localtime()), "platform": 'toutiao' } self.insert_video_into_video_association_table(video_obj) except Exception as e: print(e) def deal(self) -> None: """ class entrance """ # get exist account id set exist_account_id_set = self.get_exist_account_list() # get seed video list seed_video_list = self.get_seed_video_with_strategy() seed_video_list_bar = tqdm(seed_video_list, desc="get recommend video list") for video in seed_video_list_bar: seed_title = video['article_title'] seed_video_list_bar.set_postfix({"seed_title": seed_title}) try: self.get_recommend_video_list( seed_video=video, exist_account_set=exist_account_id_set ) except Exception as e: print(e)