123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
- """
- @author: luojunhui
- @description: crawler toutiao account videos by recommendation
- """
- from __future__ import annotations
- import time
- import traceback
- from pymysql.cursors import DictCursor
- from tqdm import tqdm
- from applications import log
- from applications.const import ToutiaoVideoCrawlerConst
- from applications.db import DatabaseConnector
- from coldStartTasks.crawler.toutiao import get_associated_recommendation
- from config import apolloConfig, long_articles_config
- const = ToutiaoVideoCrawlerConst()
- config = apolloConfig()
- cookie = config.getConfigValue("toutiao_detail_recommend_cookie")
- class CrawlerToutiaoAccounts:
- """
- toutiao blogger crawler
- """
- def __init__(self):
- self.db_client = DatabaseConnector(db_config=long_articles_config)
- self.db_client.connect()
- def get_seed_video_with_strategy(self, strategy: str = 'basic'):
- """
- 采用策略获取种子视频
- """
- match strategy:
- case 'basic':
- sql = "select id, article_title, out_account_name, url_unique_md5 from publish_single_video_source where platform = 'toutiao' and audit_status = 1 and bad_status = 0;"
- seed_video_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
- return seed_video_list
- case _:
- return []
- def get_exist_account_list(self) -> set:
- """
- get already exist account list
- """
- sql = f"""
- select account_id
- from video_meta_accounts
- where platform = 'toutiao' and status = {const.TOUTIAO_ACCOUNT_GOOD_STATUS};
- """
- account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
- account_id_set = set(
- [i['account_id'] for i in account_list]
- )
- return account_id_set
- def insert_video_into_video_association_table(self, video_info: dict):
- """
- insert video into video_association table
- """
- select_sql = f"""select id from video_association where recommend_video_id = %s"""
- video_id = self.db_client.fetch(query=select_sql, params=(video_info['recommend_video_id'],), cursor_type=DictCursor)
- if video_id:
- print(f'duplicated video id: {video_id}')
- return
- sql = f"""
- insert into video_association
- (account_name, account_id, recommend_video_id, title, read_cnt, duration, seed_account, seed_title, recommend_date, platform)
- values
- (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
- """
- self.db_client.save(
- query=sql,
- params=(
- video_info['author'],
- video_info['account_id'],
- video_info['recommend_video_id'],
- video_info['title'],
- video_info['read_cnt'],
- video_info['duration'],
- video_info['seed_account'],
- video_info['seed_title'],
- video_info['recommend_date'],
- video_info['platform'],
- )
- )
- def get_recommend_video_list(self, seed_video: dict, exist_account_set: set):
- """
- group_id: toutiao group id
- """
- group_id = seed_video['url_unique_md5']
- seed_title = seed_video['article_title']
- seed_account = seed_video['out_account_name']
- response = get_associated_recommendation(article_id=group_id, cookie=cookie)
- recommend_video_list = response['data']
- recommend_video_list_bar = tqdm(recommend_video_list, desc="get recommend video list")
- for video in recommend_video_list_bar:
- try:
- account_info = video["user_info"]
- author = account_info["name"]
- account_id = account_info["user_id"]
- if account_id in exist_account_set:
- print("exists account:\t", author, )
- continue
- video_obj = {
- "author": author,
- "account_id": account_id,
- "title": video["title"],
- "read_cnt": video.get("read_count", 0),
- "duration": video.get("video_duration", 0),
- "recommend_video_id": video["group_id"],
- "seed_account": seed_account,
- "seed_title": seed_title,
- "recommend_date": time.strftime("%Y-%m-%d", time.localtime()),
- "platform": 'toutiao'
- }
- self.insert_video_into_video_association_table(video_obj)
- except Exception as e:
- print(e)
- def deal(self) -> None:
- """
- class entrance
- """
- # get exist account id set
- exist_account_id_set = self.get_exist_account_list()
- # get seed video list
- seed_video_list = self.get_seed_video_with_strategy()
- seed_video_list_bar = tqdm(seed_video_list, desc="get recommend video list")
- for video in seed_video_list_bar:
- seed_title = video['article_title']
- seed_video_list_bar.set_postfix({"seed_title": seed_title})
- try:
- self.get_recommend_video_list(
- seed_video=video,
- exist_account_set=exist_account_id_set
- )
- except Exception as e:
- print(e)
|