123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- """
- @author: luojunhui
- """
- from __future__ import annotations
- import time
- import traceback
- from pymysql.cursors import DictCursor
- from tqdm import tqdm
- from applications import log
- from applications.const import ToutiaoVideoCrawlerConst
- from applications.db import DatabaseConnector
- from applications.pipeline import scrape_video_entities_process
- from applications.utils import Item
- from applications.utils import str_to_md5
- from applications.utils import insert_into_single_video_source_table
- from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
- from config import apolloConfig, long_articles_config
- const = ToutiaoVideoCrawlerConst()
- config = apolloConfig()
- cookie = config.getConfigValue("toutiao_detail_recommend_cookie")
- class CrawlerAccounts:
- def __init__(self):
- self.db_client = DatabaseConnector(db_config=long_articles_config)
- self.db_client.connect()
- class ChannelAccountCrawler(CrawlerAccounts):
- """
- crawler channel accounts
- strategy:
- 1. try to get search keys and titles from database
- 2. try to get hot_points from web
- 2. use search api to get accounts
- """
- def get_seed_keys(self):
- """
- get search keys from database
- """
- sql = "select * from datastat_sort_strategy limit 100;"
- result = self.db_client.fetch(sql)
- return result
- class ToutiaoAccountCrawler(CrawlerAccounts):
- def get_seed_videos(self):
- fetch_query = f"""
- select article_title, url_unique_md5
- from publish_single_video_source
- where platform = 'toutiao' and video_pool_audit_status = 1 and bad_status = 0
- order by score desc limit 100;
- """
- seed_video_list = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
- return seed_video_list
- def get_recommend_videos(self, seed_video_id: str):
- # get recommend videos for each video
- recommend_video_list = get_toutiao_account_video_list(seed_video_id, cookie)
- return recommend_video_list
|