|
@@ -0,0 +1,147 @@
|
|
|
+"""
|
|
|
+@author: luojunhui
|
|
|
+@description: crawler toutiao account videos by recommendation
|
|
|
+"""
|
|
|
+
|
|
|
+from __future__ import annotations
|
|
|
+
|
|
|
+import time
|
|
|
+import traceback
|
|
|
+
|
|
|
+from pymysql.cursors import DictCursor
|
|
|
+from tqdm import tqdm
|
|
|
+
|
|
|
+from applications import log
|
|
|
+from applications.const import ToutiaoVideoCrawlerConst
|
|
|
+from applications.db import DatabaseConnector
|
|
|
+from coldStartTasks.crawler.toutiao import get_associated_recommendation
|
|
|
+from config import apolloConfig, long_articles_config
|
|
|
+
|
|
|
+const = ToutiaoVideoCrawlerConst()
|
|
|
+config = apolloConfig()
|
|
|
+cookie = config.getConfigValue("toutiao_detail_recommend_cookie")
|
|
|
+
|
|
|
+
|
|
|
+class CrawlerToutiaoAccounts:
|
|
|
+ """
|
|
|
+ toutiao blogger crawler
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.db_client = DatabaseConnector(db_config=long_articles_config)
|
|
|
+ self.db_client.connect()
|
|
|
+
|
|
|
+ def get_seed_video_with_strategy(self, strategy: str = 'basic'):
|
|
|
+ """
|
|
|
+ 采用策略获取种子视频
|
|
|
+ """
|
|
|
+ match strategy:
|
|
|
+ case 'basic':
|
|
|
+ sql = "select id, article_title, out_account_name, url_unique_md5 from publish_single_video_source where platform = 'toutiao' and audit_status = 1 and bad_status = 0;"
|
|
|
+ seed_video_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
|
|
|
+ return seed_video_list
|
|
|
+ case _:
|
|
|
+ return []
|
|
|
+
|
|
|
+ def get_exist_account_list(self) -> set:
|
|
|
+ """
|
|
|
+ get already exist account list
|
|
|
+ """
|
|
|
+ sql = f"""
|
|
|
+ select account_id
|
|
|
+ from video_meta_accounts
|
|
|
+ where platform = 'toutiao' and status = {const.TOUTIAO_ACCOUNT_GOOD_STATUS};
|
|
|
+ """
|
|
|
+ account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
|
|
|
+ account_id_set = set(
|
|
|
+ [i['account_id'] for i in account_list]
|
|
|
+ )
|
|
|
+ return account_id_set
|
|
|
+
|
|
|
+ def insert_video_into_video_association_table(self, video_info: dict):
|
|
|
+ """
|
|
|
+ insert video into video_association table
|
|
|
+ """
|
|
|
+ select_sql = f"""select id from video_association where recommend_video_id = %s"""
|
|
|
+ video_id = self.db_client.fetch(query=select_sql, params=(video_info['recommend_video_id'],), cursor_type=DictCursor)
|
|
|
+ if video_id:
|
|
|
+ print(f'duplicated video id: {video_id}')
|
|
|
+ return
|
|
|
+
|
|
|
+ sql = f"""
|
|
|
+ insert into video_association
|
|
|
+ (account_name, account_id, recommend_video_id, title, read_cnt, duration, seed_account, seed_title, recommend_date, platform)
|
|
|
+ values
|
|
|
+ (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
+ """
|
|
|
+ self.db_client.save(
|
|
|
+ query=sql,
|
|
|
+ params=(
|
|
|
+ video_info['author'],
|
|
|
+ video_info['account_id'],
|
|
|
+ video_info['recommend_video_id'],
|
|
|
+ video_info['title'],
|
|
|
+ video_info['read_cnt'],
|
|
|
+ video_info['duration'],
|
|
|
+ video_info['seed_account'],
|
|
|
+ video_info['seed_title'],
|
|
|
+ video_info['recommend_date'],
|
|
|
+ video_info['platform'],
|
|
|
+ )
|
|
|
+ )
|
|
|
+
|
|
|
+ def get_recommend_video_list(self, seed_video: dict, exist_account_set: set):
|
|
|
+ """
|
|
|
+ group_id: toutiao group id
|
|
|
+ """
|
|
|
+ group_id = seed_video['url_unique_md5']
|
|
|
+ seed_title = seed_video['article_title']
|
|
|
+ seed_account = seed_video['out_account_name']
|
|
|
+ response = get_associated_recommendation(article_id=group_id, cookie=cookie)
|
|
|
+ recommend_video_list = response['data']
|
|
|
+ recommend_video_list_bar = tqdm(recommend_video_list, desc="get recommend video list")
|
|
|
+ for video in recommend_video_list_bar:
|
|
|
+ try:
|
|
|
+ account_info = video["user_info"]
|
|
|
+ author = account_info["name"]
|
|
|
+ account_id = account_info["user_id"]
|
|
|
+ if account_id in exist_account_set:
|
|
|
+ print("exists account:\t", author, )
|
|
|
+ continue
|
|
|
+
|
|
|
+ video_obj = {
|
|
|
+ "author": author,
|
|
|
+ "account_id": account_id,
|
|
|
+ "title": video["title"],
|
|
|
+ "read_cnt": video.get("read_count", 0),
|
|
|
+ "duration": video.get("video_duration", 0),
|
|
|
+ "recommend_video_id": video["group_id"],
|
|
|
+ "seed_account": seed_account,
|
|
|
+ "seed_title": seed_title,
|
|
|
+ "recommend_date": time.strftime("%Y-%m-%d", time.localtime()),
|
|
|
+ "platform": 'toutiao'
|
|
|
+ }
|
|
|
+ self.insert_video_into_video_association_table(video_obj)
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+
|
|
|
+ def deal(self) -> None:
|
|
|
+ """
|
|
|
+ class entrance
|
|
|
+ """
|
|
|
+ # get exist account id set
|
|
|
+ exist_account_id_set = self.get_exist_account_list()
|
|
|
+
|
|
|
+ # get seed video list
|
|
|
+ seed_video_list = self.get_seed_video_with_strategy()
|
|
|
+ seed_video_list_bar = tqdm(seed_video_list, desc="get recommend video list")
|
|
|
+ for video in seed_video_list_bar:
|
|
|
+ seed_title = video['article_title']
|
|
|
+ seed_video_list_bar.set_postfix({"seed_title": seed_title})
|
|
|
+ try:
|
|
|
+ self.get_recommend_video_list(
|
|
|
+ seed_video=video,
|
|
|
+ exist_account_set=exist_account_id_set
|
|
|
+ )
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|