Bladeren bron

头条recommend 开发中

luojunhui 7 maanden geleden
bovenliggende
commit
6de0ce1182
2 gewijzigde bestanden met toevoegingen van 148 en 1 verwijderingen
  1. 1 1
      coldStartTasks/crawler/toutiao/detail_page_recommendation.py
  2. 147 0
      tasks/crawler_toutiao_accounts.py

+ 1 - 1
coldStartTasks/crawler/toutiao/detail_page_recommendation.py

@@ -15,7 +15,7 @@ retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
 
 
 @retry(**retry_desc)
-def get_associated_recommendation(article_id, cookie):
+def get_associated_recommendation(article_id: str, cookie: str):
     """
     toutiao related recommendation
     """

+ 147 - 0
tasks/crawler_toutiao_accounts.py

@@ -0,0 +1,147 @@
+"""
+@author: luojunhui
+@description: crawler toutiao account videos by recommendation
+"""
+
+from __future__ import annotations
+
+import time
+import traceback
+
+from pymysql.cursors import DictCursor
+from tqdm import tqdm
+
+from applications import log
+from applications.const import ToutiaoVideoCrawlerConst
+from applications.db import DatabaseConnector
+from coldStartTasks.crawler.toutiao import get_associated_recommendation
+from config import apolloConfig, long_articles_config
+
+const = ToutiaoVideoCrawlerConst()
+config = apolloConfig()
+cookie = config.getConfigValue("toutiao_detail_recommend_cookie")
+
+
+class CrawlerToutiaoAccounts:
+    """
+    toutiao blogger crawler
+    """
+
+    def __init__(self):
+        self.db_client = DatabaseConnector(db_config=long_articles_config)
+        self.db_client.connect()
+
+    def get_seed_video_with_strategy(self, strategy: str = 'basic'):
+        """
+        采用策略获取种子视频
+        """
+        match strategy:
+            case 'basic':
+                sql = "select id, article_title, out_account_name, url_unique_md5 from publish_single_video_source where platform = 'toutiao' and audit_status = 1 and bad_status = 0;"
+                seed_video_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
+                return seed_video_list
+            case _:
+                return []
+
+    def get_exist_account_list(self) -> set:
+        """
+        get already exist account list
+        """
+        sql = f"""
+            select account_id
+            from video_meta_accounts
+            where platform = 'toutiao' and status = {const.TOUTIAO_ACCOUNT_GOOD_STATUS};
+        """
+        account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
+        account_id_set = set(
+            [i['account_id'] for i in account_list]
+        )
+        return account_id_set
+
+    def insert_video_into_video_association_table(self, video_info: dict):
+        """
+        insert video into video_association table
+        """
+        select_sql = f"""select id from video_association where recommend_video_id = %s"""
+        video_id = self.db_client.fetch(query=select_sql, params=(video_info['recommend_video_id'],), cursor_type=DictCursor)
+        if video_id:
+            print(f'duplicated video id: {video_id}')
+            return
+
+        sql = f"""
+            insert into video_association
+                (account_name, account_id, recommend_video_id, title, read_cnt, duration, seed_account, seed_title, recommend_date, platform)
+            values
+                (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+        """
+        self.db_client.save(
+            query=sql,
+            params=(
+                video_info['author'],
+                video_info['account_id'],
+                video_info['recommend_video_id'],
+                video_info['title'],
+                video_info['read_cnt'],
+                video_info['duration'],
+                video_info['seed_account'],
+                video_info['seed_title'],
+                video_info['recommend_date'],
+                video_info['platform'],
+            )
+        )
+
+    def get_recommend_video_list(self, seed_video: dict, exist_account_set: set):
+        """
+        group_id: toutiao group id
+        """
+        group_id = seed_video['url_unique_md5']
+        seed_title = seed_video['article_title']
+        seed_account = seed_video['out_account_name']
+        response = get_associated_recommendation(article_id=group_id, cookie=cookie)
+        recommend_video_list = response['data']
+        recommend_video_list_bar = tqdm(recommend_video_list, desc="get recommend video list")
+        for video in recommend_video_list_bar:
+            try:
+                account_info = video["user_info"]
+                author = account_info["name"]
+                account_id = account_info["user_id"]
+                if account_id in exist_account_set:
+                    print("exists account:\t", author, )
+                    continue
+
+                video_obj = {
+                    "author": author,
+                    "account_id": account_id,
+                    "title": video["title"],
+                    "read_cnt": video.get("read_count", 0),
+                    "duration": video.get("video_duration", 0),
+                    "recommend_video_id": video["group_id"],
+                    "seed_account": seed_account,
+                    "seed_title": seed_title,
+                    "recommend_date": time.strftime("%Y-%m-%d", time.localtime()),
+                    "platform": 'toutiao'
+                }
+                self.insert_video_into_video_association_table(video_obj)
+            except Exception as e:
+                print(e)
+
+    def deal(self) -> None:
+        """
+        class entrance
+        """
+        # get exist account id set
+        exist_account_id_set = self.get_exist_account_list()
+
+        # get seed video list
+        seed_video_list = self.get_seed_video_with_strategy()
+        seed_video_list_bar = tqdm(seed_video_list, desc="get recommend video list")
+        for video in seed_video_list_bar:
+            seed_title = video['article_title']
+            seed_video_list_bar.set_postfix({"seed_title": seed_title})
+            try:
+                self.get_recommend_video_list(
+                    seed_video=video,
+                    exist_account_set=exist_account_id_set
+                )
+            except Exception as e:
+                print(e)