瀏覽代碼

今日头条-推荐抓取

luojunhui 3 月之前
父節點
當前提交
7c31352fed

+ 2 - 1
coldStartTasks/crawler/toutiao/__init__.py

@@ -1,4 +1,5 @@
 """
 @author: luojunhui
 """
-from .blogger import get_toutiao_account_video_list
+from .blogger import get_toutiao_account_video_list
+from .detail_page_recommendation import get_associated_recommendation

+ 59 - 0
coldStartTasks/crawler/toutiao/detail_page_recommendation.py

@@ -0,0 +1,59 @@
+"""
+@author: luojunhui
+"""
+from __future__ import annotations
+
+import json
+import requests
+from tenacity import retry
+
+from applications import log
+from applications.utils import proxy, request_retry
+from .use_js import call_js_function
+
+retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
+
+
+@retry(**retry_desc)
+def get_associated_recommendation(article_id, cookie):
+    """
+    toutiao related recommendation
+    """
+    ms_token = "-aYwLj97uyCi3oghPfhz2nXaekLoFR5YnYUBA5SuyQZae_NLllO4zC30-CeVLth0A6Hmm7MuGr4_IN9MjHUn8wkq-UQKXJxoGmIAokpUsPsOLjdQKffe-cGWCiZ6xqgh7XE%3D"
+    query_params = [
+        0,
+        1,
+        14,
+        "min_behot_time=0&channel_id=91558184576&category=pc_profile_channel&disable_raw_data=true&client_extra_params=%7B%22playparam%22%3A%22codec_type%3A0%2Cenable_dash%3A1%2Cunwatermark%3A1%22%2C%22group_id%22%3A%22{}%22%7D&aid=24&app_name=toutiao_web&msToken={}".format(
+            article_id, ms_token, ms_token),
+        "",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
+    ]
+    a_bogus = call_js_function(query_params)
+    url = f"https://www.toutiao.com/api/pc/list/feed?min_behot_time=0&channel_id=91558184576&category=pc_profile_channel&disable_raw_data=true&client_extra_params=%7B%22playparam%22%3A%22codec_type%3A0%2Cenable_dash%3A1%2Cunwatermark%3A1%22%2C%22group_id%22%3A%22{article_id}%22%7D&aid=24&app_name=toutiao_web&msToken={ms_token}&a_bogus={a_bogus}"
+    headers = {
+        'accept': 'application/json, text/plain, */*',
+        'accept-language': 'zh',
+        'referer': 'https://www.toutiao.com/video/{}/'.format(article_id),
+        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
+        'Cookie': cookie
+    }
+    try:
+        response = requests.get(url, headers=headers, proxies=proxy())
+        response.raise_for_status()
+        return response.json()
+    except requests.exceptions.RequestException as e:
+        log(
+            task="toutiao account crawler",
+            function="get_toutiao_account_video_list",
+            message=f"API请求失败: {e}",
+            data={"account_id": article_id},
+        )
+    except json.JSONDecodeError as e:
+        log(
+            task="toutiao account crawler",
+            function="get_toutiao_account_video_list",
+            message=f"响应解析失败: {e}",
+            data={"account_id": article_id},
+        )
+    return None

+ 208 - 0
tasks/crawler_toutiao_detail_recommend_videos.py

@@ -0,0 +1,208 @@
+"""
+@author: luojunhui
+"""
+
+from __future__ import annotations
+
+import time
+import traceback
+
+from pymysql.cursors import DictCursor
+from tqdm import tqdm
+
+from applications import log
+from applications.const import ToutiaoVideoCrawlerConst
+from applications.db import DatabaseConnector
+from applications.pipeline import scrape_video_entities_process
+from applications.utils import Item
+from applications.utils import str_to_md5
+from applications.utils import insert_into_single_video_source_table
+from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
+from config import apolloConfig, long_articles_config
+
+const = ToutiaoVideoCrawlerConst()
+config = apolloConfig()
+cookie = config.getConfigValue("toutiao_detail_recommend_cookie")
+
+
+class CrawlerToutiaoDetailRecommendVideos:
+    """
+    toutiao blogger crawler
+    """
+
+    def __init__(self):
+        self.db_client = DatabaseConnector(db_config=long_articles_config)
+        self.db_client.connect()
+
+    def get_seed_video_list(self):
+        """
+        get seed video list
+        """
+        sql = f"""
+            select account_id, max_cursor
+            from video_meta_accounts
+            where platform = 'toutiao' and status = {const.TOUTIAO_ACCOUNT_GOOD_STATUS};
+        """
+        account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
+        return account_list
+
+    def crawler_each_account_video_list(
+        self, account_id: str, max_cursor: int | None, max_behot_time: int = 0
+    ):
+        """
+        account_id: toutiao account id
+        max_cursor: crawler latest cursor for each account
+        max_behot_time: max behot time from toutiao, use to switch to next page
+        """
+        has_more = True
+        current_cursor = max_behot_time
+        max_cursor = max_cursor or const.DEFAULT_CURSOR
+
+        while has_more:
+            response = get_toutiao_account_video_list(
+                account_id=account_id, cookie=cookie, max_behot_time=current_cursor
+            )
+            if not response:
+                break
+
+            if response["message"] != "success":
+                log(
+                    task="crawler_toutiao_account_videos",
+                    function="crawler_toutiao_account_videos",
+                    message="get response from toutiao failed",
+                    data={"account_id": account_id, "response": response},
+                )
+                break
+
+            video_list = response["data"]
+            has_more = response["has_more"]
+            current_cursor = response["next"]["max_behot_time"]
+
+            if not video_list:
+                break
+
+            max_timestamp_in_this_group = video_list[0]["publish_time"]
+            if max_timestamp_in_this_group < max_cursor:
+                break
+
+            # do crawler each video
+            crawler_video_list_bar = tqdm(video_list, desc="crawler videos")
+            for video in crawler_video_list_bar:
+                try:
+                    crawler_video_list_bar.set_postfix({"video_id": video["id"]})
+                    self.crawler_each_video(video)
+
+                except Exception as e:
+                    log(
+                        task="crawler_toutiao_account_videos",
+                        function="crawler_each_account_video_list",
+                        message="crawler each video failed",
+                        data={
+                            "account_id": account_id,
+                            "video_info": video,
+                            "error": str(e),
+                            "traceback": traceback.format_exc(),
+                        },
+                    )
+
+            if has_more:
+                time.sleep(const.SLEEP_SECOND)
+            else:
+                break
+
+    def crawler_each_video(self, video_data):
+        """
+        crawler each video data
+        """
+        video_item = Item()
+        video_id = video_data["video_id"]
+        title = video_data["title"]
+        media = video_data["video"]
+        url = media["download_addr"]["url_list"][0]
+
+        # add info into item
+        video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
+        video_item.add("url_unique_md5", video_id)
+        video_item.add("article_title", title)
+        video_item.add("out_account_id", video_data["user"]["user_id"])
+        video_item.add("out_account_name", video_data["source"])
+        video_item.add("publish_timestamp", video_data["publish_time"])
+        video_item.add("platform", const.PLATFORM)
+        video_item.add("read_cnt", video_data.get("read_count", 0))
+        video_item.add("article_url", url)
+        video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
+        video_item.add("crawler_timestamp", int(time.time()))
+
+        # check item before insert
+        video_item.check(source="video")
+        try:
+            item_with_oss_path = scrape_video_entities_process(
+                video_item=video_item.item, db_client=self.db_client
+            )
+            if item_with_oss_path:
+                insert_into_single_video_source_table(
+                    self.db_client, item_with_oss_path
+                )
+        except Exception as e:
+            log(
+                task="crawler_toutiao_account_videos",
+                function="crawler_toutiao_account_videos",
+                message="etl failed",
+                data={
+                    "video_item": video_item.item,
+                    "error": str(e),
+                    "traceback": traceback.format_exc(),
+                }
+            )
+
+    def update_account_max_cursor(self, account_id: str) -> None:
+        """
+        update account max cursor
+        """
+        select_sql = f"""
+            select max(publish_timestamp) as max_cursor 
+            from publish_single_video_source 
+            where out_account_id = '{account_id}' and platform = '{const.PLATFORM}';
+        """
+        response_mysql = self.db_client.fetch(query=select_sql)
+        max_publish_timestamp = response_mysql[0][0]
+
+        if max_publish_timestamp:
+            update_sql = f"""
+                update video_meta_accounts
+                set max_cursor = %s
+                where account_id = %s and platform = %s;
+            """
+            self.db_client.save(
+                query=update_sql,
+                params=(max_publish_timestamp, account_id, const.PLATFORM),
+            )
+
+    def deal(self) -> None:
+        """
+        class entrance
+        """
+        account_list = self.get_account_list()
+        account_list_bar = tqdm(account_list, desc="crawler toutiao accounts")
+        for account in account_list_bar:
+            account_id = account["account_id"]
+            max_cursor = account["max_cursor"]
+            try:
+                # crawl each account
+                account_list_bar.set_postfix({"account_id": account_id})
+                self.crawler_each_account_video_list(
+                    account_id=account_id, max_cursor=max_cursor
+                )
+                self.update_account_max_cursor(account_id)
+
+            except Exception as e:
+                # add log and bot
+                log(
+                    task="crawler_toutiao_account_videos",
+                    function="deal",
+                    message=account_id,
+                    data={
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                )