Browse Source

develop toutiao recommend

luojunhui 3 months ago
parent
commit
2e07f2863f

+ 1 - 0
applications/utils/__init__.py

@@ -10,4 +10,5 @@ from .download_video import download_toutiao_video
 from .item import Item
 from .save_to_db import insert_into_single_video_source_table
 from .save_to_db import insert_into_video_meta_accounts_table
+from .save_to_db import insert_into_associated_recommendation_table
 from .upload import upload_to_oss

+ 28 - 0
applications/utils/item.py

@@ -38,6 +38,19 @@ default_account_table_fields = {
 
 }
 
+default_association_table_fields = {
+    "account_name": 'Not NULL',
+    "account_id": 'Not NULL',
+    "recommend_video_id": 'Not NULL',
+    "title": 'Not NULL',
+    "read_cnt": 0,
+    "duration": 0,
+    "seed_account": 'Not NULL',
+    "seed_title": 'Not NULL',
+    "recommend_date": 'Not NULL',
+    "platform": 'Not NULL'
+}
+
 
 class Item(object):
     """
@@ -83,6 +96,19 @@ class Item(object):
             else:
                 self.item[key] = default_account_table_fields[key]
 
+    def check_association_item(self):
+        """
+        check association item
+        """
+        fields = list(default_association_table_fields.keys())
+        for field in fields:
+            if self.item.get(field, None) is not None:
+                continue
+            elif default_account_table_fields[field] == 'Not NULL':
+                raise ValueError(f"{field} is not None, please check your account item")
+            else:
+                self.item[field] = default_association_table_fields[field]
+
     def check(self, source):
         """
         check item
@@ -94,3 +120,5 @@ class Item(object):
                 self.check_article_item()
             case "account":
                 self.check_account_item()
+            case "association":
+                self.check_association_item()

+ 54 - 0
applications/utils/save_to_db.py

@@ -84,4 +84,58 @@ def insert_into_video_meta_accounts_table(db_client, account_item):
                 "traceback": traceback.format_exc(),
                 "account_id": account_item["account_id"],
             },
+        )
+
+def insert_into_associated_recommendation_table(db_client, associated_recommendation_item):
+    """
+    insert recommendation into recommendation table
+    """
+    # check whether duplicate video
+    fetch_query = f"""
+        select id from video_association
+        where account_id = %s and platform = %s and recommend_video_id = %s;
+    """
+    duplicate_id = db_client.fetch(
+        query=fetch_query, params=(
+            associated_recommendation_item["account_id"],
+            associated_recommendation_item["platform"],
+            associated_recommendation_item["recommend_video_id"]
+        )
+    )
+    if duplicate_id:
+        return
+
+    # insert into table
+    insert_query = f"""
+        insert into video_association
+            (account_name, account_id, recommend_video_id, title, read_cnt, duration, seed_account, seed_title, recommend_date, platform)
+            values
+            (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+    """
+    try:
+        db_client.save(
+            query=insert_query,
+            params=(
+                associated_recommendation_item["account_name"],
+                associated_recommendation_item["account_id"],
+                associated_recommendation_item["recommend_video_id"],
+                associated_recommendation_item["title"],
+                associated_recommendation_item["read_cnt"],
+                associated_recommendation_item["duration"],
+                associated_recommendation_item["seed_account"],
+                associated_recommendation_item["seed_title"],
+                associated_recommendation_item["recommend_date"],
+                associated_recommendation_item["platform"]
+            )
+        )
+    except Exception as e:
+        log(
+            task="{}_recommendation_crawler".format(associated_recommendation_item["platform"]),
+            function="save_each_recommendation",
+            message="save recommendation failed",
+            data={
+                "error": str(e),
+                "traceback": traceback.format_exc(),
+                "item": associated_recommendation_item
+            }
         )

+ 2 - 1
coldStartTasks/crawler/toutiao/__init__.py

@@ -1,4 +1,5 @@
 """
 @author: luojunhui
 """
-from .blogger import get_toutiao_account_video_list
+from .blogger import get_toutiao_account_video_list
+from .detail_recommend import get_associated_recommendation

+ 59 - 0
coldStartTasks/crawler/toutiao/detail_recommend.py

@@ -0,0 +1,59 @@
+"""
+@author: luojunhui
+"""
+from __future__ import annotations
+
+import json
+import requests
+from tenacity import retry
+
+from applications import log
+from applications.utils import proxy, request_retry
+from .use_js import call_js_function
+
+retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
+
+
+@retry(**retry_desc)
+def get_associated_recommendation(article_id: str, cookie: str):
+    """
+    toutiao related recommendation
+    """
+    ms_token = "-aYwLj97uyCi3oghPfhz2nXaekLoFR5YnYUBA5SuyQZae_NLllO4zC30-CeVLth0A6Hmm7MuGr4_IN9MjHUn8wkq-UQKXJxoGmIAokpUsPsOLjdQKffe-cGWCiZ6xqgh7XE%3D"
+    query_params = [
+        0,
+        1,
+        14,
+        "min_behot_time=0&channel_id=91558184576&category=pc_profile_channel&disable_raw_data=true&client_extra_params=%7B%22playparam%22%3A%22codec_type%3A0%2Cenable_dash%3A1%2Cunwatermark%3A1%22%2C%22group_id%22%3A%22{}%22%7D&aid=24&app_name=toutiao_web&msToken={}".format(
+            article_id, ms_token, ms_token),
+        "",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
+    ]
+    a_bogus = call_js_function(query_params)
+    url = f"https://www.toutiao.com/api/pc/list/feed?min_behot_time=0&channel_id=91558184576&category=pc_profile_channel&disable_raw_data=true&client_extra_params=%7B%22playparam%22%3A%22codec_type%3A0%2Cenable_dash%3A1%2Cunwatermark%3A1%22%2C%22group_id%22%3A%22{article_id}%22%7D&aid=24&app_name=toutiao_web&msToken={ms_token}&a_bogus={a_bogus}"
+    headers = {
+        'accept': 'application/json, text/plain, */*',
+        'accept-language': 'zh',
+        'referer': 'https://www.toutiao.com/video/{}/'.format(article_id),
+        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
+        'Cookie': cookie
+    }
+    try:
+        response = requests.get(url, headers=headers, proxies=proxy())
+        response.raise_for_status()
+        return response.json()
+    except requests.exceptions.RequestException as e:
+        log(
+            task="toutiao account crawler",
+            function="get_toutiao_account_video_list",
+            message=f"API请求失败: {e}",
+            data={"account_id": article_id},
+        )
+    except json.JSONDecodeError as e:
+        log(
+            task="toutiao account crawler",
+            function="get_toutiao_account_video_list",
+            message=f"响应解析失败: {e}",
+            data={"account_id": article_id},
+        )
+    return None

+ 64 - 13
tasks/crawler_accounts.py

@@ -4,23 +4,22 @@
 
 from __future__ import annotations
 
+import json
 import time
+import datetime
 import traceback
 
 from pymysql.cursors import DictCursor
 from tqdm import tqdm
 
 from applications import log
-from applications.const import ToutiaoVideoCrawlerConst
 from applications.db import DatabaseConnector
-from applications.pipeline import scrape_video_entities_process
+from applications.pipeline import scrape_account_entities_process
 from applications.utils import Item
-from applications.utils import str_to_md5
-from applications.utils import insert_into_single_video_source_table
-from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
+from applications.utils import insert_into_associated_recommendation_table
+from coldStartTasks.crawler.toutiao import get_associated_recommendation
 from config import apolloConfig, long_articles_config
 
-const = ToutiaoVideoCrawlerConst()
 config = apolloConfig()
 cookie = config.getConfigValue("toutiao_detail_recommend_cookie")
 
@@ -51,19 +50,71 @@ class ChannelAccountCrawler(CrawlerAccounts):
 
 
 class ToutiaoAccountCrawler(CrawlerAccounts):
+
     def get_seed_videos(self):
         fetch_query = f"""
-            select article_title, url_unique_md5 
+            select out_account_name, article_title, url_unique_md5 
             from publish_single_video_source
             where platform = 'toutiao' and video_pool_audit_status = 1 and bad_status = 0
             order by score desc limit 100;
         """
-        seed_video_list = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
+        seed_video_list = self.db_client.fetch(
+            query=fetch_query, cursor_type=DictCursor
+        )
         return seed_video_list
 
-    def get_recommend_videos(self, seed_video_id: str):
-        # get recommend videos for each video
-        recommend_video_list = get_toutiao_account_video_list(seed_video_id, cookie)
-        return recommend_video_list
-
+    def process_each_video(self, video, seed_account_name, seed_title):
+
+        # process video item and save to database
+        video_item = Item()
+        user_info = video["user_info"]
+        video_item.add("account_name", user_info["name"])
+        video_item.add("account_id", user_info["user_id"])
+        video_item.add("platform", "toutiao")
+        video_item.add("recommend_video_id", video["id"])
+        video_item.add("title", video["title"])
+        video_item.add("read_cnt", video["read_count"])
+        video_item.add("duration", video["video_duration"])
+        video_item.add("seed_account", seed_account_name)
+        video_item.add("seed_title", seed_title)
+        video_item.add("recommend_date", datetime.datetime.today().strftime("%Y-%m-%d"))
+        # check item
+        video_item.check(source="association")
+
+        # whether account exists
+        final_item = scrape_account_entities_process(video_item.item, self.db_client)
+        if not final_item:
+            return
+        else:
+            # save to db
+            insert_into_associated_recommendation_table(db_client=self.db_client, associated_recommendation_item=final_item)
+
+    def get_recommend_video_list(self, seed_video: dict):
 
+        # get recommend videos for each video
+        seed_video_id = seed_video["url_unique_md5"]
+        seed_account_name = seed_video["out_account_name"]
+        seed_title = seed_video["article_title"]
+        recommend_response = get_associated_recommendation(seed_video_id, cookie)
+        recommend_video_list = recommend_response["data"]
+        for video in tqdm(recommend_video_list):
+            self.process_each_video(video, seed_account_name, seed_title)
+
+    def deal(self):
+
+        # start
+        seed_video_list = self.get_seed_videos()
+        for seed_video in tqdm(seed_video_list, desc="get each video recommendation"):
+            try:
+                self.get_recommend_video_list(seed_video)
+            except Exception as e:
+                log(
+                    task="{}_recommendation_crawler".format(seed_video["platform"]),
+                    function="save_each_recommendation",
+                    message="save recommendation failed",
+                    data={
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                        "seed_video": seed_video,
+                    },
+                )