Преглед изворни кода

头条--推荐接口--开发

luojunhui пре 6 месеци
родитељ
комит
48e1ae6d4c

+ 66 - 0
coldStartTasks/crawler/toutiao_get_bogus.py

@@ -0,0 +1,66 @@
+"""
+@author: 罗俊辉
+@file: toutiao_get_bogus.py
+@time: 2024/1/4
+@desc: 用浏览器工具获取头条参数,并且存储到数据库中
+"""
+
+import json
+from typing import Dict
+from playwright.sync_api import sync_playwright
+
+from applications.db import DatabaseConnector
+from config import long_articles_config
+
+
+class ToutiaoBogus:
+    def __init__(self):
+        """
+        初始化ToutiaoBogus类的实例。
+        该方法创建了一个DatabaseConnector的实例,并调用其connect方法来建立与数据库的连接。
+        Attributes:
+            db (DatabaseConnector): 用于与数据库进行交互的DatabaseConnector实例。
+        """
+        # 创建一个DatabaseConnector实例,用于与数据库进行交互
+        self.db = DatabaseConnector(db_config=long_articles_config)
+        # 调用DatabaseConnector实例的connect方法,建立与数据库的连接
+        self.db.connect()
+
+    def on_request(self, request, category):
+        if "https://www.toutiao.com/api/pc/list/feed?" in request.url:
+            request_info = {
+                'method': request.method,
+                'url': request.url,
+                'headers': request.headers if request.headers else {},
+                'postData': request.post_data if request.post_data else {}
+            }
+            insert_sql = f"""
+                INSERT INTO toutiao_request_params
+                (request_method, request_url, request_headers, post_data, category) 
+                VALUES
+                (%s, %s, %s, %s, %s);
+            """
+            self.db.save(
+                query=insert_sql,
+                params=(
+                    request.method, 
+                    request.url, 
+                    json.dumps(request.headers, ensure_ascii=False), 
+                    json.dumps(request.post_data, ensure_ascii=False),
+                    category
+                )
+            )
+        
+    def crawler_recommend_article_list(self, category_info: Dict):
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=False)
+            context = browser.new_context()
+            page = context.new_page()
+            page.goto(category_info['url'])
+            page.wait_for_load_state("networkidle")
+            # 监听请求事件
+            page.on("request", lambda request: self.on_request(request, category_info['category']))
+            page.get_by_role("button", name=category_info['name']).click()
+            page.wait_for_load_state("networkidle")
+            page.wait_for_timeout(5000)
+            browser.close()

+ 52 - 52
coldStartTasks/crawler/toutiao_recommend_crawler.py

@@ -8,12 +8,15 @@ from typing import Dict
 
 import requests
 from tqdm import tqdm
+from pymysql.cursors import DictCursor
 
 from applications import bot
 from applications import log
+from applications import Functions
 from applications.db import DatabaseConnector
 from config import long_articles_config
 
+functions = Functions()
 
 class ToutiaoRecommendCrawler(object):
     """
@@ -44,36 +47,36 @@ class ToutiaoRecommendCrawler(object):
         获取历史推荐流文章
         :return:
         """
-        url = 'https://www.toutiao.com/api/pc/list/feed'
-        params = {
-            'channel_id': '3189398965',
-            'min_behot_time': '0',
-            'offset': '0',
-            'refresh_count': '1',
-            'category': 'pc_profile_channel',
-            'client_extra_params': '{"short_video_item":"filter"}',
-            'aid': '24',
-            'app_name': 'toutiao_web',
-            'msToken': '_rhOZjdccInxERSE5rot9jsH_4FDZLRNYC9HVypTRtZ2IEb0wHQCtxjLEjXSoDM4oUNW2EbhPJqomKQvt8_Jg503jGiFSl2hmP3neRKfE9uBanlfhoD1yQ==',
-            'a_bogus': 'xyRh/mL6DkdNXfyI55QLfY3qV4P3YkLG0t9bMDhqTVfSty39HMPd9exEuvhvMy8jxs/gIegjy4hbY3/DrQAJMpyUHuXLUdQ2mymsKl5Q59gCs1feejuQnU4Nmkt-tec25JZ4EKi8o7/aSYuDl2Be-wnAP6ZCcHhMHjD8CpMpvn6lErm=',
-        }
-        headers = {
-            'Accept': 'application/json, text/plain, */*',
-            'Accept-Language': 'zh,zh-CN;q=0.9',
-            'Cookie': '__ac_signature=_02B4Z6wo00f01uL1Y5QAAIDDwi9p-RyULmbi1WcAAN8B5b; ttcid=a7499fc4f17243e1a6f1d47fc054799e16; _ga=GA1.1.1771235425.1716434457; csrftoken=ee756af695a449eeb73b5a3fc78978b2; _S_IPAD=0; notRedShot=1; tt_webid=7371293454351697471; s_v_web_id=verify_m559znfi_rrsyYOnm_GvTT_4r4T_8FNl_VV6uBCq85ctv; __feed_out_channel_key=history; _S_DPR=2.200000047683716; gfkadpd=24,6457; _S_WIN_WH=1554_860; ttwid=1%7C9b5sTIuwZxZKt0wFsvE-2t5OoFxH_Q5VIpVNWEREbAo%7C1735808360%7Cc9062afa201717f3b20ed1ac1b940d22da7b5dd01e201c26a97350f99c2683ee; tt_scid=i1kVV8f6ncNAV18LRQKFsVz-XuURdfA2uP9P2oTf.IBYozNpDFa5qspcCeSQNAQ5df75; _ga_QEHZPBE5HH=GS1.1.1735807708.20.1.1735809077.0.0.0',
-            'Priority': 'u=1, i',
-            'Referer': 'https://www.toutiao.com/',
-            'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
-            'sec-ch-ua-mobile': '?0',
-            'sec-ch-ua-platform': '"macOS"',
-            'sec-fetch-dest': 'empty',
-            'sec-fetch-mode': 'cors',
-            'sec-fetch-site': 'same-origin',
-            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
-        }
-        response = requests.get(url, params=params, headers=headers)
+        select_sql = f"""
+            SELECT request_method, request_url, request_headers, post_data
+            FROM toutiao_request_params
+            WHERE category = 'history' and expire_flag = 0
+            ORDER BY id
+            LIMIT 1;
+        """
+        result = self.db_client.fetch(
+            query=select_sql,
+            cursor_type=DictCursor
+        )
+        if not result:
+            print("cookie没了报警")
+            return {}
+        cookie_obj = result[0]
+        response = requests.request(
+            method=cookie_obj['request_method'],
+            url=cookie_obj['request_url'],
+            headers=json.loads(cookie_obj['request_headers']),
+            proxies=functions.proxy()
+        )
         return response.json()
 
+    def get_tech_recommendation(self) -> Dict:
+        """
+        获取科技推荐流文章
+        :return:
+        """
+        return
+
     def insert_each_article(self, item: Dict) -> Dict:
         """
         提取文章信息
@@ -86,7 +89,6 @@ class ToutiaoRecommendCrawler(object):
         read_count = item['read_count']
         title = item['title']
         user_info = item['user_info']
-        user_name = user_info.get('name')
         user_id = user_info.get('user_id')
         abstract = item['Abstract']
         publish_time = item['publish_time']
@@ -138,26 +140,24 @@ class ToutiaoRecommendCrawler(object):
                             data=error_data,
                             status='fail'
                         )
-
-
-def main():
-    """
-    主函数
-    :return:
-    """
-    toutiao_recommend_crawler = ToutiaoRecommendCrawler()
-    toutiao_recommend_crawler.init_database()
-    for i in range(10):
-        try:
-            article_list = toutiao_recommend_crawler.get_history_recommendation()
-        except Exception as e:
-            error_data = {
-                "error": str(e),
-                "error_stack": traceback.format_exc()
-            }
-            print(error_data)
-            continue
-        toutiao_recommend_crawler.process_recommendation(article_list)
-
-
-main()
+                else:
+                    print("视频文章跳过")
+            else:
+                print("无链接文章跳过")
+    
+    def run(self) -> None:
+        """
+        主函数
+        :return:
+        """
+        for i in range(10):
+            try:
+                article_list = self.get_history_recommendation()
+                self.process_recommendation(article_list)
+                time.sleep(3)
+            except Exception as e:
+                error_data = {
+                    "error": str(e),
+                    "error_stack": traceback.format_exc()
+                }
+                print(error_data)