123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- """
- @author: 罗俊辉
- @file: toutiao_get_bogus.py
- @time: 2024/1/4
- @desc: 用浏览器工具获取头条参数,并且存储到数据库中
- """
- import json
- from typing import Dict
- from playwright.sync_api import sync_playwright
- from applications.db import DatabaseConnector
- from config import long_articles_config
- class ToutiaoBogus:
- """
- 获取头条请求参数
- """
- def __init__(self):
- """
- 初始化ToutiaoBogus类的实例。
- 该方法创建了一个DatabaseConnector的实例,并调用其connect方法来建立与数据库的连接。
- Attributes:
- db (DatabaseConnector): 用于与数据库进行交互的DatabaseConnector实例。
- """
- # 创建一个DatabaseConnector实例,用于与数据库进行交互
- self.db = DatabaseConnector(db_config=long_articles_config)
- # 调用DatabaseConnector实例的connect方法,建立与数据库的连接
- self.db.connect()
- def on_request(self, request, category):
- if "https://www.toutiao.com/api/pc/list/feed?" in request.url:
- # request_info = {
- # 'method': request.method,
- # 'url': request.url,
- # 'headers': request.headers if request.headers else {},
- # 'postData': request.post_data if request.post_data else {}
- # }
- insert_sql = f"""
- INSERT INTO toutiao_request_params
- (request_method, request_url, request_headers, post_data, category)
- VALUES
- (%s, %s, %s, %s, %s);
- """
- self.db.save(
- query=insert_sql,
- params=(
- request.method,
- request.url,
- json.dumps(request.headers, ensure_ascii=False),
- json.dumps(request.post_data, ensure_ascii=False),
- category
- )
- )
-
- def crawler_recommend_article_list(self, category_info: Dict):
- with sync_playwright() as p:
- browser = p.chromium.launch(headless=False)
- context = browser.new_context()
- page = context.new_page()
- page.goto(category_info['url'])
- page.wait_for_load_state("networkidle")
- # 监听请求事件
- page.on("request", lambda request: self.on_request(request, category_info['category']))
- page.get_by_role("button", name=category_info['name']).click()
- page.wait_for_load_state("networkidle")
- page.wait_for_timeout(5000)
- browser.close()
|