toutiao_get_bogus.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. """
  2. @author: 罗俊辉
  3. @file: toutiao_get_bogus.py
  4. @time: 2024/1/4
  5. @desc: 用浏览器工具获取头条参数,并且存储到数据库中
  6. """
  7. import json
  8. from typing import Dict
  9. from playwright.sync_api import sync_playwright
  10. from applications.db import DatabaseConnector
  11. from config import long_articles_config
  12. class ToutiaoBogus:
  13. """
  14. 获取头条请求参数
  15. """
  16. def __init__(self):
  17. """
  18. 初始化ToutiaoBogus类的实例。
  19. 该方法创建了一个DatabaseConnector的实例,并调用其connect方法来建立与数据库的连接。
  20. Attributes:
  21. db (DatabaseConnector): 用于与数据库进行交互的DatabaseConnector实例。
  22. """
  23. # 创建一个DatabaseConnector实例,用于与数据库进行交互
  24. self.db = DatabaseConnector(db_config=long_articles_config)
  25. # 调用DatabaseConnector实例的connect方法,建立与数据库的连接
  26. self.db.connect()
  27. def on_request(self, request, category):
  28. if "https://www.toutiao.com/api/pc/list/feed?" in request.url:
  29. # request_info = {
  30. # 'method': request.method,
  31. # 'url': request.url,
  32. # 'headers': request.headers if request.headers else {},
  33. # 'postData': request.post_data if request.post_data else {}
  34. # }
  35. insert_sql = f"""
  36. INSERT INTO toutiao_request_params
  37. (request_method, request_url, request_headers, post_data, category)
  38. VALUES
  39. (%s, %s, %s, %s, %s);
  40. """
  41. self.db.save(
  42. query=insert_sql,
  43. params=(
  44. request.method,
  45. request.url,
  46. json.dumps(request.headers, ensure_ascii=False),
  47. json.dumps(request.post_data, ensure_ascii=False),
  48. category
  49. )
  50. )
  51. def crawler_recommend_article_list(self, category_info: Dict):
  52. with sync_playwright() as p:
  53. browser = p.chromium.launch(headless=False)
  54. context = browser.new_context()
  55. page = context.new_page()
  56. page.goto(category_info['url'])
  57. page.wait_for_load_state("networkidle")
  58. # 监听请求事件
  59. page.on("request", lambda request: self.on_request(request, category_info['category']))
  60. page.get_by_role("button", name=category_info['name']).click()
  61. page.wait_for_load_state("networkidle")
  62. page.wait_for_timeout(5000)
  63. browser.close()