toutiao_get_bogus.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. """
  2. @author: 罗俊辉
  3. @file: toutiao_get_bogus.py
  4. @time: 2024/1/4
  5. @desc: 用浏览器工具获取头条参数,并且存储到数据库中
  6. """
  7. import json
  8. from typing import Dict
  9. from playwright.sync_api import sync_playwright
  10. from applications.db import DatabaseConnector
  11. from config import long_articles_config
  12. class ToutiaoBogus:
  13. def __init__(self):
  14. """
  15. 初始化ToutiaoBogus类的实例。
  16. 该方法创建了一个DatabaseConnector的实例,并调用其connect方法来建立与数据库的连接。
  17. Attributes:
  18. db (DatabaseConnector): 用于与数据库进行交互的DatabaseConnector实例。
  19. """
  20. # 创建一个DatabaseConnector实例,用于与数据库进行交互
  21. self.db = DatabaseConnector(db_config=long_articles_config)
  22. # 调用DatabaseConnector实例的connect方法,建立与数据库的连接
  23. self.db.connect()
  24. def on_request(self, request, category):
  25. if "https://www.toutiao.com/api/pc/list/feed?" in request.url:
  26. request_info = {
  27. 'method': request.method,
  28. 'url': request.url,
  29. 'headers': request.headers if request.headers else {},
  30. 'postData': request.post_data if request.post_data else {}
  31. }
  32. insert_sql = f"""
  33. INSERT INTO toutiao_request_params
  34. (request_method, request_url, request_headers, post_data, category)
  35. VALUES
  36. (%s, %s, %s, %s, %s);
  37. """
  38. self.db.save(
  39. query=insert_sql,
  40. params=(
  41. request.method,
  42. request.url,
  43. json.dumps(request.headers, ensure_ascii=False),
  44. json.dumps(request.post_data, ensure_ascii=False),
  45. category
  46. )
  47. )
  48. def crawler_recommend_article_list(self, category_info: Dict):
  49. with sync_playwright() as p:
  50. browser = p.chromium.launch(headless=False)
  51. context = browser.new_context()
  52. page = context.new_page()
  53. page.goto(category_info['url'])
  54. page.wait_for_load_state("networkidle")
  55. # 监听请求事件
  56. page.on("request", lambda request: self.on_request(request, category_info['category']))
  57. page.get_by_role("button", name=category_info['name']).click()
  58. page.wait_for_load_state("networkidle")
  59. page.wait_for_timeout(5000)
  60. browser.close()