baijiahao_article.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. """
  2. @author: luojunhui
  3. """
  4. import time
  5. import requests
  6. from selenium import webdriver
  7. from selenium.webdriver.chrome.service import Service
  8. from selenium.webdriver.chrome.options import Options
  9. from selenium.webdriver.common.by import By
  10. from webdriver_manager.chrome import ChromeDriverManager
  11. def tunnel_proxies():
  12. """
  13. 快代理
  14. :return:
  15. """
  16. # 隧道域名:端口号
  17. tunnel = "l901.kdltps.com:15818"
  18. # 用户名密码方式
  19. username = "t11983523373311"
  20. password = "mtuhdr2z"
  21. proxies = {
  22. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
  23. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
  24. }
  25. return proxies
  26. def bjh_url_list(search_title):
  27. """
  28. 获取图片list
  29. :return:
  30. """
  31. url = "https://lab.magiconch.com/api/baidu/images"
  32. params = {
  33. "text": search_title,
  34. "index": 0,
  35. "size": 60
  36. }
  37. headers = {
  38. 'accept': '*/*',
  39. 'accept-language': 'en,zh;q=0.9,zh-CN;q=0.8',
  40. 'content-type': 'application/json',
  41. 'cookie': 'Hm_lvt_f4e477c61adf5c145ce938a05611d5f0=1718784293; Hm_lpvt_f4e477c61adf5c145ce938a05611d5f0=1718784293',
  42. 'if-none-match': 'W/"5e03-9dK2z/6rD0/7aX0R6HraLuFnLjI"',
  43. 'priority': 'u=1, i',
  44. 'referer': 'https://lab.magiconch.com/baidu-images/',
  45. 'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
  46. 'sec-ch-ua-mobile': '?0',
  47. 'sec-ch-ua-platform': '"macOS"',
  48. 'sec-fetch-dest': 'empty',
  49. 'sec-fetch-mode': 'cors',
  50. 'sec-fetch-site': 'same-origin',
  51. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
  52. }
  53. response = requests.request(
  54. "GET",
  55. url,
  56. headers=headers,
  57. params=params,
  58. proxies=tunnel_proxies()
  59. )
  60. res = response.json()
  61. url_list = []
  62. for item in res:
  63. if "baijiahao.baidu.com" in item['url']:
  64. url_list.append(item['url'].split("&")[0])
  65. return url_list
  66. def bjh_article(content_url):
  67. """
  68. 百家号获取文章
  69. :param content_url:
  70. """
  71. # 配置无头浏览器模式
  72. print(content_url)
  73. chrome_options = Options()
  74. chrome_options.add_argument("--headless")
  75. chrome_options.add_argument("--disable-gpu")
  76. chrome_options.add_argument("--no-sandbox")
  77. chrome_options.add_argument("--disable-dev-shm-usage")
  78. # 安装并设置Chrome驱动
  79. service = Service(ChromeDriverManager().install())
  80. driver = webdriver.Chrome(service=service, options=chrome_options)
  81. # 打开指定网页
  82. driver.get(content_url)
  83. # 等待网页加载完毕
  84. driver.implicitly_wait(5)
  85. # 模拟滚动页面
  86. def scroll_page():
  87. # 获取页面高度
  88. last_height = driver.execute_script("return document.body.scrollHeight")
  89. while True:
  90. # 向下滚动到页面底部
  91. driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  92. # 等待页面加载
  93. time.sleep(2)
  94. # 计算新的页面高度并与上次页面高度进行比较
  95. new_height = driver.execute_script("return document.body.scrollHeight")
  96. if new_height == last_height:
  97. break
  98. last_height = new_height
  99. scroll_page()
  100. title_element = driver.find_element(By.XPATH, '//div[@class="sKHSJ"]')
  101. title = title_element.text
  102. page_text_elements = driver.find_elements(By.XPATH, '//div[@data-testid="article"]//p')
  103. page_text = '\n'.join([element.text for element in page_text_elements])
  104. img_elements = driver.find_elements(By.XPATH, '//div[@class="_1NCGf"]/img')
  105. img_url_list = [element.get_attribute("src") for element in img_elements]
  106. # 打印网页文本
  107. print(title)
  108. print(page_text)
  109. print(img_url_list)
  110. # 关闭浏览器
  111. driver.quit()
  112. return title, page_text, img_url_list
  113. if __name__ == '__main__':
  114. title = "老祖宗说“寿有三不过,子孙福气多”!原来话里藏着大秘密!"
  115. url_list = bjh_url_list(title)
  116. # print(url_list)
  117. for url in url_list:
  118. try:
  119. bjh_article(url)
  120. except:
  121. pass