xigua_search_dev.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/6/25
  4. import json
  5. import os
  6. import sys
  7. import time
  8. import requests
  9. import urllib.parse
  10. import urllib3
  11. from requests.adapters import HTTPAdapter
  12. from selenium.webdriver import DesiredCapabilities
  13. from selenium.webdriver.chrome.service import Service
  14. from selenium.webdriver.common.by import By
  15. from seleniumwire import webdriver
  16. sys.path.append(os.getcwd())
  17. from common.common import Common
  18. class SearchDev:
  19. @classmethod
  20. def get_videoList_requests(cls, word):
  21. while True:
  22. url = f"https://www.ixigua.com/api/searchv2/complex/{str(word)}/0?" \
  23. "fss=default_search&" \
  24. "order_type=publish_time&" \
  25. "click_position=new&" \
  26. "aid=1768&" \
  27. "msToken=EV6DlzmvSZH6yBIIm7tCdxb6EY7xuV7p0EZw4nZUyznGvXk9Wkyx0GiT39zCO2HRROdUYZc0XYpAztUSzg14q3a1Fkoj01Avy_BGjKFFn5wRQDP8nVWECA==&" \
  28. "X-Bogus=DFSzswVuSIsANrq4tnr0UFm4pID1&" \
  29. "_signature=_02B4Z6wo00001jeNZ4AAAIDCr-bw8w.DSLY3jWMAAOmJTnwirif4XNCUKjt3Ms0gS9-upb8jMBZJL5RSZ5dHBQm6GRMtSyn8h6D5rc1Y7tmwZL7a2nP390R3ARXFwF6tVQi97vqO5viH53M0c3"
  30. headers = {
  31. 'authority': 'www.ixigua.com',
  32. 'accept': 'application/json, text/plain, */*',
  33. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  34. 'cache-control': 'no-cache',
  35. 'cookie': 'MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; s_v_web_id=verify_lhoket5d_0qlKZtzS_YZkf_4Uaj_82mX_j6lRT4PcYJ7A; __ac_signature=_02B4Z6wo00f01yB6eXwAAIDCWLSSerYAxYsgWn3AAKx5S2D2PsJJ92YblwdDE-9rnwnzZ87S0CUowZ3Xi8XmxMU3JHd0xfP-9VucrE9D.l9E7Vgn6y95sGbL2H6mgsddoCZX0cCgfcfKAzWgcd; ixigua-a-s=1; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; SEARCH_CARD_MODE=7168304743566296612_1; msToken=EV6DlzmvSZH6yBIIm7tCdxb6EY7xuV7p0EZw4nZUyznGvXk9Wkyx0GiT39zCO2HRROdUYZc0XYpAztUSzg14q3a1Fkoj01Avy_BGjKFFn5wRQDP8nVWECA==; tt_scid=rP8nVwFTm4wPZyREet0crbp-ZRgJsK.x5TE0lqU2uibGbUDAhlM.oA14pKRcGzXW0955; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1687685218%7Ca985a413a36bb156ba577dac11fbc14593e5a2a4000001f9cfc7fd72781c4cc5; ixigua-a-s=1',
  36. 'pragma': 'no-cache',
  37. 'referer': f'https://www.ixigua.com/search/{urllib.parse.quote(str(word))}/?logTag=e0b95015015c05e60b1b&tab_name=home&fss=default_search',
  38. 'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
  39. 'sec-ch-ua-mobile': '?0',
  40. 'sec-ch-ua-platform': '"macOS"',
  41. 'sec-fetch-dest': 'empty',
  42. 'sec-fetch-mode': 'cors',
  43. 'sec-fetch-site': 'same-origin',
  44. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57',
  45. 'x-secsdk-csrf-token': '0001000000011fd0adbaee655439e86800862b81e3e34974cab6a8656af77695b76ff5c76c96176bdcbf2631eeb7'
  46. }
  47. urllib3.disable_warnings()
  48. s = requests.session()
  49. # max_retries=3 重试3次
  50. s.mount('http://', HTTPAdapter(max_retries=3))
  51. s.mount('https://', HTTPAdapter(max_retries=3))
  52. response = requests.get(url=url, headers=headers, verify=False, proxies=Common.tunnel_proxies(), timeout=5)
  53. if response.status_code != 200 or "data" not in response.text:
  54. print(f"response:{response.text}\n")
  55. return
  56. elif len(response.json()["data"]["data"]) == 0:
  57. print("没有更多数据啦~")
  58. return
  59. else:
  60. feeds = response.json()["data"]["data"]
  61. for i in range(len(feeds)):
  62. video_type = feeds[i].get("type", "")
  63. title = feeds[i].get("data", {}).get("title", "")
  64. publish_time = feeds[i].get("data", {}).get("publish_time", "")
  65. item_id = feeds[i].get("data", {}).get("group_id", "")
  66. print(f"title:{title}")
  67. print(f"video_type:{video_type}")
  68. print(f"publish_time:{publish_time}")
  69. print(f"item_id:{item_id}")
  70. print("\n")
  71. @classmethod
  72. def get_videoList_selenium(cls):
  73. # 打印请求配置
  74. ca = DesiredCapabilities.CHROME
  75. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  76. # # 不打开浏览器运行
  77. chrome_options = webdriver.ChromeOptions()
  78. chrome_options.add_argument(
  79. f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  80. # chrome_options.add_argument("--headless")
  81. chrome_options.add_argument("--window-size=1920,1080")
  82. # chrome_options.add_argument("--no-sandbox")
  83. chromedriver = "/Users/wangkun/Downloads/chromedriver/chromedriver_v114/chromedriver"
  84. # driver初始化
  85. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(chromedriver))
  86. driver.implicitly_wait(10)
  87. print("打开搜索页:健康")
  88. driver.get(f"https://www.ixigua.com/search/健康/")
  89. time.sleep(3)
  90. # logs = driver.get_log("performance")
  91. print("关闭登录弹框")
  92. if driver.find_elements(By.XPATH, '//*[@class="xg-notification-close"]') != 0:
  93. driver.find_element(By.XPATH, '//*[@class="xg-notification-close"]').click()
  94. driver.get_screenshot_as_file("./关闭弹框.png")
  95. print("点击筛选按钮")
  96. driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-icons-categories"]').click()
  97. print("点击最新排序")
  98. driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-categories-wrapper"]/*[1]/*[2]/*[1]').click()
  99. time.sleep(3)
  100. driver.get_screenshot_as_file("./最新排序.png")
  101. driver.quit()
  102. if __name__ == "__main__":
  103. SearchDev.get_videoList_requests("猪八戒")
  104. # SearchDev.get_videoList_selenium()