xigua_search_dev.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/6/25
  4. import os
  5. import random
  6. import string
  7. import sys
  8. import time
  9. import requests
  10. import urllib.parse
  11. import urllib3
  12. from requests.adapters import HTTPAdapter
  13. from selenium.webdriver import DesiredCapabilities
  14. from selenium.webdriver.chrome.service import Service
  15. from selenium.webdriver.common.by import By
  16. from seleniumwire import webdriver
  17. sys.path.append(os.getcwd())
  18. from common.common import Common
  19. from common.userAgent import get_random_user_agent
  20. class SearchDev:
  21. @classmethod
  22. def random_signature(cls):
  23. src_digits = string.digits # string_数字
  24. src_uppercase = string.ascii_uppercase # string_大写字母
  25. src_lowercase = string.ascii_lowercase # string_小写字母
  26. digits_num = random.randint(1, 6)
  27. uppercase_num = random.randint(1, 26 - digits_num - 1)
  28. lowercase_num = 26 - (digits_num + uppercase_num)
  29. password = random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample(
  30. src_lowercase, lowercase_num)
  31. random.shuffle(password)
  32. new_password = 'AAAAAAAAAA' + ''.join(password)[10:-4] + 'AAAB'
  33. new_password_start = new_password[0:18]
  34. new_password_end = new_password[-7:]
  35. if new_password[18] == '8':
  36. new_password = new_password_start + 'w' + new_password_end
  37. elif new_password[18] == '9':
  38. new_password = new_password_start + 'x' + new_password_end
  39. elif new_password[18] == '-':
  40. new_password = new_password_start + 'y' + new_password_end
  41. elif new_password[18] == '.':
  42. new_password = new_password_start + 'z' + new_password_end
  43. else:
  44. new_password = new_password_start + 'y' + new_password_end
  45. return new_password
  46. @classmethod
  47. def get_videoList_requests(cls, log_type, crawler, rule_dict, word, env):
  48. offset = 0
  49. while True:
  50. url = f"https://www.ixigua.com/api/searchv2/complex/{str(word)}/{offset}?" \
  51. "fss=input&" \
  52. "order_type=publish_time&" \
  53. "click_position=new&" \
  54. "aid=1768&" \
  55. f"_signature={cls.random_signature()}"
  56. headers = {
  57. # 'authority': 'www.ixigua.com',
  58. # 'accept': 'application/json, text/plain, */*',
  59. # 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  60. # 'cache-control': 'no-cache',
  61. # 'cookie': 'MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; s_v_web_id=verify_lhoket5d_0qlKZtzS_YZkf_4Uaj_82mX_j6lRT4PcYJ7A; __ac_signature=_02B4Z6wo00f01yB6eXwAAIDCWLSSerYAxYsgWn3AAKx5S2D2PsJJ92YblwdDE-9rnwnzZ87S0CUowZ3Xi8XmxMU3JHd0xfP-9VucrE9D.l9E7Vgn6y95sGbL2H6mgsddoCZX0cCgfcfKAzWgcd; ixigua-a-s=1; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; SEARCH_CARD_MODE=7168304743566296612_1; msToken=EV6DlzmvSZH6yBIIm7tCdxb6EY7xuV7p0EZw4nZUyznGvXk9Wkyx0GiT39zCO2HRROdUYZc0XYpAztUSzg14q3a1Fkoj01Avy_BGjKFFn5wRQDP8nVWECA==; tt_scid=rP8nVwFTm4wPZyREet0crbp-ZRgJsK.x5TE0lqU2uibGbUDAhlM.oA14pKRcGzXW0955; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1687685218%7Ca985a413a36bb156ba577dac11fbc14593e5a2a4000001f9cfc7fd72781c4cc5; ixigua-a-s=1',
  62. # 'pragma': 'no-cache',
  63. 'referer': f'https://www.ixigua.com/search/{urllib.parse.quote(str(word))}/?tab_name=home&fss=default_search',
  64. # 'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
  65. # 'sec-ch-ua-mobile': '?0',
  66. # 'sec-ch-ua-platform': '"macOS"',
  67. # 'sec-fetch-dest': 'empty',
  68. # 'sec-fetch-mode': 'cors',
  69. # 'sec-fetch-site': 'same-origin',
  70. # 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57',
  71. 'user-agent': get_random_user_agent("pc"),
  72. # 'x-secsdk-csrf-token': '0001000000011fd0adbaee655439e86800862b81e3e34974cab6a8656af77695b76ff5c76c96176bdcbf2631eeb7'
  73. }
  74. urllib3.disable_warnings()
  75. s = requests.session()
  76. # max_retries=3 重试3次
  77. s.mount('http://', HTTPAdapter(max_retries=3))
  78. s.mount('https://', HTTPAdapter(max_retries=3))
  79. response = requests.get(url=url, headers=headers, verify=False, proxies=Common.tunnel_proxies(), timeout=5)
  80. if response.status_code != 200 or "data" not in response.text:
  81. Common.logger(log_type, crawler).info(f"response:{response.text}\n")
  82. return
  83. elif len(response.json()["data"]["data"]) == 0:
  84. Common.logger(log_type, crawler).info("没有更多数据啦~")
  85. return
  86. else:
  87. offset += 10
  88. feeds = response.json()["data"]["data"]
  89. for i in range(len(feeds)):
  90. video_type = feeds[i].get("type", "")
  91. title = feeds[i].get("data", {}).get("title", "")
  92. publish_time = feeds[i].get("data", {}).get("publish_time", "")
  93. item_id = feeds[i].get("data", {}).get("group_id", "")
  94. Common.logger(log_type, crawler).info(f"title:{title}")
  95. Common.logger(log_type, crawler).info(f"video_type:{video_type}")
  96. Common.logger(log_type, crawler).info(f"publish_time:{publish_time}")
  97. Common.logger(log_type, crawler).info(f"item_id:{item_id}")
  98. if video_type != "video":
  99. Common.logger(log_type, crawler).info("合集,跳过\n")
  100. continue
  101. if int(time.time()) - publish_time > 3600 * 24 * int(rule_dict.get('period', {}).get('max', 1000)):
  102. Common.logger(log_type, crawler).info(f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n")
  103. return
  104. @classmethod
  105. def get_videoList_selenium(cls, log_type, crawler):
  106. # 打印请求配置
  107. ca = DesiredCapabilities.CHROME
  108. ca["goog:loggingPrefs"] = {"performance": "ALL"}
  109. # # 不打开浏览器运行
  110. chrome_options = webdriver.ChromeOptions()
  111. chrome_options.add_argument(
  112. f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
  113. # chrome_options.add_argument("--headless")
  114. chrome_options.add_argument("--window-size=1920,1080")
  115. # chrome_options.add_argument("--no-sandbox")
  116. chromedriver = "/Users/wangkun/Downloads/chromedriver/chromedriver_v114/chromedriver"
  117. # driver初始化
  118. driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(chromedriver))
  119. driver.implicitly_wait(10)
  120. Common.logger(log_type, crawler).info("打开搜索页:健康")
  121. driver.get(f"https://www.ixigua.com/search/健康/")
  122. time.sleep(3)
  123. # logs = driver.get_log("performance")
  124. Common.logger(log_type, crawler).info("关闭登录弹框")
  125. if driver.find_elements(By.XPATH, '//*[@class="xg-notification-close"]') != 0:
  126. driver.find_element(By.XPATH, '//*[@class="xg-notification-close"]').click()
  127. driver.get_screenshot_as_file("./关闭弹框.png")
  128. Common.logger(log_type, crawler).info("点击筛选按钮")
  129. driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-icons-categories"]').click()
  130. Common.logger(log_type, crawler).info("点击最新排序")
  131. driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-categories-wrapper"]/*[1]/*[2]/*[1]').click()
  132. time.sleep(3)
  133. driver.get_screenshot_as_file("./最新排序.png")
  134. driver.quit()
  135. if __name__ == "__main__":
  136. SearchDev.get_videoList_requests(log_type="search", crawler="xigua", rule_dict={"period": {"min":365, "max":365}}, word="健康", env="dev")
  137. # SearchDev.get_videoList_selenium()
  138. # print(get_random_user_agent("pc"))
  139. pass