# -*- coding: utf-8 -*- # @Author: wangkun # @Time: 2023/6/25 import os import random import string import sys import time import requests import urllib.parse import urllib3 from requests.adapters import HTTPAdapter from selenium.webdriver import DesiredCapabilities from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from seleniumwire import webdriver sys.path.append(os.getcwd()) from common.common import Common from common.userAgent import get_random_user_agent class SearchDev: @classmethod def random_signature(cls): src_digits = string.digits # string_数字 src_uppercase = string.ascii_uppercase # string_大写字母 src_lowercase = string.ascii_lowercase # string_小写字母 digits_num = random.randint(1, 6) uppercase_num = random.randint(1, 26 - digits_num - 1) lowercase_num = 26 - (digits_num + uppercase_num) password = random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample( src_lowercase, lowercase_num) random.shuffle(password) new_password = 'AAAAAAAAAA' + ''.join(password)[10:-4] + 'AAAB' new_password_start = new_password[0:18] new_password_end = new_password[-7:] if new_password[18] == '8': new_password = new_password_start + 'w' + new_password_end elif new_password[18] == '9': new_password = new_password_start + 'x' + new_password_end elif new_password[18] == '-': new_password = new_password_start + 'y' + new_password_end elif new_password[18] == '.': new_password = new_password_start + 'z' + new_password_end else: new_password = new_password_start + 'y' + new_password_end return new_password @classmethod def get_videoList_requests(cls, log_type, crawler, rule_dict, word, env): offset = 0 while True: url = f"https://www.ixigua.com/api/searchv2/complex/{str(word)}/{offset}?" \ "fss=input&" \ "order_type=publish_time&" \ "click_position=new&" \ "aid=1768&" \ f"_signature={cls.random_signature()}" headers = { # 'authority': 'www.ixigua.com', # 'accept': 'application/json, text/plain, */*', # 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', # 'cache-control': 'no-cache', # 'cookie': 'MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; s_v_web_id=verify_lhoket5d_0qlKZtzS_YZkf_4Uaj_82mX_j6lRT4PcYJ7A; __ac_signature=_02B4Z6wo00f01yB6eXwAAIDCWLSSerYAxYsgWn3AAKx5S2D2PsJJ92YblwdDE-9rnwnzZ87S0CUowZ3Xi8XmxMU3JHd0xfP-9VucrE9D.l9E7Vgn6y95sGbL2H6mgsddoCZX0cCgfcfKAzWgcd; ixigua-a-s=1; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; SEARCH_CARD_MODE=7168304743566296612_1; msToken=EV6DlzmvSZH6yBIIm7tCdxb6EY7xuV7p0EZw4nZUyznGvXk9Wkyx0GiT39zCO2HRROdUYZc0XYpAztUSzg14q3a1Fkoj01Avy_BGjKFFn5wRQDP8nVWECA==; tt_scid=rP8nVwFTm4wPZyREet0crbp-ZRgJsK.x5TE0lqU2uibGbUDAhlM.oA14pKRcGzXW0955; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1687685218%7Ca985a413a36bb156ba577dac11fbc14593e5a2a4000001f9cfc7fd72781c4cc5; ixigua-a-s=1', # 'pragma': 'no-cache', 'referer': f'https://www.ixigua.com/search/{urllib.parse.quote(str(word))}/?tab_name=home&fss=default_search', # 'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"', # 'sec-ch-ua-mobile': '?0', # 'sec-ch-ua-platform': '"macOS"', # 'sec-fetch-dest': 'empty', # 'sec-fetch-mode': 'cors', # 'sec-fetch-site': 'same-origin', # 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57', 'user-agent': get_random_user_agent("pc"), # 'x-secsdk-csrf-token': '0001000000011fd0adbaee655439e86800862b81e3e34974cab6a8656af77695b76ff5c76c96176bdcbf2631eeb7' } urllib3.disable_warnings() s = requests.session() # max_retries=3 重试3次 s.mount('http://', HTTPAdapter(max_retries=3)) s.mount('https://', HTTPAdapter(max_retries=3)) response = requests.get(url=url, headers=headers, verify=False, proxies=Common.tunnel_proxies(), timeout=5) if response.status_code != 200 or "data" not in response.text: Common.logger(log_type, crawler).info(f"response:{response.text}\n") return elif len(response.json()["data"]["data"]) == 0: Common.logger(log_type, crawler).info("没有更多数据啦~") return else: offset += 10 feeds = response.json()["data"]["data"] for i in range(len(feeds)): video_type = feeds[i].get("type", "") title = feeds[i].get("data", {}).get("title", "") publish_time = feeds[i].get("data", {}).get("publish_time", "") item_id = feeds[i].get("data", {}).get("group_id", "") Common.logger(log_type, crawler).info(f"title:{title}") Common.logger(log_type, crawler).info(f"video_type:{video_type}") Common.logger(log_type, crawler).info(f"publish_time:{publish_time}") Common.logger(log_type, crawler).info(f"item_id:{item_id}") if video_type != "video": Common.logger(log_type, crawler).info("合集,跳过\n") continue if int(time.time()) - publish_time > 3600 * 24 * int(rule_dict.get('period', {}).get('max', 1000)): Common.logger(log_type, crawler).info(f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n") return @classmethod def get_videoList_selenium(cls, log_type, crawler): # 打印请求配置 ca = DesiredCapabilities.CHROME ca["goog:loggingPrefs"] = {"performance": "ALL"} # # 不打开浏览器运行 chrome_options = webdriver.ChromeOptions() chrome_options.add_argument( f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36') # chrome_options.add_argument("--headless") chrome_options.add_argument("--window-size=1920,1080") # chrome_options.add_argument("--no-sandbox") chromedriver = "/Users/wangkun/Downloads/chromedriver/chromedriver_v114/chromedriver" # driver初始化 driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(chromedriver)) driver.implicitly_wait(10) Common.logger(log_type, crawler).info("打开搜索页:健康") driver.get(f"https://www.ixigua.com/search/健康/") time.sleep(3) # logs = driver.get_log("performance") Common.logger(log_type, crawler).info("关闭登录弹框") if driver.find_elements(By.XPATH, '//*[@class="xg-notification-close"]') != 0: driver.find_element(By.XPATH, '//*[@class="xg-notification-close"]').click() driver.get_screenshot_as_file("./关闭弹框.png") Common.logger(log_type, crawler).info("点击筛选按钮") driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-icons-categories"]').click() Common.logger(log_type, crawler).info("点击最新排序") driver.find_element(By.XPATH, '//*[@class="searchPageV2__header-categories-wrapper"]/*[1]/*[2]/*[1]').click() time.sleep(3) driver.get_screenshot_as_file("./最新排序.png") driver.quit() if __name__ == "__main__": SearchDev.get_videoList_requests(log_type="search", crawler="xigua", rule_dict={"period": {"min":365, "max":365}}, word="健康", env="dev") # SearchDev.get_videoList_selenium() # print(get_random_user_agent("pc")) pass