cloud_xhs.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. import os
  2. import time
  3. import random
  4. from dotenv import load_dotenv
  5. from browser_use import BrowserUse
  6. from fake_useragent import UserAgent
  7. # 加载环境变量(建议将敏感信息存在.env文件中)
  8. load_dotenv()
  9. API_KEY = os.getenv("BROWSER_USE_API_KEY") # 你的BrowserUse API密钥
  10. PROFILE_ID = os.getenv("XHS_PROFILE_ID") # 你的小红书专属Profile ID
  11. # 小红书关键Cookie(替换为你从浏览器导出的完整Cookie)
  12. XHS_COOKIES = [
  13. {"name": "web_session", "value": "你的web_session值"},
  14. {"name": "a1", "value": "你的a1值"},
  15. {"name": "webId", "value": "你的webId值"},
  16. {"name": "gid", "value": "你的gid值"},
  17. {"name": "xhsTrackerId", "value": "你的xhsTrackerId值"},
  18. {"name": "timestamp", "value": str(int(time.time()))}
  19. ]
  20. # 真实浏览器UA池(匹配小红书常用设备)
  21. ua = UserAgent()
  22. XHS_UA = ua.random if ua.random else "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
  23. def init_xhs_browser():
  24. """初始化小红书专用浏览器(防指纹+IP+Cookie配置)"""
  25. try:
  26. browser = BrowserUse(
  27. api_key=API_KEY,
  28. # 核心反指纹配置
  29. stealth=True, # 开启Stealth模式,清除自动化特征
  30. browser_profile_id=PROFILE_ID, # 绑定专属Profile,持久化Cookie/存储
  31. # 真实浏览器指纹配置
  32. user_agent=XHS_UA,
  33. screen_resolution="1920,1080", # 匹配真实屏幕分辨率
  34. timezone="Asia/Shanghai", # 时区与IP归属地一致
  35. language="zh-CN,zh;q=0.9", # 中文语言环境
  36. # 禁用自动化标识(关键)
  37. extra_args=[
  38. "--disable-blink-features=AutomationControlled",
  39. "--disable-dev-shm-usage",
  40. "--no-sandbox",
  41. "--disable-extensions", # 禁用扩展,减少特征暴露
  42. "--disable-images=false" # 加载图片,模拟真实访问
  43. ],
  44. # 代理配置(替换为你的专属静态IP)
  45. proxy={
  46. "type": "http", # 推荐http/socks5
  47. "host": "你的代理IP",
  48. "port": 你的代理端口,
  49. "username": "代理账号(如有)",
  50. "password": "代理密码(如有)"
  51. }
  52. )
  53. print("✅ 浏览器初始化成功")
  54. return browser
  55. except Exception as e:
  56. print(f"❌ 浏览器初始化失败: {e}")
  57. return None
  58. def inject_xhs_cookies(browser, url="https://www.xiaohongshu.com"):
  59. """注入小红书完整Cookie并验证登录状态"""
  60. try:
  61. # 先访问小红书首页(建立会话)
  62. browser.get(url)
  63. time.sleep(random.uniform(2, 4)) # 随机延迟,模拟人类加载
  64. # 注入Cookie(逐个添加,确保完整)
  65. for cookie in XHS_COOKIES:
  66. browser.add_cookie(cookie)
  67. print("✅ Cookie注入完成")
  68. # 刷新页面,验证登录状态
  69. browser.refresh()
  70. time.sleep(random.uniform(3, 5))
  71. # 检查是否登录成功(通过页面元素判断)
  72. page_source = browser.page_source
  73. if "未登录" not in page_source and "我的主页" in page_source:
  74. print("✅ 小红书登录验证成功")
  75. return True
  76. else:
  77. print("❌ Cookie无效或登录状态未维持")
  78. return False
  79. except Exception as e:
  80. print(f"❌ Cookie注入失败: {e}")
  81. return False
  82. def simulate_human_behavior(browser):
  83. """模拟人类行为(滚动、延迟、点击),规避行为风控"""
  84. try:
  85. # 模拟滚动浏览(随机步长)
  86. scroll_steps = random.randint(3, 8)
  87. for _ in range(scroll_steps):
  88. scroll_height = random.randint(300, 800)
  89. browser.execute_script(f"window.scrollBy(0, {scroll_height});")
  90. time.sleep(random.uniform(1.5, 3.5)) # 随机滚动间隔
  91. # 模拟随机停留
  92. time.sleep(random.uniform(5, 10))
  93. print("✅ 人类行为模拟完成")
  94. except Exception as e:
  95. print(f"❌ 行为模拟失败: {e}")
  96. def get_xhs_data(browser, target_url):
  97. """获取小红书目标页面数据(笔记/用户信息)"""
  98. try:
  99. # 访问目标页面(非直接请求,模拟点击跳转)
  100. browser.get(target_url)
  101. time.sleep(random.uniform(4, 6)) # 页面加载延迟
  102. # 模拟人类行为后再提取数据
  103. simulate_human_behavior(browser)
  104. # 提取页面核心数据(可根据需求修改)
  105. page_title = browser.title
  106. page_html = browser.page_source
  107. # 示例:提取笔记标题(小红书笔记页专属)
  108. note_title = browser.execute_script("""
  109. return document.querySelector('h1[class*="note-title"]')?.innerText || '未找到标题';
  110. """)
  111. print(f"\n📊 数据提取结果:")
  112. print(f"页面标题: {page_title}")
  113. print(f"笔记标题: {note_title}")
  114. # 可扩展:提取点赞、评论、内容等信息
  115. return {
  116. "title": note_title,
  117. "page_html": page_html,
  118. "status": "success"
  119. }
  120. except Exception as e:
  121. print(f"❌ 数据提取失败: {e}")
  122. return {"status": "failed", "error": str(e)}
  123. def main():
  124. """主流程:初始化→注入Cookie→获取数据→清理会话"""
  125. # 1. 初始化浏览器
  126. browser = init_xhs_browser()
  127. if not browser:
  128. return
  129. try:
  130. # 2. 注入Cookie并验证登录
  131. login_success = inject_xhs_cookies(browser)
  132. if not login_success:
  133. return
  134. # 3. 访问目标小红书页面(替换为你要爬取的URL)
  135. target_url = "https://www.xiaohongshu.com/explore/65a1b2c3d4e5f67890abcdef"
  136. data = get_xhs_data(browser, target_url)
  137. if data["status"] == "success":
  138. print("\n🎉 小红书数据获取成功!")
  139. else:
  140. print("\n❌ 小红书数据获取失败,请检查Cookie/IP/指纹配置")
  141. finally:
  142. # 4. 安全关闭会话(避免资源浪费)
  143. time.sleep(random.uniform(2, 3))
  144. browser.quit()
  145. print("\n🔚 浏览器会话已关闭")
  146. if __name__ == "__main__":
  147. main()