cloud_browser_demo_db.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. """
  2. 小红书云浏览器数据获取脚本(数据库配置版)
  3. 从数据库 agent_channel_cookies 获取 Cookie 和 cloud_profile_id
  4. """
  5. import sys
  6. import os
  7. import asyncio
  8. import json
  9. import re
  10. from datetime import datetime
  11. from pathlib import Path
  12. from urllib.parse import quote
  13. from dotenv import load_dotenv
  14. load_dotenv()
  15. project_root = Path(__file__).parent.parent
  16. sys.path.insert(0, str(project_root))
  17. from agent.tools.builtin.browser.baseClass import (
  18. init_browser_session,
  19. cleanup_browser_session,
  20. kill_browser_session,
  21. browser_navigate_to_url,
  22. browser_scroll_page,
  23. browser_evaluate,
  24. browser_wait,
  25. browser_get_page_html,
  26. _fetch_cookie_row,
  27. _fetch_profile_id,
  28. _normalize_cookies,
  29. _cookie_domain_for_type,
  30. _extract_cookie_value,
  31. )
  32. async def example_xhs_fitness_search(cookie_type: str = "xhs") -> dict:
  33. """
  34. 小红书搜索示例
  35. Args:
  36. cookie_type: Cookie 类型,用于从数据库获取配置
  37. """
  38. print("\n" + "="*60)
  39. print("示例: 小红书云浏览器搜索 - 健身")
  40. print("="*60)
  41. api_key = os.getenv("BROWSER_USE_API_KEY")
  42. if not api_key:
  43. raise RuntimeError("未找到 BROWSER_USE_API_KEY")
  44. keyword = "健身"
  45. search_url = f"https://www.xiaohongshu.com/search_result?keyword={quote(keyword)}&type=51"
  46. last_data: dict = {
  47. "success": False,
  48. "keyword": keyword,
  49. "count": 0,
  50. "results": [],
  51. "error": "未知错误",
  52. "timestamp": datetime.now().isoformat(),
  53. }
  54. # 从数据库获取配置
  55. print(f"\n🔍 从数据库获取配置 (type={cookie_type})...")
  56. profile_id = _fetch_profile_id(cookie_type)
  57. cookie_row = _fetch_cookie_row(cookie_type)
  58. if profile_id:
  59. print(f"✅ 获取到 cloud_profile_id: {profile_id}")
  60. else:
  61. print("⚠️ 未找到 cloud_profile_id,将使用环境变量或默认值")
  62. profile_id = os.getenv("XHS_PROFILE_ID")
  63. if cookie_row:
  64. print(f"✅ 获取到 Cookie 配置")
  65. else:
  66. print("⚠️ 未找到 Cookie 配置")
  67. for attempt in range(3):
  68. try:
  69. # 确保每次重试都清理旧会话
  70. if attempt > 0:
  71. try:
  72. await kill_browser_session()
  73. except Exception:
  74. pass
  75. await asyncio.sleep(2) # 等待清理完成
  76. print(f"\n🌐 启动云浏览器 (尝试 {attempt + 1}/3)...")
  77. browser, tools = await init_browser_session(
  78. headless=False,
  79. use_cloud=True,
  80. cloud_profile_id=profile_id,
  81. user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  82. disable_security=False,
  83. )
  84. if browser is None or tools is None:
  85. raise RuntimeError("浏览器初始化失败")
  86. print("✅ 云浏览器启动成功")
  87. # 访问首页
  88. print("\n🏠 访问小红书首页...")
  89. nav_result = await browser_navigate_to_url("https://www.xiaohongshu.com")
  90. if nav_result.error:
  91. raise RuntimeError(nav_result.error)
  92. await browser_wait(3)
  93. # 注入 Cookie(如果有)
  94. if cookie_row:
  95. print("\n🍪 注入 Cookie...")
  96. cookie_value = _extract_cookie_value(cookie_row)
  97. if cookie_value:
  98. domain, base_url = _cookie_domain_for_type(cookie_type, "https://www.xiaohongshu.com")
  99. cookies = _normalize_cookies(cookie_value, domain, base_url)
  100. if cookies:
  101. await browser._cdp_set_cookies(cookies)
  102. print(f"✅ 成功注入 {len(cookies)} 个 Cookie")
  103. # 刷新页面使 Cookie 生效
  104. await navigate_to_url("https://www.xiaohongshu.com")
  105. await browser_wait(2)
  106. else:
  107. print("⚠️ Cookie 解析失败")
  108. else:
  109. print("⚠️ 未找到 Cookie 值")
  110. # 访问搜索页面
  111. print(f"\n🔗 访问搜索页面: {keyword}")
  112. nav_result = await browser_navigate_to_url(search_url)
  113. if nav_result.error:
  114. raise RuntimeError(nav_result.error)
  115. await browser_wait(8)
  116. # 滚动页面
  117. print("\n📜 滚动页面...")
  118. for i in range(3):
  119. await browser_scroll_page(down=True, pages=2.0)
  120. await browser_wait(2)
  121. # 提取数据
  122. print("\n🔍 提取数据...")
  123. html_result = await browser_get_page_html()
  124. if html_result.error:
  125. raise RuntimeError(html_result.error)
  126. html = html_result.metadata.get("html", "")
  127. output_dir = project_root / "output"
  128. output_dir.mkdir(parents=True, exist_ok=True)
  129. output_path = output_dir / "xhs.html"
  130. output_path.write_text(html or "", encoding="utf-8")
  131. print(f"✅ 已保存页面 HTML: {output_path}")
  132. extract_js = """
  133. (function(){
  134. const maxCount = 20;
  135. const seen = new Set();
  136. const results = [];
  137. function pushItem(item){
  138. if (!item || !item.link || seen.has(item.link)) return;
  139. seen.add(item.link);
  140. results.push(item);
  141. }
  142. const anchors = document.querySelectorAll('a[href*="/explore/"]');
  143. anchors.forEach(a => {
  144. if (results.length >= maxCount) return;
  145. const link = a.href || '';
  146. const img = a.querySelector('img');
  147. const title = ((img && img.alt) || a.textContent || '').trim();
  148. const cover = (img && img.src) || '';
  149. if (link && title) {
  150. pushItem({ title, link, cover });
  151. }
  152. });
  153. const scriptNodes = document.querySelectorAll('script[type="application/json"], script#__NEXT_DATA__, script#__NUXT__');
  154. const walk = (node) => {
  155. if (!node || results.length >= maxCount) return;
  156. if (Array.isArray(node)) {
  157. for (const item of node) {
  158. walk(item);
  159. if (results.length >= maxCount) return;
  160. }
  161. return;
  162. }
  163. if (typeof node === 'object') {
  164. const title = (node.title || node.desc || node.name || node.noteTitle || '').toString().trim();
  165. const id = node.noteId || node.note_id || node.id || node.noteID;
  166. const cover = (node.cover && (node.cover.url || node.cover.urlDefault)) || node.coverUrl || node.image || '';
  167. let link = '';
  168. if (id) {
  169. link = `https://www.xiaohongshu.com/explore/${id}`;
  170. }
  171. if (title && link) {
  172. pushItem({ title, link, cover });
  173. }
  174. for (const key in node) {
  175. if (typeof node[key] === 'object') walk(node[key]);
  176. }
  177. }
  178. };
  179. scriptNodes.forEach(node => {
  180. if (results.length >= maxCount) return;
  181. const text = node.textContent || '';
  182. if (!text) return;
  183. try {
  184. const data = JSON.parse(text);
  185. walk(data);
  186. } catch (e) {}
  187. });
  188. return {
  189. success: true,
  190. keyword: __KEYWORD__,
  191. count: results.length,
  192. results: results,
  193. timestamp: new Date().toISOString(),
  194. };
  195. })()
  196. """
  197. extract_js = extract_js.replace("__KEYWORD__", json.dumps(keyword, ensure_ascii=False))
  198. async def run_extract() -> dict:
  199. result = await browser_evaluate(extract_js)
  200. if result.error:
  201. raise RuntimeError(result.error)
  202. output = result.output
  203. if isinstance(output, str) and output.startswith("Result: "):
  204. output = output[8:]
  205. if not output:
  206. return {
  207. "success": False,
  208. "keyword": keyword,
  209. "count": 0,
  210. "results": [],
  211. "error": "可能被登录或验证码拦截",
  212. "timestamp": datetime.now().isoformat(),
  213. }
  214. try:
  215. data = json.loads(output)
  216. except Exception:
  217. data = {
  218. "success": False,
  219. "keyword": keyword,
  220. "count": 0,
  221. "results": [],
  222. "error": "JSON 解析失败",
  223. "raw_output": str(output)[:2000],
  224. "timestamp": datetime.now().isoformat(),
  225. }
  226. if isinstance(data, dict) and data.get("count", 0) == 0:
  227. html_result = await browser_get_page_html()
  228. if html_result.error:
  229. raise RuntimeError(html_result.error)
  230. html = html_result.metadata.get("html", "")
  231. blocked_markers = ["登录", "验证", "验证码", "请先登录", "异常访问"]
  232. if html and any(marker in html for marker in blocked_markers):
  233. data = {
  234. "success": False,
  235. "keyword": keyword,
  236. "count": 0,
  237. "results": [],
  238. "error": "可能被登录或验证码拦截",
  239. "timestamp": datetime.now().isoformat(),
  240. }
  241. elif html:
  242. results = []
  243. seen = set()
  244. pattern = re.compile(r'"noteId":"(.*?)".*?"title":"(.*?)"', re.S)
  245. for match in pattern.finditer(html):
  246. note_id = match.group(1)
  247. title = match.group(2).encode("utf-8", "ignore").decode("unicode_escape").strip()
  248. link = f"https://www.xiaohongshu.com/explore/{note_id}"
  249. if note_id and link not in seen and title:
  250. seen.add(link)
  251. results.append({"title": title, "link": link})
  252. if len(results) >= 20:
  253. break
  254. if results:
  255. data = {
  256. "success": True,
  257. "keyword": keyword,
  258. "count": len(results),
  259. "results": results,
  260. "timestamp": datetime.now().isoformat(),
  261. "source": "html_fallback",
  262. }
  263. return data
  264. data = await run_extract()
  265. last_data = data if isinstance(data, dict) else last_data
  266. # 输出结果
  267. if isinstance(last_data, dict) and last_data.get("count", 0) > 0:
  268. print(f"\n✅ 成功获取 {last_data['count']} 条数据")
  269. print(f"数据来源: {last_data.get('source', 'javascript')}")
  270. print("\n前 5 条结果:")
  271. for i, item in enumerate(last_data["results"][:5], 1):
  272. print(f"{i}. {item['title'][:50]}...")
  273. # 成功获取数据,清理并返回
  274. await cleanup_browser_session()
  275. return last_data
  276. if isinstance(last_data, dict) and last_data.get("error") == "可能被登录或验证码拦截":
  277. print("\n⚠️ 检测到登录或验证码拦截")
  278. print("💡 建议:在数据库中配置有效的 Cookie")
  279. except Exception as e:
  280. err_text = str(e)
  281. print(f"⚠️ 尝试 {attempt + 1}/3 失败: {err_text}")
  282. last_data = {
  283. "success": False,
  284. "keyword": keyword,
  285. "count": 0,
  286. "results": [],
  287. "error": err_text,
  288. "timestamp": datetime.now().isoformat(),
  289. }
  290. finally:
  291. # 清理当前会话
  292. try:
  293. await cleanup_browser_session()
  294. except Exception:
  295. pass
  296. # 如果不是最后一次尝试,等待后继续
  297. if attempt < 2:
  298. print(f"等待 5 秒后重试...")
  299. await asyncio.sleep(5)
  300. return last_data
  301. async def main():
  302. # 可以通过命令行参数指定 cookie_type
  303. cookie_type = sys.argv[1] if len(sys.argv) > 1 else "xhs"
  304. data = await example_xhs_fitness_search(cookie_type)
  305. print("\n" + "="*60)
  306. print("📊 最终结果")
  307. print("="*60)
  308. print(json.dumps(data, ensure_ascii=False, indent=2))
  309. if __name__ == "__main__":
  310. asyncio.run(main())