cloud_browser_demo.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. import sys
  2. import os
  3. import asyncio
  4. import json
  5. import re
  6. from datetime import datetime
  7. from pathlib import Path
  8. from urllib.parse import quote
  9. from dotenv import load_dotenv
  10. load_dotenv()
  11. project_root = Path(__file__).parent.parent
  12. sys.path.insert(0, str(project_root))
  13. from agent.tools.builtin.browser.baseClass import (
  14. init_browser_session,
  15. cleanup_browser_session,
  16. kill_browser_session,
  17. navigate_to_url,
  18. scroll_page,
  19. evaluate,
  20. wait,
  21. get_page_html,
  22. ensure_login_with_cookies,
  23. wait_for_user_action,
  24. )
  25. async def example_xhs_fitness_search() -> dict:
  26. print("\n" + "="*60)
  27. print("示例: 小红书云浏览器搜索 - 健身")
  28. print("="*60)
  29. api_key = os.getenv("BROWSER_USE_API_KEY")
  30. if not api_key:
  31. raise RuntimeError("未找到 BROWSER_USE_API_KEY")
  32. keyword = "健身"
  33. search_url = f"https://www.xiaohongshu.com/search_result?keyword={quote(keyword)}&type=51"
  34. last_data: dict = {
  35. "success": False,
  36. "keyword": keyword,
  37. "count": 0,
  38. "results": [],
  39. "error": "未知错误",
  40. "timestamp": datetime.now().isoformat(),
  41. }
  42. for _ in range(3):
  43. try:
  44. browser, tools = await init_browser_session(
  45. headless=False,
  46. use_cloud=True,
  47. )
  48. if browser is None or tools is None:
  49. raise RuntimeError("浏览器初始化失败")
  50. nav_result = await navigate_to_url("https://www.xiaohongshu.com")
  51. if nav_result.error:
  52. raise RuntimeError(nav_result.error)
  53. await wait(3)
  54. login_result = await ensure_login_with_cookies(
  55. cookie_type="xhs",
  56. url="https://www.xiaohongshu.com"
  57. )
  58. if login_result.error and "未找到 cookies" not in login_result.error:
  59. raise RuntimeError(login_result.error)
  60. login_payload = {}
  61. if isinstance(login_result.output, str) and login_result.output:
  62. try:
  63. login_payload = json.loads(login_result.output)
  64. except Exception:
  65. login_payload = {}
  66. if login_payload.get("need_login") and login_payload.get("cookies_count", 0) == 0:
  67. await wait_for_user_action(
  68. message="未找到可用 cookies,请在云浏览器中完成小红书登录或验证码,完成后按 Enter 继续",
  69. timeout=300
  70. )
  71. nav_result = await navigate_to_url(search_url)
  72. if nav_result.error:
  73. raise RuntimeError(nav_result.error)
  74. await wait(8)
  75. for _ in range(3):
  76. await scroll_page(down=True, pages=2.0)
  77. await wait(2)
  78. extract_js = """
  79. (function(){
  80. const maxCount = 20;
  81. const seen = new Set();
  82. const results = [];
  83. function pushItem(item){
  84. if (!item || !item.link || seen.has(item.link)) return;
  85. seen.add(item.link);
  86. results.push(item);
  87. }
  88. const anchors = document.querySelectorAll('a[href*="/explore/"]');
  89. anchors.forEach(a => {
  90. if (results.length >= maxCount) return;
  91. const link = a.href || '';
  92. const img = a.querySelector('img');
  93. const title = ((img && img.alt) || a.textContent || '').trim();
  94. const cover = (img && img.src) || '';
  95. if (link && title) {
  96. pushItem({ title, link, cover });
  97. }
  98. });
  99. const scriptNodes = document.querySelectorAll('script[type="application/json"], script#__NEXT_DATA__, script#__NUXT__');
  100. const walk = (node) => {
  101. if (!node || results.length >= maxCount) return;
  102. if (Array.isArray(node)) {
  103. for (const item of node) {
  104. walk(item);
  105. if (results.length >= maxCount) return;
  106. }
  107. return;
  108. }
  109. if (typeof node === 'object') {
  110. const title = (node.title || node.desc || node.name || node.noteTitle || '').toString().trim();
  111. const id = node.noteId || node.note_id || node.id || node.noteID;
  112. const cover = (node.cover && (node.cover.url || node.cover.urlDefault)) || node.coverUrl || node.image || '';
  113. let link = '';
  114. if (id) {
  115. link = `https://www.xiaohongshu.com/explore/${id}`;
  116. }
  117. if (title && link) {
  118. pushItem({ title, link, cover });
  119. }
  120. for (const key in node) {
  121. if (typeof node[key] === 'object') walk(node[key]);
  122. }
  123. }
  124. };
  125. scriptNodes.forEach(node => {
  126. if (results.length >= maxCount) return;
  127. const text = node.textContent || '';
  128. if (!text) return;
  129. try {
  130. const data = JSON.parse(text);
  131. walk(data);
  132. } catch (e) {}
  133. });
  134. return {
  135. success: true,
  136. keyword: __KEYWORD__,
  137. count: results.length,
  138. results: results,
  139. timestamp: new Date().toISOString(),
  140. };
  141. })()
  142. """
  143. extract_js = extract_js.replace("__KEYWORD__", json.dumps(keyword, ensure_ascii=False))
  144. async def run_extract() -> dict:
  145. result = await evaluate(extract_js)
  146. if result.error:
  147. raise RuntimeError(result.error)
  148. output = result.output
  149. if isinstance(output, str) and output.startswith("Result: "):
  150. output = output[8:]
  151. if not output:
  152. return {
  153. "success": False,
  154. "keyword": keyword,
  155. "count": 0,
  156. "results": [],
  157. "error": "可能被登录或验证码拦截",
  158. "timestamp": datetime.now().isoformat(),
  159. }
  160. try:
  161. data = json.loads(output)
  162. except Exception:
  163. data = {
  164. "success": False,
  165. "keyword": keyword,
  166. "count": 0,
  167. "results": [],
  168. "error": "JSON 解析失败",
  169. "raw_output": str(output)[:2000],
  170. "timestamp": datetime.now().isoformat(),
  171. }
  172. if isinstance(data, dict) and data.get("count", 0) == 0:
  173. html_result = await get_page_html()
  174. if html_result.error:
  175. raise RuntimeError(html_result.error)
  176. html = html_result.metadata.get("html", "")
  177. blocked_markers = ["登录", "验证", "验证码", "请先登录", "异常访问"]
  178. if html and any(marker in html for marker in blocked_markers):
  179. data = {
  180. "success": False,
  181. "keyword": keyword,
  182. "count": 0,
  183. "results": [],
  184. "error": "可能被登录或验证码拦截",
  185. "timestamp": datetime.now().isoformat(),
  186. }
  187. elif html:
  188. results = []
  189. seen = set()
  190. pattern = re.compile(r'"noteId":"(.*?)".*?"title":"(.*?)"', re.S)
  191. for match in pattern.finditer(html):
  192. note_id = match.group(1)
  193. title = match.group(2).encode("utf-8", "ignore").decode("unicode_escape").strip()
  194. link = f"https://www.xiaohongshu.com/explore/{note_id}"
  195. if note_id and link not in seen and title:
  196. seen.add(link)
  197. results.append({"title": title, "link": link})
  198. if len(results) >= 20:
  199. break
  200. if results:
  201. data = {
  202. "success": True,
  203. "keyword": keyword,
  204. "count": len(results),
  205. "results": results,
  206. "timestamp": datetime.now().isoformat(),
  207. "source": "html_fallback",
  208. }
  209. return data
  210. data = await run_extract()
  211. if isinstance(data, dict) and data.get("error") == "可能被登录或验证码拦截":
  212. await wait_for_user_action(
  213. message="请在云浏览器中完成小红书登录或验证码,完成后按 Enter 继续",
  214. timeout=300
  215. )
  216. nav_result = await navigate_to_url(search_url)
  217. if nav_result.error:
  218. raise RuntimeError(nav_result.error)
  219. await wait(8)
  220. for _ in range(3):
  221. await scroll_page(down=True, pages=2.0)
  222. await wait(2)
  223. data = await run_extract()
  224. last_data = data if isinstance(data, dict) else last_data
  225. if isinstance(last_data, dict) and last_data.get("count", 0) > 0:
  226. return last_data
  227. if isinstance(last_data, dict) and last_data.get("error") != "可能被登录或验证码拦截":
  228. return last_data
  229. except Exception as e:
  230. err_text = str(e)
  231. if any(key in err_text for key in ["WebSocket", "browser not connected", "NoneType"]):
  232. try:
  233. await kill_browser_session()
  234. except Exception:
  235. pass
  236. last_data = {
  237. "success": False,
  238. "keyword": keyword,
  239. "count": 0,
  240. "results": [],
  241. "error": err_text,
  242. "timestamp": datetime.now().isoformat(),
  243. }
  244. finally:
  245. await cleanup_browser_session()
  246. await wait(5)
  247. return last_data
  248. async def main():
  249. data = await example_xhs_fitness_search()
  250. print(json.dumps(data, ensure_ascii=False, indent=2))
  251. if __name__ == "__main__":
  252. asyncio.run(main())