baseClass.py 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488
  1. """
  2. Browser-Use 原生工具适配器
  3. Native Browser-Use Tools Adapter
  4. 直接使用 browser-use 的原生类(BrowserSession, Tools)实现所有浏览器操作工具。
  5. 不依赖 Playwright,完全基于 CDP 协议。
  6. 核心特性:
  7. 1. 浏览器会话持久化 - 只启动一次浏览器
  8. 2. 状态自动保持 - 登录状态、Cookie、LocalStorage 等
  9. 3. 完整的底层访问 - 可以直接使用 CDP 协议
  10. 4. 性能优异 - 避免频繁创建/销毁浏览器实例
  11. 使用方法:
  12. 1. 在 Agent 初始化时调用 init_browser_session()
  13. 2. 使用各个工具函数执行浏览器操作
  14. 3. 任务结束时调用 cleanup_browser_session()
  15. 文件操作说明:
  16. - 浏览器专用文件目录:.browser_use_files/ (在当前工作目录下)
  17. 用于存储浏览器会话产生的临时文件(下载、上传、截图等)
  18. - 一般文件操作:请使用 agent.tools.builtin 中的文件工具 (read_file, write_file, edit_file)
  19. 这些工具功能更完善,支持diff预览、智能匹配、分页读取等
  20. """
  21. import sys
  22. import os
  23. import json
  24. import asyncio
  25. from typing import Optional, List, Dict, Any, Tuple
  26. from pathlib import Path
  27. from urllib.parse import urlparse
  28. # 将项目根目录添加到 Python 路径
  29. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  30. # 导入框架的工具装饰器和结果类
  31. from agent.tools import tool, ToolResult
  32. from agent.tools.builtin.browser.sync_mysql_help import mysql
  33. # 导入 browser-use 的核心类
  34. from browser_use import BrowserSession, BrowserProfile
  35. from browser_use.tools.service import Tools
  36. from browser_use.agent.views import ActionResult
  37. from browser_use.filesystem.file_system import FileSystem
  38. # ============================================================
  39. # 全局浏览器会话管理
  40. # ============================================================
  41. # 全局变量:浏览器会话和工具实例
  42. _browser_session: Optional[BrowserSession] = None
  43. _browser_tools: Optional[Tools] = None
  44. _file_system: Optional[FileSystem] = None
  45. async def init_browser_session(
  46. headless: bool = False,
  47. user_data_dir: Optional[str] = None,
  48. profile_name: str = "default",
  49. browser_profile: Optional[BrowserProfile] = None,
  50. use_cloud: bool = False,
  51. **kwargs
  52. ) -> tuple[BrowserSession, Tools]:
  53. """
  54. 初始化全局浏览器会话
  55. Args:
  56. headless: 是否无头模式
  57. user_data_dir: 用户数据目录(用于保存登录状态)
  58. profile_name: 配置文件名称
  59. browser_profile: BrowserProfile 对象(用于预设 cookies 等)
  60. use_cloud: 是否使用云浏览器(默认 False,使用本地浏览器)
  61. **kwargs: 其他 BrowserSession 参数
  62. Returns:
  63. (BrowserSession, Tools) 元组
  64. """
  65. global _browser_session, _browser_tools, _file_system
  66. if _browser_session is not None:
  67. return _browser_session, _browser_tools
  68. # 设置用户数据目录(持久化登录状态)
  69. if user_data_dir is None and profile_name and not use_cloud:
  70. user_data_dir = str(Path.home() / ".browser_use" / "profiles" / profile_name)
  71. Path(user_data_dir).mkdir(parents=True, exist_ok=True)
  72. # 创建浏览器会话
  73. session_params = {
  74. "headless": headless,
  75. }
  76. if use_cloud:
  77. # 云浏览器模式
  78. session_params["use_cloud"] = True
  79. print("🌐 使用云浏览器模式")
  80. else:
  81. # 本地浏览器模式
  82. session_params["is_local"] = True
  83. # macOS 上显式指定 Chrome 路径
  84. import platform
  85. if platform.system() == "Darwin": # macOS
  86. chrome_path = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
  87. if Path(chrome_path).exists():
  88. session_params["executable_path"] = chrome_path
  89. # 只在有值时才添加 user_data_dir
  90. if user_data_dir:
  91. session_params["user_data_dir"] = user_data_dir
  92. # 只在有值时才添加 browser_profile
  93. if browser_profile:
  94. session_params["browser_profile"] = browser_profile
  95. # 合并其他参数
  96. session_params.update(kwargs)
  97. _browser_session = BrowserSession(**session_params)
  98. # 启动浏览器
  99. await _browser_session.start()
  100. # 创建工具实例
  101. _browser_tools = Tools()
  102. # 创建文件系统实例(用于浏览器会话产生的文件)
  103. # 注意:这个目录仅用于浏览器操作相关的临时文件(下载、上传、截图等)
  104. # 对于一般文件读写操作,请使用 agent.tools.builtin 中的文件工具
  105. base_dir = Path.cwd() / ".browser_use_files"
  106. base_dir.mkdir(parents=True, exist_ok=True)
  107. _file_system = FileSystem(base_dir=str(base_dir))
  108. return _browser_session, _browser_tools
  109. async def get_browser_session() -> tuple[BrowserSession, Tools]:
  110. """
  111. 获取当前浏览器会话,如果不存在则自动创建
  112. Returns:
  113. (BrowserSession, Tools) 元组
  114. """
  115. global _browser_session, _browser_tools
  116. if _browser_session is None:
  117. await init_browser_session()
  118. return _browser_session, _browser_tools
  119. async def cleanup_browser_session():
  120. """
  121. 清理浏览器会话
  122. 优雅地停止浏览器但保留会话状态
  123. """
  124. global _browser_session, _browser_tools, _file_system
  125. if _browser_session is not None:
  126. await _browser_session.stop()
  127. _browser_session = None
  128. _browser_tools = None
  129. _file_system = None
  130. async def kill_browser_session():
  131. """
  132. 强制终止浏览器会话
  133. 完全关闭浏览器进程
  134. """
  135. global _browser_session, _browser_tools, _file_system
  136. if _browser_session is not None:
  137. await _browser_session.kill()
  138. _browser_session = None
  139. _browser_tools = None
  140. _file_system = None
  141. # ============================================================
  142. # 辅助函数:ActionResult 转 ToolResult
  143. # ============================================================
  144. def action_result_to_tool_result(result: ActionResult, title: str = None) -> ToolResult:
  145. """
  146. 将 browser-use 的 ActionResult 转换为框架的 ToolResult
  147. Args:
  148. result: browser-use 的 ActionResult
  149. title: 可选的标题(如果不提供则从 result 推断)
  150. Returns:
  151. ToolResult
  152. """
  153. if result.error:
  154. return ToolResult(
  155. title=title or "操作失败",
  156. output="",
  157. error=result.error,
  158. long_term_memory=result.long_term_memory or result.error
  159. )
  160. return ToolResult(
  161. title=title or "操作成功",
  162. output=result.extracted_content or "",
  163. long_term_memory=result.long_term_memory or result.extracted_content or "",
  164. metadata=result.metadata or {}
  165. )
  166. def _cookie_domain_for_type(cookie_type: str, url: str) -> Tuple[str, str]:
  167. if cookie_type:
  168. key = cookie_type.lower()
  169. if key in {"xiaohongshu", "xhs"}:
  170. return ".xiaohongshu.com", "https://www.xiaohongshu.com"
  171. parsed = urlparse(url or "")
  172. domain = parsed.netloc or ""
  173. domain = domain.replace("www.", "")
  174. if domain:
  175. domain = f".{domain}"
  176. base_url = f"{parsed.scheme}://{parsed.netloc}" if parsed.scheme and parsed.netloc else url
  177. return domain, base_url
  178. def _parse_cookie_string(cookie_str: str, domain: str, url: str) -> List[Dict[str, Any]]:
  179. cookies: List[Dict[str, Any]] = []
  180. if not cookie_str:
  181. return cookies
  182. parts = cookie_str.split(";")
  183. for part in parts:
  184. if not part:
  185. continue
  186. if "=" not in part:
  187. continue
  188. name, value = part.split("=", 1)
  189. cookie = {
  190. "name": str(name).strip(),
  191. "value": str(value).strip(),
  192. "domain": domain,
  193. "path": "/",
  194. "expires": -1,
  195. "httpOnly": False,
  196. "secure": True,
  197. "sameSite": "None"
  198. }
  199. if url:
  200. cookie["url"] = url
  201. cookies.append(cookie)
  202. return cookies
  203. def _normalize_cookies(cookie_value: Any, domain: str, url: str) -> List[Dict[str, Any]]:
  204. if cookie_value is None:
  205. return []
  206. if isinstance(cookie_value, list):
  207. return cookie_value
  208. if isinstance(cookie_value, dict):
  209. if "cookies" in cookie_value:
  210. return _normalize_cookies(cookie_value.get("cookies"), domain, url)
  211. if "name" in cookie_value and "value" in cookie_value:
  212. return [cookie_value]
  213. return []
  214. if isinstance(cookie_value, (bytes, bytearray)):
  215. cookie_value = cookie_value.decode("utf-8", errors="ignore")
  216. if isinstance(cookie_value, str):
  217. text = cookie_value.strip()
  218. if not text:
  219. return []
  220. try:
  221. parsed = json.loads(text)
  222. except Exception:
  223. parsed = None
  224. if parsed is not None:
  225. return _normalize_cookies(parsed, domain, url)
  226. return _parse_cookie_string(text, domain, url)
  227. return []
  228. def _extract_cookie_value(row: Optional[Dict[str, Any]]) -> Any:
  229. if not row:
  230. return None
  231. # 优先使用 cookies 字段
  232. if "cookies" in row:
  233. return row["cookies"]
  234. # 兼容其他可能的字段名
  235. for key, value in row.items():
  236. if "cookie" in key.lower():
  237. return value
  238. return None
  239. def _fetch_cookie_row(cookie_type: str) -> Optional[Dict[str, Any]]:
  240. if not cookie_type:
  241. return None
  242. try:
  243. return mysql.fetchone(
  244. "select * from agent_channel_cookies where type=%s limit 1",
  245. (cookie_type,)
  246. )
  247. except Exception:
  248. return None
  249. def _fetch_profile_id(cookie_type: str) -> Optional[str]:
  250. """从数据库获取 cloud_profile_id"""
  251. if not cookie_type:
  252. return None
  253. try:
  254. row = mysql.fetchone(
  255. "select profileId from agent_channel_cookies where type=%s limit 1",
  256. (cookie_type,)
  257. )
  258. if row and "profileId" in row:
  259. return row["profileId"]
  260. return None
  261. except Exception:
  262. return None
  263. # ============================================================
  264. # 导航类工具 (Navigation Tools)
  265. # ============================================================
  266. @tool()
  267. async def navigate_to_url(url: str, new_tab: bool = False) -> ToolResult:
  268. """
  269. 导航到指定的 URL
  270. Navigate to a specific URL
  271. 使用 browser-use 的原生导航功能,支持在新标签页打开。
  272. Args:
  273. url: 要访问的 URL 地址
  274. new_tab: 是否在新标签页中打开(默认 False)
  275. Returns:
  276. ToolResult: 包含导航结果的工具返回对象
  277. Example:
  278. navigate_to_url("https://www.baidu.com")
  279. navigate_to_url("https://www.google.com", new_tab=True)
  280. """
  281. try:
  282. browser, tools = await get_browser_session()
  283. # 使用 browser-use 的 navigate 工具
  284. result = await tools.navigate(
  285. url=url,
  286. new_tab=new_tab,
  287. browser_session=browser
  288. )
  289. return action_result_to_tool_result(result, f"导航到 {url}")
  290. except Exception as e:
  291. return ToolResult(
  292. title="导航失败",
  293. output="",
  294. error=f"Failed to navigate to {url}: {str(e)}",
  295. long_term_memory=f"导航到 {url} 失败"
  296. )
  297. @tool()
  298. async def search_web(query: str, engine: str = "google") -> ToolResult:
  299. """
  300. 使用搜索引擎搜索
  301. Search the web using a search engine
  302. Args:
  303. query: 搜索关键词
  304. engine: 搜索引擎 (google, duckduckgo, bing) - 默认: google
  305. Returns:
  306. ToolResult: 搜索结果
  307. Example:
  308. search_web("Python async programming", engine="google")
  309. """
  310. try:
  311. browser, tools = await get_browser_session()
  312. # 使用 browser-use 的 search 工具
  313. result = await tools.search(
  314. query=query,
  315. engine=engine,
  316. browser_session=browser
  317. )
  318. return action_result_to_tool_result(result, f"搜索: {query}")
  319. except Exception as e:
  320. return ToolResult(
  321. title="搜索失败",
  322. output="",
  323. error=f"Search failed: {str(e)}",
  324. long_term_memory=f"搜索 '{query}' 失败"
  325. )
  326. @tool()
  327. async def go_back() -> ToolResult:
  328. """
  329. 返回到上一个页面
  330. Go back to the previous page
  331. 模拟浏览器的"后退"按钮功能。
  332. Returns:
  333. ToolResult: 包含返回操作结果的工具返回对象
  334. """
  335. try:
  336. browser, tools = await get_browser_session()
  337. result = await tools.go_back(browser_session=browser)
  338. return action_result_to_tool_result(result, "返回上一页")
  339. except Exception as e:
  340. return ToolResult(
  341. title="返回失败",
  342. output="",
  343. error=f"Failed to go back: {str(e)}",
  344. long_term_memory="返回上一页失败"
  345. )
  346. @tool()
  347. async def wait(seconds: int = 3) -> ToolResult:
  348. """
  349. 等待指定的秒数
  350. Wait for a specified number of seconds
  351. 用于等待页面加载、动画完成或其他异步操作。
  352. Args:
  353. seconds: 等待时间(秒),最大30秒
  354. Returns:
  355. ToolResult: 包含等待操作结果的工具返回对象
  356. Example:
  357. wait(5) # 等待5秒
  358. """
  359. try:
  360. browser, tools = await get_browser_session()
  361. result = await tools.wait(seconds=seconds, browser_session=browser)
  362. return action_result_to_tool_result(result, f"等待 {seconds} 秒")
  363. except Exception as e:
  364. return ToolResult(
  365. title="等待失败",
  366. output="",
  367. error=f"Failed to wait: {str(e)}",
  368. long_term_memory="等待失败"
  369. )
  370. # ============================================================
  371. # 元素交互工具 (Element Interaction Tools)
  372. # ============================================================
  373. @tool()
  374. async def click_element(index: int) -> ToolResult:
  375. """
  376. 通过索引点击页面元素
  377. Click an element by index
  378. Args:
  379. index: 元素索引(从浏览器状态中获取)
  380. Returns:
  381. ToolResult: 包含点击操作结果的工具返回对象
  382. Example:
  383. click_element(index=5)
  384. Note:
  385. 需要先通过 get_selector_map 获取页面元素索引
  386. """
  387. try:
  388. browser, tools = await get_browser_session()
  389. result = await tools.click(
  390. index=index,
  391. browser_session=browser
  392. )
  393. return action_result_to_tool_result(result, f"点击元素 {index}")
  394. except Exception as e:
  395. return ToolResult(
  396. title="点击失败",
  397. output="",
  398. error=f"Failed to click element {index}: {str(e)}",
  399. long_term_memory=f"点击元素 {index} 失败"
  400. )
  401. @tool()
  402. async def input_text(index: int, text: str, clear: bool = True) -> ToolResult:
  403. """
  404. 在指定元素中输入文本
  405. Input text into an element
  406. Args:
  407. index: 元素索引(从浏览器状态中获取)
  408. text: 要输入的文本内容
  409. clear: 是否先清除现有文本(默认 True)
  410. Returns:
  411. ToolResult: 包含输入操作结果的工具返回对象
  412. Example:
  413. input_text(index=0, text="Hello World", clear=True)
  414. """
  415. try:
  416. browser, tools = await get_browser_session()
  417. result = await tools.input(
  418. index=index,
  419. text=text,
  420. clear=clear,
  421. browser_session=browser
  422. )
  423. return action_result_to_tool_result(result, f"输入文本到元素 {index}")
  424. except Exception as e:
  425. return ToolResult(
  426. title="输入失败",
  427. output="",
  428. error=f"Failed to input text into element {index}: {str(e)}",
  429. long_term_memory=f"输入文本失败"
  430. )
  431. @tool()
  432. async def send_keys(keys: str) -> ToolResult:
  433. """
  434. 发送键盘按键或快捷键
  435. Send keyboard keys or shortcuts
  436. 支持发送单个按键、组合键和快捷键。
  437. Args:
  438. keys: 要发送的按键字符串
  439. - 单个按键: "Enter", "Escape", "PageDown", "Tab"
  440. - 组合键: "Control+o", "Shift+Tab", "Alt+F4"
  441. - 功能键: "F1", "F2", ..., "F12"
  442. Returns:
  443. ToolResult: 包含按键操作结果的工具返回对象
  444. Example:
  445. send_keys("Enter")
  446. send_keys("Control+A")
  447. """
  448. try:
  449. browser, tools = await get_browser_session()
  450. result = await tools.send_keys(
  451. keys=keys,
  452. browser_session=browser
  453. )
  454. return action_result_to_tool_result(result, f"发送按键: {keys}")
  455. except Exception as e:
  456. return ToolResult(
  457. title="发送按键失败",
  458. output="",
  459. error=f"Failed to send keys: {str(e)}",
  460. long_term_memory="发送按键失败"
  461. )
  462. @tool()
  463. async def upload_file(index: int, path: str) -> ToolResult:
  464. """
  465. 上传文件到文件输入元素
  466. Upload a file to a file input element
  467. Args:
  468. index: 文件输入框的元素索引
  469. path: 要上传的文件路径(绝对路径)
  470. Returns:
  471. ToolResult: 包含上传操作结果的工具返回对象
  472. Example:
  473. upload_file(index=7, path="/path/to/file.pdf")
  474. Note:
  475. 文件必须存在且路径必须是绝对路径
  476. """
  477. try:
  478. browser, tools = await get_browser_session()
  479. result = await tools.upload_file(
  480. index=index,
  481. path=path,
  482. browser_session=browser,
  483. available_file_paths=[path],
  484. file_system=_file_system
  485. )
  486. return action_result_to_tool_result(result, f"上传文件: {path}")
  487. except Exception as e:
  488. return ToolResult(
  489. title="上传失败",
  490. output="",
  491. error=f"Failed to upload file: {str(e)}",
  492. long_term_memory=f"上传文件 {path} 失败"
  493. )
  494. # ============================================================
  495. # 滚动和视图工具 (Scroll & View Tools)
  496. # ============================================================
  497. @tool()
  498. async def scroll_page(down: bool = True, pages: float = 1.0,
  499. index: Optional[int] = None) -> ToolResult:
  500. """
  501. 滚动页面或元素
  502. Scroll the page or a specific element
  503. Args:
  504. down: True 向下滚动,False 向上滚动
  505. pages: 滚动页数(0.5=半页,1=全页,10=滚动到底部/顶部)
  506. index: 可选,滚动特定元素(如下拉框内部)
  507. Returns:
  508. ToolResult: 滚动结果
  509. Example:
  510. scroll_page(down=True, pages=2.0) # 向下滚动2页
  511. scroll_page(down=False, pages=1.0) # 向上滚动1页
  512. """
  513. try:
  514. browser, tools = await get_browser_session()
  515. result = await tools.scroll(
  516. down=down,
  517. pages=pages,
  518. index=index,
  519. browser_session=browser
  520. )
  521. direction = "向下" if down else "向上"
  522. return action_result_to_tool_result(result, f"{direction}滚动 {pages} 页")
  523. except Exception as e:
  524. return ToolResult(
  525. title="滚动失败",
  526. output="",
  527. error=f"Failed to scroll: {str(e)}",
  528. long_term_memory="滚动失败"
  529. )
  530. @tool()
  531. async def find_text(text: str) -> ToolResult:
  532. """
  533. 查找页面中的文本并滚动到该位置
  534. Find text on the page and scroll to it
  535. 在页面中搜索指定的文本,找到后自动滚动到该位置。
  536. Args:
  537. text: 要查找的文本内容
  538. Returns:
  539. ToolResult: 包含查找结果的工具返回对象
  540. Example:
  541. find_text("Privacy Policy")
  542. """
  543. try:
  544. browser, tools = await get_browser_session()
  545. result = await tools.find_text(
  546. text=text,
  547. browser_session=browser
  548. )
  549. return action_result_to_tool_result(result, f"查找文本: {text}")
  550. except Exception as e:
  551. return ToolResult(
  552. title="查找失败",
  553. output="",
  554. error=f"Failed to find text: {str(e)}",
  555. long_term_memory=f"查找文本 '{text}' 失败"
  556. )
  557. @tool()
  558. async def screenshot() -> ToolResult:
  559. """
  560. 请求在下次观察中包含页面截图
  561. Request a screenshot to be included in the next observation
  562. 用于视觉检查页面状态,帮助理解页面布局和内容。
  563. Returns:
  564. ToolResult: 包含截图请求结果的工具返回对象
  565. Example:
  566. screenshot()
  567. Note:
  568. 截图会在下次页面观察时自动包含在结果中。
  569. """
  570. try:
  571. browser, tools = await get_browser_session()
  572. result = await tools.screenshot(browser_session=browser)
  573. return action_result_to_tool_result(result, "截图请求")
  574. except Exception as e:
  575. return ToolResult(
  576. title="截图失败",
  577. output="",
  578. error=f"Failed to capture screenshot: {str(e)}",
  579. long_term_memory="截图失败"
  580. )
  581. # ============================================================
  582. # 标签页管理工具 (Tab Management Tools)
  583. # ============================================================
  584. @tool()
  585. async def switch_tab(tab_id: str) -> ToolResult:
  586. """
  587. 切换到指定标签页
  588. Switch to a different browser tab
  589. Args:
  590. tab_id: 4字符标签ID(target_id 的最后4位)
  591. Returns:
  592. ToolResult: 切换结果
  593. Example:
  594. switch_tab(tab_id="a3f2")
  595. """
  596. try:
  597. browser, tools = await get_browser_session()
  598. normalized_tab_id = tab_id[-4:] if tab_id else tab_id
  599. result = await tools.switch(
  600. tab_id=normalized_tab_id,
  601. browser_session=browser
  602. )
  603. return action_result_to_tool_result(result, f"切换到标签页 {normalized_tab_id}")
  604. except Exception as e:
  605. return ToolResult(
  606. title="切换标签页失败",
  607. output="",
  608. error=f"Failed to switch tab: {str(e)}",
  609. long_term_memory=f"切换到标签页 {tab_id} 失败"
  610. )
  611. @tool()
  612. async def close_tab(tab_id: str) -> ToolResult:
  613. """
  614. 关闭指定标签页
  615. Close a browser tab
  616. Args:
  617. tab_id: 4字符标签ID
  618. Returns:
  619. ToolResult: 关闭结果
  620. Example:
  621. close_tab(tab_id="a3f2")
  622. """
  623. try:
  624. browser, tools = await get_browser_session()
  625. normalized_tab_id = tab_id[-4:] if tab_id else tab_id
  626. result = await tools.close(
  627. tab_id=normalized_tab_id,
  628. browser_session=browser
  629. )
  630. return action_result_to_tool_result(result, f"关闭标签页 {normalized_tab_id}")
  631. except Exception as e:
  632. return ToolResult(
  633. title="关闭标签页失败",
  634. output="",
  635. error=f"Failed to close tab: {str(e)}",
  636. long_term_memory=f"关闭标签页 {tab_id} 失败"
  637. )
  638. # ============================================================
  639. # 下拉框工具 (Dropdown Tools)
  640. # ============================================================
  641. @tool()
  642. async def get_dropdown_options(index: int) -> ToolResult:
  643. """
  644. 获取下拉框的所有选项
  645. Get options from a dropdown element
  646. Args:
  647. index: 下拉框的元素索引
  648. Returns:
  649. ToolResult: 包含所有选项的结果
  650. Example:
  651. get_dropdown_options(index=8)
  652. """
  653. try:
  654. browser, tools = await get_browser_session()
  655. result = await tools.dropdown_options(
  656. index=index,
  657. browser_session=browser
  658. )
  659. return action_result_to_tool_result(result, f"获取下拉框选项: {index}")
  660. except Exception as e:
  661. return ToolResult(
  662. title="获取下拉框选项失败",
  663. output="",
  664. error=f"Failed to get dropdown options: {str(e)}",
  665. long_term_memory=f"获取下拉框 {index} 选项失败"
  666. )
  667. @tool()
  668. async def select_dropdown_option(index: int, text: str) -> ToolResult:
  669. """
  670. 选择下拉框选项
  671. Select an option from a dropdown
  672. Args:
  673. index: 下拉框的元素索引
  674. text: 要选择的选项文本(精确匹配)
  675. Returns:
  676. ToolResult: 选择结果
  677. Example:
  678. select_dropdown_option(index=8, text="Option 2")
  679. """
  680. try:
  681. browser, tools = await get_browser_session()
  682. result = await tools.select_dropdown(
  683. index=index,
  684. text=text,
  685. browser_session=browser
  686. )
  687. return action_result_to_tool_result(result, f"选择下拉框选项: {text}")
  688. except Exception as e:
  689. return ToolResult(
  690. title="选择下拉框选项失败",
  691. output="",
  692. error=f"Failed to select dropdown option: {str(e)}",
  693. long_term_memory=f"选择选项 '{text}' 失败"
  694. )
  695. # ============================================================
  696. # 内容提取工具 (Content Extraction Tools)
  697. # ============================================================
  698. @tool()
  699. async def extract_content(query: str, extract_links: bool = False,
  700. start_from_char: int = 0) -> ToolResult:
  701. """
  702. 使用 LLM 从页面提取结构化数据
  703. Extract content from the current page using LLM
  704. Args:
  705. query: 提取查询(告诉 LLM 要提取什么内容)
  706. extract_links: 是否提取链接(默认 False,节省 token)
  707. start_from_char: 从哪个字符开始提取(用于分页提取大内容)
  708. Returns:
  709. ToolResult: 提取的内容
  710. Example:
  711. extract_content(query="提取页面上所有产品的名称和价格", extract_links=True)
  712. Note:
  713. 需要配置 page_extraction_llm,否则会失败
  714. 支持分页提取,最大100k字符
  715. """
  716. try:
  717. browser, tools = await get_browser_session()
  718. # 注意:extract 需要 page_extraction_llm 参数
  719. # 这里我们假设用户会在初始化时配置 LLM
  720. # 如果没有配置,会抛出异常
  721. result = await tools.extract(
  722. query=query,
  723. extract_links=extract_links,
  724. start_from_char=start_from_char,
  725. browser_session=browser,
  726. page_extraction_llm=None, # 需要用户配置
  727. file_system=_file_system
  728. )
  729. return action_result_to_tool_result(result, f"提取内容: {query}")
  730. except Exception as e:
  731. return ToolResult(
  732. title="内容提取失败",
  733. output="",
  734. error=f"Failed to extract content: {str(e)}",
  735. long_term_memory=f"提取内容失败: {query}"
  736. )
  737. @tool()
  738. async def get_page_html() -> ToolResult:
  739. """
  740. 获取当前页面的完整 HTML
  741. Get the full HTML of the current page
  742. 返回当前页面的完整 HTML 源代码。
  743. Returns:
  744. ToolResult: 包含页面 HTML 的工具返回对象
  745. Example:
  746. get_page_html()
  747. Note:
  748. - 返回的是完整的 HTML 源代码
  749. - 输出会被限制在 10000 字符以内(完整内容保存在 metadata 中)
  750. """
  751. try:
  752. browser, tools = await get_browser_session()
  753. # 使用 CDP 获取页面 HTML
  754. cdp = await browser.get_or_create_cdp_session()
  755. # 获取页面内容
  756. result = await cdp.cdp_client.send.Runtime.evaluate(
  757. params={'expression': 'document.documentElement.outerHTML'},
  758. session_id=cdp.session_id
  759. )
  760. html = result.get('result', {}).get('value', '')
  761. # 获取 URL 和标题
  762. url = await browser.get_current_page_url()
  763. title_result = await cdp.cdp_client.send.Runtime.evaluate(
  764. params={'expression': 'document.title'},
  765. session_id=cdp.session_id
  766. )
  767. title = title_result.get('result', {}).get('value', '')
  768. # 限制输出大小
  769. output_html = html
  770. if len(html) > 10000:
  771. output_html = html[:10000] + "... (truncated)"
  772. return ToolResult(
  773. title=f"获取 HTML: {url}",
  774. output=f"页面: {title}\nURL: {url}\n\nHTML:\n{output_html}",
  775. long_term_memory=f"获取 HTML: {url}",
  776. metadata={"url": url, "title": title, "html": html}
  777. )
  778. except Exception as e:
  779. return ToolResult(
  780. title="获取 HTML 失败",
  781. output="",
  782. error=f"Failed to get page HTML: {str(e)}",
  783. long_term_memory="获取 HTML 失败"
  784. )
  785. @tool()
  786. async def get_selector_map() -> ToolResult:
  787. """
  788. 获取当前页面的元素索引映射
  789. Get the selector map of interactive elements on the current page
  790. 返回页面所有可交互元素的索引字典,用于后续的元素操作。
  791. Returns:
  792. ToolResult: 包含元素映射的工具返回对象
  793. Example:
  794. get_selector_map()
  795. Note:
  796. 返回的索引可以用于 click_element, input_text 等操作
  797. """
  798. try:
  799. browser, tools = await get_browser_session()
  800. # 获取选择器映射
  801. selector_map = await browser.get_selector_map()
  802. # 构建输出信息
  803. elements_info = []
  804. for index, node in list(selector_map.items())[:20]: # 只显示前20个
  805. tag = node.tag_name
  806. attrs = node.attributes or {}
  807. text = attrs.get('aria-label') or attrs.get('placeholder') or attrs.get('value', '')
  808. elements_info.append(f"索引 {index}: <{tag}> {text[:50]}")
  809. output = f"找到 {len(selector_map)} 个交互元素\n\n"
  810. output += "\n".join(elements_info)
  811. if len(selector_map) > 20:
  812. output += f"\n... 还有 {len(selector_map) - 20} 个元素"
  813. return ToolResult(
  814. title="获取元素映射",
  815. output=output,
  816. long_term_memory=f"获取到 {len(selector_map)} 个交互元素",
  817. metadata={"selector_map": {k: str(v) for k, v in list(selector_map.items())[:100]}}
  818. )
  819. except Exception as e:
  820. return ToolResult(
  821. title="获取元素映射失败",
  822. output="",
  823. error=f"Failed to get selector map: {str(e)}",
  824. long_term_memory="获取元素映射失败"
  825. )
  826. # ============================================================
  827. # JavaScript 执行工具 (JavaScript Tools)
  828. # ============================================================
  829. @tool()
  830. async def evaluate(code: str) -> ToolResult:
  831. """
  832. 在页面中执行 JavaScript 代码
  833. Execute JavaScript code in the page context
  834. 允许在当前页面中执行任意 JavaScript 代码,用于复杂的页面操作或数据提取。
  835. Args:
  836. code: 要执行的 JavaScript 代码字符串
  837. Returns:
  838. ToolResult: 包含执行结果的工具返回对象
  839. Example:
  840. evaluate("document.title")
  841. evaluate("document.querySelectorAll('a').length")
  842. Note:
  843. - 代码在页面上下文中执行,可以访问 DOM 和全局变量
  844. - 返回值会被自动序列化为字符串
  845. - 执行结果限制在 20k 字符以内
  846. """
  847. try:
  848. browser, tools = await get_browser_session()
  849. result = await tools.evaluate(
  850. code=code,
  851. browser_session=browser
  852. )
  853. return action_result_to_tool_result(result, "执行 JavaScript")
  854. except Exception as e:
  855. return ToolResult(
  856. title="JavaScript 执行失败",
  857. output="",
  858. error=f"Failed to execute JavaScript: {str(e)}",
  859. long_term_memory="JavaScript 执行失败"
  860. )
  861. @tool()
  862. async def ensure_login_with_cookies(cookie_type: str, url: str = "https://www.xiaohongshu.com") -> ToolResult:
  863. """
  864. 检查登录状态并在需要时注入 cookies
  865. """
  866. try:
  867. browser, tools = await get_browser_session()
  868. if url:
  869. await tools.navigate(url=url, browser_session=browser)
  870. await tools.wait(seconds=2, browser_session=browser)
  871. check_login_js = """
  872. (function() {
  873. const loginBtn = document.querySelector('[class*="login"]') ||
  874. document.querySelector('[href*="login"]') ||
  875. Array.from(document.querySelectorAll('button, a')).find(el => (el.textContent || '').includes('登录'));
  876. const userInfo = document.querySelector('[class*="user"]') ||
  877. document.querySelector('[class*="avatar"]');
  878. return {
  879. needLogin: !!loginBtn && !userInfo,
  880. hasLoginBtn: !!loginBtn,
  881. hasUserInfo: !!userInfo
  882. };
  883. })()
  884. """
  885. result = await tools.evaluate(code=check_login_js, browser_session=browser)
  886. status_output = result.extracted_content
  887. if isinstance(status_output, str) and status_output.startswith("Result: "):
  888. status_output = status_output[8:]
  889. login_info: Dict[str, Any] = {}
  890. if isinstance(status_output, str):
  891. try:
  892. login_info = json.loads(status_output)
  893. except Exception:
  894. login_info = {}
  895. elif isinstance(status_output, dict):
  896. login_info = status_output
  897. if not login_info.get("needLogin"):
  898. output = json.dumps({"need_login": False}, ensure_ascii=False)
  899. return ToolResult(
  900. title="已登录",
  901. output=output,
  902. long_term_memory=output
  903. )
  904. row = _fetch_cookie_row(cookie_type)
  905. cookie_value = _extract_cookie_value(row)
  906. if not cookie_value:
  907. output = json.dumps({"need_login": True, "cookies_count": 0}, ensure_ascii=False)
  908. return ToolResult(
  909. title="未找到 cookies",
  910. output=output,
  911. error="未找到 cookies",
  912. long_term_memory=output
  913. )
  914. domain, base_url = _cookie_domain_for_type(cookie_type, url)
  915. cookies = _normalize_cookies(cookie_value, domain, base_url)
  916. if not cookies:
  917. output = json.dumps({"need_login": True, "cookies_count": 0}, ensure_ascii=False)
  918. return ToolResult(
  919. title="cookies 解析失败",
  920. output=output,
  921. error="cookies 解析失败",
  922. long_term_memory=output
  923. )
  924. await browser._cdp_set_cookies(cookies)
  925. if url:
  926. await tools.navigate(url=url, browser_session=browser)
  927. await tools.wait(seconds=2, browser_session=browser)
  928. output = json.dumps({"need_login": True, "cookies_count": len(cookies)}, ensure_ascii=False)
  929. return ToolResult(
  930. title="已注入 cookies",
  931. output=output,
  932. long_term_memory=output
  933. )
  934. except Exception as e:
  935. return ToolResult(
  936. title="登录检查失败",
  937. output="",
  938. error=str(e),
  939. long_term_memory="登录检查失败"
  940. )
  941. # ============================================================
  942. # 等待用户操作工具 (Wait for User Action)
  943. # ============================================================
  944. @tool()
  945. async def wait_for_user_action(message: str = "Please complete the action in browser",
  946. timeout: int = 300) -> ToolResult:
  947. """
  948. 等待用户在浏览器中完成操作(如登录)
  949. Wait for user to complete an action in the browser (e.g., login)
  950. 暂停自动化流程,等待用户手动完成某些操作(如登录、验证码等)。
  951. Args:
  952. message: 提示用户需要完成的操作
  953. timeout: 最大等待时间(秒),默认 300 秒(5 分钟)
  954. Returns:
  955. ToolResult: 包含等待结果的工具返回对象
  956. Example:
  957. wait_for_user_action("Please login to Xiaohongshu", timeout=180)
  958. wait_for_user_action("Please complete the CAPTCHA", timeout=60)
  959. Note:
  960. - 用户需要在浏览器窗口中手动完成操作
  961. - 完成后按回车键继续
  962. - 超时后会自动继续执行
  963. """
  964. try:
  965. import asyncio
  966. print(f"\n{'='*60}")
  967. print(f"⏸️ WAITING FOR USER ACTION")
  968. print(f"{'='*60}")
  969. print(f"📝 {message}")
  970. print(f"⏱️ Timeout: {timeout} seconds")
  971. print(f"\n👉 Please complete the action in the browser window")
  972. print(f"👉 Press ENTER when done, or wait for timeout")
  973. print(f"{'='*60}\n")
  974. # Wait for user input or timeout
  975. try:
  976. loop = asyncio.get_event_loop()
  977. # Wait for either user input or timeout
  978. await asyncio.wait_for(
  979. loop.run_in_executor(None, input),
  980. timeout=timeout
  981. )
  982. return ToolResult(
  983. title="用户操作完成",
  984. output=f"User completed: {message}",
  985. long_term_memory=f"用户完成操作: {message}"
  986. )
  987. except asyncio.TimeoutError:
  988. return ToolResult(
  989. title="用户操作超时",
  990. output=f"Timeout waiting for: {message}",
  991. long_term_memory=f"等待用户操作超时: {message}"
  992. )
  993. except Exception as e:
  994. return ToolResult(
  995. title="等待用户操作失败",
  996. output="",
  997. error=f"Failed to wait for user action: {str(e)}",
  998. long_term_memory="等待用户操作失败"
  999. )
  1000. # ============================================================
  1001. # 任务完成工具 (Task Completion)
  1002. # ============================================================
  1003. @tool()
  1004. async def done(text: str, success: bool = True,
  1005. files_to_display: Optional[List[str]] = None) -> ToolResult:
  1006. """
  1007. 标记任务完成并返回最终消息
  1008. Mark the task as complete and return final message to user
  1009. Args:
  1010. text: 给用户的最终消息
  1011. success: 任务是否成功完成
  1012. files_to_display: 可选的要显示的文件路径列表
  1013. Returns:
  1014. ToolResult: 完成结果
  1015. Example:
  1016. done("任务已完成,提取了10个产品信息", success=True)
  1017. """
  1018. try:
  1019. browser, tools = await get_browser_session()
  1020. result = await tools.done(
  1021. text=text,
  1022. success=success,
  1023. files_to_display=files_to_display,
  1024. file_system=_file_system
  1025. )
  1026. return action_result_to_tool_result(result, "任务完成")
  1027. except Exception as e:
  1028. return ToolResult(
  1029. title="标记任务完成失败",
  1030. output="",
  1031. error=f"Failed to complete task: {str(e)}",
  1032. long_term_memory="标记任务完成失败"
  1033. )
  1034. # ============================================================
  1035. # 容器管理工具 (Container Management Tools)
  1036. # ============================================================
  1037. import aiohttp
  1038. async def create_container(url: str, account_name: str = "liuwenwu") -> Dict[str, Any]:
  1039. """
  1040. 创建浏览器容器并导航到指定URL
  1041. 按照 test.md 的要求:
  1042. 1.1 调用接口创建容器
  1043. 1.2 调用接口创建窗口并导航到URL
  1044. Args:
  1045. url: 要导航的URL地址
  1046. account_name: 账户名称
  1047. Returns:
  1048. 包含容器信息的字典:
  1049. - success: 是否成功
  1050. - container_id: 容器ID
  1051. - vnc: VNC访问URL
  1052. - cdp: CDP协议URL(用于浏览器连接)
  1053. - connection_id: 窗口连接ID
  1054. - error: 错误信息(如果失败)
  1055. """
  1056. result = {
  1057. "success": False,
  1058. "container_id": None,
  1059. "vnc": None,
  1060. "cdp": None,
  1061. "connection_id": None,
  1062. "error": None
  1063. }
  1064. try:
  1065. async with aiohttp.ClientSession() as session:
  1066. # 步骤1.1: 创建容器
  1067. print("📦 步骤1.1: 创建容器...")
  1068. create_url = "http://47.84.182.56:8200/api/v1/container/create"
  1069. create_payload = {
  1070. "auto_remove": True,
  1071. "need_port_binding": True,
  1072. "max_lifetime_seconds": 900
  1073. }
  1074. async with session.post(create_url, json=create_payload) as resp:
  1075. if resp.status != 200:
  1076. raise RuntimeError(f"创建容器失败: HTTP {resp.status}")
  1077. create_result = await resp.json()
  1078. if create_result.get("code") != 0:
  1079. raise RuntimeError(f"创建容器失败: {create_result.get('msg')}")
  1080. data = create_result.get("data", {})
  1081. result["container_id"] = data.get("container_id")
  1082. result["vnc"] = data.get("vnc")
  1083. result["cdp"] = data.get("cdp")
  1084. print(f"✅ 容器创建成功")
  1085. print(f" Container ID: {result['container_id']}")
  1086. print(f" VNC: {result['vnc']}")
  1087. print(f" CDP: {result['cdp']}")
  1088. # 等待容器内的浏览器启动
  1089. print(f"\n⏳ 等待容器内浏览器启动...")
  1090. await asyncio.sleep(5)
  1091. # 步骤1.2: 创建页面并导航
  1092. print(f"\n📱 步骤1.2: 创建页面并导航到 {url}...")
  1093. page_create_url = "http://47.84.182.56:8200/api/v1/browser/page/create"
  1094. page_payload = {
  1095. "container_id": result["container_id"],
  1096. "url": url,
  1097. "account_name": account_name,
  1098. "need_wait": True,
  1099. "timeout": 30
  1100. }
  1101. # 重试机制:最多尝试3次
  1102. max_retries = 3
  1103. page_created = False
  1104. last_error = None
  1105. for attempt in range(max_retries):
  1106. try:
  1107. if attempt > 0:
  1108. print(f" 重试 {attempt + 1}/{max_retries}...")
  1109. await asyncio.sleep(3) # 重试前等待
  1110. async with session.post(page_create_url, json=page_payload, timeout=aiohttp.ClientTimeout(total=60)) as resp:
  1111. if resp.status != 200:
  1112. response_text = await resp.text()
  1113. last_error = f"HTTP {resp.status}: {response_text[:200]}"
  1114. continue
  1115. page_result = await resp.json()
  1116. if page_result.get("code") != 0:
  1117. last_error = f"{page_result.get('msg')}"
  1118. continue
  1119. page_data = page_result.get("data", {})
  1120. result["connection_id"] = page_data.get("connection_id")
  1121. result["success"] = True
  1122. page_created = True
  1123. print(f"✅ 页面创建成功")
  1124. print(f" Connection ID: {result['connection_id']}")
  1125. break
  1126. except asyncio.TimeoutError:
  1127. last_error = "请求超时"
  1128. continue
  1129. except aiohttp.ClientError as e:
  1130. last_error = f"网络错误: {str(e)}"
  1131. continue
  1132. except Exception as e:
  1133. last_error = f"未知错误: {str(e)}"
  1134. continue
  1135. if not page_created:
  1136. raise RuntimeError(f"创建页面失败(尝试{max_retries}次后): {last_error}")
  1137. except Exception as e:
  1138. result["error"] = str(e)
  1139. print(f"❌ 错误: {str(e)}")
  1140. return result
  1141. # ============================================================
  1142. # 导出所有工具函数(供外部使用)
  1143. # ============================================================
  1144. __all__ = [
  1145. # 会话管理
  1146. 'init_browser_session',
  1147. 'get_browser_session',
  1148. 'cleanup_browser_session',
  1149. 'kill_browser_session',
  1150. # 导航类工具
  1151. 'navigate_to_url',
  1152. 'search_web',
  1153. 'go_back',
  1154. 'wait',
  1155. # 元素交互工具
  1156. 'click_element',
  1157. 'input_text',
  1158. 'send_keys',
  1159. 'upload_file',
  1160. # 滚动和视图工具
  1161. 'scroll_page',
  1162. 'find_text',
  1163. 'screenshot',
  1164. # 标签页管理工具
  1165. 'switch_tab',
  1166. 'close_tab',
  1167. # 下拉框工具
  1168. 'get_dropdown_options',
  1169. 'select_dropdown_option',
  1170. # 内容提取工具
  1171. 'extract_content',
  1172. 'get_page_html',
  1173. 'get_selector_map',
  1174. # JavaScript 执行工具
  1175. 'evaluate',
  1176. 'ensure_login_with_cookies',
  1177. # 等待用户操作
  1178. 'wait_for_user_action',
  1179. # 任务完成
  1180. 'done',
  1181. # 容器管理
  1182. 'create_container',
  1183. ]