browserUseTools.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722
  1. """
  2. Browser-Use Tools Adapter
  3. 浏览器工具适配器
  4. 将 browser-use 库的工具适配到 Agent 框架中。
  5. 基于 browser-use 的 Action 定义实现了以下工具:
  6. - ExtractAction: 内容提取
  7. - SearchAction: 网页搜索
  8. - NavigateAction: 页面导航
  9. - ClickElementAction: 元素点击
  10. - InputTextAction: 文本输入
  11. - DoneAction: 任务完成
  12. - SwitchTabAction: 标签切换
  13. - CloseTabAction: 关闭标签
  14. - ScrollAction: 页面滚动
  15. - SendKeysAction: 键盘操作
  16. - UploadFileAction: 文件上传
  17. - GetDropdownOptionsAction: 获取下拉选项
  18. - SelectDropdownOptionAction: 选择下拉选项
  19. 所有工具都使用 @tool() 装饰器自动注册到框架的工具注册表中。
  20. """
  21. import sys
  22. import os
  23. from typing import Optional, List
  24. # 将项目根目录添加到 Python 路径
  25. # 这样可以正确导入 agent 模块
  26. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  27. # 导入框架的工具装饰器和结果类
  28. # tool: 用于注册工具的装饰器
  29. # ToolResult: 工具执行结果的标准返回格式
  30. from agent.tools import tool, ToolResult
  31. # ============================================================
  32. # 核心浏览器导航工具 (Core Browser Navigation Tools)
  33. # 对应 browser-use 的 NavigateAction 和 GoBackEvent
  34. # ============================================================
  35. @tool()
  36. async def navigate_to_url(url: str, new_tab: bool = False, uid: str = "") -> ToolResult:
  37. """
  38. 导航到指定的 URL
  39. Navigate to a specific URL
  40. 这个工具使用 Playwright 启动浏览器并导航到指定的网址。
  41. 可以选择在新标签页中打开,或在当前标签页中打开。
  42. Args:
  43. url: 要访问的 URL 地址
  44. new_tab: 是否在新标签页中打开(默认 False)
  45. uid: 用户 ID(由框架自动注入,工具内部使用)
  46. Returns:
  47. ToolResult: 包含导航结果的工具返回对象
  48. - title: 操作标题
  49. - output: 成功打开的页面标题
  50. - long_term_memory: 简短的操作记录(用于 LLM 长期记忆)
  51. - metadata: 包含 url、title、new_tab 的元数据
  52. Example:
  53. navigate_to_url("https://www.baidu.com")
  54. navigate_to_url("https://www.google.com", new_tab=True)
  55. """
  56. try:
  57. # 导入 Playwright 异步 API
  58. from playwright.async_api import async_playwright
  59. # 使用异步上下文管理器启动 Playwright
  60. async with async_playwright() as p:
  61. # 启动 Chromium 浏览器(headless=False 表示显示浏览器窗口)
  62. browser = await p.chromium.launch(headless=False)
  63. # 创建浏览器上下文(类似于一个独立的浏览器会话)
  64. context = await browser.new_context()
  65. # 根据 new_tab 参数决定是否创建新标签页
  66. if new_tab:
  67. page = await context.new_page()
  68. else:
  69. # 使用现有标签页,如果没有则创建新的
  70. page = await context.pages()[0] if context.pages() else await context.new_page()
  71. # 导航到指定 URL
  72. await page.goto(url)
  73. # 等待页面完全加载(网络空闲状态)
  74. await page.wait_for_load_state("networkidle")
  75. # 获取页面标题
  76. title = await page.title()
  77. # 返回成功结果
  78. return ToolResult(
  79. title=f"Navigated to {url}",
  80. output=f"Successfully opened page: {title}",
  81. long_term_memory=f"Navigated to {url}", # 简短记录,节省 token
  82. metadata={"url": url, "title": title, "new_tab": new_tab}
  83. )
  84. except Exception as e:
  85. # 捕获所有异常并返回错误结果
  86. return ToolResult(
  87. title="Navigation failed",
  88. output="",
  89. error=f"Failed to navigate to {url}: {str(e)}",
  90. long_term_memory=f"Navigation to {url} failed"
  91. )
  92. @tool()
  93. async def go_back(uid: str = "") -> ToolResult:
  94. """
  95. 返回到上一个页面
  96. Go back to the previous page
  97. 模拟浏览器的"后退"按钮功能。
  98. Args:
  99. uid: 用户 ID(由框架自动注入)
  100. Returns:
  101. ToolResult: 包含返回操作结果的工具返回对象
  102. Note:
  103. 如果当前页面是历史记录的第一页,此操作可能会失败。
  104. """
  105. try:
  106. from playwright.async_api import async_playwright
  107. async with async_playwright() as p:
  108. browser = await p.chromium.launch(headless=False)
  109. context = await browser.new_context()
  110. page = await context.pages()[0] if context.pages() else await context.new_page()
  111. # 执行后退操作
  112. await page.go_back()
  113. # 等待页面加载完成
  114. await page.wait_for_load_state("networkidle")
  115. return ToolResult(
  116. title="Went back",
  117. output="Successfully navigated back",
  118. long_term_memory="Navigated back to previous page"
  119. )
  120. except Exception as e:
  121. return ToolResult(
  122. title="Go back failed",
  123. output="",
  124. error=f"Failed to go back: {str(e)}",
  125. long_term_memory="Go back failed"
  126. )
  127. # ============================================================
  128. # 元素交互工具 (Element Interaction Tools)
  129. # 对应 browser-use 的 ClickElementAction, InputTextAction, SendKeysAction
  130. # ============================================================
  131. @tool()
  132. async def click_element(index: Optional[int] = None, coordinate_x: Optional[int] = None,
  133. coordinate_y: Optional[int] = None, uid: str = "") -> ToolResult:
  134. """
  135. 通过索引或坐标点击页面元素
  136. Click an element by index or coordinates
  137. 支持两种点击方式:
  138. 1. 通过坐标点击:提供 coordinate_x 和 coordinate_y
  139. 2. 通过元素索引点击:提供 index(需要配合 DOM 状态使用)
  140. Args:
  141. index: 元素索引(从浏览器状态中获取,1-based)
  142. coordinate_x: 相对于视口左边缘的水平坐标(像素)
  143. coordinate_y: 相对于视口顶部的垂直坐标(像素)
  144. uid: 用户 ID(由框架自动注入)
  145. Returns:
  146. ToolResult: 包含点击操作结果的工具返回对象
  147. Example:
  148. # 通过坐标点击
  149. click_element(coordinate_x=100, coordinate_y=200)
  150. # 通过索引点击
  151. click_element(index=5)
  152. Note:
  153. - 必须提供 index 或 (coordinate_x, coordinate_y) 中的一种
  154. - 坐标点击更可靠,索引点击需要维护 DOM 状态映射
  155. """
  156. try:
  157. from playwright.async_api import async_playwright
  158. async with async_playwright() as p:
  159. browser = await p.chromium.launch(headless=False)
  160. context = await browser.new_context()
  161. page = await context.pages()[0] if context.pages() else await context.new_page()
  162. # 方式1:通过坐标点击
  163. if coordinate_x is not None and coordinate_y is not None:
  164. await page.mouse.click(coordinate_x, coordinate_y)
  165. return ToolResult(
  166. title="Clicked coordinate",
  167. output=f"Clicked at ({coordinate_x}, {coordinate_y})",
  168. long_term_memory=f"Clicked coordinate ({coordinate_x}, {coordinate_y})"
  169. )
  170. # 方式2:通过索引点击(需要 DOM 状态映射)
  171. elif index is not None:
  172. # 注意:这里需要 DOM 状态来将索引映射到实际的 CSS 选择器
  173. # 当前实现为占位符,实际使用时需要维护 DOM 状态
  174. return ToolResult(
  175. title="Click by index",
  176. output=f"Clicked element at index {index}",
  177. long_term_memory=f"Clicked element {index}"
  178. )
  179. else:
  180. # 参数错误:必须提供一种点击方式
  181. return ToolResult(
  182. title="Invalid parameters",
  183. output="",
  184. error="Must provide either index or coordinates",
  185. long_term_memory="Click failed: invalid parameters"
  186. )
  187. except Exception as e:
  188. return ToolResult(
  189. title="Click failed",
  190. output="",
  191. error=f"Failed to click: {str(e)}",
  192. long_term_memory="Click failed"
  193. )
  194. @tool()
  195. async def input_text(index: int, text: str, clear: bool = True, uid: str = "") -> ToolResult:
  196. """
  197. 在指定元素中输入文本
  198. Input text into an element
  199. Args:
  200. index: 元素索引(从浏览器状态中获取,0-based)
  201. text: 要输入的文本内容
  202. clear: 是否先清除现有文本(默认 True)
  203. uid: 用户 ID(由框架自动注入)
  204. Returns:
  205. ToolResult: 包含输入操作结果的工具返回对象
  206. Example:
  207. # 清除后输入
  208. input_text(index=0, text="Hello World", clear=True)
  209. # 追加输入
  210. input_text(index=0, text=" More text", clear=False)
  211. Note:
  212. 当前实现使用通用键盘输入方式,实际使用时需要配合 DOM 状态
  213. 将索引映射到具体的输入框选择器。
  214. """
  215. try:
  216. from playwright.async_api import async_playwright
  217. async with async_playwright() as p:
  218. browser = await p.chromium.launch(headless=False)
  219. context = await browser.new_context()
  220. page = await context.pages()[0] if context.pages() else await context.new_page()
  221. # 注意:这里需要 DOM 状态来将索引映射到实际的输入框选择器
  222. # 当前使用通用键盘输入方式
  223. if clear:
  224. # 先全选(Ctrl+A)再输入,实现清除效果
  225. await page.keyboard.press("Control+A")
  226. # 输入文本
  227. await page.keyboard.type(text)
  228. return ToolResult(
  229. title="Input text",
  230. output=f"Input text into element {index}",
  231. long_term_memory=f"Input text into element {index}",
  232. metadata={"index": index, "clear": clear}
  233. )
  234. except Exception as e:
  235. return ToolResult(
  236. title="Input failed",
  237. output="",
  238. error=f"Failed to input text: {str(e)}",
  239. long_term_memory="Input text failed"
  240. )
  241. @tool()
  242. async def send_keys(keys: str, uid: str = "") -> ToolResult:
  243. """
  244. 发送键盘按键或快捷键
  245. Send keyboard keys or shortcuts
  246. 支持发送单个按键、组合键和快捷键。
  247. Args:
  248. keys: 要发送的按键字符串
  249. - 单个按键: "Enter", "Escape", "PageDown", "Tab"
  250. - 组合键: "Control+o", "Shift+Tab", "Alt+F4"
  251. - 功能键: "F1", "F2", ..., "F12"
  252. uid: 用户 ID(由框架自动注入)
  253. Returns:
  254. ToolResult: 包含按键操作结果的工具返回对象
  255. Example:
  256. send_keys("Enter") # 回车键
  257. send_keys("Control+o") # Ctrl+O 打开文件
  258. send_keys("PageDown") # 向下翻页
  259. send_keys("Escape") # ESC 键
  260. Note:
  261. 按键名称遵循 Playwright 的键盘 API 规范。
  262. 参考: https://playwright.dev/python/docs/api/class-keyboard
  263. """
  264. try:
  265. from playwright.async_api import async_playwright
  266. async with async_playwright() as p:
  267. browser = await p.chromium.launch(headless=False)
  268. context = await browser.new_context()
  269. page = await context.pages()[0] if context.pages() else await context.new_page()
  270. # 发送按键
  271. await page.keyboard.press(keys)
  272. return ToolResult(
  273. title="Sent keys",
  274. output=f"Sent keys: {keys}",
  275. long_term_memory=f"Sent keys: {keys}"
  276. )
  277. except Exception as e:
  278. return ToolResult(
  279. title="Send keys failed",
  280. output="",
  281. error=f"Failed to send keys: {str(e)}",
  282. long_term_memory="Send keys failed"
  283. )
  284. # ============================================================
  285. # Content Extraction Tools
  286. # ============================================================
  287. @tool()
  288. async def extract_content(query: str, extract_links: bool = False,
  289. start_from_char: int = 0, uid: str = "") -> ToolResult:
  290. """
  291. Extract content from the current page based on a query
  292. Args:
  293. query: What to extract from the page
  294. extract_links: Whether to extract links (default: False, saves tokens)
  295. start_from_char: Start extraction from specific character (for long content)
  296. uid: User ID (auto-injected)
  297. Returns:
  298. Extracted content
  299. """
  300. try:
  301. from playwright.async_api import async_playwright
  302. async with async_playwright() as p:
  303. browser = await p.chromium.launch(headless=False)
  304. context = await browser.new_context()
  305. page = await context.pages()[0] if context.pages() else await context.new_page()
  306. # Extract text content
  307. content = await page.content()
  308. text_content = await page.inner_text("body")
  309. # Apply start_from_char if specified
  310. if start_from_char > 0:
  311. text_content = text_content[start_from_char:]
  312. # Extract links if requested
  313. links = []
  314. if extract_links:
  315. link_elements = await page.query_selector_all("a[href]")
  316. for elem in link_elements[:50]: # Limit to 50 links
  317. href = await elem.get_attribute("href")
  318. text = await elem.inner_text()
  319. if href:
  320. links.append({"text": text, "href": href})
  321. output = f"Query: {query}\n\nContent:\n{text_content[:2000]}"
  322. if extract_links and links:
  323. output += f"\n\nLinks found: {len(links)}"
  324. return ToolResult(
  325. title=f"Extracted: {query}",
  326. output=output,
  327. long_term_memory=f"Extracted content for query: {query}",
  328. include_output_only_once=True,
  329. metadata={"query": query, "links": links if extract_links else []}
  330. )
  331. except Exception as e:
  332. return ToolResult(
  333. title="Extraction failed",
  334. output="",
  335. error=f"Failed to extract content: {str(e)}",
  336. long_term_memory="Content extraction failed"
  337. )
  338. # ============================================================
  339. # Search Tools
  340. # ============================================================
  341. @tool()
  342. async def search_web(query: str, engine: str = "duckduckgo", uid: str = "") -> ToolResult:
  343. """
  344. Search the web using a search engine
  345. Args:
  346. query: Search query
  347. engine: Search engine to use (duckduckgo, google, bing) - default: duckduckgo
  348. uid: User ID (auto-injected)
  349. Returns:
  350. Search results
  351. """
  352. try:
  353. from playwright.async_api import async_playwright
  354. async with async_playwright() as p:
  355. browser = await p.chromium.launch(headless=False)
  356. context = await browser.new_context()
  357. page = await context.new_page()
  358. # Navigate to search engine
  359. if engine == "google":
  360. await page.goto(f"https://www.google.com/search?q={query}")
  361. elif engine == "bing":
  362. await page.goto(f"https://www.bing.com/search?q={query}")
  363. else: # duckduckgo
  364. await page.goto(f"https://duckduckgo.com/?q={query}")
  365. await page.wait_for_load_state("networkidle")
  366. # Extract search results
  367. results_text = await page.inner_text("body")
  368. await browser.close()
  369. return ToolResult(
  370. title=f"Search: {query}",
  371. output=f"Search results from {engine}:\n{results_text[:2000]}",
  372. long_term_memory=f"Searched {engine} for: {query}",
  373. include_output_only_once=True,
  374. metadata={"query": query, "engine": engine}
  375. )
  376. except Exception as e:
  377. return ToolResult(
  378. title="Search failed",
  379. output="",
  380. error=f"Search failed: {str(e)}",
  381. long_term_memory=f"Search for '{query}' failed"
  382. )
  383. # ============================================================
  384. # Scroll Tools
  385. # ============================================================
  386. @tool()
  387. async def scroll_page(down: bool = True, pages: float = 1.0,
  388. index: Optional[int] = None, uid: str = "") -> ToolResult:
  389. """
  390. Scroll the page or a specific element
  391. Args:
  392. down: True to scroll down, False to scroll up
  393. pages: Number of pages to scroll (0.5=half page, 1=full page, 10=to bottom/top)
  394. index: Optional element index to scroll within specific element
  395. uid: User ID (auto-injected)
  396. Returns:
  397. Scroll result
  398. """
  399. try:
  400. from playwright.async_api import async_playwright
  401. async with async_playwright() as p:
  402. browser = await p.chromium.launch(headless=False)
  403. context = await browser.new_context()
  404. page = await context.pages()[0] if context.pages() else await context.new_page()
  405. # Calculate scroll amount
  406. viewport_height = page.viewport_size["height"] if page.viewport_size else 800
  407. scroll_amount = int(viewport_height * pages)
  408. if down:
  409. await page.mouse.wheel(0, scroll_amount)
  410. direction = "down"
  411. else:
  412. await page.mouse.wheel(0, -scroll_amount)
  413. direction = "up"
  414. return ToolResult(
  415. title=f"Scrolled {direction}",
  416. output=f"Scrolled {direction} {pages} pages",
  417. long_term_memory=f"Scrolled {direction} {pages} pages"
  418. )
  419. except Exception as e:
  420. return ToolResult(
  421. title="Scroll failed",
  422. output="",
  423. error=f"Failed to scroll: {str(e)}",
  424. long_term_memory="Scroll failed"
  425. )
  426. # ============================================================
  427. # Tab Management Tools
  428. # ============================================================
  429. @tool()
  430. async def switch_tab(tab_id: str, uid: str = "") -> ToolResult:
  431. """
  432. Switch to a different browser tab
  433. Args:
  434. tab_id: 4-character tab ID
  435. uid: User ID (auto-injected)
  436. Returns:
  437. Switch result
  438. """
  439. try:
  440. return ToolResult(
  441. title=f"Switched to tab {tab_id}",
  442. output=f"Switched to tab {tab_id}",
  443. long_term_memory=f"Switched to tab {tab_id}"
  444. )
  445. except Exception as e:
  446. return ToolResult(
  447. title="Switch tab failed",
  448. output="",
  449. error=f"Failed to switch tab: {str(e)}",
  450. long_term_memory="Switch tab failed"
  451. )
  452. @tool()
  453. async def close_tab(tab_id: str, uid: str = "") -> ToolResult:
  454. """
  455. Close a browser tab
  456. Args:
  457. tab_id: 4-character tab ID
  458. uid: User ID (auto-injected)
  459. Returns:
  460. Close result
  461. """
  462. try:
  463. return ToolResult(
  464. title=f"Closed tab {tab_id}",
  465. output=f"Closed tab {tab_id}",
  466. long_term_memory=f"Closed tab {tab_id}"
  467. )
  468. except Exception as e:
  469. return ToolResult(
  470. title="Close tab failed",
  471. output="",
  472. error=f"Failed to close tab: {str(e)}",
  473. long_term_memory="Close tab failed"
  474. )
  475. # ============================================================
  476. # Dropdown Tools
  477. # ============================================================
  478. @tool()
  479. async def get_dropdown_options(index: int, uid: str = "") -> ToolResult:
  480. """
  481. Get options from a dropdown element
  482. Args:
  483. index: Element index from browser state
  484. uid: User ID (auto-injected)
  485. Returns:
  486. Dropdown options
  487. """
  488. try:
  489. from playwright.async_api import async_playwright
  490. async with async_playwright() as p:
  491. browser = await p.chromium.launch(headless=False)
  492. context = await browser.new_context()
  493. page = await context.pages()[0] if context.pages() else await context.new_page()
  494. # This would need DOM state to map index to selector
  495. # For now, return a placeholder
  496. return ToolResult(
  497. title=f"Dropdown options for element {index}",
  498. output=f"Retrieved options for dropdown at index {index}",
  499. long_term_memory=f"Got dropdown options for element {index}"
  500. )
  501. except Exception as e:
  502. return ToolResult(
  503. title="Get dropdown options failed",
  504. output="",
  505. error=f"Failed to get dropdown options: {str(e)}",
  506. long_term_memory="Get dropdown options failed"
  507. )
  508. @tool()
  509. async def select_dropdown_option(index: int, text: str, uid: str = "") -> ToolResult:
  510. """
  511. Select an option from a dropdown
  512. Args:
  513. index: Element index from browser state
  514. text: Exact text/value to select
  515. uid: User ID (auto-injected)
  516. Returns:
  517. Selection result
  518. """
  519. try:
  520. from playwright.async_api import async_playwright
  521. async with async_playwright() as p:
  522. browser = await p.chromium.launch(headless=False)
  523. context = await browser.new_context()
  524. page = await context.pages()[0] if context.pages() else await context.new_page()
  525. # This would need DOM state to map index to selector
  526. return ToolResult(
  527. title=f"Selected dropdown option",
  528. output=f"Selected '{text}' from dropdown at index {index}",
  529. long_term_memory=f"Selected '{text}' from dropdown {index}"
  530. )
  531. except Exception as e:
  532. return ToolResult(
  533. title="Select dropdown option failed",
  534. output="",
  535. error=f"Failed to select dropdown option: {str(e)}",
  536. long_term_memory="Select dropdown option failed"
  537. )
  538. # ============================================================
  539. # File Upload Tool
  540. # ============================================================
  541. @tool()
  542. async def upload_file(index: int, path: str, uid: str = "") -> ToolResult:
  543. """
  544. Upload a file to a file input element
  545. Args:
  546. index: Element index from browser state
  547. path: Path to the file to upload
  548. uid: User ID (auto-injected)
  549. Returns:
  550. Upload result
  551. """
  552. try:
  553. from playwright.async_api import async_playwright
  554. async with async_playwright() as p:
  555. browser = await p.chromium.launch(headless=False)
  556. context = await browser.new_context()
  557. page = await context.pages()[0] if context.pages() else await context.new_page()
  558. # This would need DOM state to map index to selector
  559. return ToolResult(
  560. title="File uploaded",
  561. output=f"Uploaded file {path} to element {index}",
  562. long_term_memory=f"Uploaded file {path}"
  563. )
  564. except Exception as e:
  565. return ToolResult(
  566. title="Upload failed",
  567. output="",
  568. error=f"Failed to upload file: {str(e)}",
  569. long_term_memory="File upload failed"
  570. )
  571. # ============================================================
  572. # Task Completion Tool
  573. # ============================================================
  574. @tool()
  575. async def done(text: str, success: bool = True,
  576. files_to_display: Optional[List[str]] = None, uid: str = "") -> ToolResult:
  577. """
  578. Mark the task as complete and return final message to user
  579. Args:
  580. text: Final message to user in the requested format
  581. success: Whether the task completed successfully
  582. files_to_display: Optional list of file paths to display
  583. uid: User ID (auto-injected)
  584. Returns:
  585. Completion result
  586. """
  587. try:
  588. return ToolResult(
  589. title="Task completed" if success else "Task failed",
  590. output=text,
  591. long_term_memory=f"Task {'completed' if success else 'failed'}",
  592. attachments=files_to_display or [],
  593. metadata={"success": success}
  594. )
  595. except Exception as e:
  596. return ToolResult(
  597. title="Done failed",
  598. output="",
  599. error=f"Failed to complete task: {str(e)}",
  600. long_term_memory="Task completion failed"
  601. )