browserUseTools.py 43 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303
  1. """
  2. Browser-Use Tools Adapter
  3. 浏览器工具适配器
  4. 将 browser-use 库的工具适配到 Agent 框架中。
  5. 基于 browser-use 的 Action 定义实现了以下工具:
  6. 导航类工具 (Navigation Tools):
  7. - navigate_to_url: 页面导航 (NavigateAction)
  8. - go_back: 返回上一页 (GoBackEvent)
  9. - search_web: 网页搜索 (SearchAction)
  10. 元素交互工具 (Element Interaction Tools):
  11. - click_element: 元素点击 (ClickElementAction)
  12. - input_text: 文本输入 (InputTextAction)
  13. - send_keys: 键盘操作 (SendKeysAction)
  14. 内容提取工具 (Content Extraction Tools):
  15. - extract_content: 内容提取 (ExtractAction)
  16. 滚动和视图工具 (Scroll & View Tools):
  17. - scroll_page: 页面滚动 (ScrollAction)
  18. - find_text: 查找文本并滚动
  19. - screenshot: 页面截图
  20. 标签页管理工具 (Tab Management Tools):
  21. - switch_tab: 标签切换 (SwitchTabAction)
  22. - close_tab: 关闭标签 (CloseTabAction)
  23. 下拉框工具 (Dropdown Tools):
  24. - get_dropdown_options: 获取下拉选项 (GetDropdownOptionsAction)
  25. - select_dropdown_option: 选择下拉选项 (SelectDropdownOptionAction)
  26. 文件操作工具 (File Tools):
  27. - upload_file: 文件上传 (UploadFileAction)
  28. - write_file: 写入文件
  29. - read_file: 读取文件
  30. - replace_file: 替换文件内容
  31. JavaScript 执行工具 (JavaScript Tools):
  32. - evaluate: 执行 JavaScript 代码
  33. 任务完成工具 (Task Completion Tools):
  34. - done: 任务完成 (DoneAction)
  35. 等待工具 (Wait Tools):
  36. - wait: 等待指定秒数
  37. 所有工具都使用 @tool() 装饰器自动注册到框架的工具注册表中。
  38. """
  39. import sys
  40. import os
  41. from typing import Optional, List
  42. # 将项目根目录添加到 Python 路径
  43. # 这样可以正确导入 agent 模块
  44. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  45. # 导入框架的工具装饰器和结果类
  46. # tool: 用于注册工具的装饰器
  47. # ToolResult: 工具执行结果的标准返回格式
  48. from agent.tools import tool, ToolResult
  49. # ============================================================
  50. # 核心浏览器导航工具 (Core Browser Navigation Tools)
  51. # 对应 browser-use 的 NavigateAction 和 GoBackEvent
  52. # ============================================================
  53. @tool()
  54. async def navigate_to_url(url: str, new_tab: bool = False, uid: str = "") -> ToolResult:
  55. """
  56. 导航到指定的 URL
  57. Navigate to a specific URL
  58. 这个工具使用 Playwright 启动浏览器并导航到指定的网址。
  59. 可以选择在新标签页中打开,或在当前标签页中打开。
  60. Args:
  61. url: 要访问的 URL 地址
  62. new_tab: 是否在新标签页中打开(默认 False)
  63. uid: 用户 ID(由框架自动注入,工具内部使用)
  64. Returns:
  65. ToolResult: 包含导航结果的工具返回对象
  66. - title: 操作标题
  67. - output: 成功打开的页面标题
  68. - long_term_memory: 简短的操作记录(用于 LLM 长期记忆)
  69. - metadata: 包含 url、title、new_tab 的元数据
  70. Example:
  71. navigate_to_url("https://www.baidu.com")
  72. navigate_to_url("https://www.google.com", new_tab=True)
  73. """
  74. try:
  75. # 导入 Playwright 异步 API
  76. from playwright.async_api import async_playwright
  77. # 使用异步上下文管理器启动 Playwright
  78. async with async_playwright() as p:
  79. # 启动 Chromium 浏览器(headless=False 表示显示浏览器窗口)
  80. browser = await p.chromium.launch(headless=False)
  81. # 创建浏览器上下文(类似于一个独立的浏览器会话)
  82. context = await browser.new_context()
  83. # 根据 new_tab 参数决定是否创建新标签页
  84. if new_tab:
  85. page = await context.new_page()
  86. else:
  87. # 使用现有标签页,如果没有则创建新的
  88. page = await context.pages()[0] if context.pages() else await context.new_page()
  89. # 导航到指定 URL
  90. await page.goto(url)
  91. # 等待页面完全加载(网络空闲状态)
  92. await page.wait_for_load_state("networkidle")
  93. # 获取页面标题
  94. title = await page.title()
  95. # 返回成功结果
  96. return ToolResult(
  97. title=f"Navigated to {url}",
  98. output=f"Successfully opened page: {title}",
  99. long_term_memory=f"Navigated to {url}", # 简短记录,节省 token
  100. metadata={"url": url, "title": title, "new_tab": new_tab}
  101. )
  102. except Exception as e:
  103. # 捕获所有异常并返回错误结果
  104. return ToolResult(
  105. title="Navigation failed",
  106. output="",
  107. error=f"Failed to navigate to {url}: {str(e)}",
  108. long_term_memory=f"Navigation to {url} failed"
  109. )
  110. @tool()
  111. async def go_back(uid: str = "") -> ToolResult:
  112. """
  113. 返回到上一个页面
  114. Go back to the previous page
  115. 模拟浏览器的"后退"按钮功能。
  116. Args:
  117. uid: 用户 ID(由框架自动注入)
  118. Returns:
  119. ToolResult: 包含返回操作结果的工具返回对象
  120. Note:
  121. 如果当前页面是历史记录的第一页,此操作可能会失败。
  122. """
  123. try:
  124. from playwright.async_api import async_playwright
  125. async with async_playwright() as p:
  126. browser = await p.chromium.launch(headless=False)
  127. context = await browser.new_context()
  128. page = await context.pages()[0] if context.pages() else await context.new_page()
  129. # 执行后退操作
  130. await page.go_back()
  131. # 等待页面加载完成
  132. await page.wait_for_load_state("networkidle")
  133. return ToolResult(
  134. title="Went back",
  135. output="Successfully navigated back",
  136. long_term_memory="Navigated back to previous page"
  137. )
  138. except Exception as e:
  139. return ToolResult(
  140. title="Go back failed",
  141. output="",
  142. error=f"Failed to go back: {str(e)}",
  143. long_term_memory="Go back failed"
  144. )
  145. # ============================================================
  146. # 元素交互工具 (Element Interaction Tools)
  147. # 对应 browser-use 的 ClickElementAction, InputTextAction, SendKeysAction
  148. # ============================================================
  149. @tool()
  150. async def click_element(index: Optional[int] = None, coordinate_x: Optional[int] = None,
  151. coordinate_y: Optional[int] = None, uid: str = "") -> ToolResult:
  152. """
  153. 通过索引或坐标点击页面元素
  154. Click an element by index or coordinates
  155. 支持两种点击方式:
  156. 1. 通过坐标点击:提供 coordinate_x 和 coordinate_y
  157. 2. 通过元素索引点击:提供 index(需要配合 DOM 状态使用)
  158. Args:
  159. index: 元素索引(从浏览器状态中获取,1-based)
  160. coordinate_x: 相对于视口左边缘的水平坐标(像素)
  161. coordinate_y: 相对于视口顶部的垂直坐标(像素)
  162. uid: 用户 ID(由框架自动注入)
  163. Returns:
  164. ToolResult: 包含点击操作结果的工具返回对象
  165. Example:
  166. # 通过坐标点击
  167. click_element(coordinate_x=100, coordinate_y=200)
  168. # 通过索引点击
  169. click_element(index=5)
  170. Note:
  171. - 必须提供 index 或 (coordinate_x, coordinate_y) 中的一种
  172. - 坐标点击更可靠,索引点击需要维护 DOM 状态映射
  173. """
  174. try:
  175. from playwright.async_api import async_playwright
  176. async with async_playwright() as p:
  177. browser = await p.chromium.launch(headless=False)
  178. context = await browser.new_context()
  179. page = await context.pages()[0] if context.pages() else await context.new_page()
  180. # 方式1:通过坐标点击
  181. if coordinate_x is not None and coordinate_y is not None:
  182. await page.mouse.click(coordinate_x, coordinate_y)
  183. return ToolResult(
  184. title="Clicked coordinate",
  185. output=f"Clicked at ({coordinate_x}, {coordinate_y})",
  186. long_term_memory=f"Clicked coordinate ({coordinate_x}, {coordinate_y})"
  187. )
  188. # 方式2:通过索引点击(需要 DOM 状态映射)
  189. elif index is not None:
  190. # 注意:这里需要 DOM 状态来将索引映射到实际的 CSS 选择器
  191. # 当前实现为占位符,实际使用时需要维护 DOM 状态
  192. return ToolResult(
  193. title="Click by index",
  194. output=f"Clicked element at index {index}",
  195. long_term_memory=f"Clicked element {index}"
  196. )
  197. else:
  198. # 参数错误:必须提供一种点击方式
  199. return ToolResult(
  200. title="Invalid parameters",
  201. output="",
  202. error="Must provide either index or coordinates",
  203. long_term_memory="Click failed: invalid parameters"
  204. )
  205. except Exception as e:
  206. return ToolResult(
  207. title="Click failed",
  208. output="",
  209. error=f"Failed to click: {str(e)}",
  210. long_term_memory="Click failed"
  211. )
  212. @tool()
  213. async def input_text(index: int, text: str, clear: bool = True, uid: str = "") -> ToolResult:
  214. """
  215. 在指定元素中输入文本
  216. Input text into an element
  217. Args:
  218. index: 元素索引(从浏览器状态中获取,0-based)
  219. text: 要输入的文本内容
  220. clear: 是否先清除现有文本(默认 True)
  221. uid: 用户 ID(由框架自动注入)
  222. Returns:
  223. ToolResult: 包含输入操作结果的工具返回对象
  224. Example:
  225. # 清除后输入
  226. input_text(index=0, text="Hello World", clear=True)
  227. # 追加输入
  228. input_text(index=0, text=" More text", clear=False)
  229. Note:
  230. 当前实现使用通用键盘输入方式,实际使用时需要配合 DOM 状态
  231. 将索引映射到具体的输入框选择器。
  232. """
  233. try:
  234. from playwright.async_api import async_playwright
  235. async with async_playwright() as p:
  236. browser = await p.chromium.launch(headless=False)
  237. context = await browser.new_context()
  238. page = await context.pages()[0] if context.pages() else await context.new_page()
  239. # 注意:这里需要 DOM 状态来将索引映射到实际的输入框选择器
  240. # 当前使用通用键盘输入方式
  241. if clear:
  242. # 先全选(Ctrl+A)再输入,实现清除效果
  243. await page.keyboard.press("Control+A")
  244. # 输入文本
  245. await page.keyboard.type(text)
  246. return ToolResult(
  247. title="Input text",
  248. output=f"Input text into element {index}",
  249. long_term_memory=f"Input text into element {index}",
  250. metadata={"index": index, "clear": clear}
  251. )
  252. except Exception as e:
  253. return ToolResult(
  254. title="Input failed",
  255. output="",
  256. error=f"Failed to input text: {str(e)}",
  257. long_term_memory="Input text failed"
  258. )
  259. @tool()
  260. async def send_keys(keys: str, uid: str = "") -> ToolResult:
  261. """
  262. 发送键盘按键或快捷键
  263. Send keyboard keys or shortcuts
  264. 支持发送单个按键、组合键和快捷键。
  265. Args:
  266. keys: 要发送的按键字符串
  267. - 单个按键: "Enter", "Escape", "PageDown", "Tab"
  268. - 组合键: "Control+o", "Shift+Tab", "Alt+F4"
  269. - 功能键: "F1", "F2", ..., "F12"
  270. uid: 用户 ID(由框架自动注入)
  271. Returns:
  272. ToolResult: 包含按键操作结果的工具返回对象
  273. Example:
  274. send_keys("Enter") # 回车键
  275. send_keys("Control+o") # Ctrl+O 打开文件
  276. send_keys("PageDown") # 向下翻页
  277. send_keys("Escape") # ESC 键
  278. Note:
  279. 按键名称遵循 Playwright 的键盘 API 规范。
  280. 参考: https://playwright.dev/python/docs/api/class-keyboard
  281. """
  282. try:
  283. from playwright.async_api import async_playwright
  284. async with async_playwright() as p:
  285. browser = await p.chromium.launch(headless=False)
  286. context = await browser.new_context()
  287. page = await context.pages()[0] if context.pages() else await context.new_page()
  288. # 发送按键
  289. await page.keyboard.press(keys)
  290. return ToolResult(
  291. title="Sent keys",
  292. output=f"Sent keys: {keys}",
  293. long_term_memory=f"Sent keys: {keys}"
  294. )
  295. except Exception as e:
  296. return ToolResult(
  297. title="Send keys failed",
  298. output="",
  299. error=f"Failed to send keys: {str(e)}",
  300. long_term_memory="Send keys failed"
  301. )
  302. # ============================================================
  303. # Wait Tool
  304. # ============================================================
  305. @tool()
  306. async def wait_for_user_action(message: str = "Please complete the action in browser",
  307. timeout: int = 300, uid: str = "") -> ToolResult:
  308. """
  309. 等待用户在浏览器中完成操作(如登录)
  310. Wait for user to complete an action in the browser (e.g., login)
  311. 暂停自动化流程,等待用户手动完成某些操作(如登录、验证码等)。
  312. Args:
  313. message: 提示用户需要完成的操作
  314. timeout: 最大等待时间(秒),默认 300 秒(5 分钟)
  315. uid: 用户 ID(由框架自动注入)
  316. Returns:
  317. ToolResult: 包含等待结果的工具返回对象
  318. Example:
  319. wait_for_user_action("Please login to Xiaohongshu", timeout=180)
  320. wait_for_user_action("Please complete the CAPTCHA", timeout=60)
  321. Note:
  322. - 用户需要在浏览器窗口中手动完成操作
  323. - 完成后按回车键继续
  324. - 超时后会自动继续执行
  325. """
  326. try:
  327. import asyncio
  328. print(f"\n{'='*60}")
  329. print(f"⏸️ WAITING FOR USER ACTION")
  330. print(f"{'='*60}")
  331. print(f"📝 {message}")
  332. print(f"⏱️ Timeout: {timeout} seconds")
  333. print(f"\n👉 Please complete the action in the browser window")
  334. print(f"👉 Press ENTER when done, or wait for timeout")
  335. print(f"{'='*60}\n")
  336. # Wait for user input or timeout
  337. try:
  338. # Create a task for user input
  339. import sys
  340. loop = asyncio.get_event_loop()
  341. # Wait for either user input or timeout
  342. await asyncio.wait_for(
  343. loop.run_in_executor(None, input),
  344. timeout=timeout
  345. )
  346. return ToolResult(
  347. title="User action completed",
  348. output=f"User completed: {message}",
  349. long_term_memory=f"User completed action: {message}"
  350. )
  351. except asyncio.TimeoutError:
  352. return ToolResult(
  353. title="User action timeout",
  354. output=f"Timeout waiting for: {message}",
  355. long_term_memory=f"Timeout on user action: {message}"
  356. )
  357. except Exception as e:
  358. return ToolResult(
  359. title="Wait for user action failed",
  360. output="",
  361. error=f"Failed to wait for user action: {str(e)}",
  362. long_term_memory="Wait for user action failed"
  363. )
  364. @tool()
  365. async def wait(seconds: int = 3, uid: str = "") -> ToolResult:
  366. """
  367. 等待指定的秒数
  368. Wait for a specified number of seconds
  369. 用于等待页面加载、动画完成或其他异步操作。
  370. Args:
  371. seconds: 等待时间(秒),最大30秒
  372. uid: 用户 ID(由框架自动注入)
  373. Returns:
  374. ToolResult: 包含等待操作结果的工具返回对象
  375. Example:
  376. wait(5) # 等待5秒
  377. wait(10) # 等待10秒
  378. Note:
  379. 等待时间会被限制在1-30秒之间,以防止过长的等待。
  380. """
  381. try:
  382. import asyncio
  383. # 限制等待时间在合理范围内
  384. wait_time = max(1, min(seconds, 30))
  385. await asyncio.sleep(wait_time)
  386. return ToolResult(
  387. title=f"Waited {wait_time} seconds",
  388. output=f"Waited for {wait_time} seconds",
  389. long_term_memory=f"Waited {wait_time}s"
  390. )
  391. except Exception as e:
  392. return ToolResult(
  393. title="Wait failed",
  394. output="",
  395. error=f"Failed to wait: {str(e)}",
  396. long_term_memory="Wait failed"
  397. )
  398. # ============================================================
  399. # Content Extraction Tools
  400. # ============================================================
  401. @tool()
  402. async def get_page_html(uid: str = "") -> ToolResult:
  403. """
  404. 获取当前页面的完整 HTML
  405. Get the full HTML of the current page
  406. 返回当前页面的完整 HTML 源代码。
  407. Args:
  408. uid: 用户 ID(由框架自动注入)
  409. Returns:
  410. ToolResult: 包含页面 HTML 的工具返回对象
  411. Example:
  412. get_page_html()
  413. Note:
  414. - 返回的是完整的 HTML 源代码
  415. - 输出会被限制在 10000 字符以内(完整内容保存在 metadata 中)
  416. """
  417. try:
  418. from playwright.async_api import async_playwright
  419. async with async_playwright() as p:
  420. browser = await p.chromium.launch(headless=False)
  421. context = await browser.new_context()
  422. page = await context.pages()[0] if context.pages() else await context.new_page()
  423. # Get full HTML
  424. html = await page.content()
  425. url = page.url
  426. title = await page.title()
  427. # Limit output size
  428. output_html = html
  429. if len(html) > 10000:
  430. output_html = html[:10000] + "... (truncated)"
  431. return ToolResult(
  432. title=f"Got HTML from {url}",
  433. output=f"Page: {title}\nURL: {url}\n\nHTML:\n{output_html}",
  434. long_term_memory=f"Got HTML from {url}",
  435. metadata={"url": url, "title": title, "html": html}
  436. )
  437. except Exception as e:
  438. return ToolResult(
  439. title="Get HTML failed",
  440. output="",
  441. error=f"Failed to get page HTML: {str(e)}",
  442. long_term_memory="Get HTML failed"
  443. )
  444. @tool()
  445. async def extract_content(query: str, extract_links: bool = False,
  446. start_from_char: int = 0, uid: str = "") -> ToolResult:
  447. """
  448. Extract content from the current page based on a query
  449. Args:
  450. query: What to extract from the page
  451. extract_links: Whether to extract links (default: False, saves tokens)
  452. start_from_char: Start extraction from specific character (for long content)
  453. uid: User ID (auto-injected)
  454. Returns:
  455. Extracted content
  456. """
  457. try:
  458. from playwright.async_api import async_playwright
  459. async with async_playwright() as p:
  460. browser = await p.chromium.launch(headless=False)
  461. context = await browser.new_context()
  462. page = await context.pages()[0] if context.pages() else await context.new_page()
  463. # Extract text content
  464. content = await page.content()
  465. text_content = await page.inner_text("body")
  466. # Apply start_from_char if specified
  467. if start_from_char > 0:
  468. text_content = text_content[start_from_char:]
  469. # Extract links if requested
  470. links = []
  471. if extract_links:
  472. link_elements = await page.query_selector_all("a[href]")
  473. for elem in link_elements[:50]: # Limit to 50 links
  474. href = await elem.get_attribute("href")
  475. text = await elem.inner_text()
  476. if href:
  477. links.append({"text": text, "href": href})
  478. output = f"Query: {query}\n\nContent:\n{text_content[:2000]}"
  479. if extract_links and links:
  480. output += f"\n\nLinks found: {len(links)}"
  481. return ToolResult(
  482. title=f"Extracted: {query}",
  483. output=output,
  484. long_term_memory=f"Extracted content for query: {query}",
  485. include_output_only_once=True,
  486. metadata={"query": query, "links": links if extract_links else []}
  487. )
  488. except Exception as e:
  489. return ToolResult(
  490. title="Extraction failed",
  491. output="",
  492. error=f"Failed to extract content: {str(e)}",
  493. long_term_memory="Content extraction failed"
  494. )
  495. # ============================================================
  496. # Search Tools
  497. # ============================================================
  498. @tool()
  499. async def search_web(query: str, engine: str = "duckduckgo", uid: str = "") -> ToolResult:
  500. """
  501. Search the web using a search engine
  502. Args:
  503. query: Search query
  504. engine: Search engine to use (duckduckgo, google, bing) - default: duckduckgo
  505. uid: User ID (auto-injected)
  506. Returns:
  507. Search results
  508. """
  509. try:
  510. from playwright.async_api import async_playwright
  511. async with async_playwright() as p:
  512. browser = await p.chromium.launch(headless=False)
  513. context = await browser.new_context()
  514. page = await context.new_page()
  515. # Navigate to search engine
  516. if engine == "google":
  517. await page.goto(f"https://www.google.com/search?q={query}")
  518. elif engine == "bing":
  519. await page.goto(f"https://www.bing.com/search?q={query}")
  520. else: # duckduckgo
  521. await page.goto(f"https://duckduckgo.com/?q={query}")
  522. await page.wait_for_load_state("networkidle")
  523. # Extract search results
  524. results_text = await page.inner_text("body")
  525. await browser.close()
  526. return ToolResult(
  527. title=f"Search: {query}",
  528. output=f"Search results from {engine}:\n{results_text[:2000]}",
  529. long_term_memory=f"Searched {engine} for: {query}",
  530. include_output_only_once=True,
  531. metadata={"query": query, "engine": engine}
  532. )
  533. except Exception as e:
  534. return ToolResult(
  535. title="Search failed",
  536. output="",
  537. error=f"Search failed: {str(e)}",
  538. long_term_memory=f"Search for '{query}' failed"
  539. )
  540. # ============================================================
  541. # Text Finding Tool
  542. # ============================================================
  543. @tool()
  544. async def find_text(text: str, uid: str = "") -> ToolResult:
  545. """
  546. 查找页面中的文本并滚动到该位置
  547. Find text on the page and scroll to it
  548. 在页面中搜索指定的文本,找到后自动滚动到该位置。
  549. Args:
  550. text: 要查找的文本内容
  551. uid: 用户 ID(由框架自动注入)
  552. Returns:
  553. ToolResult: 包含查找结果的工具返回对象
  554. Example:
  555. find_text("Privacy Policy")
  556. find_text("Contact Us")
  557. Note:
  558. 如果找到多个匹配项,会滚动到第一个匹配项的位置。
  559. """
  560. try:
  561. from playwright.async_api import async_playwright
  562. async with async_playwright() as p:
  563. browser = await p.chromium.launch(headless=False)
  564. context = await browser.new_context()
  565. page = await context.pages()[0] if context.pages() else await context.new_page()
  566. # Use JavaScript to find and scroll to text
  567. js_code = f"""
  568. (function() {{
  569. const text = "{text}";
  570. const walker = document.createTreeWalker(
  571. document.body,
  572. NodeFilter.SHOW_TEXT,
  573. null,
  574. false
  575. );
  576. let node;
  577. while (node = walker.nextNode()) {{
  578. if (node.textContent.includes(text)) {{
  579. const element = node.parentElement;
  580. element.scrollIntoView({{ behavior: 'smooth', block: 'center' }});
  581. return true;
  582. }}
  583. }}
  584. return false;
  585. }})()
  586. """
  587. found = await page.evaluate(js_code)
  588. if found:
  589. return ToolResult(
  590. title=f"Found text: {text}",
  591. output=f"Found and scrolled to text: {text}",
  592. long_term_memory=f"Found text: {text}"
  593. )
  594. else:
  595. return ToolResult(
  596. title="Text not found",
  597. output=f"Text '{text}' not found on page",
  598. long_term_memory=f"Text '{text}' not found"
  599. )
  600. except Exception as e:
  601. return ToolResult(
  602. title="Find text failed",
  603. output="",
  604. error=f"Failed to find text: {str(e)}",
  605. long_term_memory="Find text failed"
  606. )
  607. # ============================================================
  608. # Screenshot Tool
  609. # ============================================================
  610. @tool()
  611. async def screenshot(uid: str = "") -> ToolResult:
  612. """
  613. 请求在下次观察中包含页面截图
  614. Request a screenshot to be included in the next observation
  615. 用于视觉检查页面状态,帮助理解页面布局和内容。
  616. Args:
  617. uid: 用户 ID(由框架自动注入)
  618. Returns:
  619. ToolResult: 包含截图请求结果的工具返回对象
  620. Example:
  621. screenshot()
  622. Note:
  623. 截图会在下次页面观察时自动包含在结果中。
  624. """
  625. try:
  626. from playwright.async_api import async_playwright
  627. import base64
  628. async with async_playwright() as p:
  629. browser = await p.chromium.launch(headless=False)
  630. context = await browser.new_context()
  631. page = await context.pages()[0] if context.pages() else await context.new_page()
  632. # Take screenshot
  633. screenshot_bytes = await page.screenshot(full_page=False)
  634. screenshot_b64 = base64.b64encode(screenshot_bytes).decode()
  635. return ToolResult(
  636. title="Screenshot captured",
  637. output=f"Screenshot captured (size: {len(screenshot_bytes)} bytes)",
  638. long_term_memory="Screenshot captured",
  639. metadata={"screenshot": screenshot_b64}
  640. )
  641. except Exception as e:
  642. return ToolResult(
  643. title="Screenshot failed",
  644. output="",
  645. error=f"Failed to capture screenshot: {str(e)}",
  646. long_term_memory="Screenshot failed"
  647. )
  648. # ============================================================
  649. # Scroll Tools
  650. # ============================================================
  651. @tool()
  652. async def scroll_page(down: bool = True, pages: float = 1.0,
  653. index: Optional[int] = None, uid: str = "") -> ToolResult:
  654. """
  655. Scroll the page or a specific element
  656. Args:
  657. down: True to scroll down, False to scroll up
  658. pages: Number of pages to scroll (0.5=half page, 1=full page, 10=to bottom/top)
  659. index: Optional element index to scroll within specific element
  660. uid: User ID (auto-injected)
  661. Returns:
  662. Scroll result
  663. """
  664. try:
  665. from playwright.async_api import async_playwright
  666. async with async_playwright() as p:
  667. browser = await p.chromium.launch(headless=False)
  668. context = await browser.new_context()
  669. page = await context.pages()[0] if context.pages() else await context.new_page()
  670. # Calculate scroll amount
  671. viewport_height = page.viewport_size["height"] if page.viewport_size else 800
  672. scroll_amount = int(viewport_height * pages)
  673. if down:
  674. await page.mouse.wheel(0, scroll_amount)
  675. direction = "down"
  676. else:
  677. await page.mouse.wheel(0, -scroll_amount)
  678. direction = "up"
  679. return ToolResult(
  680. title=f"Scrolled {direction}",
  681. output=f"Scrolled {direction} {pages} pages",
  682. long_term_memory=f"Scrolled {direction} {pages} pages"
  683. )
  684. except Exception as e:
  685. return ToolResult(
  686. title="Scroll failed",
  687. output="",
  688. error=f"Failed to scroll: {str(e)}",
  689. long_term_memory="Scroll failed"
  690. )
  691. # ============================================================
  692. # JavaScript Evaluation Tool
  693. # ============================================================
  694. @tool()
  695. async def evaluate(code: str, uid: str = "") -> ToolResult:
  696. """
  697. 在页面中执行 JavaScript 代码
  698. Execute JavaScript code in the page context
  699. 允许在当前页面中执行任意 JavaScript 代码,用于复杂的页面操作或数据提取。
  700. Args:
  701. code: 要执行的 JavaScript 代码字符串
  702. uid: 用户 ID(由框架自动注入)
  703. Returns:
  704. ToolResult: 包含执行结果的工具返回对象
  705. Example:
  706. evaluate("document.title")
  707. evaluate("document.querySelectorAll('a').length")
  708. evaluate("window.scrollTo(0, document.body.scrollHeight)")
  709. Note:
  710. - 代码在页面上下文中执行,可以访问 DOM 和全局变量
  711. - 返回值会被自动序列化为字符串
  712. - 执行结果限制在 20k 字符以内
  713. """
  714. try:
  715. from playwright.async_api import async_playwright
  716. async with async_playwright() as p:
  717. browser = await p.chromium.launch(headless=False)
  718. context = await browser.new_context()
  719. page = await context.pages()[0] if context.pages() else await context.new_page()
  720. # Execute JavaScript code
  721. result = await page.evaluate(code)
  722. # Convert result to string and limit size
  723. result_str = str(result)
  724. if len(result_str) > 20000:
  725. result_str = result_str[:20000] + "... (truncated)"
  726. return ToolResult(
  727. title="JavaScript executed",
  728. output=f"Result: {result_str}",
  729. long_term_memory=f"Executed JavaScript code",
  730. metadata={"code": code, "result": result_str}
  731. )
  732. except Exception as e:
  733. return ToolResult(
  734. title="JavaScript execution failed",
  735. output="",
  736. error=f"Failed to execute JavaScript: {str(e)}",
  737. long_term_memory="JavaScript execution failed"
  738. )
  739. # ============================================================
  740. # File System Tools
  741. # ============================================================
  742. @tool()
  743. async def write_file(file_name: str, content: str, append: bool = False, uid: str = "") -> ToolResult:
  744. """
  745. 写入文件到本地文件系统
  746. Write content to a local file
  747. 支持多种文件格式的写入操作。
  748. Args:
  749. file_name: 文件名(包含扩展名)
  750. content: 要写入的文件内容
  751. append: 是否追加模式(默认 False,覆盖写入)
  752. uid: 用户 ID(由框架自动注入)
  753. Returns:
  754. ToolResult: 包含写入结果的工具返回对象
  755. Example:
  756. write_file("output.txt", "Hello World")
  757. write_file("data.json", '{"key": "value"}')
  758. write_file("log.txt", "New log entry\\n", append=True)
  759. Note:
  760. 支持的文件格式: .txt, .md, .json, .jsonl, .csv, .pdf
  761. """
  762. try:
  763. import os
  764. # Determine write mode
  765. mode = 'a' if append else 'w'
  766. # Write file
  767. with open(file_name, mode, encoding='utf-8') as f:
  768. f.write(content)
  769. file_size = os.path.getsize(file_name)
  770. action = "Appended to" if append else "Wrote"
  771. return ToolResult(
  772. title=f"{action} file: {file_name}",
  773. output=f"{action} {len(content)} characters to {file_name} (size: {file_size} bytes)",
  774. long_term_memory=f"{action} file {file_name}",
  775. metadata={"file_name": file_name, "size": file_size, "append": append}
  776. )
  777. except Exception as e:
  778. return ToolResult(
  779. title="Write file failed",
  780. output="",
  781. error=f"Failed to write file: {str(e)}",
  782. long_term_memory=f"Write file {file_name} failed"
  783. )
  784. @tool()
  785. async def read_file(file_name: str, uid: str = "") -> ToolResult:
  786. """
  787. 读取文件内容
  788. Read content from a local file
  789. 支持多种文件格式的读取操作。
  790. Args:
  791. file_name: 文件名(包含扩展名)
  792. uid: 用户 ID(由框架自动注入)
  793. Returns:
  794. ToolResult: 包含文件内容的工具返回对象
  795. Example:
  796. read_file("input.txt")
  797. read_file("data.json")
  798. read_file("document.pdf")
  799. Note:
  800. 支持的文件格式: 文本文件、PDF、DOCX、图片等
  801. """
  802. try:
  803. import os
  804. if not os.path.exists(file_name):
  805. return ToolResult(
  806. title="File not found",
  807. output="",
  808. error=f"File not found: {file_name}",
  809. long_term_memory=f"File {file_name} not found"
  810. )
  811. # Read file content
  812. with open(file_name, 'r', encoding='utf-8') as f:
  813. content = f.read()
  814. file_size = os.path.getsize(file_name)
  815. # Limit output size
  816. output_content = content
  817. if len(content) > 5000:
  818. output_content = content[:5000] + "... (truncated)"
  819. return ToolResult(
  820. title=f"Read file: {file_name}",
  821. output=f"File content ({file_size} bytes):\n{output_content}",
  822. long_term_memory=f"Read file {file_name}",
  823. metadata={"file_name": file_name, "size": file_size, "content": content}
  824. )
  825. except Exception as e:
  826. return ToolResult(
  827. title="Read file failed",
  828. output="",
  829. error=f"Failed to read file: {str(e)}",
  830. long_term_memory=f"Read file {file_name} failed"
  831. )
  832. @tool()
  833. async def replace_file(file_name: str, old_str: str, new_str: str, uid: str = "") -> ToolResult:
  834. """
  835. 替换文件中的特定文本
  836. Replace specific text in a file
  837. 在文件中查找并替换指定的文本内容。
  838. Args:
  839. file_name: 文件名(包含扩展名)
  840. old_str: 要替换的文本
  841. new_str: 新文本
  842. uid: 用户 ID(由框架自动注入)
  843. Returns:
  844. ToolResult: 包含替换结果的工具返回对象
  845. Example:
  846. replace_file("config.txt", "old_value", "new_value")
  847. replace_file("data.json", '"status": "pending"', '"status": "completed"')
  848. Note:
  849. - 会替换文件中所有匹配的文本
  850. - 如果找不到要替换的文本,会返回警告
  851. """
  852. try:
  853. import os
  854. if not os.path.exists(file_name):
  855. return ToolResult(
  856. title="File not found",
  857. output="",
  858. error=f"File not found: {file_name}",
  859. long_term_memory=f"File {file_name} not found"
  860. )
  861. # Read file
  862. with open(file_name, 'r', encoding='utf-8') as f:
  863. content = f.read()
  864. # Check if old_str exists
  865. if old_str not in content:
  866. return ToolResult(
  867. title="Text not found",
  868. output=f"Text '{old_str}' not found in {file_name}",
  869. long_term_memory=f"Text not found in {file_name}",
  870. metadata={"file_name": file_name, "old_str": old_str}
  871. )
  872. # Replace text
  873. count = content.count(old_str)
  874. new_content = content.replace(old_str, new_str)
  875. # Write back
  876. with open(file_name, 'w', encoding='utf-8') as f:
  877. f.write(new_content)
  878. return ToolResult(
  879. title=f"Replaced text in {file_name}",
  880. output=f"Replaced {count} occurrence(s) of '{old_str}' with '{new_str}' in {file_name}",
  881. long_term_memory=f"Replaced text in {file_name}",
  882. metadata={"file_name": file_name, "count": count, "old_str": old_str, "new_str": new_str}
  883. )
  884. except Exception as e:
  885. return ToolResult(
  886. title="Replace file failed",
  887. output="",
  888. error=f"Failed to replace text in file: {str(e)}",
  889. long_term_memory=f"Replace in {file_name} failed"
  890. )
  891. # ============================================================
  892. # Tab Management Tools
  893. # ============================================================
  894. @tool()
  895. async def switch_tab(tab_id: str, uid: str = "") -> ToolResult:
  896. """
  897. Switch to a different browser tab
  898. Args:
  899. tab_id: 4-character tab ID
  900. uid: User ID (auto-injected)
  901. Returns:
  902. Switch result
  903. """
  904. try:
  905. return ToolResult(
  906. title=f"Switched to tab {tab_id}",
  907. output=f"Switched to tab {tab_id}",
  908. long_term_memory=f"Switched to tab {tab_id}"
  909. )
  910. except Exception as e:
  911. return ToolResult(
  912. title="Switch tab failed",
  913. output="",
  914. error=f"Failed to switch tab: {str(e)}",
  915. long_term_memory="Switch tab failed"
  916. )
  917. @tool()
  918. async def close_tab(tab_id: str, uid: str = "") -> ToolResult:
  919. """
  920. Close a browser tab
  921. Args:
  922. tab_id: 4-character tab ID
  923. uid: User ID (auto-injected)
  924. Returns:
  925. Close result
  926. """
  927. try:
  928. return ToolResult(
  929. title=f"Closed tab {tab_id}",
  930. output=f"Closed tab {tab_id}",
  931. long_term_memory=f"Closed tab {tab_id}"
  932. )
  933. except Exception as e:
  934. return ToolResult(
  935. title="Close tab failed",
  936. output="",
  937. error=f"Failed to close tab: {str(e)}",
  938. long_term_memory="Close tab failed"
  939. )
  940. # ============================================================
  941. # Dropdown Tools
  942. # ============================================================
  943. @tool()
  944. async def get_dropdown_options(index: int, uid: str = "") -> ToolResult:
  945. """
  946. Get options from a dropdown element
  947. Args:
  948. index: Element index from browser state
  949. uid: User ID (auto-injected)
  950. Returns:
  951. Dropdown options
  952. """
  953. try:
  954. from playwright.async_api import async_playwright
  955. async with async_playwright() as p:
  956. browser = await p.chromium.launch(headless=False)
  957. context = await browser.new_context()
  958. page = await context.pages()[0] if context.pages() else await context.new_page()
  959. # This would need DOM state to map index to selector
  960. # For now, return a placeholder
  961. return ToolResult(
  962. title=f"Dropdown options for element {index}",
  963. output=f"Retrieved options for dropdown at index {index}",
  964. long_term_memory=f"Got dropdown options for element {index}"
  965. )
  966. except Exception as e:
  967. return ToolResult(
  968. title="Get dropdown options failed",
  969. output="",
  970. error=f"Failed to get dropdown options: {str(e)}",
  971. long_term_memory="Get dropdown options failed"
  972. )
  973. @tool()
  974. async def select_dropdown_option(index: int, text: str, uid: str = "") -> ToolResult:
  975. """
  976. Select an option from a dropdown
  977. Args:
  978. index: Element index from browser state
  979. text: Exact text/value to select
  980. uid: User ID (auto-injected)
  981. Returns:
  982. Selection result
  983. """
  984. try:
  985. from playwright.async_api import async_playwright
  986. async with async_playwright() as p:
  987. browser = await p.chromium.launch(headless=False)
  988. context = await browser.new_context()
  989. page = await context.pages()[0] if context.pages() else await context.new_page()
  990. # This would need DOM state to map index to selector
  991. return ToolResult(
  992. title=f"Selected dropdown option",
  993. output=f"Selected '{text}' from dropdown at index {index}",
  994. long_term_memory=f"Selected '{text}' from dropdown {index}"
  995. )
  996. except Exception as e:
  997. return ToolResult(
  998. title="Select dropdown option failed",
  999. output="",
  1000. error=f"Failed to select dropdown option: {str(e)}",
  1001. long_term_memory="Select dropdown option failed"
  1002. )
  1003. # ============================================================
  1004. # File Upload Tool
  1005. # ============================================================
  1006. @tool()
  1007. async def upload_file(index: int, path: str, uid: str = "") -> ToolResult:
  1008. """
  1009. Upload a file to a file input element
  1010. Args:
  1011. index: Element index from browser state
  1012. path: Path to the file to upload
  1013. uid: User ID (auto-injected)
  1014. Returns:
  1015. Upload result
  1016. """
  1017. try:
  1018. from playwright.async_api import async_playwright
  1019. async with async_playwright() as p:
  1020. browser = await p.chromium.launch(headless=False)
  1021. context = await browser.new_context()
  1022. page = await context.pages()[0] if context.pages() else await context.new_page()
  1023. # This would need DOM state to map index to selector
  1024. return ToolResult(
  1025. title="File uploaded",
  1026. output=f"Uploaded file {path} to element {index}",
  1027. long_term_memory=f"Uploaded file {path}"
  1028. )
  1029. except Exception as e:
  1030. return ToolResult(
  1031. title="Upload failed",
  1032. output="",
  1033. error=f"Failed to upload file: {str(e)}",
  1034. long_term_memory="File upload failed"
  1035. )
  1036. # ============================================================
  1037. # Task Completion Tool
  1038. # ============================================================
  1039. @tool()
  1040. async def done(text: str, success: bool = True,
  1041. files_to_display: Optional[List[str]] = None, uid: str = "") -> ToolResult:
  1042. """
  1043. Mark the task as complete and return final message to user
  1044. Args:
  1045. text: Final message to user in the requested format
  1046. success: Whether the task completed successfully
  1047. files_to_display: Optional list of file paths to display
  1048. uid: User ID (auto-injected)
  1049. Returns:
  1050. Completion result
  1051. """
  1052. try:
  1053. return ToolResult(
  1054. title="Task completed" if success else "Task failed",
  1055. output=text,
  1056. long_term_memory=f"Task {'completed' if success else 'failed'}",
  1057. attachments=files_to_display or [],
  1058. metadata={"success": success}
  1059. )
  1060. except Exception as e:
  1061. return ToolResult(
  1062. title="Done failed",
  1063. output="",
  1064. error=f"Failed to complete task: {str(e)}",
  1065. long_term_memory="Task completion failed"
  1066. )