run.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. """
  2. 真实环境集成测试(Agent-main)。
  3. """
  4. import asyncio
  5. import json
  6. import os
  7. import sys
  8. from dataclasses import dataclass
  9. from pathlib import Path
  10. from tempfile import TemporaryDirectory
  11. from typing import Any, Dict, List
  12. # 避免 browser_use 在受限环境写 ~/.config 触发权限错误
  13. os.environ.setdefault("BROWSER_USE_CONFIG_DIR", "/tmp/browseruse-test")
  14. PROJECT_ROOT = Path(__file__).resolve().parents[2]
  15. sys.path.insert(0, str(PROJECT_ROOT))
  16. @dataclass
  17. class CheckResult:
  18. name: str
  19. ok: bool
  20. detail: str
  21. def record(results: List[CheckResult], name: str, ok: bool, detail: str) -> None:
  22. results.append(CheckResult(name=name, ok=ok, detail=detail))
  23. mark = "PASS" if ok else "FAIL"
  24. print(f"[{mark}] {name}: {detail}")
  25. async def mock_llm_call(messages, model="gpt-4o", tools=None, **kwargs):
  26. """
  27. 测试专用 mock LLM:
  28. - 当有工具可用时:第一轮触发 bash_command,第二轮返回文本结论
  29. - 当是 subagent 任务时:按 prompt 类型返回固定文本
  30. """
  31. state = kwargs.get("_test_state")
  32. if isinstance(state, dict):
  33. call_no = state.get("call_no", 0)
  34. state["call_no"] = call_no + 1
  35. else:
  36. call_no = 0
  37. last_user = ""
  38. for msg in reversed(messages):
  39. if msg.get("role") == "user":
  40. last_user = str(msg.get("content", ""))
  41. break
  42. if "# 评估任务" in last_user:
  43. return {
  44. "content": "## 评估结论\n通过\n\n## 评估理由\n结果满足要求。",
  45. "tool_calls": None,
  46. "prompt_tokens": 10,
  47. "completion_tokens": 10,
  48. "finish_reason": "stop",
  49. "cost": 0.0,
  50. }
  51. if "# 探索任务" in last_user:
  52. return {
  53. "content": "探索结论:优先采用方案 1。",
  54. "tool_calls": None,
  55. "prompt_tokens": 10,
  56. "completion_tokens": 10,
  57. "finish_reason": "stop",
  58. "cost": 0.0,
  59. }
  60. if "委托" in last_user or "实现" in last_user or "继续" in last_user or "优化" in last_user:
  61. return {
  62. "content": "委托任务执行完成。",
  63. "tool_calls": None,
  64. "prompt_tokens": 10,
  65. "completion_tokens": 10,
  66. "finish_reason": "stop",
  67. "cost": 0.0,
  68. }
  69. if call_no == 0 and tools:
  70. return {
  71. "content": "",
  72. "tool_calls": [
  73. {
  74. "id": "tc_1",
  75. "type": "function",
  76. "function": {
  77. "name": "bash_command",
  78. "arguments": json.dumps(
  79. {
  80. "command": "echo runner_run_ok",
  81. "description": "integration",
  82. }
  83. ),
  84. },
  85. }
  86. ],
  87. "prompt_tokens": 12,
  88. "completion_tokens": 8,
  89. "finish_reason": "tool_calls",
  90. "cost": 0.0,
  91. }
  92. return {
  93. "content": "run_fallback_ok",
  94. "tool_calls": None,
  95. "prompt_tokens": 8,
  96. "completion_tokens": 6,
  97. "finish_reason": "stop",
  98. "cost": 0.0,
  99. }
  100. def check_tool_registry(results: List[CheckResult]) -> None:
  101. from agent.tools import get_tool_registry
  102. registry = get_tool_registry()
  103. names = set(registry.get_tool_names())
  104. core_required = {
  105. "read_file",
  106. "edit_file",
  107. "write_file",
  108. "glob_files",
  109. "grep_content",
  110. "bash_command",
  111. "skill",
  112. "list_skills",
  113. "subagent",
  114. }
  115. core_missing = sorted(core_required - names)
  116. record(
  117. results,
  118. "tool_registry_core",
  119. len(core_missing) == 0,
  120. "all core tools registered" if not core_missing else f"missing: {core_missing}",
  121. )
  122. browser_subset = {
  123. "browser_search_web",
  124. "browser_navigate_to_url",
  125. "browser_screenshot",
  126. }
  127. browser_missing = sorted(browser_subset - names)
  128. record(
  129. results,
  130. "tool_registry_browser",
  131. len(browser_missing) == 0,
  132. "browser tools visible" if not browser_missing else f"missing: {browser_missing}",
  133. )
  134. async def check_file_tools(results: List[CheckResult]) -> None:
  135. from agent.tools.builtin.file.write import write_file
  136. from agent.tools.builtin.file.read import read_file
  137. from agent.tools.builtin.file.edit import edit_file
  138. from agent.tools.builtin.file.glob import glob_files
  139. from agent.tools.builtin.file.grep import grep_content
  140. from agent.tools.builtin.bash import bash_command
  141. with TemporaryDirectory(prefix="agent-main-int-") as tmp:
  142. tmp_path = Path(tmp)
  143. target = tmp_path / "notes.txt"
  144. wr = await write_file(file_path=str(target), content="hello\npython\nagent\n")
  145. record(results, "write_file", wr.error is None, wr.error or "write success")
  146. rd = await read_file(file_path=str(target))
  147. read_ok = (rd.error is None) and ("python" in rd.output)
  148. record(results, "read_file", read_ok, rd.error or "content contains python")
  149. ed = await edit_file(file_path=str(target), old_string="python", new_string="python3")
  150. record(results, "edit_file", ed.error is None, ed.error or "edit success")
  151. gp = await grep_content(pattern="python3", path=str(tmp_path))
  152. grep_ok = gp.error is None and "notes.txt" in gp.output
  153. record(results, "grep_content", grep_ok, gp.error or "pattern found")
  154. gb = await glob_files(pattern="**/*.txt", path=str(tmp_path))
  155. glob_ok = gb.error is None and "notes.txt" in gb.output
  156. record(results, "glob_files", glob_ok, gb.error or "glob matched")
  157. bs = await bash_command(
  158. command="echo integration_ok",
  159. description="integration test",
  160. workdir=str(tmp_path),
  161. )
  162. bash_ok = bs.error is None and "integration_ok" in bs.output
  163. record(results, "bash_command", bash_ok, bs.error or "command output ok")
  164. async def check_runner(results: List[CheckResult]) -> None:
  165. from agent.core.runner import AgentRunner
  166. from agent.trace.store import FileSystemTraceStore
  167. from agent.trace.models import Trace, Message
  168. with TemporaryDirectory(prefix="agent-main-runner-") as tmp:
  169. store = FileSystemTraceStore(base_path=tmp)
  170. # call 模式
  171. runner_call = AgentRunner(trace_store=store, llm_call=mock_llm_call)
  172. call_result = await runner_call.call(messages=[{"role": "user", "content": "ping"}], trace=True)
  173. call_ok = bool(call_result.trace_id) and isinstance(call_result.reply, str)
  174. record(results, "runner_call", call_ok, f"trace_id={call_result.trace_id}, reply={call_result.reply}")
  175. # run 模式(含工具调用)
  176. state = {"call_no": 0}
  177. async def llm_with_state(messages, model="gpt-4o", tools=None, **kwargs):
  178. kwargs["_test_state"] = state
  179. return await mock_llm_call(messages=messages, model=model, tools=tools, **kwargs)
  180. runner_run = AgentRunner(trace_store=store, llm_call=llm_with_state)
  181. events: List[Any] = []
  182. async for item in runner_run.run(
  183. task="请执行一次bash并给出结果",
  184. system_prompt="你是测试助手",
  185. model="gpt-4o-mini",
  186. ):
  187. events.append(item)
  188. final_trace = None
  189. assistant_texts = []
  190. for item in events:
  191. if isinstance(item, Trace):
  192. final_trace = item
  193. if isinstance(item, Message) and item.role == "assistant":
  194. content = item.content
  195. text = content.get("text", "") if isinstance(content, dict) else str(content)
  196. if text:
  197. assistant_texts.append(text)
  198. run_ok = bool(final_trace) and final_trace.status == "completed" and "run_fallback_ok" in assistant_texts
  199. record(
  200. results,
  201. "runner_run",
  202. run_ok,
  203. f"status={getattr(final_trace, 'status', 'n/a')}, assistant_count={len(assistant_texts)}",
  204. )
  205. async def check_subagent(results: List[CheckResult]) -> None:
  206. from agent.core.runner import AgentRunner
  207. from agent.trace.store import FileSystemTraceStore
  208. from agent.trace.models import Trace
  209. from agent.trace.goal_models import GoalTree
  210. from agent.tools.builtin.subagent import subagent
  211. with TemporaryDirectory(prefix="agent-main-subagent-") as tmp:
  212. store = FileSystemTraceStore(base_path=tmp)
  213. runner = AgentRunner(trace_store=store, llm_call=mock_llm_call)
  214. main_trace = Trace(
  215. trace_id="main-trace",
  216. mode="agent",
  217. task="主任务",
  218. agent_type="default",
  219. status="running",
  220. )
  221. await store.create_trace(main_trace)
  222. goal_tree = GoalTree(mission="主任务")
  223. goals = goal_tree.add_goals(["验证 subagent 功能"])
  224. goal_tree.focus(goals[0].id)
  225. await store.update_goal_tree(main_trace.trace_id, goal_tree)
  226. ctx = {"store": store, "trace_id": main_trace.trace_id, "goal_id": goals[0].id, "runner": runner}
  227. r1 = await subagent(mode="delegate", task="实现登录", context=ctx)
  228. r2 = await subagent(mode="explore", branches=["方案A", "方案B"], background="请比较", context=ctx)
  229. r3 = await subagent(
  230. mode="evaluate",
  231. target_goal_id=goals[0].id,
  232. evaluation_input={"actual_result": "实现完成"},
  233. requirements="给出是否通过",
  234. context=ctx,
  235. )
  236. r4 = await subagent(mode="delegate", task="继续优化", continue_from=r1["sub_trace_id"], context=ctx)
  237. s1 = str(r1.get("status", "")).strip()
  238. s2 = str(r2.get("status", "")).strip()
  239. s3 = str(r3.get("status", "")).strip()
  240. s4 = str(r4.get("status", "")).strip()
  241. same_trace = str(r4.get("sub_trace_id", "")).strip() == str(r1.get("sub_trace_id", "")).strip()
  242. ok = (s1 == "completed" and s2 == "completed" and s3 == "completed" and s4 == "completed" and same_trace)
  243. detail = (
  244. f"delegate={s1}, explore={s2}, evaluate={s3}, continue={s4}, continue_same={same_trace}"
  245. )
  246. record(results, "subagent_unified", ok, detail)
  247. async def main() -> int:
  248. results: List[CheckResult] = []
  249. try:
  250. check_tool_registry(results)
  251. await check_file_tools(results)
  252. await check_runner(results)
  253. await check_subagent(results)
  254. except Exception as exc:
  255. record(results, "unexpected_exception", False, repr(exc))
  256. total = len(results)
  257. passed = sum(1 for r in results if r.ok)
  258. failed = total - passed
  259. print("\n=== Integration Summary ===")
  260. print(f"Total: {total}")
  261. print(f"Passed: {passed}")
  262. print(f"Failed: {failed}")
  263. return 0 if failed == 0 else 1
  264. if __name__ == "__main__":
  265. raise SystemExit(asyncio.run(main()))