""" 集成测试 - 真实场景测试 测试场景:代码重构与测试任务 目标:让 Agent 在真实场景中自然使用各种工具,验证重构后的功能 测试内容: 1. Goal 工具 - 创建和管理执行计划 2. SubAgent 工具 - delegate 模式(委托任务) 3. SubAgent 工具 - evaluate 模式(评估结果) 4. 文件操作工具 - 读写编辑文件 5. Bash 工具 - 运行测试 不刻意测试某个功能,而是让 Agent 自然地完成一个真实任务。 """ import os import sys import asyncio from pathlib import Path # 添加项目根目录到 Python 路径 sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from dotenv import load_dotenv load_dotenv() from agent.llm.prompts import SimplePrompt from agent.core.runner import AgentRunner from agent.execution import FileSystemTraceStore, Trace, Message from agent.llm import create_openrouter_llm_call async def main(): # 路径配置 base_dir = Path(__file__).parent project_root = base_dir.parent.parent prompt_path = base_dir / "task.prompt" project_dir = base_dir / "project" print("=" * 80) print("集成测试 - 真实场景:代码重构与测试") print("=" * 80) print() # 1. 加载 prompt print("1. 加载任务 prompt...") prompt = SimplePrompt(prompt_path) system_prompt = prompt._messages.get("system", "") user_prompt = prompt._messages.get("user", "") print(f" ✓ System prompt 已加载") print(f" ✓ User prompt 已加载") print() # 2. 创建 Agent Runner print("2. 创建 Agent Runner...") print(f" - 模型: Claude Sonnet 4.5 (via OpenRouter)") print(f" - Trace 存储: .trace/") print() runner = AgentRunner( trace_store=FileSystemTraceStore(base_path=".trace"), llm_call=create_openrouter_llm_call(model="anthropic/claude-sonnet-4.5"), skills_dir=str(project_root / "agent" / "skills"), debug=False ) # 3. 运行 Agent print("3. 启动 Agent 执行任务...") print("=" * 80) print() current_trace_id = None goal_used = False subagent_used = False evaluate_used = False delegate_used = False iteration_count = 0 tool_calls_count = {} async for item in runner.run( task=user_prompt, system_prompt=system_prompt, model="anthropic/claude-sonnet-4.5", temperature=0.3, max_iterations=30, ): # 处理 Trace 对象 if isinstance(item, Trace): current_trace_id = item.trace_id if item.status == "running": print(f"[Trace] 开始: {item.trace_id[:8]}...") elif item.status == "completed": print() print("=" * 80) print(f"[Trace] 完成") print(f" - 总消息数: {item.total_messages}") print(f" - 总 Token 数: {item.total_tokens}") print(f" - 总成本: ${item.total_cost:.4f}") print("=" * 80) elif item.status == "failed": print() print(f"[Trace] 失败: {item.error}") # 处理 Message 对象 elif isinstance(item, Message): if item.role == "assistant": iteration_count += 1 content = item.content if isinstance(content, dict): text = content.get("text", "") tool_calls = content.get("tool_calls") # 显示 Agent 的思考 if text and not tool_calls: print(f"\n[{iteration_count}] Agent 回复:") print(f" {text[:200]}{'...' if len(text) > 200 else ''}") elif text: print(f"\n[{iteration_count}] Agent 思考:") print(f" {text[:150]}{'...' if len(text) > 150 else ''}") # 显示工具调用 if tool_calls: for tc in tool_calls: tool_name = tc.get("function", {}).get("name", "unknown") args = tc.get("function", {}).get("arguments", {}) # 如果 args 是字符串,尝试解析为 JSON if isinstance(args, str): import json try: args = json.loads(args) except: args = {} # 统计工具使用 tool_calls_count[tool_name] = tool_calls_count.get(tool_name, 0) + 1 # 检测关键工具使用 if tool_name == "goal": goal_used = True # 显示 goal 操作 if isinstance(args, dict): if args.get("add"): print(f" → goal(add): {args['add'][:80]}...") elif args.get("done"): print(f" → goal(done): {args['done'][:80]}...") elif args.get("focus"): print(f" → goal(focus): {args['focus']}") else: print(f" → goal(...)") elif tool_name == "subagent": subagent_used = True if isinstance(args, dict): mode = args.get("mode", "unknown") if mode == "evaluate": evaluate_used = True target = args.get("target_goal_id", "?") print(f" → subagent(evaluate): 评估目标 {target}") elif mode == "delegate": delegate_used = True task = args.get("task", "") print(f" → subagent(delegate): {task[:60]}...") else: print(f" → subagent({mode})") else: print(f" → subagent(...)") else: # 其他工具简化显示 if tool_name in ["read_file", "write_file", "edit_file"]: if isinstance(args, dict): file_path = args.get("file_path", "") if file_path: file_name = Path(file_path).name print(f" → {tool_name}: {file_name}") else: print(f" → {tool_name}") else: print(f" → {tool_name}") elif tool_name == "bash_command": if isinstance(args, dict): cmd = args.get("command", "") print(f" → bash: {cmd[:60]}...") else: print(f" → bash") else: print(f" → {tool_name}") elif item.role == "tool": # 工具返回结果(简化显示) pass # 4. 测试结果总结 print() print("=" * 80) print("测试结果总结") print("=" * 80) print() print("功能使用情况:") print(f" ✓ Goal 工具: {'已使用' if goal_used else '未使用'}") print(f" ✓ SubAgent 工具: {'已使用' if subagent_used else '未使用'}") print(f" - Evaluate 模式: {'已使用' if evaluate_used else '未使用'}") print(f" - Delegate 模式: {'已使用' if delegate_used else '未使用'}") print() print("工具调用统计:") for tool_name, count in sorted(tool_calls_count.items()): print(f" - {tool_name}: {count} 次") print() print(f"总迭代次数: {iteration_count}") print() # 5. 验证结果 print("验证生成的文件:") # 检查是否生成了测试文件 test_file = project_dir / "test_calculator.py" if test_file.exists(): print(f" ✓ 测试文件已生成: {test_file.name}") else: print(f" ✗ 测试文件未生成") # 检查 calculator.py 是否被修改(添加了 average 函数) calc_file = project_dir / "calculator.py" if calc_file.exists(): content = calc_file.read_text() if "average" in content or "mean" in content: print(f" ✓ Calculator 已添加新功能") else: print(f" ✗ Calculator 未添加新功能") print() print("=" * 80) print("集成测试完成") print("=" * 80) if __name__ == "__main__": asyncio.run(main())