#!/usr/bin/env python3 """ 集成测试 5: 用户认证模块实现(强制评估) 测试目标: - 验证 Agent 能够使用 subagent(mode="evaluate") 进行代码评估 - 验证 Agent 能够根据评估结果修复代码 - 验证评估-修复-重新评估的迭代流程 """ import asyncio import sys import os from pathlib import Path # 添加项目根目录到 Python 路径 project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) from dotenv import load_dotenv load_dotenv() from agent.llm.prompts import SimplePrompt from agent.core.runner import AgentRunner from agent.execution import FileSystemTraceStore, Trace, Message from agent.llm import create_openrouter_llm_call async def main(): """运行测试""" # 路径配置 base_dir = Path(__file__).parent prompt_path = base_dir / "task.prompt" output_dir = base_dir / "output" print("=" * 80) print("集成测试 5: 用户认证模块实现(强制评估)") print("=" * 80) print() # 1. 加载 prompt print("1. 加载任务...") prompt = SimplePrompt(prompt_path) system_prompt = prompt._messages.get("system", "") user_prompt = prompt._messages.get("user", "") print(f" ✓ 任务类型: 用户认证模块实现") print(f" ✓ 强制要求: 必须使用 subagent 评估") print(f" ✓ 安全检查: 密码加密、SQL注入、输入验证") print() # 2. 创建 Agent Runner print("2. 创建 Agent Runner...") print(f" - 模型: Claude Sonnet 4.5") print() runner = AgentRunner( trace_store=FileSystemTraceStore(base_path=".trace"), llm_call=create_openrouter_llm_call(model="anthropic/claude-sonnet-4.5"), skills_dir=str(project_root / "agent" / "skills"), debug=False ) # 3. 运行 Agent print("3. 启动 Agent...") print("=" * 80) print() # 创建输出目录 output_dir.mkdir(exist_ok=True) # 监控变量 current_trace_id = None goal_used = False subagent_used = False evaluate_used = False delegate_used = False explore_used = False iteration_count = 0 tool_calls_count = {} evaluation_count = 0 evaluation_results = [] async for item in runner.run( task=user_prompt, system_prompt=system_prompt, model="anthropic/claude-sonnet-4.5", temperature=0.5, max_iterations=50, ): # 处理 Trace 对象 if isinstance(item, Trace): current_trace_id = item.trace_id if item.status == "running": print(f"[Trace] 开始: {item.trace_id[:8]}...") elif item.status == "completed": print() print("=" * 80) print(f"[Trace] 完成") print(f" - 总消息数: {item.total_messages}") print(f" - 总 Token 数: {item.total_tokens}") print(f" - 总成本: ${item.total_cost:.4f}") print("=" * 80) elif item.status == "failed": print() print(f"[Trace] 失败: {item.error_message}") # 处理 Message 对象 elif isinstance(item, Message): if item.role == "assistant": iteration_count += 1 content = item.content if isinstance(content, dict): text = content.get("text", "") tool_calls = content.get("tool_calls") # 显示 Agent 的思考 if text and not tool_calls: print(f"\n[{iteration_count}] Agent 回复:") print(f" {text[:200]}{'...' if len(text) > 200 else ''}") elif text: print(f"\n[{iteration_count}] Agent 思考:") print(f" {text[:150]}{'...' if len(text) > 150 else ''}") # 显示工具调用 if tool_calls: for tc in tool_calls: tool_name = tc.get("function", {}).get("name", "unknown") args = tc.get("function", {}).get("arguments", {}) # 如果 args 是字符串,尝试解析为 JSON if isinstance(args, str): import json try: args = json.loads(args) except: args = {} # 统计工具使用 tool_calls_count[tool_name] = tool_calls_count.get(tool_name, 0) + 1 # 检测关键工具使用 if tool_name == "goal": goal_used = True if isinstance(args, dict): if args.get("add"): print(f" → goal(add): {args['add'][:80]}...") elif args.get("done"): print(f" → goal(done): {args['done'][:80]}...") elif args.get("focus"): print(f" → goal(focus): {args['focus']}") else: print(f" → goal(...)") elif tool_name == "subagent": subagent_used = True if isinstance(args, dict): mode = args.get("mode", "unknown") if mode == "evaluate": evaluate_used = True evaluation_count += 1 target = args.get("target_goal_id", "?") print(f" → subagent(evaluate): 评估目标 {target} [评估 #{evaluation_count}]") elif mode == "delegate": delegate_used = True task = args.get("task", "") print(f" → subagent(delegate): {task[:60]}...") elif mode == "explore": explore_used = True branches = args.get("branches", []) print(f" → subagent(explore): {len(branches)} 个分支") else: print(f" → subagent({mode})") else: print(f" → subagent(...)") else: # 其他工具简化显示 if tool_name in ["read_file", "write_file", "edit_file"]: if isinstance(args, dict): file_path = args.get("file_path", "") if file_path: file_name = Path(file_path).name print(f" → {tool_name}: {file_name}") else: print(f" → {tool_name}") else: print(f" → {tool_name}") elif tool_name == "bash_command": if isinstance(args, dict): cmd = args.get("command", "") print(f" → bash: {cmd[:60]}...") else: print(f" → bash") else: print(f" → {tool_name}") elif item.role == "tool": # 检查是否是评估结果 content = item.content if isinstance(content, str): import json try: result = json.loads(content) if isinstance(result, dict) and "passed" in result: passed = result.get("passed", False) reason = result.get("reason", "")[:100] evaluation_results.append({ "passed": passed, "reason": reason }) status = "✅ 通过" if passed else "❌ 不通过" print(f" [评估结果] {status}") if reason: print(f" 理由: {reason}...") except: pass # 4. 测试结果总结 print() print("=" * 80) print("测试结果总结") print("=" * 80) print() print("功能使用情况:") print(f" - goal 工具: {'✅ 使用' if goal_used else '❌ 未使用'}") print(f" - subagent 工具: {'✅ 使用' if subagent_used else '❌ 未使用'}") print(f" - evaluate 模式: {'✅ 使用' if evaluate_used else '❌ 未使用'} ({evaluation_count} 次)") print(f" - delegate 模式: {'✅ 使用' if delegate_used else '❌ 未使用'}") print(f" - explore 模式: {'✅ 使用' if explore_used else '❌ 未使用'}") print() print("工具调用统计:") for tool_name, count in sorted(tool_calls_count.items(), key=lambda x: x[1], reverse=True): print(f" - {tool_name}: {count} 次") print() # 评估结果 if evaluation_results: print("评估结果:") for i, eval_result in enumerate(evaluation_results, 1): status = "✅ 通过" if eval_result["passed"] else "❌ 不通过" print(f" {i}. {status}") print(f" 理由: {eval_result['reason']}") print() # 检查输出文件 print("输出文件:") auth_file = output_dir / "auth.py" report_file = output_dir / "IMPLEMENTATION_REPORT.md" if auth_file.exists(): size = auth_file.stat().st_size print(f" ✅ auth.py ({size} bytes)") else: print(f" ❌ auth.py (未生成)") if report_file.exists(): size = report_file.stat().st_size print(f" ✅ IMPLEMENTATION_REPORT.md ({size} bytes)") else: print(f" ❌ IMPLEMENTATION_REPORT.md (未生成)") print() # 验证测试目标 print("测试目标验证:") print() success = True if evaluate_used: print(f" ✅ Agent 使用了 subagent(mode='evaluate') ({evaluation_count} 次)") else: print(f" ❌ Agent 未使用 subagent(mode='evaluate')") success = False if evaluation_results: print(f" ✅ 获得了评估结果 ({len(evaluation_results)} 次)") else: print(f" ❌ 未获得评估结果") success = False if auth_file.exists(): print(f" ✅ 生成了代码文件") else: print(f" ❌ 未生成代码文件") success = False print() if success: print("🎉 测试成功!Agent 正确使用了 subagent 评估功能。") else: print("⚠️ 测试未完全通过,请检查 Agent 行为。") print() if current_trace_id: print(f"详细日志: .trace/{current_trace_id}/") print("=" * 80) if __name__ == "__main__": asyncio.run(main())