| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306 |
- #!/usr/bin/env python3
- """
- 集成测试 5: 用户认证模块实现(强制评估)
- 测试目标:
- - 验证 Agent 能够使用 subagent(mode="evaluate") 进行代码评估
- - 验证 Agent 能够根据评估结果修复代码
- - 验证评估-修复-重新评估的迭代流程
- """
- import asyncio
- import sys
- import os
- from pathlib import Path
- # 添加项目根目录到 Python 路径
- project_root = Path(__file__).parent.parent.parent
- sys.path.insert(0, str(project_root))
- from dotenv import load_dotenv
- load_dotenv()
- from agent.llm.prompts import SimplePrompt
- from agent.core.runner import AgentRunner
- from agent.execution import FileSystemTraceStore, Trace, Message
- from agent.llm import create_openrouter_llm_call
- async def main():
- """运行测试"""
- # 路径配置
- base_dir = Path(__file__).parent
- prompt_path = base_dir / "task.prompt"
- output_dir = base_dir / "output"
- print("=" * 80)
- print("集成测试 5: 用户认证模块实现(强制评估)")
- print("=" * 80)
- print()
- # 1. 加载 prompt
- print("1. 加载任务...")
- prompt = SimplePrompt(prompt_path)
- system_prompt = prompt._messages.get("system", "")
- user_prompt = prompt._messages.get("user", "")
- print(f" ✓ 任务类型: 用户认证模块实现")
- print(f" ✓ 强制要求: 必须使用 subagent 评估")
- print(f" ✓ 安全检查: 密码加密、SQL注入、输入验证")
- print()
- # 2. 创建 Agent Runner
- print("2. 创建 Agent Runner...")
- print(f" - 模型: Claude Sonnet 4.5")
- print()
- runner = AgentRunner(
- trace_store=FileSystemTraceStore(base_path=".trace"),
- llm_call=create_openrouter_llm_call(model="anthropic/claude-sonnet-4.5"),
- skills_dir=str(project_root / "agent" / "skills"),
- debug=False
- )
- # 3. 运行 Agent
- print("3. 启动 Agent...")
- print("=" * 80)
- print()
- # 创建输出目录
- output_dir.mkdir(exist_ok=True)
- # 监控变量
- current_trace_id = None
- goal_used = False
- subagent_used = False
- evaluate_used = False
- delegate_used = False
- explore_used = False
- iteration_count = 0
- tool_calls_count = {}
- evaluation_count = 0
- evaluation_results = []
- async for item in runner.run(
- task=user_prompt,
- system_prompt=system_prompt,
- model="anthropic/claude-sonnet-4.5",
- temperature=0.5,
- max_iterations=50,
- ):
- # 处理 Trace 对象
- if isinstance(item, Trace):
- current_trace_id = item.trace_id
- if item.status == "running":
- print(f"[Trace] 开始: {item.trace_id[:8]}...")
- elif item.status == "completed":
- print()
- print("=" * 80)
- print(f"[Trace] 完成")
- print(f" - 总消息数: {item.total_messages}")
- print(f" - 总 Token 数: {item.total_tokens}")
- print(f" - 总成本: ${item.total_cost:.4f}")
- print("=" * 80)
- elif item.status == "failed":
- print()
- print(f"[Trace] 失败: {item.error_message}")
- # 处理 Message 对象
- elif isinstance(item, Message):
- if item.role == "assistant":
- iteration_count += 1
- content = item.content
- if isinstance(content, dict):
- text = content.get("text", "")
- tool_calls = content.get("tool_calls")
- # 显示 Agent 的思考
- if text and not tool_calls:
- print(f"\n[{iteration_count}] Agent 回复:")
- print(f" {text[:200]}{'...' if len(text) > 200 else ''}")
- elif text:
- print(f"\n[{iteration_count}] Agent 思考:")
- print(f" {text[:150]}{'...' if len(text) > 150 else ''}")
- # 显示工具调用
- if tool_calls:
- for tc in tool_calls:
- tool_name = tc.get("function", {}).get("name", "unknown")
- args = tc.get("function", {}).get("arguments", {})
- # 如果 args 是字符串,尝试解析为 JSON
- if isinstance(args, str):
- import json
- try:
- args = json.loads(args)
- except:
- args = {}
- # 统计工具使用
- tool_calls_count[tool_name] = tool_calls_count.get(tool_name, 0) + 1
- # 检测关键工具使用
- if tool_name == "goal":
- goal_used = True
- if isinstance(args, dict):
- if args.get("add"):
- print(f" → goal(add): {args['add'][:80]}...")
- elif args.get("done"):
- print(f" → goal(done): {args['done'][:80]}...")
- elif args.get("focus"):
- print(f" → goal(focus): {args['focus']}")
- else:
- print(f" → goal(...)")
- elif tool_name == "subagent":
- subagent_used = True
- if isinstance(args, dict):
- mode = args.get("mode", "unknown")
- if mode == "evaluate":
- evaluate_used = True
- evaluation_count += 1
- target = args.get("target_goal_id", "?")
- print(f" → subagent(evaluate): 评估目标 {target} [评估 #{evaluation_count}]")
- elif mode == "delegate":
- delegate_used = True
- task = args.get("task", "")
- print(f" → subagent(delegate): {task[:60]}...")
- elif mode == "explore":
- explore_used = True
- branches = args.get("branches", [])
- print(f" → subagent(explore): {len(branches)} 个分支")
- else:
- print(f" → subagent({mode})")
- else:
- print(f" → subagent(...)")
- else:
- # 其他工具简化显示
- if tool_name in ["read_file", "write_file", "edit_file"]:
- if isinstance(args, dict):
- file_path = args.get("file_path", "")
- if file_path:
- file_name = Path(file_path).name
- print(f" → {tool_name}: {file_name}")
- else:
- print(f" → {tool_name}")
- else:
- print(f" → {tool_name}")
- elif tool_name == "bash_command":
- if isinstance(args, dict):
- cmd = args.get("command", "")
- print(f" → bash: {cmd[:60]}...")
- else:
- print(f" → bash")
- else:
- print(f" → {tool_name}")
- elif item.role == "tool":
- # 检查是否是评估结果
- content = item.content
- if isinstance(content, str):
- import json
- try:
- result = json.loads(content)
- if isinstance(result, dict) and "passed" in result:
- passed = result.get("passed", False)
- reason = result.get("reason", "")[:100]
- evaluation_results.append({
- "passed": passed,
- "reason": reason
- })
- status = "✅ 通过" if passed else "❌ 不通过"
- print(f" [评估结果] {status}")
- if reason:
- print(f" 理由: {reason}...")
- except:
- pass
- # 4. 测试结果总结
- print()
- print("=" * 80)
- print("测试结果总结")
- print("=" * 80)
- print()
- print("功能使用情况:")
- print(f" - goal 工具: {'✅ 使用' if goal_used else '❌ 未使用'}")
- print(f" - subagent 工具: {'✅ 使用' if subagent_used else '❌ 未使用'}")
- print(f" - evaluate 模式: {'✅ 使用' if evaluate_used else '❌ 未使用'} ({evaluation_count} 次)")
- print(f" - delegate 模式: {'✅ 使用' if delegate_used else '❌ 未使用'}")
- print(f" - explore 模式: {'✅ 使用' if explore_used else '❌ 未使用'}")
- print()
- print("工具调用统计:")
- for tool_name, count in sorted(tool_calls_count.items(), key=lambda x: x[1], reverse=True):
- print(f" - {tool_name}: {count} 次")
- print()
- # 评估结果
- if evaluation_results:
- print("评估结果:")
- for i, eval_result in enumerate(evaluation_results, 1):
- status = "✅ 通过" if eval_result["passed"] else "❌ 不通过"
- print(f" {i}. {status}")
- print(f" 理由: {eval_result['reason']}")
- print()
- # 检查输出文件
- print("输出文件:")
- auth_file = output_dir / "auth.py"
- report_file = output_dir / "IMPLEMENTATION_REPORT.md"
- if auth_file.exists():
- size = auth_file.stat().st_size
- print(f" ✅ auth.py ({size} bytes)")
- else:
- print(f" ❌ auth.py (未生成)")
- if report_file.exists():
- size = report_file.stat().st_size
- print(f" ✅ IMPLEMENTATION_REPORT.md ({size} bytes)")
- else:
- print(f" ❌ IMPLEMENTATION_REPORT.md (未生成)")
- print()
- # 验证测试目标
- print("测试目标验证:")
- print()
- success = True
- if evaluate_used:
- print(f" ✅ Agent 使用了 subagent(mode='evaluate') ({evaluation_count} 次)")
- else:
- print(f" ❌ Agent 未使用 subagent(mode='evaluate')")
- success = False
- if evaluation_results:
- print(f" ✅ 获得了评估结果 ({len(evaluation_results)} 次)")
- else:
- print(f" ❌ 未获得评估结果")
- success = False
- if auth_file.exists():
- print(f" ✅ 生成了代码文件")
- else:
- print(f" ❌ 未生成代码文件")
- success = False
- print()
- if success:
- print("🎉 测试成功!Agent 正确使用了 subagent 评估功能。")
- else:
- print("⚠️ 测试未完全通过,请检查 Agent 行为。")
- print()
- if current_trace_id:
- print(f"详细日志: .trace/{current_trace_id}/")
- print("=" * 80)
- if __name__ == "__main__":
- asyncio.run(main())
|