howard
/
Agent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
							"""
集成测试 - 真实场景测试

测试场景：代码重构与测试任务
目标：让 Agent 在真实场景中自然使用各种工具，验证重构后的功能

测试内容：
1. Goal 工具 - 创建和管理执行计划
2. SubAgent 工具 - delegate 模式（委托任务）
3. SubAgent 工具 - evaluate 模式（评估结果）
4. 文件操作工具 - 读写编辑文件
5. Bash 工具 - 运行测试

不刻意测试某个功能，而是让 Agent 自然地完成一个真实任务。
"""

import os
import sys
import asyncio
from pathlib import Path

# 添加项目根目录到 Python 路径
sys.path.insert(0, str(Path(__file__).parent.parent.parent))

from dotenv import load_dotenv
load_dotenv()

from agent.llm.prompts import SimplePrompt
from agent.core.runner import AgentRunner
from agent.execution import FileSystemTraceStore, Trace, Message
from agent.llm import create_openrouter_llm_call


async def main():
    # 路径配置
    base_dir = Path(__file__).parent
    project_root = base_dir.parent.parent
    prompt_path = base_dir / "task.prompt"
    project_dir = base_dir / "project"

    print("=" * 80)
    print("集成测试 - 真实场景：代码重构与测试")
    print("=" * 80)
    print()

    # 1. 加载 prompt
    print("1. 加载任务 prompt...")
    prompt = SimplePrompt(prompt_path)
    system_prompt = prompt._messages.get("system", "")
    user_prompt = prompt._messages.get("user", "")

    print(f"   ✓ System prompt 已加载")
    print(f"   ✓ User prompt 已加载")
    print()

    # 2. 创建 Agent Runner
    print("2. 创建 Agent Runner...")
    print(f"   - 模型: Claude Sonnet 4.5 (via OpenRouter)")
    print(f"   - Trace 存储: .trace/")
    print()

    runner = AgentRunner(
        trace_store=FileSystemTraceStore(base_path=".trace"),
        llm_call=create_openrouter_llm_call(model="anthropic/claude-sonnet-4.5"),
        skills_dir=str(project_root / "agent" / "skills"),
        debug=False
    )

    # 3. 运行 Agent
    print("3. 启动 Agent 执行任务...")
    print("=" * 80)
    print()

    current_trace_id = None
    goal_used = False
    subagent_used = False
    evaluate_used = False
    delegate_used = False

    iteration_count = 0
    tool_calls_count = {}

    async for item in runner.run(
        task=user_prompt,
        system_prompt=system_prompt,
        model="anthropic/claude-sonnet-4.5",
        temperature=0.3,
        max_iterations=30,
    ):
        # 处理 Trace 对象
        if isinstance(item, Trace):
            current_trace_id = item.trace_id
            if item.status == "running":
                print(f"[Trace] 开始: {item.trace_id[:8]}...")
            elif item.status == "completed":
                print()
                print("=" * 80)
                print(f"[Trace] 完成")
                print(f"  - 总消息数: {item.total_messages}")
                print(f"  - 总 Token 数: {item.total_tokens}")
                print(f"  - 总成本: ${item.total_cost:.4f}")
                print("=" * 80)
            elif item.status == "failed":
                print()
                print(f"[Trace] 失败: {item.error}")

        # 处理 Message 对象
        elif isinstance(item, Message):
            if item.role == "assistant":
                iteration_count += 1

                content = item.content
                if isinstance(content, dict):
                    text = content.get("text", "")
                    tool_calls = content.get("tool_calls")

                    # 显示 Agent 的思考
                    if text and not tool_calls:
                        print(f"\n[{iteration_count}] Agent 回复:")
                        print(f"  {text[:200]}{'...' if len(text) > 200 else ''}")
                    elif text:
                        print(f"\n[{iteration_count}] Agent 思考:")
                        print(f"  {text[:150]}{'...' if len(text) > 150 else ''}")

                    # 显示工具调用
                    if tool_calls:
                        for tc in tool_calls:
                            tool_name = tc.get("function", {}).get("name", "unknown")
                            args = tc.get("function", {}).get("arguments", {})

                            # 如果 args 是字符串，尝试解析为 JSON
                            if isinstance(args, str):
                                import json
                                try:
                                    args = json.loads(args)
                                except:
                                    args = {}

                            # 统计工具使用
                            tool_calls_count[tool_name] = tool_calls_count.get(tool_name, 0) + 1

                            # 检测关键工具使用
                            if tool_name == "goal":
                                goal_used = True
                                # 显示 goal 操作
                                if isinstance(args, dict):
                                    if args.get("add"):
                                        print(f"  → goal(add): {args['add'][:80]}...")
                                    elif args.get("done"):
                                        print(f"  → goal(done): {args['done'][:80]}...")
                                    elif args.get("focus"):
                                        print(f"  → goal(focus): {args['focus']}")
                                else:
                                    print(f"  → goal(...)")

                            elif tool_name == "subagent":
                                subagent_used = True
                                if isinstance(args, dict):
                                    mode = args.get("mode", "unknown")
                                    if mode == "evaluate":
                                        evaluate_used = True
                                        target = args.get("target_goal_id", "?")
                                        print(f"  → subagent(evaluate): 评估目标 {target}")
                                    elif mode == "delegate":
                                        delegate_used = True
                                        task = args.get("task", "")
                                        print(f"  → subagent(delegate): {task[:60]}...")
                                    else:
                                        print(f"  → subagent({mode})")
                                else:
                                    print(f"  → subagent(...)")

                            else:
                                # 其他工具简化显示
                                if tool_name in ["read_file", "write_file", "edit_file"]:
                                    if isinstance(args, dict):
                                        file_path = args.get("file_path", "")
                                        if file_path:
                                            file_name = Path(file_path).name
                                            print(f"  → {tool_name}: {file_name}")
                                        else:
                                            print(f"  → {tool_name}")
                                    else:
                                        print(f"  → {tool_name}")
                                elif tool_name == "bash_command":
                                    if isinstance(args, dict):
                                        cmd = args.get("command", "")
                                        print(f"  → bash: {cmd[:60]}...")
                                    else:
                                        print(f"  → bash")
                                else:
                                    print(f"  → {tool_name}")

            elif item.role == "tool":
                # 工具返回结果（简化显示）
                pass

    # 4. 测试结果总结
    print()
    print("=" * 80)
    print("测试结果总结")
    print("=" * 80)
    print()

    print("功能使用情况:")
    print(f"  ✓ Goal 工具: {'已使用' if goal_used else '未使用'}")
    print(f"  ✓ SubAgent 工具: {'已使用' if subagent_used else '未使用'}")
    print(f"    - Evaluate 模式: {'已使用' if evaluate_used else '未使用'}")
    print(f"    - Delegate 模式: {'已使用' if delegate_used else '未使用'}")
    print()

    print("工具调用统计:")
    for tool_name, count in sorted(tool_calls_count.items()):
        print(f"  - {tool_name}: {count} 次")
    print()

    print(f"总迭代次数: {iteration_count}")
    print()

    # 5. 验证结果
    print("验证生成的文件:")

    # 检查是否生成了测试文件
    test_file = project_dir / "test_calculator.py"
    if test_file.exists():
        print(f"  ✓ 测试文件已生成: {test_file.name}")
    else:
        print(f"  ✗ 测试文件未生成")

    # 检查 calculator.py 是否被修改（添加了 average 函数）
    calc_file = project_dir / "calculator.py"
    if calc_file.exists():
        content = calc_file.read_text()
        if "average" in content or "mean" in content:
            print(f"  ✓ Calculator 已添加新功能")
        else:
            print(f"  ✗ Calculator 未添加新功能")

    print()
    print("=" * 80)
    print("集成测试完成")
    print("=" * 80)


if __name__ == "__main__":
    asyncio.run(main())