run.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. #!/usr/bin/env python3
  2. """
  3. 集成测试 5: 用户认证模块实现(强制评估)
  4. 测试目标:
  5. - 验证 Agent 能够使用 subagent(mode="evaluate") 进行代码评估
  6. - 验证 Agent 能够根据评估结果修复代码
  7. - 验证评估-修复-重新评估的迭代流程
  8. """
  9. import asyncio
  10. import sys
  11. import os
  12. from pathlib import Path
  13. # 添加项目根目录到 Python 路径
  14. project_root = Path(__file__).parent.parent.parent
  15. sys.path.insert(0, str(project_root))
  16. from dotenv import load_dotenv
  17. load_dotenv()
  18. from agent.llm.prompts import SimplePrompt
  19. from agent.core.runner import AgentRunner
  20. from agent.execution import FileSystemTraceStore, Trace, Message
  21. from agent.llm import create_openrouter_llm_call
  22. async def main():
  23. """运行测试"""
  24. # 路径配置
  25. base_dir = Path(__file__).parent
  26. prompt_path = base_dir / "task.prompt"
  27. output_dir = base_dir / "output"
  28. print("=" * 80)
  29. print("集成测试 5: 用户认证模块实现(强制评估)")
  30. print("=" * 80)
  31. print()
  32. # 1. 加载 prompt
  33. print("1. 加载任务...")
  34. prompt = SimplePrompt(prompt_path)
  35. system_prompt = prompt._messages.get("system", "")
  36. user_prompt = prompt._messages.get("user", "")
  37. print(f" ✓ 任务类型: 用户认证模块实现")
  38. print(f" ✓ 强制要求: 必须使用 subagent 评估")
  39. print(f" ✓ 安全检查: 密码加密、SQL注入、输入验证")
  40. print()
  41. # 2. 创建 Agent Runner
  42. print("2. 创建 Agent Runner...")
  43. print(f" - 模型: Claude Sonnet 4.5")
  44. print()
  45. runner = AgentRunner(
  46. trace_store=FileSystemTraceStore(base_path=".trace"),
  47. llm_call=create_openrouter_llm_call(model="anthropic/claude-sonnet-4.5"),
  48. skills_dir=str(project_root / "agent" / "skills"),
  49. debug=False
  50. )
  51. # 3. 运行 Agent
  52. print("3. 启动 Agent...")
  53. print("=" * 80)
  54. print()
  55. # 创建输出目录
  56. output_dir.mkdir(exist_ok=True)
  57. # 监控变量
  58. current_trace_id = None
  59. goal_used = False
  60. subagent_used = False
  61. evaluate_used = False
  62. delegate_used = False
  63. explore_used = False
  64. iteration_count = 0
  65. tool_calls_count = {}
  66. evaluation_count = 0
  67. evaluation_results = []
  68. async for item in runner.run(
  69. task=user_prompt,
  70. system_prompt=system_prompt,
  71. model="anthropic/claude-sonnet-4.5",
  72. temperature=0.5,
  73. max_iterations=50,
  74. ):
  75. # 处理 Trace 对象
  76. if isinstance(item, Trace):
  77. current_trace_id = item.trace_id
  78. if item.status == "running":
  79. print(f"[Trace] 开始: {item.trace_id[:8]}...")
  80. elif item.status == "completed":
  81. print()
  82. print("=" * 80)
  83. print(f"[Trace] 完成")
  84. print(f" - 总消息数: {item.total_messages}")
  85. print(f" - 总 Token 数: {item.total_tokens}")
  86. print(f" - 总成本: ${item.total_cost:.4f}")
  87. print("=" * 80)
  88. elif item.status == "failed":
  89. print()
  90. print(f"[Trace] 失败: {item.error_message}")
  91. # 处理 Message 对象
  92. elif isinstance(item, Message):
  93. if item.role == "assistant":
  94. iteration_count += 1
  95. content = item.content
  96. if isinstance(content, dict):
  97. text = content.get("text", "")
  98. tool_calls = content.get("tool_calls")
  99. # 显示 Agent 的思考
  100. if text and not tool_calls:
  101. print(f"\n[{iteration_count}] Agent 回复:")
  102. print(f" {text[:200]}{'...' if len(text) > 200 else ''}")
  103. elif text:
  104. print(f"\n[{iteration_count}] Agent 思考:")
  105. print(f" {text[:150]}{'...' if len(text) > 150 else ''}")
  106. # 显示工具调用
  107. if tool_calls:
  108. for tc in tool_calls:
  109. tool_name = tc.get("function", {}).get("name", "unknown")
  110. args = tc.get("function", {}).get("arguments", {})
  111. # 如果 args 是字符串,尝试解析为 JSON
  112. if isinstance(args, str):
  113. import json
  114. try:
  115. args = json.loads(args)
  116. except:
  117. args = {}
  118. # 统计工具使用
  119. tool_calls_count[tool_name] = tool_calls_count.get(tool_name, 0) + 1
  120. # 检测关键工具使用
  121. if tool_name == "goal":
  122. goal_used = True
  123. if isinstance(args, dict):
  124. if args.get("add"):
  125. print(f" → goal(add): {args['add'][:80]}...")
  126. elif args.get("done"):
  127. print(f" → goal(done): {args['done'][:80]}...")
  128. elif args.get("focus"):
  129. print(f" → goal(focus): {args['focus']}")
  130. else:
  131. print(f" → goal(...)")
  132. elif tool_name == "subagent":
  133. subagent_used = True
  134. if isinstance(args, dict):
  135. mode = args.get("mode", "unknown")
  136. if mode == "evaluate":
  137. evaluate_used = True
  138. evaluation_count += 1
  139. target = args.get("target_goal_id", "?")
  140. print(f" → subagent(evaluate): 评估目标 {target} [评估 #{evaluation_count}]")
  141. elif mode == "delegate":
  142. delegate_used = True
  143. task = args.get("task", "")
  144. print(f" → subagent(delegate): {task[:60]}...")
  145. elif mode == "explore":
  146. explore_used = True
  147. branches = args.get("branches", [])
  148. print(f" → subagent(explore): {len(branches)} 个分支")
  149. else:
  150. print(f" → subagent({mode})")
  151. else:
  152. print(f" → subagent(...)")
  153. else:
  154. # 其他工具简化显示
  155. if tool_name in ["read_file", "write_file", "edit_file"]:
  156. if isinstance(args, dict):
  157. file_path = args.get("file_path", "")
  158. if file_path:
  159. file_name = Path(file_path).name
  160. print(f" → {tool_name}: {file_name}")
  161. else:
  162. print(f" → {tool_name}")
  163. else:
  164. print(f" → {tool_name}")
  165. elif tool_name == "bash_command":
  166. if isinstance(args, dict):
  167. cmd = args.get("command", "")
  168. print(f" → bash: {cmd[:60]}...")
  169. else:
  170. print(f" → bash")
  171. else:
  172. print(f" → {tool_name}")
  173. elif item.role == "tool":
  174. # 检查是否是评估结果
  175. content = item.content
  176. if isinstance(content, str):
  177. import json
  178. try:
  179. result = json.loads(content)
  180. if isinstance(result, dict) and "passed" in result:
  181. passed = result.get("passed", False)
  182. reason = result.get("reason", "")[:100]
  183. evaluation_results.append({
  184. "passed": passed,
  185. "reason": reason
  186. })
  187. status = "✅ 通过" if passed else "❌ 不通过"
  188. print(f" [评估结果] {status}")
  189. if reason:
  190. print(f" 理由: {reason}...")
  191. except:
  192. pass
  193. # 4. 测试结果总结
  194. print()
  195. print("=" * 80)
  196. print("测试结果总结")
  197. print("=" * 80)
  198. print()
  199. print("功能使用情况:")
  200. print(f" - goal 工具: {'✅ 使用' if goal_used else '❌ 未使用'}")
  201. print(f" - subagent 工具: {'✅ 使用' if subagent_used else '❌ 未使用'}")
  202. print(f" - evaluate 模式: {'✅ 使用' if evaluate_used else '❌ 未使用'} ({evaluation_count} 次)")
  203. print(f" - delegate 模式: {'✅ 使用' if delegate_used else '❌ 未使用'}")
  204. print(f" - explore 模式: {'✅ 使用' if explore_used else '❌ 未使用'}")
  205. print()
  206. print("工具调用统计:")
  207. for tool_name, count in sorted(tool_calls_count.items(), key=lambda x: x[1], reverse=True):
  208. print(f" - {tool_name}: {count} 次")
  209. print()
  210. # 评估结果
  211. if evaluation_results:
  212. print("评估结果:")
  213. for i, eval_result in enumerate(evaluation_results, 1):
  214. status = "✅ 通过" if eval_result["passed"] else "❌ 不通过"
  215. print(f" {i}. {status}")
  216. print(f" 理由: {eval_result['reason']}")
  217. print()
  218. # 检查输出文件
  219. print("输出文件:")
  220. auth_file = output_dir / "auth.py"
  221. report_file = output_dir / "IMPLEMENTATION_REPORT.md"
  222. if auth_file.exists():
  223. size = auth_file.stat().st_size
  224. print(f" ✅ auth.py ({size} bytes)")
  225. else:
  226. print(f" ❌ auth.py (未生成)")
  227. if report_file.exists():
  228. size = report_file.stat().st_size
  229. print(f" ✅ IMPLEMENTATION_REPORT.md ({size} bytes)")
  230. else:
  231. print(f" ❌ IMPLEMENTATION_REPORT.md (未生成)")
  232. print()
  233. # 验证测试目标
  234. print("测试目标验证:")
  235. print()
  236. success = True
  237. if evaluate_used:
  238. print(f" ✅ Agent 使用了 subagent(mode='evaluate') ({evaluation_count} 次)")
  239. else:
  240. print(f" ❌ Agent 未使用 subagent(mode='evaluate')")
  241. success = False
  242. if evaluation_results:
  243. print(f" ✅ 获得了评估结果 ({len(evaluation_results)} 次)")
  244. else:
  245. print(f" ❌ 未获得评估结果")
  246. success = False
  247. if auth_file.exists():
  248. print(f" ✅ 生成了代码文件")
  249. else:
  250. print(f" ❌ 未生成代码文件")
  251. success = False
  252. print()
  253. if success:
  254. print("🎉 测试成功!Agent 正确使用了 subagent 评估功能。")
  255. else:
  256. print("⚠️ 测试未完全通过,请检查 Agent 行为。")
  257. print()
  258. if current_trace_id:
  259. print(f"详细日志: .trace/{current_trace_id}/")
  260. print("=" * 80)
  261. if __name__ == "__main__":
  262. asyncio.run(main())