Просмотр исходного кода

feat: rewind, msg compression, control API

Talegorithm 3 недель назад
Родитель
Сommit
106a660487
11 измененных файлов с 1086 добавлено и 315 удалено
  1. 141 74
      agent/core/runner.py
  2. 145 102
      agent/trace/compaction.py
  3. 49 3
      agent/trace/goal_models.py
  4. 15 3
      agent/trace/models.py
  5. 17 0
      agent/trace/protocols.py
  6. 144 43
      agent/trace/run_api.py
  7. 29 0
      agent/trace/store.py
  8. 8 3
      api_server.py
  9. 167 69
      docs/README.md
  10. 263 0
      docs/decisions.md
  11. 108 18
      docs/trace-api.md

+ 141 - 74
agent/core/runner.py

@@ -14,8 +14,10 @@ Agent Runner - Agent 执行引擎
 - Messages: OpenAI SDK 格式的任务消息
 """
 
+import asyncio
 import json
 import logging
+import os
 import uuid
 from dataclasses import dataclass, field
 from datetime import datetime
@@ -24,7 +26,7 @@ from typing import AsyncIterator, Optional, Dict, Any, List, Callable, Literal,
 from agent.trace.models import Trace, Message
 from agent.trace.protocols import TraceStore
 from agent.trace.goal_models import GoalTree
-from agent.memory.models import Experience, Skill
+from agent.memory.models import Skill
 from agent.memory.protocols import MemoryStore, StateStore
 from agent.memory.skill_loader import load_skills_from_dir
 from agent.tools import ToolRegistry, get_tool_registry
@@ -170,6 +172,7 @@ class AgentRunner:
         utility_llm_call: Optional[Callable] = None,
         config: Optional[AgentConfig] = None,
         skills_dir: Optional[str] = None,
+        experiences_path: Optional[str] = "./cache/experiences.md",
         goal_tree: Optional[GoalTree] = None,
         debug: bool = False,
     ):
@@ -185,6 +188,7 @@ class AgentRunner:
             utility_llm_call: 轻量 LLM(用于生成任务标题等),可选
             config: [向后兼容] AgentConfig
             skills_dir: Skills 目录路径
+            experiences_path: 经验文件路径(默认 ./cache/experiences.md)
             goal_tree: 初始 GoalTree(可选)
             debug: 保留参数(已废弃)
         """
@@ -196,8 +200,10 @@ class AgentRunner:
         self.utility_llm_call = utility_llm_call
         self.config = config or AgentConfig()
         self.skills_dir = skills_dir
+        self.experiences_path = experiences_path
         self.goal_tree = goal_tree
         self.debug = debug
+        self._cancel_events: Dict[str, asyncio.Event] = {}  # trace_id → cancel event
 
     # ===== 核心公开方法 =====
 
@@ -228,12 +234,16 @@ class AgentRunner:
         try:
             # Phase 1: PREPARE TRACE
             trace, goal_tree, sequence = await self._prepare_trace(messages, config)
+            # 注册取消事件
+            self._cancel_events[trace.trace_id] = asyncio.Event()
             yield trace
 
             # Phase 2: BUILD HISTORY
-            history, sequence, created_messages = await self._build_history(
+            history, sequence, created_messages, head_seq = await self._build_history(
                 trace.trace_id, messages, goal_tree, config, sequence
             )
+            # Update trace's head_sequence in memory
+            trace.head_sequence = head_seq
             for msg in created_messages:
                 yield msg
 
@@ -255,6 +265,10 @@ class AgentRunner:
                 if trace_obj:
                     yield trace_obj
             raise
+        finally:
+            # 清理取消事件
+            if trace:
+                self._cancel_events.pop(trace.trace_id, None)
 
     async def run_result(
         self,
@@ -306,6 +320,22 @@ class AgentRunner:
             },
         }
 
+    async def stop(self, trace_id: str) -> bool:
+        """
+        停止运行中的 Trace
+
+        设置取消信号,agent loop 在下一个 LLM 调用前检查并退出。
+        Trace 状态置为 "stopped"。
+
+        Returns:
+            True 如果成功发送停止信号,False 如果该 trace 不在运行中
+        """
+        cancel_event = self._cancel_events.get(trace_id)
+        if cancel_event is None:
+            return False
+        cancel_event.set()
+        return True
+
     # ===== 单次调用(保留)=====
 
     async def call(
@@ -432,11 +462,8 @@ class AgentRunner:
             # 回溯模式
             sequence = await self._rewind(config.trace_id, config.insert_after, goal_tree)
         else:
-            # 续跑模式:从最大 sequence + 1 开始
-            all_messages = await self.trace_store.get_trace_messages(
-                config.trace_id, include_abandoned=True
-            )
-            sequence = max((m.sequence for m in all_messages), default=0) + 1
+            # 续跑模式:从 last_sequence + 1 开始
+            sequence = trace_obj.last_sequence + 1
 
         # 状态置为 running
         await self.trace_store.update_trace(
@@ -461,21 +488,30 @@ class AgentRunner:
         """
         构建完整的 LLM 消息历史
 
-        1. 加载已有 active messages(续跑/回溯场景)
-        2. 构建 system prompt(新建时注入 skills/experiences)
-        3. 追加 input messages
+        1. 从 head_sequence 沿 parent chain 加载主路径消息(续跑/回溯场景)
+        2. 构建 system prompt(新建时注入 skills)
+        3. 新建时:在第一条 user message 末尾注入当前经验
+        4. 追加 input messages(设置 parent_sequence 链接到当前 head)
 
         Returns:
-            (history, next_sequence, created_messages)
+            (history, next_sequence, created_messages, head_sequence)
             created_messages: 本次新创建并持久化的 Message 列表,供 run() yield 给调用方
+            head_sequence: 当前主路径头节点的 sequence
         """
         history: List[Dict] = []
         created_messages: List[Message] = []
+        head_seq: Optional[int] = None  # 当前主路径的头节点 sequence
 
-        # 1. 加载已有 messages
+        # 1. 加载已有 messages(通过主路径遍历)
         if config.trace_id and self.trace_store:
-            existing_messages = await self.trace_store.get_trace_messages(trace_id)
-            history = [msg.to_llm_dict() for msg in existing_messages]
+            trace_obj = await self.trace_store.get_trace(trace_id)
+            if trace_obj and trace_obj.head_sequence > 0:
+                main_path = await self.trace_store.get_main_path_messages(
+                    trace_id, trace_obj.head_sequence
+                )
+                history = [msg.to_llm_dict() for msg in main_path]
+                if main_path:
+                    head_seq = main_path[-1].sequence
 
         # 2. 构建 system prompt(如果历史中没有 system message)
         has_system = any(m.get("role") == "system" for m in history)
@@ -490,24 +526,41 @@ class AgentRunner:
                     system_msg = Message.create(
                         trace_id=trace_id, role="system", sequence=sequence,
                         goal_id=None, content=system_prompt,
+                        parent_sequence=None,  # system message 是 root
                     )
                     await self.trace_store.add_message(system_msg)
                     created_messages.append(system_msg)
+                    head_seq = sequence
                     sequence += 1
 
-        # 3. 追加新 messages
+        # 3. 新建时:在第一条 user message 末尾注入当前经验
+        if not config.trace_id:  # 新建模式
+            experiences_text = self._load_experiences()
+            if experiences_text:
+                for msg in new_messages:
+                    if msg.get("role") == "user" and isinstance(msg.get("content"), str):
+                        msg["content"] += f"\n\n## 参考经验\n\n{experiences_text}"
+                        break
+
+        # 4. 追加新 messages(设置 parent_sequence 链接到当前 head)
         for msg_dict in new_messages:
             history.append(msg_dict)
 
             if self.trace_store:
                 stored_msg = Message.from_llm_dict(
-                    msg_dict, trace_id=trace_id, sequence=sequence, goal_id=None
+                    msg_dict, trace_id=trace_id, sequence=sequence,
+                    goal_id=None, parent_sequence=head_seq,
                 )
                 await self.trace_store.add_message(stored_msg)
                 created_messages.append(stored_msg)
+                head_seq = sequence
                 sequence += 1
 
-        return history, sequence, created_messages
+        # 5. 更新 trace 的 head_sequence
+        if self.trace_store and head_seq is not None:
+            await self.trace_store.update_trace(trace_id, head_sequence=head_seq)
+
+        return history, sequence, created_messages, head_seq or 0
 
     # ===== Phase 3: AGENT LOOP =====
 
@@ -523,12 +576,30 @@ class AgentRunner:
         trace_id = trace.trace_id
         tool_schemas = self._get_tool_schemas(config.tools)
 
+        # 当前主路径头节点的 sequence(用于设置 parent_sequence)
+        head_seq = trace.head_sequence
+
         # 设置 goal_tree 到 goal 工具
         if goal_tree and self.trace_store:
             from agent.trace.goal_tool import set_goal_tree
             set_goal_tree(goal_tree)
 
         for iteration in range(config.max_iterations):
+            # 检查取消信号
+            cancel_event = self._cancel_events.get(trace_id)
+            if cancel_event and cancel_event.is_set():
+                logger.info(f"Trace {trace_id} stopped by user")
+                if self.trace_store:
+                    await self.trace_store.update_trace(
+                        trace_id,
+                        status="stopped",
+                        completed_at=datetime.now(),
+                    )
+                    trace_obj = await self.trace_store.get_trace(trace_id)
+                    if trace_obj:
+                        yield trace_obj
+                return
+
             # 构建 LLM messages(注入上下文)
             llm_messages = list(history)
 
@@ -577,12 +648,13 @@ class AgentRunner:
             # 获取当前 goal_id
             current_goal_id = goal_tree.current_id if (goal_tree and goal_tree.current_id) else None
 
-            # 记录 assistant Message
+            # 记录 assistant Message(parent_sequence 指向当前 head)
             assistant_msg = Message.create(
                 trace_id=trace_id,
                 role="assistant",
                 sequence=sequence,
                 goal_id=current_goal_id,
+                parent_sequence=head_seq if head_seq > 0 else None,
                 content={"text": response_content, "tool_calls": tool_calls},
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
@@ -594,6 +666,7 @@ class AgentRunner:
                 await self.trace_store.add_message(assistant_msg)
 
             yield assistant_msg
+            head_seq = sequence
             sequence += 1
 
             # 处理工具调用
@@ -632,6 +705,7 @@ class AgentRunner:
                         role="tool",
                         sequence=sequence,
                         goal_id=current_goal_id,
+                        parent_sequence=head_seq,
                         tool_call_id=tc["id"],
                         content={"tool_name": tool_name, "result": tool_result},
                     )
@@ -640,6 +714,7 @@ class AgentRunner:
                         await self.trace_store.add_message(tool_msg)
 
                     yield tool_msg
+                    head_seq = sequence
                     sequence += 1
 
                     history.append({
@@ -654,11 +729,12 @@ class AgentRunner:
             # 无工具调用,任务完成
             break
 
-        # 完成 Trace
+        # 更新 head_sequence 并完成 Trace
         if self.trace_store:
             await self.trace_store.update_trace(
                 trace_id,
                 status="completed",
+                head_sequence=head_seq,
                 completed_at=datetime.now(),
             )
             trace_obj = await self.trace_store.get_trace(trace_id)
@@ -674,7 +750,9 @@ class AgentRunner:
         goal_tree: Optional[GoalTree],
     ) -> int:
         """
-        执行回溯:标记 insert_after 之后的 messages 和 goals 为 abandoned
+        执行回溯:快照 GoalTree,重建干净树,设置 head_sequence
+
+        新消息的 parent_sequence 将指向 rewind 点,旧消息通过树结构自然脱离主路径。
 
         Returns:
             下一个可用的 sequence 号
@@ -682,7 +760,7 @@ class AgentRunner:
         if not self.trace_store:
             raise ValueError("trace_store required for rewind")
 
-        # 1. 加载所有 messages(含已 abandoned 的)
+        # 1. 加载所有 messages
         all_messages = await self.trace_store.get_trace_messages(
             trace_id, include_abandoned=True
         )
@@ -693,40 +771,37 @@ class AgentRunner:
         # 2. 找到安全截断点(确保不截断在 tool_call 和 tool response 之间)
         cutoff = self._find_safe_cutoff(all_messages, insert_after)
 
-        # 3. 批量标记 messages 为 abandoned
-        abandoned_ids = await self.trace_store.abandon_messages_after(trace_id, cutoff)
-
-        # 4. 处理 Goals
+        # 3. 快照并重建 GoalTree
         if goal_tree:
-            active_messages = [m for m in all_messages if m.sequence <= cutoff]
-            active_goal_ids = {m.goal_id for m in active_messages if m.goal_id}
-
+            # 找出 rewind 点之前已完成的 goal IDs
+            # 通过主路径消息来判断:cutoff 之前的消息引用的 completed goals
+            messages_before = [m for m in all_messages if m.sequence <= cutoff]
+            completed_goal_ids = set()
             for goal in goal_tree.goals:
-                if goal.status == "abandoned":
-                    continue  # 已 abandoned,跳过
-                if goal.status == "completed" and goal.id in active_goal_ids:
-                    continue  # 已完成且有截断点之前的 messages → 保留
-                # 其余全部 abandon(含无 active messages 的 completed goal)
-                goal.status = "abandoned"
-                goal.summary = "回溯导致放弃"
-
-            # 重置 current_id
-            goal_tree._current_id = None
-
-            await self.trace_store.update_goal_tree(trace_id, goal_tree)
-
-        # 5. 记录 rewind 事件
-        abandoned_sequences = [
-            m.sequence for m in all_messages
-            if m.sequence > cutoff and m.status != "abandoned"  # 本次新 abandon 的
-        ]
-        await self.trace_store.append_event(trace_id, "rewind", {
-            "insert_after_sequence": cutoff,
-            "abandoned_message_count": len(abandoned_ids),
-            "abandoned_sequences": abandoned_sequences[:20],  # 只记前 20 条
-        })
-
-        # 6. 返回 next sequence
+                if goal.status == "completed":
+                    # 检查该 goal 是否在 rewind 点之前就已完成(有关联消息在 cutoff 之前)
+                    goal_msgs = [m for m in messages_before if m.goal_id == goal.id]
+                    if goal_msgs:
+                        completed_goal_ids.add(goal.id)
+
+            # 快照到 events
+            await self.trace_store.append_event(trace_id, "rewind", {
+                "insert_after_sequence": cutoff,
+                "goal_tree_snapshot": goal_tree.to_dict(),
+            })
+
+            # 重建干净的 GoalTree
+            new_tree = goal_tree.rebuild_for_rewind(completed_goal_ids)
+            await self.trace_store.update_goal_tree(trace_id, new_tree)
+
+            # 更新内存中的引用
+            goal_tree.goals = new_tree.goals
+            goal_tree.current_id = new_tree.current_id
+
+        # 4. 更新 head_sequence 到 rewind 点
+        await self.trace_store.update_trace(trace_id, head_sequence=cutoff)
+
+        # 5. 返回 next sequence(全局递增,不复用)
         max_seq = max((m.sequence for m in all_messages), default=0)
         return max_seq + 1
 
@@ -806,7 +881,7 @@ class AgentRunner:
         return self.tools.get_schemas(tool_names)
 
     async def _build_system_prompt(self, config: RunConfig) -> Optional[str]:
-        """构建 system prompt(注入 skills 和 experiences)"""
+        """构建 system prompt(注入 skills)"""
         system_prompt = config.system_prompt
 
         # 加载 Skills
@@ -815,27 +890,12 @@ class AgentRunner:
         if skills:
             skills_text = self._format_skills(skills)
 
-        # 加载 Experiences
-        experiences_text = ""
-        if config.enable_memory and self.memory_store:
-            scope = f"agent:{config.agent_type}"
-            # 从 messages 提取文本作为查询
-            experiences = await self.memory_store.search_experiences(scope, system_prompt or "")
-            experiences_text = self._format_experiences(experiences)
-
         # 拼装
         if system_prompt:
             if skills_text:
                 system_prompt += f"\n\n## Skills\n{skills_text}"
-            if experiences_text:
-                system_prompt += f"\n\n## 相关经验\n{experiences_text}"
-        elif skills_text or experiences_text:
-            parts = []
-            if skills_text:
-                parts.append(f"## Skills\n{skills_text}")
-            if experiences_text:
-                parts.append(f"## 相关经验\n{experiences_text}")
-            system_prompt = "\n\n".join(parts)
+        elif skills_text:
+            system_prompt = f"## Skills\n{skills_text}"
 
         return system_prompt
 
@@ -880,7 +940,14 @@ class AgentRunner:
             return ""
         return "\n\n".join(s.to_prompt_text() for s in skills)
 
-    def _format_experiences(self, experiences: List[Experience]) -> str:
-        if not experiences:
+    def _load_experiences(self) -> str:
+        """从文件加载经验(./cache/experiences.md)"""
+        if not self.experiences_path:
             return ""
-        return "\n".join(f"- {e.to_prompt_text()}" for e in experiences)
+        try:
+            if os.path.exists(self.experiences_path):
+                with open(self.experiences_path, "r", encoding="utf-8") as f:
+                    return f.read().strip()
+        except Exception as e:
+            logger.warning(f"Failed to load experiences from {self.experiences_path}: {e}")
+        return ""

+ 145 - 102
agent/trace/compaction.py

@@ -1,135 +1,115 @@
 """
-Context 压缩
+Context 压缩 — 两级压缩策略
 
-基于 Goal 状态进行增量压缩:
-- 当 Goal 完成或放弃时,将相关的详细 messages 替换为 summary
+Level 1: GoalTree 过滤(确定性,零成本)
+  - 跳过 completed/abandoned goals 的消息(信息已在 GoalTree summary 中)
+  - 始终保留:system prompt、第一条 user message、当前 focus goal 的消息
+
+Level 2: LLM 总结(仅在 Level 1 后仍超限时触发)
+  - 在消息列表末尾追加压缩 prompt → 主模型回复 → summary 存为新消息
+  - summary 的 parent_sequence 跳过被压缩的范围
+
+压缩不修改存储:原始消息永远保留在 messages/,通过 parent_sequence 树结构实现跳过。
 """
 
-from typing import List, Dict, Any, Optional
-from .goal_models import GoalTree, Goal
+from dataclasses import dataclass
+from typing import List, Dict, Any, Optional, Set
+
+from .goal_models import GoalTree
+from .models import Message
+
+
+# ===== 配置 =====
 
+@dataclass
+class CompressionConfig:
+    """压缩配置"""
+    max_tokens: int = 100000           # 最大 token 数
+    threshold_ratio: float = 0.8       # 触发 Level 2 的阈值比例(80%)
+    keep_recent_messages: int = 10     # Level 1 中始终保留最近 N 条消息
 
-def compress_messages_for_goal(
-    messages: List[Dict[str, Any]],
-    goal_id: str,
-    summary: str,
-) -> List[Dict[str, Any]]:
+
+# ===== Level 1: GoalTree 过滤 =====
+
+def filter_by_goal_status(
+    messages: List[Message],
+    goal_tree: Optional[GoalTree],
+) -> List[Message]:
     """
-    压缩指定 goal 关联的 messages
+    Level 1 过滤:跳过 completed/abandoned goals 的消息
+
+    始终保留:
+    - goal_id 为 None 的消息(system prompt、初始 user message)
+    - 当前 focus goal 及其祖先链上的消息
+    - in_progress 和 pending goals 的消息
 
-    将 goal_id 关联的所有详细 messages 替换为一条 summary message。
+    跳过:
+    - completed 且不在焦点路径上的 goals 的消息
+    - abandoned goals 的消息
 
     Args:
-        messages: 原始消息列表
-        goal_id: 要压缩的 goal ID
-        summary: 压缩后的摘要
+        messages: 主路径上的有序消息列表
+        goal_tree: GoalTree 实例
 
     Returns:
-        压缩后的消息列表
+        过滤后的消息列表
     """
-    # 分离:关联的 messages vs 其他 messages
-    related = []
-    other = []
+    if not goal_tree or not goal_tree.goals:
+        return messages
 
-    for msg in messages:
-        if msg.get("goal_id") == goal_id:
-            related.append(msg)
-        else:
-            other.append(msg)
+    # 构建焦点路径(当前焦点 + 父链 + 直接子节点)
+    focus_path = _get_focus_path(goal_tree)
 
-    # 如果没有关联的消息,直接返回
-    if not related:
-        return messages
+    # 构建需要跳过的 goal IDs
+    skip_goal_ids: Set[str] = set()
+    for goal in goal_tree.goals:
+        if goal.id in focus_path:
+            continue  # 焦点路径上的 goal 始终保留
+        if goal.status in ("completed", "abandoned"):
+            skip_goal_ids.add(goal.id)
 
-    # 找到第一条关联消息的位置(用于插入 summary)
-    first_related_index = None
-    for i, msg in enumerate(messages):
-        if msg.get("goal_id") == goal_id:
-            first_related_index = i
-            break
-
-    # 创建 summary message
-    summary_message = {
-        "role": "assistant",
-        "content": f"[Goal {goal_id} Summary] {summary}",
-        "goal_id": goal_id,
-        "is_summary": True,
-    }
-
-    # 构建新的消息列表
+    # 过滤消息
     result = []
-    summary_inserted = False
-
-    for i, msg in enumerate(messages):
-        if msg.get("goal_id") == goal_id:
-            # 跳过关联的详细消息,在第一个位置插入 summary
-            if not summary_inserted:
-                result.append(summary_message)
-                summary_inserted = True
-        else:
-            result.append(msg)
+    for msg in messages:
+        if msg.goal_id is None:
+            result.append(msg)  # 无 goal 的消息始终保留
+        elif msg.goal_id not in skip_goal_ids:
+            result.append(msg)  # 不在跳过列表中的消息保留
 
     return result
 
 
-def should_compress(goal: Goal) -> bool:
-    """判断 goal 是否需要压缩"""
-    return goal.status in ("completed", "abandoned") and goal.summary is not None
+def _get_focus_path(goal_tree: GoalTree) -> Set[str]:
+    """获取焦点路径上的所有 goal IDs(焦点 + 父链 + 直接子节点)"""
+    focus_ids: Set[str] = set()
 
+    if not goal_tree.current_id:
+        return focus_ids
 
-def compress_all_completed(
-    messages: List[Dict[str, Any]],
-    tree: GoalTree,
-) -> List[Dict[str, Any]]:
-    """
-    压缩所有已完成/已放弃的 goals
-
-    遍历 GoalTree,对所有需要压缩的 goal 执行压缩。
+    # 焦点自身
+    focus_ids.add(goal_tree.current_id)
 
-    Args:
-        messages: 原始消息列表
-        tree: GoalTree 实例
-
-    Returns:
-        压缩后的消息列表
-    """
-    result = messages
-
-    def process_goal(goal: Goal):
-        nonlocal result
-        if should_compress(goal):
-            # 检查是否已经压缩过(避免重复压缩)
-            already_compressed = any(
-                msg.get("goal_id") == goal.id and msg.get("is_summary")
-                for msg in result
-            )
-            if not already_compressed:
-                result = compress_messages_for_goal(result, goal.id, goal.summary)
-
-        # 递归处理子目标
-        for child in goal.children:
-            process_goal(child)
-
-    for goal in tree.goals:
-        process_goal(goal)
+    # 父链
+    goal = goal_tree.find(goal_tree.current_id)
+    while goal and goal.parent_id:
+        focus_ids.add(goal.parent_id)
+        goal = goal_tree.find(goal.parent_id)
 
-    return result
+    # 直接子节点
+    children = goal_tree.get_children(goal_tree.current_id)
+    for child in children:
+        focus_ids.add(child.id)
 
+    return focus_ids
 
-def get_messages_for_goal(
-    messages: List[Dict[str, Any]],
-    goal_id: str,
-) -> List[Dict[str, Any]]:
-    """获取指定 goal 关联的所有 messages"""
-    return [msg for msg in messages if msg.get("goal_id") == goal_id]
 
+# ===== Token 估算 =====
 
-def count_tokens_estimate(messages: List[Dict[str, Any]]) -> int:
+def estimate_tokens(messages: List[Dict[str, Any]]) -> int:
     """
-    估算消息的 token 数量(简单估算)
+    估算消息列表的 token 数量
 
-    实际使用时应该用 tiktoken 或 API 返回的 token 数。
-    这里用简单的字符数 / 4 来估算。
+    简单估算:字符数 / 4。实际使用时应该用 tiktoken 或 API 返回的 token 数。
     """
     total_chars = 0
     for msg in messages:
@@ -137,9 +117,72 @@ def count_tokens_estimate(messages: List[Dict[str, Any]]) -> int:
         if isinstance(content, str):
             total_chars += len(content)
         elif isinstance(content, list):
-            # 多模态消息
             for part in content:
                 if isinstance(part, dict) and part.get("type") == "text":
                     total_chars += len(part.get("text", ""))
+        # tool_calls
+        tool_calls = msg.get("tool_calls")
+        if tool_calls and isinstance(tool_calls, list):
+            for tc in tool_calls:
+                if isinstance(tc, dict):
+                    func = tc.get("function", {})
+                    total_chars += len(func.get("name", ""))
+                    args = func.get("arguments", "")
+                    if isinstance(args, str):
+                        total_chars += len(args)
 
     return total_chars // 4
+
+
+def estimate_tokens_from_messages(messages: List[Message]) -> int:
+    """从 Message 对象列表估算 token 数"""
+    return estimate_tokens([msg.to_llm_dict() for msg in messages])
+
+
+def needs_level2_compression(
+    token_count: int,
+    config: CompressionConfig,
+) -> bool:
+    """判断是否需要触发 Level 2 压缩"""
+    return token_count > config.max_tokens * config.threshold_ratio
+
+
+# ===== Level 2: 压缩 Prompt =====
+
+COMPRESSION_PROMPT = """请对以上对话历史进行压缩总结。
+
+要求:
+1. 保留关键决策、结论和产出(如创建的文件、修改的代码、得出的分析结论)
+2. 保留重要的上下文(如用户的要求、约束条件、之前的讨论结果)
+3. 省略中间探索过程、重复的工具调用细节
+4. 使用结构化格式(标题 + 要点)
+5. 控制在 2000 字以内
+
+当前 GoalTree 状态(完整版,含 summary):
+{goal_tree_prompt}
+"""
+
+REFLECT_PROMPT = """请回顾以上整个执行过程,提取有价值的经验教训。
+
+关注以下方面:
+1. **人工干预**:如果有用户中途修改了指令或纠正了方向,说明之前的决策哪里有问题
+2. **弯路**:哪些尝试是不必要的,有没有更直接的方法
+3. **好的决策**:哪些判断和选择是正确的,值得记住
+4. **工具使用**:哪些工具用法是高效的,哪些可以改进
+
+请以简洁的规则列表形式输出,每条规则格式为:
+- 当遇到 [条件] 时,应该 [动作](原因:[简短说明])
+"""
+
+
+def build_compression_prompt(goal_tree: Optional[GoalTree]) -> str:
+    """构建 Level 2 压缩 prompt"""
+    goal_prompt = ""
+    if goal_tree:
+        goal_prompt = goal_tree.to_prompt(include_summary=True)
+    return COMPRESSION_PROMPT.format(goal_tree_prompt=goal_prompt)
+
+
+def build_reflect_prompt() -> str:
+    """构建反思 prompt"""
+    return REFLECT_PROMPT

+ 49 - 3
agent/trace/goal_models.py

@@ -326,15 +326,22 @@ class GoalTree:
 
         return goal
 
-    def to_prompt(self, include_abandoned: bool = False) -> str:
+    def to_prompt(self, include_abandoned: bool = False, include_summary: bool = False) -> str:
         """
         格式化为 Prompt 注入文本
 
+        Args:
+            include_abandoned: 是否包含已废弃的目标
+            include_summary: 是否显示 completed/abandoned goals 的 summary 详情
+                False(默认)= 精简视图,用于日常周期性注入
+                True = 完整视图(含 summary),用于压缩时提供上下文
+
         展示策略:
         - 过滤掉 abandoned 目标(除非明确要求)
         - 完整展示所有顶层目标
         - 完整展示当前 focus 目标的父链及其所有子孙
         - 其他分支的子目标折叠显示(只显示数量和状态)
+        - include_summary=True 时不折叠,全部展开并显示 summary
         """
         lines = []
         lines.append(f"**Mission**: {self.mission}")
@@ -384,13 +391,19 @@ class GoalTree:
 
             result = [f"{prefix}{icon} {display_id}. {goal.description}{current_mark}"]
 
-            # 显示 summary(如果有)
-            if goal.summary:
+            # 显示 summary:include_summary=True 时全部显示,否则只在焦点路径上显示
+            if goal.summary and (include_summary or goal.id in current_path):
                 result.append(f"{prefix}    → {goal.summary}")
 
             # 递归处理子目标
             children = self.get_children(goal.id)
 
+            # include_summary 模式下不折叠,全部展开
+            if include_summary:
+                for child in children:
+                    result.extend(format_goal(child, indent + 1))
+                return result
+
             # 判断是否需要折叠
             # 如果当前 goal 或其子孙在焦点路径上,完整展示
             should_expand = goal.id in current_path or any(
@@ -464,6 +477,39 @@ class GoalTree:
             created_at=created_at or datetime.now(),
         )
 
+    def rebuild_for_rewind(self, completed_goal_ids: set) -> "GoalTree":
+        """
+        为 Rewind 重建干净的 GoalTree
+
+        保留 rewind 点之前已 completed 的 goals,丢弃其余。
+        清空 current_id,让 Agent 重新选择焦点。
+
+        Args:
+            completed_goal_ids: rewind 点之前已 completed 的 Goal ID 集合
+
+        Returns:
+            新的干净 GoalTree
+        """
+        surviving_goals = []
+        for goal in self.goals:
+            if goal.id in completed_goal_ids and goal.status == "completed":
+                surviving_goals.append(goal)
+
+        # 清理 parent_id 引用:如果 parent 不在存活列表中,设为 None
+        surviving_ids = {g.id for g in surviving_goals}
+        for goal in surviving_goals:
+            if goal.parent_id and goal.parent_id not in surviving_ids:
+                goal.parent_id = None
+
+        new_tree = GoalTree(
+            mission=self.mission,
+            goals=surviving_goals,
+            current_id=None,
+            _next_id=self._next_id,
+            created_at=self.created_at,
+        )
+        return new_tree
+
     def save(self, path: str) -> None:
         """保存到 JSON 文件"""
         with open(path, "w", encoding="utf-8") as f:

+ 15 - 3
agent/trace/models.py

@@ -51,7 +51,7 @@ class Trace:
     parent_goal_id: Optional[str] = None      # 哪个 Goal 启动的
 
     # 状态
-    status: Literal["running", "completed", "failed"] = "running"
+    status: Literal["running", "completed", "failed", "stopped"] = "running"
 
     # 统计
     total_messages: int = 0      # 消息总数(改名自 total_steps)
@@ -66,6 +66,7 @@ class Trace:
 
     # 进度追踪(head)
     last_sequence: int = 0      # 最新 message 的 sequence
+    head_sequence: int = 0      # 当前主路径的头节点 sequence(用于 build_llm_messages)
     last_event_id: int = 0      # 最新事件 ID(用于 WS 续传)
 
     # 配置
@@ -120,6 +121,7 @@ class Trace:
             "total_cost": self.total_cost,
             "total_duration_ms": self.total_duration_ms,
             "last_sequence": self.last_sequence,
+            "head_sequence": self.head_sequence,
             "last_event_id": self.last_event_id,
             "uid": self.uid,
             "model": self.model,
@@ -151,7 +153,8 @@ class Message:
     trace_id: str
     role: Literal["system", "user", "assistant", "tool"]   # 和 LLM API 一致
     sequence: int                        # 全局顺序
-    status: Literal["active", "abandoned"] = "active"  # 回溯时后续消息标记为 abandoned
+    parent_sequence: Optional[int] = None  # 父消息的 sequence(构成消息树)
+    status: Literal["active", "abandoned"] = "active"  # [已弃用] 由 parent_sequence 树结构替代
     goal_id: Optional[str] = None        # 关联的 Goal 内部 ID(None = 还没有创建 Goal)
     description: str = ""                # 消息描述(系统自动生成)
     tool_call_id: Optional[str] = None   # tool 消息关联对应的 tool_call
@@ -166,7 +169,7 @@ class Message:
     cost: Optional[float] = None
     duration_ms: Optional[int] = None
     created_at: datetime = field(default_factory=datetime.now)
-    abandoned_at: Optional[datetime] = None  # 回溯标记时间
+    abandoned_at: Optional[datetime] = None  # [已弃用] 由 parent_sequence 树结构替代
 
     # LLM 响应信息(仅 role="assistant" 时使用)
     finish_reason: Optional[str] = None  # stop, length, tool_calls, content_filter 等
@@ -230,6 +233,7 @@ class Message:
         trace_id: str,
         sequence: int,
         goal_id: Optional[str] = None,
+        parent_sequence: Optional[int] = None,
     ) -> "Message":
         """从 OpenAI SDK 格式创建 Message"""
         role = d["role"]
@@ -246,6 +250,7 @@ class Message:
             role=role,
             sequence=sequence,
             goal_id=goal_id,
+            parent_sequence=parent_sequence,
             content=content,
             tool_call_id=d.get("tool_call_id"),
         )
@@ -266,6 +271,10 @@ class Message:
         if "status" not in filtered_data:
             filtered_data["status"] = "active"
 
+        # 向后兼容:旧消息没有 parent_sequence 字段
+        if "parent_sequence" not in filtered_data:
+            filtered_data["parent_sequence"] = None
+
         return cls(**filtered_data)
 
     @classmethod
@@ -277,6 +286,7 @@ class Message:
         goal_id: Optional[str] = None,
         content: Any = None,
         tool_call_id: Optional[str] = None,
+        parent_sequence: Optional[int] = None,
         prompt_tokens: Optional[int] = None,
         completion_tokens: Optional[int] = None,
         reasoning_tokens: Optional[int] = None,
@@ -294,6 +304,7 @@ class Message:
             trace_id=trace_id,
             role=role,
             sequence=sequence,
+            parent_sequence=parent_sequence,
             goal_id=goal_id,
             content=content,
             description=description,
@@ -377,6 +388,7 @@ class Message:
             "trace_id": self.trace_id,
             "role": self.role,
             "sequence": self.sequence,
+            "parent_sequence": self.parent_sequence,
             "status": self.status,
             "goal_id": self.goal_id,
             "tool_call_id": self.tool_call_id,

+ 17 - 0
agent/trace/protocols.py

@@ -135,6 +135,23 @@ class TraceStore(Protocol):
         """
         ...
 
+    async def get_main_path_messages(
+        self,
+        trace_id: str,
+        head_sequence: int
+    ) -> List[Message]:
+        """
+        获取主路径上的消息(从 head_sequence 沿 parent_sequence 链回溯到 root)
+
+        Args:
+            trace_id: Trace ID
+            head_sequence: 主路径头节点的 sequence
+
+        Returns:
+            按 sequence 正序排列的主路径 Message 列表
+        """
+        ...
+
     async def get_messages_by_goal(
         self,
         trace_id: str,

+ 144 - 43
agent/trace/run_api.py

@@ -1,12 +1,22 @@
 """
-Trace 操作 API — 新建 / 续跑 / 回溯
+Trace 控制 API — 新建 / 运行 / 停止 / 反思
 
-提供 POST 端点触发 Agent 执行。需要通过 set_runner() 注入 AgentRunner 实例。
+提供 POST 端点触发 Agent 执行和控制。需要通过 set_runner() 注入 AgentRunner 实例。
 执行在后台异步进行,客户端通过 WebSocket (/api/traces/{trace_id}/watch) 监听实时更新。
+
+端点:
+  POST /api/traces              — 新建 Trace 并执行
+  POST /api/traces/{id}/run     — 运行(统一续跑 + 回溯)
+  POST /api/traces/{id}/stop    — 停止运行中的 Trace
+  POST /api/traces/{id}/reflect — 反思,在 trace 末尾追加反思 prompt 运行,结果追加到 experiences 文件
+  GET  /api/traces/running      — 列出正在运行的 Trace
+  GET  /api/experiences         — 读取经验文件内容
 """
 
 import asyncio
 import logging
+import os
+from datetime import datetime
 from typing import Any, Dict, List, Optional
 
 from fastapi import APIRouter, HTTPException
@@ -16,6 +26,9 @@ logger = logging.getLogger(__name__)
 
 router = APIRouter(prefix="/api/traces", tags=["run"])
 
+# 经验 API 使用独立 prefix
+experiences_router = APIRouter(prefix="/api", tags=["experiences"])
+
 
 # ===== 全局 Runner(由 api_server.py 注入)=====
 
@@ -40,7 +53,7 @@ def _get_runner():
 # ===== Request / Response 模型 =====
 
 
-class RunRequest(BaseModel):
+class CreateRequest(BaseModel):
     """新建执行"""
     messages: List[Dict[str, Any]] = Field(..., description="OpenAI SDK 格式的输入消息")
     model: str = Field("gpt-4o", description="模型名称")
@@ -52,31 +65,42 @@ class RunRequest(BaseModel):
     uid: Optional[str] = Field(None)
 
 
-class ContinueRequest(BaseModel):
-    """续跑"""
+class TraceRunRequest(BaseModel):
+    """运行(统一续跑 + 回溯)"""
     messages: List[Dict[str, Any]] = Field(
-        default=[{"role": "user", "content": "继续"}],
-        description="追加到末尾的新消息",
+        default_factory=list,
+        description="追加的新消息(可为空,用于重新生成场景)",
+    )
+    insert_after: Optional[int] = Field(
+        None,
+        description="回溯插入点的 message sequence。None = 从末尾续跑,int = 回溯到该 sequence 后运行",
     )
 
 
-class RewindRequest(BaseModel):
-    """回溯重放"""
-    insert_after: int = Field(..., description="截断点的 message sequence(保留该 sequence 及之前的消息)")
-    messages: List[Dict[str, Any]] = Field(
-        default=[{"role": "user", "content": "继续"}],
-        description="在截断点之后插入的新消息",
-    )
+class ReflectRequest(BaseModel):
+    """反思请求"""
+    focus: Optional[str] = Field(None, description="反思重点(可选)")
 
 
 class RunResponse(BaseModel):
     """操作响应(立即返回,后台执行)"""
     trace_id: str
-    mode: str  # "new" | "continue" | "rewind"
     status: str = "started"
     message: str = ""
 
 
+class StopResponse(BaseModel):
+    """停止响应"""
+    trace_id: str
+    status: str  # "stopping" | "not_running"
+
+
+class ReflectResponse(BaseModel):
+    """反思响应"""
+    trace_id: str
+    reflection: str
+
+
 # ===== 后台执行 =====
 
 _running_tasks: Dict[str, asyncio.Task] = {}
@@ -120,7 +144,7 @@ async def _run_with_trace_signal(
 
 
 @router.post("", response_model=RunResponse)
-async def create_and_run(req: RunRequest):
+async def create_and_run(req: CreateRequest):
     """
     新建 Trace 并开始执行
 
@@ -152,18 +176,22 @@ async def create_and_run(req: RunRequest):
 
     return RunResponse(
         trace_id=trace_id,
-        mode="new",
         status="started",
         message=f"Execution started. Watch via WebSocket: /api/traces/{trace_id}/watch",
     )
 
 
-@router.post("/{trace_id}/continue", response_model=RunResponse)
-async def continue_trace(trace_id: str, req: ContinueRequest):
+@router.post("/{trace_id}/run", response_model=RunResponse)
+async def run_trace(trace_id: str, req: TraceRunRequest):
     """
-    续跑已有 Trace
+    运行已有 Trace(统一续跑 + 回溯)
+
+    - insert_after 为 null(或省略):从末尾续跑
+    - insert_after 为 int:回溯到该 sequence 后运行
+    - messages 为空 + insert_after 为 int:重新生成(从该位置重跑,不插入新消息)
 
-    在已有 trace 末尾追加消息,继续执行。
+    insert_after 的值是 message 的 sequence 号。如果指定的 sequence 是一条带
+    tool_calls 的 assistant 消息,系统会自动扩展截断点到其所有 tool response 之后。
     """
     from agent.core.runner import RunConfig
 
@@ -179,50 +207,105 @@ async def continue_trace(trace_id: str, req: ContinueRequest):
     if trace_id in _running_tasks and not _running_tasks[trace_id].done():
         raise HTTPException(status_code=409, detail="Trace is already running")
 
-    config = RunConfig(trace_id=trace_id)
+    config = RunConfig(trace_id=trace_id, insert_after=req.insert_after)
     task = asyncio.create_task(_run_in_background(trace_id, req.messages, config))
     _running_tasks[trace_id] = task
 
+    mode = "rewind" if req.insert_after is not None else "continue"
     return RunResponse(
         trace_id=trace_id,
-        mode="continue",
         status="started",
-        message=f"Continue started. Watch via WebSocket: /api/traces/{trace_id}/watch",
+        message=f"Run ({mode}) started. Watch via WebSocket: /api/traces/{trace_id}/watch",
     )
 
 
-@router.post("/{trace_id}/rewind", response_model=RunResponse)
-async def rewind_trace(trace_id: str, req: RewindRequest):
+@router.post("/{trace_id}/stop", response_model=StopResponse)
+async def stop_trace(trace_id: str):
     """
-    回溯重放
+    停止运行中的 Trace
 
-    从指定 sequence 处截断,abandon 后续消息和 goals,插入新消息重新执行。
-    insert_after 的值是 message 的 sequence 号,可通过 GET /api/traces/{trace_id}/messages 查看。
-    如果指定的 sequence 是一条带 tool_calls 的 assistant 消息,系统会自动扩展截断点到其所有 tool response 之后。
+    设置取消信号,agent loop 在下一个 LLM 调用前检查并退出。
+    Trace 状态置为 "stopped"。
+    """
+    runner = _get_runner()
+
+    # 通过 runner 的 stop 方法设置取消信号
+    stopped = await runner.stop(trace_id)
+
+    if not stopped:
+        # 检查是否在 _running_tasks 但 runner 不知道(可能已完成)
+        if trace_id in _running_tasks:
+            task = _running_tasks[trace_id]
+            if not task.done():
+                task.cancel()
+                _running_tasks.pop(trace_id, None)
+                return StopResponse(trace_id=trace_id, status="stopping")
+        return StopResponse(trace_id=trace_id, status="not_running")
+
+    return StopResponse(trace_id=trace_id, status="stopping")
+
+
+@router.post("/{trace_id}/reflect", response_model=ReflectResponse)
+async def reflect_trace(trace_id: str, req: ReflectRequest):
+    """
+    触发反思
+
+    在 trace 末尾追加一条包含反思 prompt 的 user message,运行 agent 获取反思结果,
+    将结果追加到 experiences 文件(默认 ./cache/experiences.md)。
+
+    反思消息作为侧枝(side branch):运行前保存 head_sequence,运行后恢复。
+    这样反思消息不会出现在主对话路径上。
     """
     from agent.core.runner import RunConfig
+    from agent.trace.compaction import build_reflect_prompt
 
     runner = _get_runner()
 
+    if not runner.trace_store:
+        raise HTTPException(status_code=503, detail="TraceStore not configured")
+
     # 验证 trace 存在
-    if runner.trace_store:
-        trace = await runner.trace_store.get_trace(trace_id)
-        if not trace:
-            raise HTTPException(status_code=404, detail=f"Trace not found: {trace_id}")
+    trace = await runner.trace_store.get_trace(trace_id)
+    if not trace:
+        raise HTTPException(status_code=404, detail=f"Trace not found: {trace_id}")
 
-    # 检查是否已在运行
+    # 检查是否在运行
     if trace_id in _running_tasks and not _running_tasks[trace_id].done():
-        raise HTTPException(status_code=409, detail="Trace is already running")
+        raise HTTPException(status_code=409, detail="Cannot reflect on a running trace. Stop it first.")
 
-    config = RunConfig(trace_id=trace_id, insert_after=req.insert_after)
-    task = asyncio.create_task(_run_in_background(trace_id, req.messages, config))
-    _running_tasks[trace_id] = task
+    # 保存当前 head_sequence(反思完成后恢复,使反思消息成为侧枝)
+    saved_head_sequence = trace.head_sequence
 
-    return RunResponse(
+    # 构建反思 prompt
+    prompt = build_reflect_prompt()
+    if req.focus:
+        prompt += f"\n\n请特别关注:{req.focus}"
+
+    # 以续跑方式运行:追加 user message,agent 回复反思内容
+    config = RunConfig(trace_id=trace_id)
+    result = await runner.run_result(
+        messages=[{"role": "user", "content": prompt}],
+        config=config,
+    )
+
+    reflection_text = result.get("summary", "")
+
+    # 恢复 head_sequence(反思消息成为侧枝,不影响主路径)
+    await runner.trace_store.update_trace(trace_id, head_sequence=saved_head_sequence)
+
+    # 追加到 experiences 文件
+    if reflection_text:
+        experiences_path = getattr(runner, "experiences_path", "./cache/experiences.md")
+        if experiences_path:
+            os.makedirs(os.path.dirname(experiences_path), exist_ok=True)
+            header = f"\n\n---\n\n## {trace_id} ({datetime.now().strftime('%Y-%m-%d %H:%M')})\n\n"
+            with open(experiences_path, "a", encoding="utf-8") as f:
+                f.write(header + reflection_text + "\n")
+            logger.info(f"Reflection appended to {experiences_path}")
+
+    return ReflectResponse(
         trace_id=trace_id,
-        mode="rewind",
-        status="started",
-        message=f"Rewind to sequence {req.insert_after} started. Watch via WebSocket: /api/traces/{trace_id}/watch",
+        reflection=reflection_text,
     )
 
 
@@ -236,3 +319,21 @@ async def list_running():
         else:
             running.append(tid)
     return {"running": running}
+
+
+# ===== 经验 API =====
+
+
+@experiences_router.get("/experiences")
+async def list_experiences():
+    """读取经验文件内容"""
+    runner = _get_runner()
+    experiences_path = getattr(runner, "experiences_path", "./cache/experiences.md")
+
+    if not experiences_path or not os.path.exists(experiences_path):
+        return {"content": "", "path": experiences_path}
+
+    with open(experiences_path, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    return {"content": content, "path": experiences_path}

+ 29 - 0
agent/trace/store.py

@@ -491,6 +491,35 @@ class FileSystemTraceStore:
         messages.sort(key=lambda m: m.sequence)
         return messages
 
+    async def get_main_path_messages(
+        self,
+        trace_id: str,
+        head_sequence: int
+    ) -> List[Message]:
+        """
+        获取主路径上的消息(从 head_sequence 沿 parent_sequence 链回溯到 root)
+
+        Returns:
+            按 sequence 正序排列的主路径 Message 列表
+        """
+        # 加载所有消息,建立 sequence -> Message 索引
+        all_messages = await self.get_trace_messages(trace_id, include_abandoned=True)
+        messages_by_seq = {m.sequence: m for m in all_messages}
+
+        # 从 head 沿 parent chain 回溯
+        path = []
+        seq = head_sequence
+        while seq is not None:
+            msg = messages_by_seq.get(seq)
+            if not msg:
+                break
+            path.append(msg)
+            seq = msg.parent_sequence
+
+        # 反转为正序(root → head)
+        path.reverse()
+        return path
+
     async def get_messages_by_goal(
         self,
         trace_id: str,

+ 8 - 3
api_server.py

@@ -3,8 +3,9 @@ API Server - FastAPI 应用入口
 
 聚合所有模块的 API 路由:
 - GET  /api/traces — 查询(trace/api.py)
-- POST /api/traces — 执行(trace/run_api.py,需配置 Runner)
+- POST /api/traces — 执行控制(trace/run_api.py,需配置 Runner)
 - WS   /api/traces/{id}/watch — 实时推送(trace/websocket.py)
+- GET  /api/experiences — 经验查询(trace/run_api.py,需配置 Runner)
 """
 
 import logging
@@ -16,7 +17,7 @@ import uvicorn
 
 from agent.trace import FileSystemTraceStore
 from agent.trace.api import router as api_router, set_trace_store as set_api_trace_store
-from agent.trace.run_api import router as run_router, set_runner
+from agent.trace.run_api import router as run_router, experiences_router, set_runner
 from agent.trace.websocket import router as ws_router, set_trace_store as set_ws_trace_store
 
 
@@ -59,7 +60,7 @@ set_ws_trace_store(trace_store)
 
 # ===== 可选:配置 Runner(启用执行 API)=====
 
-# 如需启用 POST /api/traces(新建/续跑/回溯),取消以下注释并配置 LLM:
+# 如需启用 POST /api/traces(新建/运行/停止/反思),取消以下注释并配置 LLM:
 #
 # from agent.core.runner import AgentRunner
 # from agent.llm import create_openrouter_llm_call
@@ -67,6 +68,7 @@ set_ws_trace_store(trace_store)
 # runner = AgentRunner(
 #     trace_store=trace_store,
 #     llm_call=create_openrouter_llm_call(model="google/gemini-2.5-flash"),
+#     experiences_path="./cache/experiences.md",  # 经验文件路径
 # )
 # set_runner(runner)
 
@@ -77,6 +79,9 @@ set_ws_trace_store(trace_store)
 # 注意:run_router 必须在 api_router 之前注册,否则 GET /running 会被 /{trace_id} 捕获
 app.include_router(run_router)
 
+# 经验 API(GET /api/experiences,需配置 Runner)
+app.include_router(experiences_router)
+
 # Trace 查询 API(GET)
 app.include_router(api_router)
 

+ 167 - 69
docs/README.md

@@ -156,11 +156,11 @@ class RunConfig:
 
 通过 RunConfig 参数自然区分,统一入口 `run(messages, config)`:
 
-| 模式 | trace_id | insert_after | messages 含义 |
-|------|----------|-------------|--------------|
-| 新建 | None | - | 初始任务消息 |
-| 续跑 | 已有 ID | None | 追加到末尾的新消息 |
-| 回溯 | 已有 ID | 指定 sequence | 在插入点之后追加的新消息 |
+| 模式 | trace_id | insert_after | messages 含义 | API 端点 |
+|------|----------|-------------|--------------|----------|
+| 新建 | None | - | 初始任务消息 | `POST /api/traces` |
+| 续跑 | 已有 ID | None | 追加到末尾的新消息 | `POST /api/traces/{id}/run` |
+| 回溯 | 已有 ID | 指定 sequence | 在插入点之后追加的新消息 | `POST /api/traces/{id}/run` |
 
 ### 执行流程
 
@@ -169,14 +169,14 @@ async def run(messages: List[Dict], config: RunConfig = None) -> AsyncIterator[U
     # Phase 1: PREPARE TRACE
     #   无 trace_id → 创建新 Trace(生成 name,初始化 GoalTree)
     #   有 trace_id + 无 insert_after → 加载已有 Trace,状态置为 running
-    #   有 trace_id + 有 insert_after → 加载 Trace,执行 rewind(标记后续 msgs/goals 为 abandoned
+    #   有 trace_id + 有 insert_after → 加载 Trace,执行 rewind(快照 GoalTree,重建,设 parent_sequence
     trace = await _prepare_trace(config)
     yield trace
 
     # Phase 2: BUILD HISTORY
-    #   加载已有 active messages(续跑/回溯场景)
+    #   从 head_sequence 沿 parent chain 回溯构建主路径消息
     #   构建 system prompt(新建时注入 skills/experiences;续跑时复用已有)
-    #   追加 input messages
+    #   追加 input messages(设置 parent_sequence 指向当前 head)
     history, sequence = await _build_history(trace, messages, config)
 
     # Phase 3: AGENT LOOP
@@ -205,12 +205,12 @@ async def run(messages: List[Dict], config: RunConfig = None) -> AsyncIterator[U
 回溯通过 `RunConfig(trace_id=..., insert_after=N)` 触发,在 Phase 1 中执行:
 
 1. **验证插入点**:确保不截断在 assistant(tool_calls) 和 tool response 之间
-2. **标记 Messages**:sequence > cutoff 的 messages 标记 `status="abandoned"`
-3. **处理 Goals**:已完成且消息均在插入点之前的保留,其余 abandon
-4. **记录事件**:events.jsonl 追加 `rewind` 事件
-5. **更新 Trace**:status 改回 running
+2. **快照 GoalTree**:将当前完整 GoalTree 存入 `events.jsonl`(rewind 事件的 `goal_tree_snapshot` 字段)
+3. **重建 GoalTree**:保留 rewind 点之前已 completed 的 goals,丢弃其余,清空 `current_id`
+4. **设置 parent_sequence**:新消息的 `parent_sequence` 指向 rewind 点,旧消息自动脱离主路径
+5. **更新 Trace**:`head_sequence` 更新为新消息的 sequence,status 改回 running
 
-新消息的 sequence 从 `max(all_sequences) + 1` 开始,不复用被 abandon 的序号
+新消息的 sequence 从 `last_sequence + 1` 开始(全局递增,不复用)。旧消息无需标记 abandoned,通过消息树结构自然隔离
 
 ### 调用接口
 
@@ -238,23 +238,53 @@ async for item in runner.run(
     config=RunConfig(trace_id="existing-trace-id", insert_after=5),
 ):
     ...
+
+# 重新生成:回溯后不插入新消息,直接基于已有消息重跑
+async for item in runner.run(
+    messages=[],
+    config=RunConfig(trace_id="existing-trace-id", insert_after=5),
+):
+    ...
 ```
 
 `insert_after` 的值是 message 的 `sequence` 号,可通过 `GET /api/traces/{trace_id}/messages` 查看。如果指定的 sequence 是一条带 `tool_calls` 的 assistant 消息,系统会自动将截断点扩展到其所有对应的 tool response 之后(安全截断)。
 
+**停止运行**:
+
+```python
+# 停止正在运行的 Trace
+await runner.stop(trace_id)
+```
+
+调用后 agent loop 在下一个检查点退出,Trace 状态置为 `stopped`。
+
 - `run(messages, config)`:**核心方法**,流式返回 `AsyncIterator[Union[Trace, Message]]`
 - `run_result(messages, config)`:便利方法,内部消费 `run()`,返回结构化结果。主要用于 `agent`/`evaluate` 工具内部
 
 ### REST API
 
-操作型端点(需在 `api_server.py` 中配置 Runner)。执行在后台异步进行,通过 WebSocket 监听进度。
+#### 查询端点
 
 | 方法 | 路径 | 说明 |
 |------|------|------|
-| POST | `/api/traces` | 新建 Trace 并执行 |
-| POST | `/api/traces/{id}/continue` | 续跑 |
-| POST | `/api/traces/{id}/rewind` | 回溯重放 |
+| GET  | `/api/traces` | 列出 Traces |
+| GET  | `/api/traces/{id}` | 获取 Trace 详情(含 GoalTree、Sub-Traces) |
+| GET  | `/api/traces/{id}/messages` | 获取 Messages |
 | GET  | `/api/traces/running` | 列出正在运行的 Trace |
+| WS   | `/api/traces/{id}/watch` | 实时事件推送 |
+
+**实现**:`agent/trace/api.py`, `agent/trace/websocket.py`
+
+#### 控制端点
+
+需在 `api_server.py` 中配置 Runner。执行在后台异步进行,通过 WebSocket 监听进度。
+
+| 方法 | 路径 | 说明 |
+|------|------|------|
+| POST | `/api/traces` | 新建 Trace 并执行 |
+| POST | `/api/traces/{id}/run` | 运行(统一续跑 + 回溯) |
+| POST | `/api/traces/{id}/stop` | 停止运行中的 Trace |
+| POST | `/api/traces/{id}/reflect` | 触发反思,从执行历史中提取经验 |
 
 ```bash
 # 新建
@@ -262,16 +292,35 @@ curl -X POST http://localhost:8000/api/traces \
   -H "Content-Type: application/json" \
   -d '{"messages": [{"role": "user", "content": "分析项目架构"}], "model": "gpt-4o"}'
 
-# 续跑
-curl -X POST http://localhost:8000/api/traces/{trace_id}/continue \
+# 续跑(insert_after 为 null 或省略)
+curl -X POST http://localhost:8000/api/traces/{trace_id}/run \
   -d '{"messages": [{"role": "user", "content": "继续深入分析"}]}'
 
 # 回溯:从 sequence 5 处截断,插入新消息重新执行
-curl -X POST http://localhost:8000/api/traces/{trace_id}/rewind \
+curl -X POST http://localhost:8000/api/traces/{trace_id}/run \
   -d '{"insert_after": 5, "messages": [{"role": "user", "content": "换一个方案"}]}'
+
+# 重新生成:回溯到 sequence 5,不插入新消息,直接重跑
+curl -X POST http://localhost:8000/api/traces/{trace_id}/run \
+  -d '{"insert_after": 5, "messages": []}'
+
+# 停止
+curl -X POST http://localhost:8000/api/traces/{trace_id}/stop
+
+# 反思:追加反思 prompt 运行,结果追加到 experiences 文件
+curl -X POST http://localhost:8000/api/traces/{trace_id}/reflect \
+  -d '{"focus": "为什么第三步选择了错误的方案"}'
 ```
 
-响应立即返回 `{"trace_id": "...", "mode": "rewind", "status": "started"}`,通过 `WS /api/traces/{trace_id}/watch` 监听实时事件。
+响应立即返回 `{"trace_id": "...", "status": "started"}`,通过 `WS /api/traces/{trace_id}/watch` 监听实时事件。
+
+**实现**:`agent/trace/run_api.py`
+
+#### 经验端点
+
+| 方法 | 路径 | 说明 |
+|------|------|------|
+| GET  | `/api/experiences` | 读取经验文件内容 |
 
 **实现**:`agent/trace/run_api.py`
 
@@ -301,7 +350,7 @@ class Trace:
     parent_goal_id: Optional[str] = None     # 哪个 Goal 启动的
 
     # 状态
-    status: Literal["running", "completed", "failed"] = "running"
+    status: Literal["running", "completed", "failed", "stopped"] = "running"
 
     # 统计
     total_messages: int = 0
@@ -312,7 +361,8 @@ class Trace:
     total_duration_ms: int = 0
 
     # 进度追踪
-    last_sequence: int = 0                   # 最新 message 的 sequence
+    last_sequence: int = 0                   # 最新 message 的 sequence(全局递增,不复用)
+    head_sequence: int = 0                   # 当前主路径的头节点 sequence(用于 build_llm_messages)
     last_event_id: int = 0                   # 最新事件 ID(用于 WS 续传)
 
     # 配置
@@ -386,7 +436,7 @@ class Goal:
 
 ### Message(执行消息)
 
-对应 LLM API 的消息,每条 Message 关联一个 Goal。
+对应 LLM API 的消息,每条 Message 关联一个 Goal。消息通过 `parent_sequence` 形成树结构。
 
 ```python
 @dataclass
@@ -394,8 +444,8 @@ class Message:
     message_id: str                          # 格式:{trace_id}-{sequence:04d}
     trace_id: str
     role: Literal["system", "user", "assistant", "tool"]
-    sequence: int                            # 全局顺序
-    status: Literal["active", "abandoned"] = "active"  # 回溯时后续消息标记为 abandoned
+    sequence: int                            # 全局顺序(递增,不复用)
+    parent_sequence: Optional[int] = None    # 父消息的 sequence(构成消息树)
     goal_id: Optional[str] = None            # 关联的 Goal ID(初始消息为 None,系统会按需自动创建 root goal 兜底)
     description: str = ""                    # 系统自动生成的摘要
     tool_call_id: Optional[str] = None
@@ -411,15 +461,29 @@ class Message:
     finish_reason: Optional[str] = None
 
     created_at: datetime
-    abandoned_at: Optional[datetime] = None  # 回溯标记时间
+
+    # [已弃用] 由 parent_sequence 树结构替代
+    status: Literal["active", "abandoned"] = "active"
+    abandoned_at: Optional[datetime] = None
+```
+
+**消息树(Message Tree)**:
+
+消息通过 `parent_sequence` 形成树。主路径 = 从 `trace.head_sequence` 沿 parent chain 回溯到 root。
+
+```
+正常对话:1 → 2 → 3 → 4 → 5       (每条的 parent 指向前一条)
+Rewind 到 3:3 → 6(parent=3) → 7   (新主路径,4-5 自动脱离)
+压缩 1-3:   8(summary, parent=None) → 6 → 7  (summary 跳过被压缩的消息)
+反思分支:   5 → 9(reflect, parent=5) → 10     (侧枝,不在主路径上)
 ```
 
+`build_llm_messages` = 从 head 沿 parent_sequence 链回溯到 root,反转后返回。
+
 Message 提供格式转换方法:
 - `to_llm_dict()` → OpenAI 格式 Dict(用于 LLM 调用)
 - `from_llm_dict(d, trace_id, sequence, goal_id)` → 从 OpenAI 格式创建 Message
 
-加载 messages 时,默认只返回 `status="active"` 的消息。
-
 **实现**:`agent/trace/models.py`
 
 ---
@@ -662,62 +726,94 @@ agent/memory/skills/
 
 从执行历史中提取的经验规则,用于指导未来任务。
 
-### 数据结构
+### 存储
 
-```python
-@dataclass
-class Experience:
-    id: str
-    scope: str           # "agent:executor" 或 "user:123"
-    condition: str       # "当遇到数据库连接超时"
-    rule: str            # "增加重试次数到5次"
-    evidence: Dict       # 证据(trace_ids)
-    confidence: float
-    usage_count: int
-    success_rate: float
-    embedding: List[float]  # 向量,用于检索
+经验以 Markdown 文件存储(默认 `./cache/experiences.md`),人类可读、可编辑、可版本控制。
+
+文件格式:
+
+```markdown
+---
+
+## trace-id-xxx (2026-02-12 15:30)
+
+- 当遇到 X 情况时,应该先 Y 再 Z
+- 分析代码前应先读取项目结构
+
+---
+
+## trace-id-yyy (2026-02-12 16:00)
+
+- 执行 bash 命令前应检查路径是否存在
 ```
 
-### 检索和注入
+### 反思机制(Reflect)
 
-```python
-# 1. 检索相关 Experiences
-experiences = await db.query(
-    "SELECT * FROM experiences WHERE scope = $1 ORDER BY embedding <-> $2 LIMIT 10",
-    f"agent:{agent_type}", embed(task)
-)
+通过 `POST /api/traces/{id}/reflect` 触发:
+
+1. 在 trace 末尾追加一条 user message(内置反思 prompt),**作为侧枝**(parent_sequence 分叉,不在主路径上)
+2. Agent 回顾整个执行过程,生成经验总结
+3. 将 assistant 的反思内容追加到 `./cache/experiences.md`
 
-# 2. 注入到 system prompt
-system_prompt += "\n# Learned Experiences\n" + format_experiences(experiences)
+反思消息不影响主对话路径。正常 continue/rewind 时看不到反思消息。
+
+反思 prompt 引导 Agent 关注:人工干预说明做错了什么、走了哪些弯路、哪些决策是对的。
+
+**实现**:`agent/trace/run_api.py:reflect_trace`
+
+### 注入
+
+新建 Trace 时,Runner 自动读取 `./cache/experiences.md` 并追加到第一条 user message 末尾:
+
+```python
+# _build_history 中(仅新建模式):
+if not config.trace_id:
+    experiences_text = self._load_experiences()  # 读取文件
+    if experiences_text:
+        first_user_msg["content"] += f"\n\n## 参考经验\n\n{experiences_text}"
 ```
 
-**存储**:PostgreSQL + pgvector
+后续 continue/rewind 不重新注入(经验已在初始消息中)。
 
-**实现**:`agent/memory/stores.py:ExperienceStore`
+**实现**:`agent/core/runner.py:AgentRunner._build_history`
 
 ---
 
 ## Context 压缩
 
-### 压缩时机
+### 两级压缩策略
 
-Goal 完成(done)或放弃(abandon)时,将详细 Messages 替换为 Summary Message。
+#### Level 1:GoalTree 过滤(确定性,零成本)
 
-### 压缩策略
+每轮 agent loop 构建 `llm_messages` 时自动执行:
+- 始终保留:system prompt、第一条 user message(含 GoalTree 精简视图)、当前 focus goal 的消息
+- 跳过 completed/abandoned goals 的消息(信息已在 GoalTree summary 中)
+- 通过 Message Tree 的 parent_sequence 实现跳过
 
-```
-Goal 状态变化
-    ↓
-收集该 Goal 下的所有 Messages
-    ↓
-生成 Summary(由 LLM 提供)
-    ↓
-替换原始 Messages 为单条 Summary Message
-    ↓
-更新统计信息
-```
+大多数情况下 Level 1 足够。
 
-**实现**:`agent/trace/compaction.py`
+#### Level 2:LLM 总结(仅在 Level 1 后仍超限时触发)
+
+触发条件:Level 1 之后 token 数仍超过阈值(默认 `max_tokens × 0.8`)。
+
+流程:
+1. **经验提取**:先在消息列表末尾追加反思 prompt → 主模型回复 → 追加到 `./cache/experiences.md`。反思消息为侧枝(parent_sequence 分叉,不在主路径上)
+2. **压缩**:在消息列表末尾追加压缩 prompt(含 GoalTree 完整视图) → 主模型回复 → summary 存为新消息,其 `parent_sequence` 跳过被压缩的范围
+
+### GoalTree 双视图
+
+`to_prompt()` 支持两种模式:
+- `include_summary=False`(默认):精简视图,用于日常周期性注入
+- `include_summary=True`:含所有 completed goals 的 summary,用于 Level 2 压缩时提供上下文
+
+### 压缩存储
+
+- 原始消息永远保留在 `messages/`
+- 压缩 summary 作为普通 Message 存储
+- 通过 `parent_sequence` 树结构实现跳过,无需 compression events 或 skip list
+- Rewind 到压缩区域内时,summary 脱离主路径,原始消息自动恢复
+
+**实现**:`agent/trace/compaction.py`, `agent/trace/goal_models.py`
 
 **详细文档**:[Context 管理](./context-management.md)
 
@@ -732,11 +828,13 @@ class TraceStore(Protocol):
     async def update_trace(self, trace_id: str, **updates) -> None: ...
     async def add_message(self, message: Message) -> None: ...
     async def get_trace_messages(self, trace_id: str, include_abandoned: bool = False) -> List[Message]: ...
+    async def get_main_path_messages(self, trace_id: str, head_sequence: int) -> List[Message]: ...
     async def get_messages_by_goal(self, trace_id: str, goal_id: str) -> List[Message]: ...
-    async def abandon_messages_after(self, trace_id: str, cutoff_sequence: int) -> List[str]: ...
     async def append_event(self, trace_id: str, event_type: str, payload: Dict) -> int: ...
 ```
 
+`get_main_path_messages` 从 `head_sequence` 沿 `parent_sequence` 链回溯,返回主路径上的有序消息列表。
+
 **实现**:
 - 协议定义:`agent/trace/protocols.py`
 - 文件存储:`agent/trace/store.py:FileSystemTraceStore`

+ 263 - 0
docs/decisions.md

@@ -836,4 +836,267 @@ MessageContent = Union[str, List[Dict[str, str]]]     # content 字段(文本
 
 **实现**:`agent/tools/builtin/subagent.py`, `agent/trace/models.py`, `agent/tools/schema.py`
 
+---
+
+## 19. 前端控制 API:统一 run + stop + reflect
+
+**日期**: 2026-02-12
+
+### 问题
+
+需要从前端控制 Agent 的创建、启动(含从任意位置重放)、插入用户消息、打断运行。原有 API 将 `continue` 和 `rewind` 拆分为两个独立端点,但它们本质上是同一操作(在某个位置运行),仅 `insert_after` 是否为 null 的区别。此外,缺少停止和反思机制。
+
+### 决策
+
+#### 19a. 合并 `continue` + `rewind` → 统一 `run` 端点
+
+```
+POST /api/traces/{id}/run
+{
+  "messages": [...],
+  "insert_after": null | int
+}
+```
+
+- `insert_after: null` → 从末尾续跑(原 continue)
+- `insert_after: N` → 回溯到 sequence N 后运行(原 rewind)
+- `messages: []` + `insert_after: N` → 重新生成(从 N 处重跑,不插入新消息)
+
+删除 `POST /{id}/continue` 和 `POST /{id}/rewind` 两个端点。
+
+#### 19b. 新增 `stop` 端点 + Runner 取消机制
+
+```
+POST /api/traces/{id}/stop
+```
+
+Runner 内部维护 `_cancel_events: Dict[str, asyncio.Event]`,agent loop 在每次 LLM 调用前检查。`stop()` 方法设置事件,loop 退出,Trace 状态置为 `stopped`。
+
+Trace.status 新增 `"stopped"` 值。
+
+#### 19c. 新增 `reflect` 端点 — 追加反思 prompt 运行
+
+```
+POST /api/traces/{id}/reflect
+{
+  "focus": "optional, 反思重点"
+}
+```
+
+在 trace 末尾追加一条内置反思 prompt 的 user message,以续跑方式运行 agent。Agent 回顾整个执行过程后生成经验总结,结果自动追加到 `./cache/experiences.md`。
+
+不单独调用 LLM、不解析结构化数据——反思就是一次普通的 agent 运行,只是 user message 是预置的反思 prompt。
+
+#### 19d. 经验存储简化为文件
+
+经验存储从 MemoryStore(内存/数据库)简化为 `./cache/experiences.md` 文件:
+- 人类可读可编辑(Markdown)
+- 可版本控制(git)
+- 新建 Trace 时由 Runner 读取并注入到第一条 user message 末尾
+- `GET /api/experiences` 直接读取文件内容返回
+
+### 最终 API 设计
+
+```
+控制类(3 个端点,替代原来的 3 个):
+  POST /api/traces              → 创建并运行(不变)
+  POST /api/traces/{id}/run     → 运行(合并 continue + rewind)
+  POST /api/traces/{id}/stop    → 停止(新增)
+
+学习类(2 个端点,全新):
+  POST /api/traces/{id}/reflect → 追加反思 prompt 运行,结果追加到 experiences 文件
+  GET  /api/experiences         → 读取经验文件内容
+```
+
+### 理由
+
+1. **API 更少**:`continue` 和 `rewind` 合并后端点总数不增反减(3 → 3 控制 + 2 学习)
+2. **概念统一**:`run` 就是"在某个位置运行",`insert_after` 自然区分续跑和回溯,与 `RunConfig` 设计一致
+3. **前端简化**:`sendMessage()` 直接透传 `branchPoint` 作为 `insert_after`,无需判断调哪个 API
+4. **停止机制**:asyncio.Event 轻量可靠,每次 LLM 调用前检查,不会在工具执行中途被打断
+5. **反思闭环**:Run → Observe → Intervene → Reflect → Run,形成完整的学习循环
+6. **经验存储极简**:一个 Markdown 文件,不需要数据库,人类可读可编辑可版本控制
+
+### 变更范围
+
+- `agent/trace/models.py` — Trace.status 增加 `"stopped"`
+- `agent/core/runner.py` — `_cancel_events` 字典,`stop()` 方法,agent loop 检查取消;`experiences_path` 参数,`_load_experiences()` 方法,新建时注入经验到 user message
+- `agent/trace/run_api.py` — 合并 `continue`/`rewind` 为 `run`,新增 `stop`/`reflect` 端点,`GET /api/experiences` 读取文件
+- `api_server.py` — 注入 experiences_router
+
+**实现**:`agent/trace/run_api.py`, `agent/core/runner.py`, `agent/trace/models.py`
+
+---
+
+## 20. Message Tree:用 parent_sequence 构建消息树
+
+**日期**: 2026-02-13
+
+### 问题
+
+原有的消息管理使用线性列表 + `status=abandoned` 标记,导致:
+1. 压缩需要独立的 compression events + skip list 来标记跳过哪些消息
+2. 反思消息掺入主对话列表,需要额外过滤
+3. Rewind 需要标记 abandoned + 维护 GoalTree 快照
+4. `build_llm_messages` 逻辑复杂(过滤 abandoned + 应用 skip + 排除反思)
+
+### 决策
+
+**选择:Message 新增 `parent_sequence` 字段,消息形成树结构**
+
+核心规则:**`build_llm_messages` = 从 head 沿 parent_sequence 链回溯到 root**。
+
+**压缩**:summary 的 `parent_sequence` 指向压缩范围起点的前一条消息,旧消息自然脱离主路径。
+
+```
+压缩前主路径:1 → 2 → 3 → ... → 41 → 42 → ...
+压缩后:
+  1 → 2 → 3 → ... → 41 (旧路径,脱离主路径)
+       ↓
+  2 → 45(summary, parent=2) → 46 → ...  (新主路径)
+```
+
+**反思**:反思消息从当前消息分出侧枝,不汇入主路径,天然隔离。
+
+**Rewind**:新消息的 `parent_sequence` 指向 rewind 点,旧路径自动变成死胡同。
+
+```
+Rewind 到 seq 20:
+  主路径原本:1 → ... → 20 → 21 → ... → 50
+  Rewind 后:20 → 51(新, parent=20) → 52 → ...
+  新主路径:1 → ... → 20 → 51 → 52 → ...
+  旧消息 21-50 脱离主路径,无需标记 abandoned
+```
+
+**build_llm_messages**:
+
+```python
+def build_llm_messages(head_sequence, messages_by_seq):
+    path = []
+    seq = head_sequence
+    while seq is not None:
+        msg = messages_by_seq[seq]
+        path.append(msg)
+        seq = msg.parent_sequence
+    path.reverse()
+    return [m.to_llm_dict() for m in path]
+```
+
+### 不再需要的机制
+
+- ~~Message.status (abandoned)~~ → 树结构替代
+- ~~Message.abandoned_at~~ → 树结构替代
+- ~~compression events in events.jsonl~~ → summary.parent_sequence 替代
+- ~~abandon_messages_after()~~ → 新消息设 parent_sequence 即可
+- ~~skip list / 过滤逻辑~~ → parent chain 遍历替代
+
+### 变更范围
+
+- `agent/trace/models.py` — Message 新增 `parent_sequence`,`status`/`abandoned_at` 保留但标记弃用
+- `agent/trace/store.py` — 新增 `get_main_path_messages()`,Trace 追踪 `head_sequence`
+- `agent/trace/protocols.py` — 新增 `get_main_path_messages()` 接口
+- `agent/core/runner.py` — agent loop 中设置 parent_sequence,rewind 使用新模型
+
+**实现**:`agent/trace/models.py`, `agent/trace/store.py`, `agent/core/runner.py`
+
+---
+
+## 21. GoalTree Rewind:快照 + 重建
+
+**日期**: 2026-02-13
+
+### 问题
+
+Message Tree 解决了消息层面的分支问题,但 GoalTree 是独立的状态,不适合从消息树派生(压缩会使目标创建消息脱离主路径,但目标应该保留)。
+
+### 决策
+
+**选择:GoalTree 保持独立管理,rewind 时快照 + 重建**
+
+**Rewind 流程**:
+1. 把当前完整 GoalTree 快照存入 `events.jsonl`(rewind 事件的 `goal_tree_snapshot` 字段)
+2. 重建干净的 GoalTree:保留 rewind 点之前已 completed 的 goals,丢弃其余
+3. 清空 `current_id`,让 Agent 重新选择焦点
+
+**快照用途**:仅用于非运行态下查看历史版本,运行时和前端展示只使用当前干净的 goal.json。
+
+**Agent 自主废弃**:Agent 调用 `goal(abandon=...)` 时,abandoned goals 正常保留在 GoalTree 中,前端逐一收到事件,可以展示废弃的分支。
+
+**用户 Rewind**:不展示废弃的分支。GoalTree 被清理为只包含存活 goals,用户可通过"历史版本"页面查看快照。
+
+### 理由
+
+1. GoalTree 和 Messages 的生命周期不同——压缩可以移除消息但不能移除目标
+2. 快照 + 重建逻辑简单可靠,不需要 event sourcing
+3. 干净的 goal.json 让运行时和前端展示始终一致
+
+### 变更范围
+
+- `agent/core/runner.py:_rewind()` — 快照旧树到事件,重建干净树
+- `agent/trace/store.py` — rewind 事件增加 `goal_tree_snapshot`
+
+**实现**:`agent/core/runner.py`
+
+---
+
+## 22. Context 压缩:GoalTree 双视图 + 两级压缩
+
+**日期**: 2026-02-13
+
+### 问题
+
+长时间运行的 Agent 会累积大量 messages,超出 LLM 上下文窗口。需要在保留关键信息的前提下压缩历史。
+
+### 决策
+
+**选择:Level 1 确定性过滤 + Level 2 LLM 总结,压缩不修改存储**
+
+#### 22a. GoalTree 双视图
+
+`to_prompt()` 支持两种模式:
+- `include_summary=False`(默认):精简视图,用于日常周期性注入
+- `include_summary=True`:含所有 completed goals 的 summary,用于压缩时提供上下文
+
+压缩视图追加到第一条 user message 末尾(构建 `llm_messages` 时的内存操作,不修改存储)。
+
+#### 22b. Level 1:GoalTree 过滤(确定性,零成本)
+
+每轮 agent loop 构建 `llm_messages` 时:
+- 始终保留:system prompt、第一条 user message、focus goal 的消息
+- 跳过 completed/abandoned goals 的消息(信息已在 GoalTree summary 中)
+- 通过 Message Tree 的 parent_sequence 实现(压缩 summary 的 parent 跳过被压缩的消息)
+
+大多数情况下 Level 1 足够。
+
+#### 22c. Level 2:LLM 总结(仅在 Level 1 后仍超限时触发)
+
+触发条件:Level 1 之后 token 数仍超过阈值(默认 max_tokens × 0.8)。
+
+做法:在当前消息列表末尾追加压缩 prompt → 主模型回复 → summary 作为新消息存入 messages/,其 parent_sequence 跳过被压缩的范围。
+
+不使用 utility_llm,就用主模型。压缩和反思都是"在消息列表末尾追加 prompt,主模型回复"。
+
+#### 22d. 压缩前经验提取
+
+触发 Level 2 压缩之前,先在消息列表末尾追加反思 prompt → 主模型回复 → 结果追加到 `./cache/experiences.md`。反思消息为侧枝(parent_sequence 分叉,不在主路径上)。
+
+#### 22e. 压缩不修改存储
+
+- `messages/` 始终保留原始消息
+- 压缩结果(summary)作为新消息存入 messages/
+- 通过 parent_sequence 树结构实现"跳过",不需要 compression events 或 skip list
+- Rewind 到压缩区域内时,原始消息自动恢复到主路径(summary 脱离新主路径)
+
+#### 22f. 多次压缩的恢复
+
+每次压缩的 summary 消息通过 parent_sequence 跳过被压缩的范围。Rewind 时,如果 rewind 点在某次压缩之后,该压缩的 summary 仍在主路径上,压缩保持生效;如果 rewind 点在压缩之前,summary 脱离新主路径,原始消息自动恢复。无需特殊恢复逻辑。
+
+### 变更范围
+
+- `agent/trace/goal_models.py` — `to_prompt(include_summary)` 双视图
+- `agent/trace/compaction.py` — 压缩触发逻辑、Level 1/Level 2 实现
+- `agent/core/runner.py` — agent loop 中集成压缩
+
+**实现**:`agent/trace/compaction.py`, `agent/trace/goal_models.py`, `agent/core/runner.py`
+
 ---

+ 108 - 18
docs/trace-api.md

@@ -6,16 +6,20 @@
 
 ## 架构概览
 
-**职责定位**:`agent/execution` 模块负责所有 Trace/Message 相关功能
+**职责定位**:`agent/trace` 模块负责所有 Trace/Message 相关功能
 
 ```
-agent/execution/
+agent/trace/
 ├── models.py          # Trace/Message 数据模型
+├── goal_models.py     # Goal/GoalTree 数据模型
 ├── protocols.py       # TraceStore 存储接口
-├── fs_store.py        # 文件系统存储实现
+├── store.py           # 文件系统存储实现
 ├── trace_id.py        # Trace ID 生成工具
-├── api.py             # RESTful API
-└── websocket.py       # WebSocket 实时推送
+├── api.py             # RESTful 查询 API
+├── run_api.py         # 控制 API(run/stop/reflect)
+├── websocket.py       # WebSocket 实时推送
+├── goal_tool.py       # goal 工具(计划管理)
+└── compaction.py      # Context 压缩
 ```
 
 **设计原则**:
@@ -54,22 +58,23 @@ trace.task            # 任务描述
 trace.parent_trace_id # 父 Trace ID(Sub-Trace 专用)
 trace.parent_goal_id  # 触发的父 Goal ID(Sub-Trace 专用)
 trace.agent_type      # Agent 类型:explore, delegate 等
-trace.status          # "running" | "completed" | "failed"
+trace.status          # "running" | "completed" | "failed" | "stopped"
 trace.total_messages  # Message 总数
 trace.total_tokens    # Token 总数
 trace.total_cost      # 总成本
 trace.current_goal_id # 当前焦点 goal
+trace.head_sequence   # 当前主路径头节点 sequence(用于 build_llm_messages)
 ```
 
 **Trace ID 格式**:
 - **主 Trace**:标准 UUID,例如 `2f8d3a1c-4b6e-4f9a-8c2d-1e5b7a9f3c4d`
 - **Sub-Trace**:`{parent_uuid}@{mode}-{timestamp}-{seq}`,例如 `2f8d3a1c...@explore-20260204220012-001`
 
-**实现**:`agent/execution/models.py:Trace`
+**实现**:`agent/trace/models.py:Trace`
 
 ### Message - 执行消息
 
-对应 LLM API 消息,加上元数据。通过 `goal_id` 关联 GoalTree 中的目标。
+对应 LLM API 消息,加上元数据。通过 `goal_id` 关联 GoalTree 中的目标。通过 `parent_sequence` 形成消息树。
 
 ```python
 # assistant 消息(模型返回,可能含 text + tool_calls)
@@ -78,6 +83,7 @@ assistant_msg = Message.create(
     role="assistant",
     goal_id="3",                    # Goal ID(Trace 内部自增)
     content={"text": "...", "tool_calls": [...]},
+    parent_sequence=5,              # 父消息的 sequence
 )
 
 # tool 消息
@@ -87,14 +93,17 @@ tool_msg = Message.create(
     goal_id="5",
     tool_call_id="call_abc123",
     content="工具执行结果",
+    parent_sequence=6,
 )
 ```
 
+**parent_sequence**:指向父消息的 sequence,构成消息树。主路径 = 从 `trace.head_sequence` 沿 parent chain 回溯到 root。
+
 **description 字段**(系统自动生成):
 - `assistant` 消息:优先取 content 中的 text,若无 text 则生成 "tool call: XX, XX"
 - `tool` 消息:使用 tool name
 
-**实现**:`agent/execution/models.py:Message`
+**实现**:`agent/trace/models.py:Message`
 
 ---
 
@@ -120,6 +129,7 @@ class TraceStore(Protocol):
     async def add_message(self, message: Message) -> str: ...
     async def get_message(self, message_id: str) -> Optional[Message]: ...
     async def get_trace_messages(self, trace_id: str) -> List[Message]: ...
+    async def get_main_path_messages(self, trace_id: str, head_sequence: int) -> List[Message]: ...
     async def get_messages_by_goal(self, trace_id: str, goal_id: str) -> List[Message]: ...
     async def update_message(self, message_id: str, **updates) -> None: ...
 
@@ -128,12 +138,12 @@ class TraceStore(Protocol):
     async def append_event(self, trace_id: str, event_type: str, payload: Dict) -> int: ...
 ```
 
-**实现**:`agent/execution/protocols.py`
+**实现**:`agent/trace/protocols.py`
 
 ### FileSystemTraceStore
 
 ```python
-from agent.execution import FileSystemTraceStore
+from agent.trace import FileSystemTraceStore
 
 store = FileSystemTraceStore(base_path=".trace")
 ```
@@ -164,13 +174,15 @@ store = FileSystemTraceStore(base_path=".trace")
 - ✅ 每个 Sub-Trace 是顶层独立目录
 - ✅ Sub-Trace 有完整的 Trace 结构(meta + goal + messages + events)
 
-**实现**:`agent/execution/fs_store.py`
+**实现**:`agent/trace/store.py`
 
 ---
 
 ## REST API 端点
 
-### 1. 列出 Traces
+### 查询端点
+
+#### 1. 列出 Traces
 
 ```http
 GET /api/traces?mode=agent&status=running&limit=20
@@ -178,7 +190,7 @@ GET /api/traces?mode=agent&status=running&limit=20
 
 返回所有 Traces(包括主 Trace 和 Sub-Traces)。
 
-### 2. 获取 Trace + GoalTree + Sub-Traces
+#### 2. 获取 Trace + GoalTree + Sub-Traces
 
 ```http
 GET /api/traces/{trace_id}
@@ -189,7 +201,7 @@ GET /api/traces/{trace_id}
 - GoalTree(该 Trace 的完整 Goal 树)
 - Sub-Traces 元数据(查询所有 `parent_trace_id == trace_id` 的 Traces)
 
-### 3. 获取 Messages
+#### 3. 获取 Messages
 
 ```http
 GET /api/traces/{trace_id}/messages?goal_id=3
@@ -197,7 +209,85 @@ GET /api/traces/{trace_id}/messages?goal_id=3
 
 返回指定 Trace 的 Messages,可选按 Goal 过滤。
 
-**实现**:`agent/execution/api.py`
+**实现**:`agent/trace/api.py`
+
+### 控制端点
+
+需在 `api_server.py` 中配置 Runner。执行在后台异步进行,通过 WebSocket 监听进度。
+
+#### 4. 新建 Trace 并执行
+
+```http
+POST /api/traces
+Content-Type: application/json
+
+{
+  "messages": [{"role": "user", "content": "分析项目架构"}],
+  "model": "gpt-4o",
+  "temperature": 0.3,
+  "max_iterations": 200,
+  "system_prompt": null,
+  "tools": null,
+  "name": "任务名称",
+  "uid": "user_id"
+}
+```
+
+#### 5. 运行(统一续跑 + 回溯)
+
+```http
+POST /api/traces/{trace_id}/run
+Content-Type: application/json
+
+{
+  "messages": [{"role": "user", "content": "..."}],
+  "insert_after": null
+}
+```
+
+- `insert_after: null`(或省略) → 从末尾续跑
+- `insert_after: N` → 回溯到 sequence N 后运行
+- `messages: []` + `insert_after: N` → 重新生成
+
+#### 6. 停止运行中的 Trace
+
+```http
+POST /api/traces/{trace_id}/stop
+```
+
+设置取消信号,agent loop 在下一个检查点退出,Trace 状态置为 `stopped`。
+
+#### 7. 列出正在运行的 Trace
+
+```http
+GET /api/traces/running
+```
+
+#### 8. 反思(提取经验)
+
+```http
+POST /api/traces/{trace_id}/reflect
+Content-Type: application/json
+
+{
+  "focus": "可选,反思重点"
+}
+```
+
+在 trace 末尾追加一条包含反思 prompt 的 user message,以续跑方式运行 agent。
+Agent 回顾整个执行过程后生成经验总结,结果自动追加到 `./cache/experiences.md`。
+
+### 经验端点
+
+#### 9. 读取经验文件
+
+```http
+GET /api/experiences
+```
+
+返回 `./cache/experiences.md` 的文件内容。
+
+**实现**:`agent/trace/run_api.py`
 
 ---
 
@@ -235,7 +325,7 @@ ws://localhost:8000/api/traces/{trace_id}/watch?since_event_id=0
 2. 自动设置父 Goal 的 `status = "completed"`
 3. 在 `goal_updated` 事件的 `affected_goals` 中包含级联完成的父节点
 
-**实现**:`agent/execution/websocket.py`
+**实现**:`agent/trace/websocket.py`
 
 ---
 
@@ -288,7 +378,7 @@ result = await delegate_tool(
 
 ```python
 from agent import AgentRunner
-from agent.execution import FileSystemTraceStore
+from agent.trace import FileSystemTraceStore
 
 store = FileSystemTraceStore(base_path=".trace")
 runner = AgentRunner(trace_store=store, llm_call=my_llm_fn)