Talegorithm 2 недель назад
Родитель
Сommit
5f21dfade3
3 измененных файлов с 208 добавлено и 32 удалено
  1. 143 25
      agent/core/runner.py
  2. 53 6
      agent/llm/yescode.py
  3. 12 1
      docs/README.md

+ 143 - 25
agent/core/runner.py

@@ -520,10 +520,10 @@ class AgentRunner:
                     trace_id, trace_obj.head_sequence
                 )
 
-                # 清理尾部不完整的 tool_call/tool_result 对
-                # 当 agent 被 stop 时,可能恰好存了 assistant(tool_calls) 但还没存 tool results,
-                # 直接发给 LLM 会报 400(tool_use without tool_result)
-                main_path = self._trim_dangling_tool_calls(main_path)
+                # 修复 orphaned tool_calls(中断导致的 tool_call 无 tool_result)
+                main_path, sequence = await self._heal_orphaned_tool_calls(
+                    main_path, trace_id, goal_tree, sequence,
+                )
 
                 history = [msg.to_llm_dict() for msg in main_path]
                 if main_path:
@@ -911,36 +911,154 @@ class AgentRunner:
 
         return cutoff
 
-    @staticmethod
-    def _trim_dangling_tool_calls(messages: List[Message]) -> List[Message]:
+    async def _heal_orphaned_tool_calls(
+        self,
+        messages: List[Message],
+        trace_id: str,
+        goal_tree: Optional[GoalTree],
+        sequence: int,
+    ) -> tuple:
         """
-        从消息列表尾部移除不完整的 tool_call/tool_result 对。
+        检测并修复消息历史中的 orphaned tool_calls。
+
+        当 agent 被 stop/crash 中断时,可能有 assistant 的 tool_calls 没有对应的
+        tool results(包括多 tool_call 部分完成的情况)。直接发给 LLM 会导致 400。
 
-        当 agent 被 stop 中断时,可能最后一条消息是带 tool_calls 的 assistant,
-        但对应的 tool results 尚未存储。直接发给 LLM 会导致 400 错误。
-        此方法从尾部向前回退,直到最后一条消息不是悬空的 tool_call。
+        修复策略:为每个缺失的 tool_result 插入合成的"中断通知"消息,而非裁剪。
+        - 普通工具:简短中断提示
+        - agent/evaluate:包含 sub_trace_id、执行统计、continue_from 指引
+
+        合成消息持久化到 store,确保幂等(下次续跑不再触发)。
+
+        Returns:
+            (healed_messages, next_sequence)
         """
         if not messages:
-            return messages
+            return messages, sequence
 
-        while messages:
-            last = messages[-1]
-            if last.role != "assistant":
-                break
+        # 收集所有 tool_call IDs → (assistant_msg, tool_call_dict)
+        tc_map: Dict[str, tuple] = {}
+        result_ids: set = set()
 
-            content = last.content
-            if not isinstance(content, dict) or not content.get("tool_calls"):
-                break
+        for msg in messages:
+            if msg.role == "assistant":
+                content = msg.content
+                if isinstance(content, dict) and content.get("tool_calls"):
+                    for tc in content["tool_calls"]:
+                        tc_id = tc.get("id")
+                        if tc_id:
+                            tc_map[tc_id] = (msg, tc)
+            elif msg.role == "tool" and msg.tool_call_id:
+                result_ids.add(msg.tool_call_id)
+
+        orphaned_ids = [tc_id for tc_id in tc_map if tc_id not in result_ids]
+        if not orphaned_ids:
+            return messages, sequence
+
+        logger.info(
+            "检测到 %d 个 orphaned tool_calls,生成合成中断通知",
+            len(orphaned_ids),
+        )
+
+        healed = list(messages)
+        head_seq = messages[-1].sequence
+
+        for tc_id in orphaned_ids:
+            assistant_msg, tc = tc_map[tc_id]
+            tool_name = tc.get("function", {}).get("name", "unknown")
+
+            if tool_name in ("agent", "evaluate"):
+                result_text = self._build_agent_interrupted_result(
+                    tc, goal_tree, assistant_msg,
+                )
+            else:
+                result_text = (
+                    f"⚠️ 工具 {tool_name} 执行被中断(进程异常退出),"
+                    "未获得执行结果。请根据需要重新调用。"
+                )
 
-            # 最后一条是 assistant + tool_calls,检查 tool_results 是否齐全
-            # 既然它是最后一条,后面没有 tool results → 悬空,需要移除
-            logger.info(
-                "移除尾部悬空的 tool_call 消息 (sequence=%d)",
-                last.sequence,
+            synthetic_msg = Message.create(
+                trace_id=trace_id,
+                role="tool",
+                sequence=sequence,
+                goal_id=assistant_msg.goal_id,
+                parent_sequence=head_seq,
+                tool_call_id=tc_id,
+                content={"tool_name": tool_name, "result": result_text},
+            )
+
+            if self.trace_store:
+                await self.trace_store.add_message(synthetic_msg)
+
+            healed.append(synthetic_msg)
+            head_seq = sequence
+            sequence += 1
+
+        # 更新 trace head/last sequence
+        if self.trace_store:
+            await self.trace_store.update_trace(
+                trace_id,
+                head_sequence=head_seq,
+                last_sequence=max(head_seq, sequence - 1),
+            )
+
+        return healed, sequence
+
+    def _build_agent_interrupted_result(
+        self,
+        tc: Dict,
+        goal_tree: Optional[GoalTree],
+        assistant_msg: Message,
+    ) -> str:
+        """为中断的 agent/evaluate 工具调用构建合成结果(对齐正常返回值格式)"""
+        args_str = tc.get("function", {}).get("arguments", "{}")
+        try:
+            args = json.loads(args_str) if isinstance(args_str, str) else args_str
+        except json.JSONDecodeError:
+            args = {}
+
+        task = args.get("task", "未知任务")
+        if isinstance(task, list):
+            task = "; ".join(task)
+
+        tool_name = tc.get("function", {}).get("name", "agent")
+        mode = "evaluate" if tool_name == "evaluate" else "delegate"
+
+        # 从 goal_tree 查找 sub_trace 信息
+        sub_trace_id = None
+        stats = None
+        if goal_tree and assistant_msg.goal_id:
+            goal = goal_tree.find(assistant_msg.goal_id)
+            if goal and goal.sub_trace_ids:
+                first = goal.sub_trace_ids[0]
+                if isinstance(first, dict):
+                    sub_trace_id = first.get("trace_id")
+                elif isinstance(first, str):
+                    sub_trace_id = first
+                if goal.cumulative_stats:
+                    s = goal.cumulative_stats
+                    if s.message_count > 0:
+                        stats = {
+                            "message_count": s.message_count,
+                            "total_tokens": s.total_tokens,
+                            "total_cost": round(s.total_cost, 4),
+                        }
+
+        result: Dict[str, Any] = {
+            "mode": mode,
+            "status": "interrupted",
+            "summary": "⚠️ 子Agent执行被中断(进程异常退出)",
+            "task": task,
+        }
+        if sub_trace_id:
+            result["sub_trace_id"] = sub_trace_id
+            result["hint"] = (
+                f'使用 continue_from="{sub_trace_id}" 可继续执行,保留已有进度'
             )
-            messages = messages[:-1]
+        if stats:
+            result["stats"] = stats
 
-        return messages
+        return json.dumps(result, ensure_ascii=False, indent=2)
 
     # ===== 上下文注入 =====
 

+ 53 - 6
agent/llm/yescode.py

@@ -37,15 +37,62 @@ _RETRYABLE_EXCEPTIONS = (
     ConnectionError,
 )
 
-# 简短模型名 -> Anthropic API 模型名
-MODEL_MAP = {
+# 模糊匹配规则:(关键词, 目标模型名),从精确到宽泛排序
+# 精确匹配走 MODEL_EXACT,不命中则按顺序尝试关键词匹配
+MODEL_EXACT = {
+    "claude-sonnet-4-5-20250929": "claude-sonnet-4-5-20250929",
+    "claude-sonnet-4-5": "claude-sonnet-4-5-20250929",
     "claude-sonnet-4.5": "claude-sonnet-4-5-20250929",
+    "claude-opus-4-6": "claude-opus-4-6",
+    "claude-opus-4-5-20251101": "claude-opus-4-5-20251101",
+    "claude-opus-4-5": "claude-opus-4-5-20251101",
+    "claude-opus-4-1-20250805": "claude-opus-4-1-20250805",
+    "claude-opus-4-1": "claude-opus-4-1-20250805",
+    "claude-haiku-4-5-20251001": "claude-haiku-4-5-20251001",
+    "claude-haiku-4-5": "claude-haiku-4-5-20251001",
 }
 
+MODEL_FUZZY = [
+    # 版本+家族(精确)
+    ("sonnet-4-5", "claude-sonnet-4-5-20250929"),
+    ("sonnet-4.5", "claude-sonnet-4-5-20250929"),
+    ("opus-4-6", "claude-opus-4-6"),
+    ("opus-4.6", "claude-opus-4-6"),
+    ("opus-4-5", "claude-opus-4-5-20251101"),
+    ("opus-4.5", "claude-opus-4-5-20251101"),
+    ("opus-4-1", "claude-opus-4-1-20250805"),
+    ("opus-4.1", "claude-opus-4-1-20250805"),
+    ("haiku-4-5", "claude-haiku-4-5-20251001"),
+    ("haiku-4.5", "claude-haiku-4-5-20251001"),
+    # 仅家族名 → 最新版本
+    ("sonnet", "claude-sonnet-4-5-20250929"),
+    ("opus", "claude-opus-4-6"),
+    ("haiku", "claude-haiku-4-5-20251001"),
+]
+
 
 def _resolve_model(model: str) -> str:
-    """将简短模型名映射为实际 API 模型名"""
-    return MODEL_MAP.get(model, model)
+    """将任意格式的模型名映射为 Yescode API 接受的模型名。
+    支持:OpenRouter 前缀(anthropic/xxx)、带点号(4.5)、纯家族名(sonnet)等。
+    """
+    # 1. 剥离 provider 前缀
+    if "/" in model:
+        model = model.split("/", 1)[1]
+
+    # 2. 精确匹配
+    if model in MODEL_EXACT:
+        return MODEL_EXACT[model]
+
+    # 3. 模糊匹配(大小写不敏感)
+    model_lower = model.lower()
+    for keyword, target in MODEL_FUZZY:
+        if keyword in model_lower:
+            logger.info("模型名模糊匹配: %s → %s", model, target)
+            return target
+
+    # 4. 兜底:原样返回,让 API 报错
+    logger.warning("未能匹配模型名: %s, 原样传递", model)
+    return model
 
 
 def _normalize_tool_call_ids(messages: List[Dict[str, Any]], target_prefix: str) -> List[Dict[str, Any]]:
@@ -298,7 +345,7 @@ async def yescode_llm_call(
     }
 
     # 调用 API(带重试)
-    max_retries = 3
+    max_retries = 5
     last_exception = None
     for attempt in range(max_retries):
         async with httpx.AsyncClient(timeout=300.0) as client:
@@ -311,7 +358,7 @@ async def yescode_llm_call(
             except httpx.HTTPStatusError as e:
                 error_body = e.response.text
                 status = e.response.status_code
-                if status in (429, 500, 502, 503, 504, 529) and attempt < max_retries - 1:
+                if status in (429, 500, 502, 503, 504, 524, 529) and attempt < max_retries - 1:
                     wait = 2 ** attempt * 2
                     logger.warning(
                         "[Yescode] HTTP %d (attempt %d/%d), retrying in %ds: %s",

+ 12 - 1
docs/README.md

@@ -290,7 +290,18 @@ await runner.stop(trace_id)
 
 调用后 agent loop 在下一个检查点退出,Trace 状态置为 `stopped`,同时保存当前 `head_sequence`(确保续跑时能正确加载完整历史)。
 
-**消息完整性保护**:续跑/回溯加载历史时,`_build_history` 自动裁剪尾部悬空的 tool_call 消息(`_trim_dangling_tool_calls`)。当 agent 被 stop 中断时,可能最后一条消息是带 `tool_calls` 的 assistant,但对应的 tool results 尚未存储;直接发给 LLM 会导致 400 错误,此机制在框架层自动处理。
+**消息完整性保护(orphaned tool_call 修复)**:续跑加载历史时,`_build_history` 自动检测并修复 orphaned tool_calls(`_heal_orphaned_tool_calls`)。当 agent 被 stop/crash 中断时,可能存在 assistant 的 tool_calls 没有对应的 tool results(包括部分完成的情况:3 个 tool_call 只有 1 个 tool_result)。直接发给 LLM 会导致 400 错误。
+
+修复策略:为每个缺失的 tool_result **插入合成的中断通知**(而非裁剪 assistant 消息):
+
+| 工具类型 | 合成 tool_result 内容 |
+|----------|---------------------|
+| 普通工具 | 简短中断提示,建议重新调用 |
+| agent/evaluate | 结构化中断信息,包含 `sub_trace_id`、执行统计、`continue_from` 用法指引 |
+
+agent 工具的合成结果对齐正常返回值格式(含 `sub_trace_id` 字段),主 Agent 可直接使用 `agent(task=..., continue_from=sub_trace_id)` 续跑被中断的子 Agent。合成消息持久化存储,确保幂等。
+
+**实现**:`agent/core/runner.py:AgentRunner._heal_orphaned_tool_calls`
 
 - `run(messages, config)`:**核心方法**,流式返回 `AsyncIterator[Union[Trace, Message]]`
 - `run_result(messages, config)`:便利方法,内部消费 `run()`,返回结构化结果。主要用于 `agent`/`evaluate` 工具内部