Просмотр исходного кода

Merge branch 'main' of https://git.yishihui.com/howard/Agent

max_liu 1 неделя назад
Родитель
Сommit
f5b1b2b73b

+ 42 - 24
agent/core/runner.py

@@ -475,6 +475,10 @@ class AgentRunner:
             raise ValueError(f"Trace not found: {config.trace_id}")
 
         goal_tree = await self.trace_store.get_goal_tree(config.trace_id)
+        if goal_tree is None:
+            # 防御性兜底:trace 存在但 goal.json 丢失时,创建空树
+            goal_tree = GoalTree(mission=trace_obj.task or "Agent task")
+            await self.trace_store.update_goal_tree(config.trace_id, goal_tree)
 
         # 自动判断行为:after_sequence 为 None 或 == head → 续跑;< head → 回溯
         after_seq = config.after_sequence
@@ -549,25 +553,38 @@ class AgentRunner:
                 if main_path:
                     head_seq = main_path[-1].sequence
 
-        # 2. 构建 system prompt(如果历史中没有 system message)
+        # 2. 构建/注入 skills 到 system prompt
         has_system = any(m.get("role") == "system" for m in history)
         has_system_in_new = any(m.get("role") == "system" for m in new_messages)
 
-        if not has_system and not has_system_in_new:
-            system_prompt = await self._build_system_prompt(config)
-            if system_prompt:
-                history = [{"role": "system", "content": system_prompt}] + history
+        if not has_system:
+            if has_system_in_new:
+                # 入参消息已含 system,将 skills 注入其中(在 step 4 持久化之前)
+                augmented = []
+                for msg in new_messages:
+                    if msg.get("role") == "system":
+                        base = msg.get("content") or ""
+                        enriched = await self._build_system_prompt(config, base_prompt=base)
+                        augmented.append({**msg, "content": enriched or base})
+                    else:
+                        augmented.append(msg)
+                new_messages = augmented
+            else:
+                # 没有 system,自动构建并插入历史
+                system_prompt = await self._build_system_prompt(config)
+                if system_prompt:
+                    history = [{"role": "system", "content": system_prompt}] + history
 
-                if self.trace_store:
-                    system_msg = Message.create(
-                        trace_id=trace_id, role="system", sequence=sequence,
-                        goal_id=None, content=system_prompt,
-                        parent_sequence=None,  # system message 是 root
-                    )
-                    await self.trace_store.add_message(system_msg)
-                    created_messages.append(system_msg)
-                    head_seq = sequence
-                    sequence += 1
+                    if self.trace_store:
+                        system_msg = Message.create(
+                            trace_id=trace_id, role="system", sequence=sequence,
+                            goal_id=None, content=system_prompt,
+                            parent_sequence=None,  # system message 是 root
+                        )
+                        await self.trace_store.add_message(system_msg)
+                        created_messages.append(system_msg)
+                        head_seq = sequence
+                        sequence += 1
 
         # 3. 新建时:在第一条 user message 末尾注入当前经验
         if not config.trace_id:  # 新建模式
@@ -615,11 +632,6 @@ class AgentRunner:
         # 当前主路径头节点的 sequence(用于设置 parent_sequence)
         head_seq = trace.head_sequence
 
-        # 设置 goal_tree 到 goal 工具
-        if goal_tree and self.trace_store:
-            from agent.trace.goal_tool import set_goal_tree
-            set_goal_tree(goal_tree)
-
         for iteration in range(config.max_iterations):
             # 检查取消信号
             cancel_event = self._cancel_events.get(trace_id)
@@ -729,8 +741,8 @@ class AgentRunner:
                     )
                     goal_tree.focus(goal_tree.goals[0].id)
                     if self.trace_store:
-                        await self.trace_store.update_goal_tree(trace_id, goal_tree)
                         await self.trace_store.add_goal(trace_id, goal_tree.goals[0])
+                        await self.trace_store.update_goal_tree(trace_id, goal_tree)
                     logger.info(f"自动创建 root goal: {goal_tree.goals[0].id}")
 
             # 获取当前 goal_id
@@ -812,6 +824,7 @@ class AgentRunner:
                             "trace_id": trace_id,
                             "goal_id": current_goal_id,
                             "runner": self,
+                            "goal_tree": goal_tree,
                         }
                     )
 
@@ -843,7 +856,8 @@ class AgentRunner:
                         goal_id=current_goal_id,
                         parent_sequence=head_seq,
                         tool_call_id=tc["id"],
-                        content={"tool_name": tool_name, "result": tool_result_text},
+                        # 存储完整内容:有图片时保留 list(含 image_url),纯文本时存字符串
+                        content={"tool_name": tool_name, "result": tool_content_for_llm},
                     )
 
                     if self.trace_store:
@@ -1433,17 +1447,21 @@ class AgentRunner:
     # 默认 system prompt 前缀(当 config.system_prompt 和前端都未提供 system message 时使用)
     DEFAULT_SYSTEM_PREFIX = "你是最顶尖的AI助手,可以拆分并调用工具逐步解决复杂问题。"
 
-    async def _build_system_prompt(self, config: RunConfig) -> Optional[str]:
+    async def _build_system_prompt(self, config: RunConfig, base_prompt: Optional[str] = None) -> Optional[str]:
         """构建 system prompt(注入 skills)
 
         优先级:
         1. config.skills 显式指定 → 按名称过滤
         2. config.skills 为 None → 查 preset 的默认 skills 列表
         3. preset 也无 skills(None)→ 加载全部(向后兼容)
+
+        Args:
+            base_prompt: 已有 system 内容(来自消息或 config.system_prompt),
+                         None 时使用 config.system_prompt
         """
         from agent.core.presets import AGENT_PRESETS
 
-        system_prompt = config.system_prompt
+        system_prompt = base_prompt if base_prompt is not None else config.system_prompt
 
         # 确定要加载哪些 skills
         skills_filter: Optional[List[str]] = config.skills

+ 411 - 10
agent/llm/openrouter.py

@@ -2,7 +2,11 @@
 OpenRouter Provider
 
 使用 OpenRouter API 调用各种模型(包括 Claude Sonnet 4.5)
-支持 OpenAI 兼容的 API 格式
+
+路由策略:
+- Claude 模型:走 OpenRouter 的 Anthropic 原生端点(/api/v1/messages),
+  使用自包含的格式转换逻辑,确保多模态工具结果(截图等)正确传递。
+- 其他模型:走 OpenAI 兼容端点(/api/v1/chat/completions)。
 
 OpenRouter 转发多种模型,需要根据实际模型处理不同的 usage 格式:
 - OpenAI 模型: prompt_tokens, completion_tokens, completion_tokens_details.reasoning_tokens
@@ -15,6 +19,7 @@ import json
 import asyncio
 import logging
 import httpx
+from pathlib import Path
 from typing import List, Dict, Any, Optional
 
 from .usage import TokenUsage, create_usage_from_response
@@ -34,6 +39,289 @@ _RETRYABLE_EXCEPTIONS = (
 )
 
 
+# ── OpenRouter Anthropic endpoint: model name mapping ──────────────────────
+# Local copy of yescode's model tables so this module is self-contained.
+_OR_MODEL_EXACT = {
+    "claude-sonnet-4-6": "claude-sonnet-4-6",
+    "claude-sonnet-4.6": "claude-sonnet-4-6",
+    "claude-sonnet-4-5-20250929": "claude-sonnet-4-5-20250929",
+    "claude-sonnet-4-5": "claude-sonnet-4-5-20250929",
+    "claude-sonnet-4.5": "claude-sonnet-4-5-20250929",
+    "claude-opus-4-6": "claude-opus-4-6",
+    "claude-opus-4-5-20251101": "claude-opus-4-5-20251101",
+    "claude-opus-4-5": "claude-opus-4-5-20251101",
+    "claude-opus-4-1-20250805": "claude-opus-4-1-20250805",
+    "claude-opus-4-1": "claude-opus-4-1-20250805",
+    "claude-haiku-4-5-20251001": "claude-haiku-4-5-20251001",
+    "claude-haiku-4-5": "claude-haiku-4-5-20251001",
+}
+
+_OR_MODEL_FUZZY = [
+    ("sonnet-4-6", "claude-sonnet-4-6"),
+    ("sonnet-4.6", "claude-sonnet-4-6"),
+    ("sonnet-4-5", "claude-sonnet-4-5-20250929"),
+    ("sonnet-4.5", "claude-sonnet-4-5-20250929"),
+    ("opus-4-6", "claude-opus-4-6"),
+    ("opus-4.6", "claude-opus-4-6"),
+    ("opus-4-5", "claude-opus-4-5-20251101"),
+    ("opus-4.5", "claude-opus-4-5-20251101"),
+    ("opus-4-1", "claude-opus-4-1-20250805"),
+    ("opus-4.1", "claude-opus-4-1-20250805"),
+    ("haiku-4-5", "claude-haiku-4-5-20251001"),
+    ("haiku-4.5", "claude-haiku-4-5-20251001"),
+    ("sonnet", "claude-sonnet-4-6"),
+    ("opus", "claude-opus-4-6"),
+    ("haiku", "claude-haiku-4-5-20251001"),
+]
+
+
+def _resolve_openrouter_model(model: str) -> str:
+    """Normalize a model name for OpenRouter's Anthropic endpoint.
+
+    Strips ``anthropic/`` prefix, resolves aliases / dot-notation,
+    and re-prepends ``anthropic/`` for OpenRouter routing.
+    """
+    # 1. Strip provider prefix
+    bare = model.split("/", 1)[1] if "/" in model else model
+
+    # 2. Exact match
+    if bare in _OR_MODEL_EXACT:
+        return f"anthropic/{_OR_MODEL_EXACT[bare]}"
+
+    # 3. Fuzzy keyword match (case-insensitive)
+    bare_lower = bare.lower()
+    for keyword, target in _OR_MODEL_FUZZY:
+        if keyword in bare_lower:
+            logger.info("[OpenRouter] Model fuzzy match: %s → anthropic/%s", model, target)
+            return f"anthropic/{target}"
+
+    # 4. Fallback – return as-is (let API report the error)
+    logger.warning("[OpenRouter] Could not resolve model name: %s, passing as-is", model)
+    return model
+
+
+# ── OpenRouter Anthropic endpoint: format conversion helpers ───────────────
+
+def _to_anthropic_content(content: Any) -> Any:
+    """Convert OpenAI-style *content* (string or block list) to Anthropic format.
+
+    Handles ``image_url`` blocks → Anthropic ``image`` blocks (base64 or url).
+    Passes through ``text`` blocks and ``cache_control`` unchanged.
+    """
+    if not isinstance(content, list):
+        return content
+
+    result = []
+    for block in content:
+        if not isinstance(block, dict):
+            result.append(block)
+            continue
+
+        if block.get("type") == "image_url":
+            image_url_obj = block.get("image_url", {})
+            url = image_url_obj.get("url", "") if isinstance(image_url_obj, dict) else str(image_url_obj)
+            if url.startswith("data:"):
+                header, _, data = url.partition(",")
+                media_type = header.split(":")[1].split(";")[0] if ":" in header else "image/png"
+                result.append({
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": media_type,
+                        "data": data,
+                    },
+                })
+            else:
+                # 检测本地文件路径,自动转 base64
+                local_path = Path(url)
+                if local_path.exists() and local_path.is_file():
+                    import base64 as b64mod
+                    import mimetypes
+                    mime_type, _ = mimetypes.guess_type(str(local_path))
+                    mime_type = mime_type or "image/png"
+                    raw = local_path.read_bytes()
+                    b64_data = b64mod.b64encode(raw).decode("ascii")
+                    logger.info(f"[OpenRouter] 本地图片自动转 base64: {url} ({len(raw)} bytes)")
+                    result.append({
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": mime_type,
+                            "data": b64_data,
+                        },
+                    })
+                else:
+                    result.append({
+                        "type": "image",
+                        "source": {"type": "url", "url": url},
+                    })
+        else:
+            result.append(block)
+    return result
+
+
+def _to_anthropic_messages(messages: List[Dict[str, Any]]) -> tuple:
+    """Convert an OpenAI-format message list to Anthropic Messages API format.
+
+    Returns ``(system_prompt, anthropic_messages)`` where *system_prompt* is
+    ``None`` or a string extracted from ``role=system`` messages, and
+    *anthropic_messages* is the converted list.
+    """
+    system_prompt = None
+    anthropic_messages: List[Dict[str, Any]] = []
+
+    for msg in messages:
+        role = msg.get("role", "")
+        content = msg.get("content", "")
+
+        if role == "system":
+            system_prompt = content
+
+        elif role == "user":
+            anthropic_messages.append({
+                "role": "user",
+                "content": _to_anthropic_content(content),
+            })
+
+        elif role == "assistant":
+            tool_calls = msg.get("tool_calls")
+            if tool_calls:
+                content_blocks: List[Dict[str, Any]] = []
+                if content:
+                    converted = _to_anthropic_content(content)
+                    if isinstance(converted, list):
+                        content_blocks.extend(converted)
+                    elif isinstance(converted, str) and converted.strip():
+                        content_blocks.append({"type": "text", "text": converted})
+                for tc in tool_calls:
+                    func = tc.get("function", {})
+                    args_str = func.get("arguments", "{}")
+                    try:
+                        args = json.loads(args_str) if isinstance(args_str, str) else args_str
+                    except json.JSONDecodeError:
+                        args = {}
+                    content_blocks.append({
+                        "type": "tool_use",
+                        "id": tc.get("id", ""),
+                        "name": func.get("name", ""),
+                        "input": args,
+                    })
+                anthropic_messages.append({"role": "assistant", "content": content_blocks})
+            else:
+                anthropic_messages.append({"role": "assistant", "content": content})
+
+        elif role == "tool":
+            # Split tool result into text-only tool_result + sibling image blocks.
+            # Images nested inside tool_result.content are not reliably passed
+            # through by all proxies (e.g. OpenRouter).  Placing them as sibling
+            # content blocks in the same user message is more compatible.
+            converted = _to_anthropic_content(content)
+            text_parts: List[Dict[str, Any]] = []
+            image_parts: List[Dict[str, Any]] = []
+            if isinstance(converted, list):
+                for block in converted:
+                    if isinstance(block, dict) and block.get("type") == "image":
+                        image_parts.append(block)
+                    else:
+                        text_parts.append(block)
+            elif isinstance(converted, str):
+                text_parts = [{"type": "text", "text": converted}] if converted else []
+
+            # tool_result keeps only text content
+            tool_result_block: Dict[str, Any] = {
+                "type": "tool_result",
+                "tool_use_id": msg.get("tool_call_id", ""),
+            }
+            if len(text_parts) == 1 and text_parts[0].get("type") == "text":
+                tool_result_block["content"] = text_parts[0]["text"]
+            elif text_parts:
+                tool_result_block["content"] = text_parts
+            # (omit content key entirely when empty – Anthropic accepts this)
+
+            # Build the blocks to append: tool_result first, then any images
+            new_blocks = [tool_result_block] + image_parts
+
+            # Merge consecutive tool results into one user message
+            if (anthropic_messages
+                    and anthropic_messages[-1].get("role") == "user"
+                    and isinstance(anthropic_messages[-1].get("content"), list)
+                    and anthropic_messages[-1]["content"]
+                    and anthropic_messages[-1]["content"][0].get("type") == "tool_result"):
+                anthropic_messages[-1]["content"].extend(new_blocks)
+            else:
+                anthropic_messages.append({
+                    "role": "user",
+                    "content": new_blocks,
+                })
+
+    return system_prompt, anthropic_messages
+
+
+def _to_anthropic_tools(tools: List[Dict]) -> List[Dict]:
+    """Convert OpenAI tool definitions to Anthropic format."""
+    anthropic_tools = []
+    for tool in tools:
+        if tool.get("type") == "function":
+            func = tool["function"]
+            anthropic_tools.append({
+                "name": func.get("name", ""),
+                "description": func.get("description", ""),
+                "input_schema": func.get("parameters", {"type": "object", "properties": {}}),
+            })
+    return anthropic_tools
+
+
+def _parse_anthropic_response(result: Dict[str, Any]) -> Dict[str, Any]:
+    """Parse an Anthropic Messages API response into the unified format.
+
+    Returns a dict with keys: content, tool_calls, finish_reason, usage.
+    """
+    content_blocks = result.get("content", [])
+
+    text_parts = []
+    tool_calls = []
+    for block in content_blocks:
+        if block.get("type") == "text":
+            text_parts.append(block.get("text", ""))
+        elif block.get("type") == "tool_use":
+            tool_calls.append({
+                "id": block.get("id", ""),
+                "type": "function",
+                "function": {
+                    "name": block.get("name", ""),
+                    "arguments": json.dumps(block.get("input", {}), ensure_ascii=False),
+                },
+            })
+
+    content = "\n".join(text_parts)
+
+    stop_reason = result.get("stop_reason", "end_turn")
+    finish_reason_map = {
+        "end_turn": "stop",
+        "tool_use": "tool_calls",
+        "max_tokens": "length",
+        "stop_sequence": "stop",
+    }
+    finish_reason = finish_reason_map.get(stop_reason, stop_reason)
+
+    raw_usage = result.get("usage", {})
+    usage = TokenUsage(
+        input_tokens=raw_usage.get("input_tokens", 0),
+        output_tokens=raw_usage.get("output_tokens", 0),
+        cache_creation_tokens=raw_usage.get("cache_creation_input_tokens", 0),
+        cache_read_tokens=raw_usage.get("cache_read_input_tokens", 0),
+    )
+
+    return {
+        "content": content,
+        "tool_calls": tool_calls if tool_calls else None,
+        "finish_reason": finish_reason,
+        "usage": usage,
+    }
+
+
+# ── Provider detection / usage parsing ─────────────────────────────────────
+
 def _detect_provider_from_model(model: str) -> str:
     """根据模型名称检测提供商"""
     model_lower = model.lower()
@@ -139,6 +427,122 @@ def _normalize_tool_call_ids(messages: List[Dict[str, Any]], target_prefix: str)
     return result
 
 
+async def _openrouter_anthropic_call(
+    messages: List[Dict[str, Any]],
+    model: str,
+    tools: Optional[List[Dict]],
+    api_key: str,
+    **kwargs,
+) -> Dict[str, Any]:
+    """
+    通过 OpenRouter 的 Anthropic 原生端点调用 Claude 模型。
+
+    使用 Anthropic Messages API 格式(/api/v1/messages),
+    自包含的格式转换逻辑,确保多模态内容(截图等)正确传递。
+    """
+    endpoint = "https://openrouter.ai/api/v1/messages"
+
+    # Resolve model name for OpenRouter (e.g. "claude-sonnet-4.5" → "anthropic/claude-sonnet-4-5-20250929")
+    resolved_model = _resolve_openrouter_model(model)
+    logger.info("[OpenRouter/Anthropic] model: %s → %s", model, resolved_model)
+
+    # 跨 Provider 续跑时,重写不兼容的 tool_call_id 为 toolu_ 前缀
+    messages = _normalize_tool_call_ids(messages, "toolu")
+
+    # OpenAI 格式 → Anthropic 格式
+    system_prompt, anthropic_messages = _to_anthropic_messages(messages)
+
+    # Diagnostic: count image blocks in the payload
+    _img_count = 0
+    for _m in anthropic_messages:
+        if isinstance(_m.get("content"), list):
+            for _b in _m["content"]:
+                if isinstance(_b, dict) and _b.get("type") == "image":
+                    _img_count += 1
+    if _img_count:
+        logger.info("[OpenRouter/Anthropic] payload contains %d image block(s)", _img_count)
+        print(f"[OpenRouter/Anthropic] payload contains {_img_count} image block(s)")
+
+    payload: Dict[str, Any] = {
+        "model": resolved_model,
+        "messages": anthropic_messages,
+        "max_tokens": kwargs.get("max_tokens", 16384),
+    }
+    if system_prompt is not None:
+        payload["system"] = system_prompt
+    if tools:
+        payload["tools"] = _to_anthropic_tools(tools)
+    if "temperature" in kwargs:
+        payload["temperature"] = kwargs["temperature"]
+
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "anthropic-version": "2023-06-01",
+        "content-type": "application/json",
+        "HTTP-Referer": "https://github.com/your-repo",
+        "X-Title": "Agent Framework",
+    }
+
+    max_retries = 3
+    last_exception = None
+    for attempt in range(max_retries):
+        async with httpx.AsyncClient(timeout=300.0) as client:
+            try:
+                response = await client.post(endpoint, json=payload, headers=headers)
+                response.raise_for_status()
+                result = response.json()
+                break
+
+            except httpx.HTTPStatusError as e:
+                status = e.response.status_code
+                error_body = e.response.text
+                if status in (429, 500, 502, 503, 504) and attempt < max_retries - 1:
+                    wait = 2 ** attempt * 2
+                    logger.warning(
+                        "[OpenRouter/Anthropic] HTTP %d (attempt %d/%d), retrying in %ds: %s",
+                        status, attempt + 1, max_retries, wait, error_body[:200],
+                    )
+                    await asyncio.sleep(wait)
+                    last_exception = e
+                    continue
+                # Log AND print error body so it is visible in console output
+                logger.error("[OpenRouter/Anthropic] HTTP %d error body: %s", status, error_body)
+                print(f"[OpenRouter/Anthropic] API Error {status}: {error_body[:500]}")
+                raise
+
+            except _RETRYABLE_EXCEPTIONS as e:
+                last_exception = e
+                if attempt < max_retries - 1:
+                    wait = 2 ** attempt * 2
+                    logger.warning(
+                        "[OpenRouter/Anthropic] %s (attempt %d/%d), retrying in %ds",
+                        type(e).__name__, attempt + 1, max_retries, wait,
+                    )
+                    await asyncio.sleep(wait)
+                    continue
+                raise
+    else:
+        raise last_exception  # type: ignore[misc]
+
+    # 解析 Anthropic 响应 → 统一格式
+    parsed = _parse_anthropic_response(result)
+    usage = parsed["usage"]
+    cost = calculate_cost(model, usage)
+
+    return {
+        "content": parsed["content"],
+        "tool_calls": parsed["tool_calls"],
+        "prompt_tokens": usage.input_tokens,
+        "completion_tokens": usage.output_tokens,
+        "reasoning_tokens": usage.reasoning_tokens,
+        "cache_creation_tokens": usage.cache_creation_tokens,
+        "cache_read_tokens": usage.cache_read_tokens,
+        "finish_reason": parsed["finish_reason"],
+        "cost": cost,
+        "usage": usage,
+    }
+
+
 async def openrouter_llm_call(
     messages: List[Dict[str, Any]],
     model: str = "anthropic/claude-sonnet-4.5",
@@ -168,6 +572,12 @@ async def openrouter_llm_call(
     if not api_key:
         raise ValueError("OPEN_ROUTER_API_KEY environment variable not set")
 
+    # Claude 模型走 Anthropic 原生端点,其余走 OpenAI 兼容端点
+    provider = _detect_provider_from_model(model)
+    if provider == "anthropic":
+        logger.debug("[OpenRouter] Routing Claude model to Anthropic native endpoint")
+        return await _openrouter_anthropic_call(messages, model, tools, api_key, **kwargs)
+
     base_url = "https://openrouter.ai/api/v1"
     endpoint = f"{base_url}/chat/completions"
 
@@ -189,15 +599,6 @@ async def openrouter_llm_call(
     if "max_tokens" in kwargs:
         payload["max_tokens"] = kwargs["max_tokens"]
 
-    # 对于 Anthropic 模型,锁定 provider 以确保缓存生效
-    if "anthropic" in model.lower() or "claude" in model.lower():
-        payload["provider"] = {
-            "only": ["Anthropic"],
-            "allow_fallbacks": False,
-            "require_parameters": True
-        }
-        logger.debug("[OpenRouter] Locked provider to Anthropic for caching support")
-
     # OpenRouter 特定参数
     headers = {
         "Authorization": f"Bearer {api_key}",

+ 7 - 1
agent/llm/yescode.py

@@ -212,7 +212,13 @@ def _convert_messages_to_anthropic(messages: List[Dict[str, Any]]) -> tuple:
             if tool_calls:
                 content_blocks = []
                 if content:
-                    content_blocks.append({"type": "text", "text": content})
+                    # content 可能已被 _add_cache_control 转成 list(含 cache_control),
+                    # 也可能是普通字符串。两者都需要正确处理,避免产生 {"type":"text","text":[...]}
+                    converted = _convert_content_to_anthropic(content)
+                    if isinstance(converted, list):
+                        content_blocks.extend(converted)
+                    elif isinstance(converted, str) and converted.strip():
+                        content_blocks.append({"type": "text", "text": converted})
                 for tc in tool_calls:
                     func = tc.get("function", {})
                     args_str = func.get("arguments", "{}")

+ 101 - 9
agent/tools/builtin/file/read.py

@@ -11,9 +11,13 @@ Read Tool - 文件读取工具
 """
 
 import os
+import base64
 import mimetypes
 from pathlib import Path
 from typing import Optional
+from urllib.parse import urlparse
+
+import httpx
 
 from agent.tools import tool, ToolResult, ToolContext
 
@@ -23,7 +27,7 @@ MAX_LINE_LENGTH = 2000
 MAX_BYTES = 50 * 1024  # 50KB
 
 
-@tool(description="读取文件内容,支持文本文件、图片、PDF 等多种格式")
+@tool(description="读取文件内容,支持文本文件、图片、PDF 等多种格式,也支持 HTTP/HTTPS URL")
 async def read_file(
     file_path: str,
     offset: int = 0,
@@ -36,7 +40,7 @@ async def read_file(
     参考 OpenCode 实现
 
     Args:
-        file_path: 文件路径(绝对路径或相对路径
+        file_path: 文件路径(绝对路径、相对路径或 HTTP/HTTPS URL
         offset: 起始行号(从 0 开始)
         limit: 读取行数(默认 2000 行)
         context: 工具上下文
@@ -44,6 +48,11 @@ async def read_file(
     Returns:
         ToolResult: 文件内容
     """
+    # 检测是否为 HTTP/HTTPS URL
+    parsed = urlparse(file_path)
+    if parsed.scheme in ("http", "https"):
+        return await _read_from_url(file_path)
+
     # 解析路径
     path = Path(file_path)
     if not path.is_absolute():
@@ -79,13 +88,25 @@ async def read_file(
 
     # 图片文件(参考 opencode:66-91)
     if mime_type.startswith("image/") and mime_type not in ["image/svg+xml", "image/vnd.fastbidsheet"]:
-        # 注意:实际项目中需要实现图片的 base64 编码
-        # 这里简化处理
-        return ToolResult(
-            title=path.name,
-            output=f"图片文件: {path.name} (MIME: {mime_type})",
-            metadata={"mime_type": mime_type, "truncated": False}
-        )
+        try:
+            raw = path.read_bytes()
+            b64_data = base64.b64encode(raw).decode("ascii")
+            return ToolResult(
+                title=path.name,
+                output=f"图片文件: {path.name} (MIME: {mime_type}, {len(raw)} bytes)",
+                metadata={"mime_type": mime_type, "truncated": False},
+                images=[{
+                    "type": "base64",
+                    "media_type": mime_type,
+                    "data": b64_data,
+                }],
+            )
+        except Exception as e:
+            return ToolResult(
+                title=path.name,
+                output=f"图片文件读取失败: {path.name}: {e}",
+                error=str(e),
+            )
 
     # PDF 文件
     if mime_type == "application/pdf":
@@ -225,3 +246,74 @@ def _is_binary_file(path: Path) -> bool:
 
     except Exception:
         return False
+
+
+async def _read_from_url(url: str) -> ToolResult:
+    """
+    从 HTTP/HTTPS URL 读取文件内容。
+
+    主要用于图片等多媒体资源,自动转换为 base64。
+    """
+    try:
+        async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
+            response = await client.get(url)
+            response.raise_for_status()
+
+            content_type = response.headers.get("content-type", "")
+            raw = response.content
+
+            # 从 URL 提取文件名
+            from urllib.parse import urlparse
+            parsed = urlparse(url)
+            filename = Path(parsed.path).name or "downloaded_file"
+
+            # 图片文件
+            if content_type.startswith("image/") or any(url.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"]):
+                mime_type = content_type.split(";")[0] if content_type else "image/jpeg"
+                b64_data = base64.b64encode(raw).decode("ascii")
+                return ToolResult(
+                    title=filename,
+                    output=f"图片文件: {filename} (URL: {url}, MIME: {mime_type}, {len(raw)} bytes)",
+                    metadata={"mime_type": mime_type, "url": url, "truncated": False},
+                    images=[{
+                        "type": "base64",
+                        "media_type": mime_type,
+                        "data": b64_data,
+                    }],
+                )
+
+            # 文本文件
+            if content_type.startswith("text/") or content_type == "application/json":
+                text = raw.decode("utf-8", errors="replace")
+                lines = text.split("\n")
+                preview = "\n".join(lines[:20])
+                return ToolResult(
+                    title=filename,
+                    output=f"<file>\n{text}\n</file>",
+                    metadata={
+                        "preview": preview,
+                        "url": url,
+                        "mime_type": content_type,
+                        "total_lines": len(lines),
+                    }
+                )
+
+            # 其他二进制文件
+            return ToolResult(
+                title=filename,
+                output=f"二进制文件: {filename} (URL: {url}, {len(raw)} bytes)",
+                metadata={"url": url, "mime_type": content_type, "size": len(raw)}
+            )
+
+    except httpx.HTTPStatusError as e:
+        return ToolResult(
+            title="HTTP 错误",
+            output=f"无法下载文件: {url}\nHTTP {e.response.status_code}: {e.response.reason_phrase}",
+            error=str(e)
+        )
+    except Exception as e:
+        return ToolResult(
+            title="下载失败",
+            output=f"无法从 URL 读取文件: {url}\n错误: {str(e)}",
+            error=str(e)
+        )

+ 0 - 4
agent/trace/__init__.py

@@ -14,7 +14,6 @@ from .goal_models import Goal, GoalTree, GoalStatus, GoalType, GoalStats
 from .protocols import TraceStore
 from .store import FileSystemTraceStore
 from .trace_id import generate_trace_id, generate_sub_trace_id, parse_parent_trace_id
-from .goal_tool import set_goal_tree, get_goal_tree
 
 __all__ = [
     # Models
@@ -32,7 +31,4 @@ __all__ = [
     "generate_trace_id",
     "generate_sub_trace_id",
     "parse_parent_trace_id",
-    # Goal tool
-    "set_goal_tree",
-    "get_goal_tree",
 ]

+ 8 - 27
agent/trace/goal_tool.py

@@ -13,22 +13,6 @@ if TYPE_CHECKING:
     from .protocols import TraceStore
 
 
-# ===== 全局 GoalTree 状态管理 =====
-
-_current_goal_tree = None
-
-
-def set_goal_tree(tree):
-    """设置当前 GoalTree(由 AgentRunner 调用)"""
-    global _current_goal_tree
-    _current_goal_tree = tree
-
-
-def get_goal_tree():
-    """获取当前 GoalTree"""
-    return _current_goal_tree
-
-
 # ===== LLM 可调用的 goal 工具 =====
 
 @tool(description="管理执行计划,添加/完成/放弃目标,切换焦点")
@@ -53,12 +37,13 @@ async def goal(
         done: 完成当前目标,值为 summary
         abandon: 放弃当前目标,值为原因
         focus: 切换焦点到指定 ID
-        context: 工具执行上下文(包含 store 和 trace_id
+        context: 工具执行上下文(包含 store、trace_id、goal_tree
 
     Returns:
         str: 更新后的计划状态文本
     """
-    tree = get_goal_tree()
+    # GoalTree 从 context 获取,每个 agent 实例独立,不再依赖全局变量
+    tree = context.get("goal_tree") if context else None
     if tree is None:
         return "错误:GoalTree 未初始化"
 
@@ -130,10 +115,7 @@ async def goal_tool(
 
         # 推送事件
         if store and trace_id:
-            print(f"[DEBUG] goal_tool: calling store.update_goal for done: goal_id={goal.id}")
             await store.update_goal(trace_id, goal.id, status="completed", summary=done)
-        else:
-            print(f"[DEBUG] goal_tool: skip event push (store={store}, trace_id={trace_id})")
 
         # 检查是否有级联完成的父目标(complete方法已经处理,这里只需要记录)
         if goal.parent_id:
@@ -163,10 +145,7 @@ async def goal_tool(
 
         # 推送事件
         if store and trace_id:
-            print(f"[DEBUG] goal_tool: calling store.update_goal for abandon: goal_id={goal.id}")
             await store.update_goal(trace_id, goal.id, status="abandoned", summary=abandon)
-        else:
-            print(f"[DEBUG] goal_tool: skip event push (store={store}, trace_id={trace_id})")
 
     # 4. 处理 add
     if add is not None:
@@ -218,11 +197,8 @@ async def goal_tool(
 
             # 推送事件
             if store and trace_id:
-                print(f"[DEBUG] goal_tool: calling store.add_goal for {len(new_goals)} new goals")
                 for goal in new_goals:
                     await store.add_goal(trace_id, goal)
-            else:
-                print(f"[DEBUG] goal_tool: skip event push (store={store}, trace_id={trace_id})")
 
             # 如果没有焦点且添加了目标,自动 focus 到第一个新目标
             if not tree.current_id and new_goals:
@@ -230,6 +206,11 @@ async def goal_tool(
                 display_id = tree._generate_display_id(new_goals[0])
                 changes.append(f"自动切换焦点: {display_id}")
 
+    # 将完整内存树状态(含 current_id)同步到存储,
+    # 因为 store.add_goal / update_goal 各自从磁盘加载,不包含 focus 等内存变更
+    if store and trace_id and changes:
+        await store.update_goal_tree(trace_id, tree)
+
     # 返回当前状态
     result = []
     if changes:

+ 4 - 2
agent/trace/models.py

@@ -200,12 +200,14 @@ class Message:
         msg: Dict[str, Any] = {"role": self.role}
 
         if self.role == "tool":
-            # tool message: tool_call_id + name + content(string)
+            # tool message: tool_call_id + name + content
             if self.tool_call_id:
                 msg["tool_call_id"] = self.tool_call_id
                 msg["name"] = self.description or "unknown"
             if isinstance(self.content, dict):
-                msg["content"] = str(self.content.get("result", self.content))
+                result = self.content.get("result", self.content)
+                # result 可能是 list(含图片的多模态内容)或字符串
+                msg["content"] = result if isinstance(result, list) else str(result)
             else:
                 msg["content"] = str(self.content) if self.content is not None else ""
 

+ 38 - 9
docs/README.md

@@ -60,8 +60,10 @@ agent/
 │   ├── protocols.py       # MemoryStore 接口
 │   ├── stores.py          # 存储实现
 │   ├── skill_loader.py    # Skill 加载器
-│   └── skills/            # 内置 Skills
-│       └── core.md        # Core Skill(自动加载)
+│   └── skills/            # 内置 Skills(自动注入 system prompt)
+│       ├── planning.md    # 计划与 Goal 工具使用
+│       ├── research.md    # 搜索与内容研究
+│       └── browser.md     # 浏览器自动化
 ├── llm/                   # LLM 集成
 │   ├── gemini.py          # Gemini Provider
@@ -167,6 +169,7 @@ class RunConfig:
     agent_type: str = "default"
     uid: Optional[str] = None
     system_prompt: Optional[str] = None        # None = 从 skills 自动构建
+    skills: Optional[List[str]] = None         # 注入 system prompt 的 skill 名称列表;None = 按 preset 决定
     enable_memory: bool = True
     auto_execute_tools: bool = True
     name: Optional[str] = None                 # 显示名称(空则由 utility_llm 自动生成)
@@ -304,7 +307,7 @@ agent 工具的合成结果对齐正常返回值格式(含 `sub_trace_id` 字
 **实现**:`agent/core/runner.py:AgentRunner._heal_orphaned_tool_calls`
 
 - `run(messages, config)`:**核心方法**,流式返回 `AsyncIterator[Union[Trace, Message]]`
-- `run_result(messages, config)`:便利方法,内部消费 `run()`,返回结构化结果。主要用于 `agent`/`evaluate` 工具内部
+- `run_result(messages, config, on_event=None)`:便利方法,内部消费 `run()`,返回结构化结果。`on_event` 回调可实时接收每个 Trace/Message 事件(用于调试时输出子 Agent 执行过程)。主要用于 `agent`/`evaluate` 工具内部
 
 ### REST API
 
@@ -544,19 +547,24 @@ class AgentPreset:
     denied_tools: Optional[List[str]] = None   # 黑名单
     max_iterations: int = 30
     temperature: Optional[float] = None
+    skills: Optional[List[str]] = None         # 注入 system prompt 的 skill 名称列表;None = 加载全部
     description: Optional[str] = None
 
 
+_DEFAULT_SKILLS = ["planning", "research", "browser"]
+
 AGENT_PRESETS = {
     "default": AgentPreset(
         allowed_tools=None,
         max_iterations=30,
+        skills=_DEFAULT_SKILLS,
         description="默认 Agent,拥有全部工具权限",
     ),
     "explore": AgentPreset(
         allowed_tools=["read", "glob", "grep", "list_files"],
         denied_tools=["write", "edit", "bash", "task"],
         max_iterations=15,
+        skills=["planning"],
         description="探索型 Agent,只读权限,用于代码分析",
     ),
     "analyst": AgentPreset(
@@ -564,6 +572,7 @@ AGENT_PRESETS = {
         denied_tools=["write", "edit", "bash", "task"],
         temperature=0.3,
         max_iterations=25,
+        skills=["planning", "research"],
         description="分析型 Agent,用于深度分析和研究",
     ),
 }
@@ -571,7 +580,7 @@ AGENT_PRESETS = {
 
 **实现**:`agent/core/presets.py`
 
-**用户自定义**:项目级配置 `.agent/presets.json` 可覆盖或添加预设。
+**用户自定义**:项目级配置文件(如 `examples/how/presets.json`)可通过 `register_preset()` 注册额外预设。项目专用的 Agent 类型建议放在项目目录下,而非内置预设。
 
 ---
 
@@ -589,10 +598,15 @@ async def agent(
     task: Union[str, List[str]],
     messages: Optional[Union[Messages, List[Messages]]] = None,
     continue_from: Optional[str] = None,
+    agent_type: Optional[str] = None,
+    skills: Optional[List[str]] = None,
     context: Optional[dict] = None,
 ) -> Dict[str, Any]:
 ```
 
+- `agent_type`: 子 Agent 类型,决定工具权限和默认 skills(对应 `AgentPreset` 名称,如 `"deconstruct"`)
+- `skills`: 覆盖 preset 默认值,显式指定注入 system prompt 的 skill 列表
+
 **单任务(delegate)**:`task: str`
 - 创建单个 Sub-Trace
 - 完整工具权限(除 agent/evaluate 外,防止递归)
@@ -748,17 +762,32 @@ ToolResult(
 
 | 类型 | 加载位置 | 加载时机 |
 |------|---------|---------|
-| **Core Skill** | System Prompt | Agent 启动时自动加载 |
+| **内置 Skill** | System Prompt | Agent 启动时自动注入 |
+| **项目 Skill** | System Prompt | Agent 启动时按 preset/call-site 过滤后注入 |
 | **普通 Skill** | 对话消息 | 模型调用 `skill` 工具时 |
 
 ### 目录结构
 
 ```
-agent/memory/skills/
-├── core.md              # Core Skill(自动加载到 System Prompt)
-└── browser_use/         # 普通 Skill(按需加载)
+agent/memory/skills/         # 内置 Skills(始终加载)
+├── planning.md              # 计划与 Goal 工具使用
+├── research.md              # 搜索与内容研究
+└── browser.md               # 浏览器自动化
+
+./skills/                    # 项目自定义 Skills
+```
 
-./skills/                # 项目自定义 Skills(按需加载)
+### Skills 过滤(call-site 选择)
+
+不同 Agent 类型所需的 skills 不同。过滤优先级:
+
+1. `agent()` 工具的 `skills` 参数(显式指定,最高优先级)
+2. `AgentPreset.skills`(preset 默认值)
+3. `None`(加载全部,向后兼容)
+
+示例:调用子 Agent 时只注入解构相关 skill:
+```python
+agent(task="...", agent_type="deconstruct", skills=["planning", "deconstruct"])
 ```
 
 **实现**:`agent/memory/skill_loader.py`

+ 12 - 0
examples/how/README.md

@@ -0,0 +1,12 @@
+
+## 运行方法
+1. 输入:将原始帖子内容放到 `examples/how/input/` 文件夹下
+2. 运行:在项目根目录下运行 `python examples/how/run.py`
+3. 输出:在 `examples/how/output` 中查看
+
+## prompt调试
+- 主Agent(调度与评估):修改 `examples/how/production.prompt`
+    - 原始输入/参考资料等等都可以在这里输入文件路径
+    - 可以在这里指定各类输出的保存路径
+- 制作表解构Agent:修改 `examples/how/skills/deconstruct.md` , 这部分内容会在主Agent创建子Agent时作为子Agent的system prompt
+- 还原Agent:修改 `examples/how/skills/construct.md` , 这部分内容会在主Agent创建子Agent时作为子Agent的system prompt

+ 2 - 19
examples/how/input/《秋日际遇》写生油画.json

@@ -1,26 +1,9 @@
 {
-  "channel_content_id": "616192600000000021034642",
-  "link": "https://www.xiaohongshu.com/explore/616192600000000021034642",
-  "comment_count": 0,
   "images": [
     "examples/how/input/1.jpeg",
-    "examples/how/input/2.jpeg",
     "examples/how/input/3.jpeg",
-    "examples/how/input/4.jpeg",
-    "examples/how/input/5.jpeg",
-    "examples/how/input/6.jpeg",    
-    "examples/how/input/7.jpeg",
-    "examples/how/input/8.jpeg",
-    "examples/how/input/9.jpeg"
+    "examples/how/input/7.jpeg"
   ],
-  "like_count": 411,
   "body_text": "听闻秋日是倒放的春天\n于是我心中有一座秋日的花园\n栽种着一簇簇淡却温暖的花\n风沿着远边的山吹来\n热情的阳光里秋风微凉\n与颜料一起酝酿出的画面\n白裙是一抹无暇\n迎着光绘画出\n那片在我心上开满\n限定的浪漫\n被画架支起\n绿草坪还驻留了匆匆而过的热闹\n再添一笔白\n为我画一枝玫瑰的奇遇\n———@万淮 #草地拍照[话题]##画画[话题]#",
-  "title": "《秋日际遇》写生油画",
-  "collect_count": 181,
-  "channel_account_id": "584fc4a36a6a693eef600ec3",
-  "channel_account_name": "糯米和Kilala",
-  "content_type": "note",
-  "video": "",
-  "publish_timestamp": 1633784416000,
-  "publish_time": "2021-10-09 21:00:16"
+  "title": "《秋日际遇》写生油画"
 }

+ 10 - 6
examples/how/production.prompt

@@ -11,14 +11,17 @@ $system$
 ## 工作流程
 
 **第一轮**:
-1. 调用 deconstruct agent,传入原帖的完整多模态内容(文字+所有图片),获取 制作表
+1. 调用 deconstruct agent,传入原帖的完整多模态内容,获取 制作表;注意:
+    - 你可以直接给deconstruct agent输入文件夹路径
+    - 它会自动加载如何解构内容的skill:examples/how/skills/deconstruct.md作为system prompt
+    - 指定解构结果的保存路径
 2. 调用 construct agent,传入解构产物 制作表,得到生成内容
 3. 对比建构结果与原帖,做出评估
 
 **后续迭代**(如有必要):
 4. 根据建构 agent 的执行报告和你的对比观察,判断解构哪里不够准确或不够完整,或者建构做的不够好
-5. 带着具体的修改意见再次调用解构 agent(通过 `continue_from` 复用已有 trace,或重新调用并说明改进方向)
-6. 重复建构→评估,直到满意
+5. 带着具体的修改意见再次调用解构或建构 agent(通过 `continue_from` 复用已有 trace,或重新调用并说明改进方向)
+6. 评估结果,并重复以上环节,直到满意
 
 ## 评估标准
 
@@ -35,10 +38,11 @@ $system$
 - 差距来自建构工具能力上限,而非解构质量问题
 - 迭代超过 3 轮且边际改善明显收窄
 
-## 最终输出
+## 输出
 
-输出最终解构产物 制作表JSON,并附上一段简短的研究备注(这篇内容的核心创作规律是什么,迭代过程中发现了什么)。
+注意输出过程中的制作表和还原产物,每一轮次的结果应该输出到examples/how/output中的一个子文件夹。
+输出最终解构产物 制作表JSON 和相关特征 以及 还原结果,保存到examples/how/output/final,并附上一段简短的研究备注(这篇内容的核心创作规律是什么,迭代过程中发现了什么)。
 
 $user$
 请对下面这篇社交媒体帖子进行解构-建构-评估迭代,产出高质量解构产物。
-(原始帖子信息放在了这个目录下:examples/how/input)
+原始帖子信息:examples/how/input/《秋日际遇》写生油画.json

+ 26 - 0
examples/how/resource/input_cloud_archive/《秋日际遇》写生油画.json

@@ -0,0 +1,26 @@
+{
+  "channel_content_id": "616192600000000021034642",
+  "link": "https://www.xiaohongshu.com/explore/616192600000000021034642",
+  "comment_count": 0,
+  "images": [
+    "http://res.cybertogether.net/crawler/image/5b94399f3bdef0a80b98e2734e110ca2.jpeg",
+    "http://res.cybertogether.net/crawler/image/6d80c193ccd0b047e0f3354ed6aca355.jpeg",
+    "http://res.cybertogether.net/crawler/image/2ba333062a7370ce229696fc36b9a060.jpeg",
+    "http://res.cybertogether.net/crawler/image/8187a1ad4e56295ab13d881d0ef7c934.jpeg",
+    "http://res.cybertogether.net/crawler/image/16fc8596b7c12031e910eb517859045c.jpeg",
+    "http://res.cybertogether.net/crawler/image/15a29cb486344bc10e90402371e21c92.jpeg",
+    "http://res.cybertogether.net/crawler/image/e70bbea964cfcf0225744da00e8e7939.jpeg",
+    "http://res.cybertogether.net/crawler/image/d20b73ad445c7dce64983159bc6cdae0.jpeg",
+    "http://res.cybertogether.net/crawler/image/c4c73c1b32f8066cc40a43ce61f61364.jpeg"
+  ],
+  "like_count": 411,
+  "body_text": "听闻秋日是倒放的春天\n于是我心中有一座秋日的花园\n栽种着一簇簇淡却温暖的花\n风沿着远边的山吹来\n热情的阳光里秋风微凉\n与颜料一起酝酿出的画面\n白裙是一抹无暇\n迎着光绘画出\n那片在我心上开满\n限定的浪漫\n被画架支起\n绿草坪还驻留了匆匆而过的热闹\n再添一笔白\n为我画一枝玫瑰的奇遇\n———@万淮 #草地拍照[话题]##画画[话题]#",
+  "title": "《秋日际遇》写生油画",
+  "collect_count": 181,
+  "channel_account_id": "584fc4a36a6a693eef600ec3",
+  "channel_account_name": "糯米和Kilala",
+  "content_type": "note",
+  "video": "",
+  "publish_timestamp": 1633784416000,
+  "publish_time": "2021-10-09 21:00:16"
+}

BIN
examples/how/resource/input_local_archive/1.jpeg


+ 0 - 0
examples/how/input/2.jpeg → examples/how/resource/input_local_archive/2.jpeg


BIN
examples/how/resource/input_local_archive/3.jpeg


+ 0 - 0
examples/how/input/4.jpeg → examples/how/resource/input_local_archive/4.jpeg


+ 0 - 0
examples/how/input/5.jpeg → examples/how/resource/input_local_archive/5.jpeg


+ 0 - 0
examples/how/input/6.jpeg → examples/how/resource/input_local_archive/6.jpeg


BIN
examples/how/resource/input_local_archive/7.jpeg


+ 0 - 0
examples/how/input/8.jpeg → examples/how/resource/input_local_archive/8.jpeg


+ 0 - 0
examples/how/input/9.jpeg → examples/how/resource/input_local_archive/9.jpeg


+ 26 - 0
examples/how/resource/input_local_archive/《秋日际遇》写生油画.json

@@ -0,0 +1,26 @@
+{
+  "channel_content_id": "616192600000000021034642",
+  "link": "https://www.xiaohongshu.com/explore/616192600000000021034642",
+  "comment_count": 0,
+  "images": [
+    "examples/how/input/1.jpeg",
+    "examples/how/input/2.jpeg",
+    "examples/how/input/3.jpeg",
+    "examples/how/input/4.jpeg",
+    "examples/how/input/5.jpeg",
+    "examples/how/input/6.jpeg",    
+    "examples/how/input/7.jpeg",
+    "examples/how/input/8.jpeg",
+    "examples/how/input/9.jpeg"
+  ],
+  "like_count": 411,
+  "body_text": "听闻秋日是倒放的春天\n于是我心中有一座秋日的花园\n栽种着一簇簇淡却温暖的花\n风沿着远边的山吹来\n热情的阳光里秋风微凉\n与颜料一起酝酿出的画面\n白裙是一抹无暇\n迎着光绘画出\n那片在我心上开满\n限定的浪漫\n被画架支起\n绿草坪还驻留了匆匆而过的热闹\n再添一笔白\n为我画一枝玫瑰的奇遇\n———@万淮 #草地拍照[话题]##画画[话题]#",
+  "title": "《秋日际遇》写生油画",
+  "collect_count": 181,
+  "channel_account_id": "584fc4a36a6a693eef600ec3",
+  "channel_account_name": "糯米和Kilala",
+  "content_type": "note",
+  "video": "",
+  "publish_timestamp": 1633784416000,
+  "publish_time": "2021-10-09 21:00:16"
+}

+ 32 - 12
examples/how/run.py

@@ -39,24 +39,40 @@ from agent.trace import (
     Message,
 )
 from agent.llm import create_openrouter_llm_call
+from agent.tools import get_tool_registry
 
 
 # ===== 非阻塞 stdin 检测 =====
+if sys.platform == 'win32':
+    import msvcrt
 
 def check_stdin() -> str | None:
     """
-    非阻塞检查 stdin 是否有输入。
-
-    使用 select 轮询,不开后台线程,因此不会与交互菜单的 input() 抢 stdin。
+    跨平台非阻塞检查 stdin 输入。
+    Windows: 使用 msvcrt.kbhit()
+    macOS/Linux: 使用 select.select()
     """
-    ready, _, _ = select.select([sys.stdin], [], [], 0)
-    if ready:
-        line = sys.stdin.readline().strip().lower()
-        if line in ('p', 'pause'):
-            return 'pause'
-        if line in ('q', 'quit'):
-            return 'quit'
-    return None
+    if sys.platform == 'win32':
+        # 检查是否有按键按下
+        if msvcrt.kbhit():
+            # 读取按下的字符(msvcrt.getwch 是非阻塞读取宽字符)
+            ch = msvcrt.getwch().lower()
+            if ch == 'p':
+                return 'pause'
+            if ch == 'q':
+                return 'quit'
+            # 如果是其他按键,可以选择消耗掉或者忽略
+        return None
+    else:
+        # Unix/Mac 逻辑
+        ready, _, _ = select.select([sys.stdin], [], [], 0)
+        if ready:
+            line = sys.stdin.readline().strip().lower()
+            if line in ('p', 'pause'):
+                return 'pause'
+            if line in ('q', 'quit'):
+                return 'quit'
+        return None
 
 
 # ===== 交互菜单 =====
@@ -248,6 +264,10 @@ async def main():
     print(f"   - Skills 目录: {skills_dir}")
     print(f"   - 模型: {prompt.config.get('model', 'sonnet-4.5')}")
 
+    # 加载自定义工具
+    print("   - 加载自定义工具: nanobanana")
+    import examples.how.tool  # 导入自定义工具模块,触发 @tool 装饰器注册
+
     store = FileSystemTraceStore(base_path=".trace")
     runner = AgentRunner(
         trace_store=store,
@@ -294,7 +314,7 @@ async def main():
                 model=f"claude-{prompt.config.get('model', 'sonnet-4.5')}",
                 temperature=float(prompt.config.get('temperature', 0.3)),
                 max_iterations=1000,
-                name="mcp/skills 发现、获取、评价 分析任务",
+                name="社交媒体内容解构、建构、评估任务",
             )
 
         while not should_exit:

+ 5 - 0
examples/how/skills/construct.md

@@ -9,6 +9,11 @@ description: 建构社交媒体帖子内容
 
 ---
 
+## 主要工具
+你可以使用 `nanobanana` 工具生成图片。
+
+---
+
 ## 输出
 
 将最终的生成内容组织到输出文件夹中。不同版本的输出应该分别是一个子文件夹。

+ 2 - 1
examples/how/skills/deconstruct.md

@@ -55,7 +55,7 @@ description: 从制作层解构社交媒体帖子,提取视觉制作决策
 
 ## 输出格式
 
-输出一个 JSON。**只填写对这篇帖子有意义的字段**,不强制填写所有字段,不强制填满每个层级。
+输出一个 JSON,并将其保存到指定输出目录下。**只填写对这篇帖子有意义的字段**,不强制填写所有字段,不强制填满每个层级。
 
 特征文件保存至 `./features/<元素名>/`,制作表中以路径引用。
 
@@ -113,6 +113,7 @@ description: 从制作层解构社交媒体帖子,提取视觉制作决策
 
 ## 原则
 
+- **亲自读图**:你应该直接读取我们需要解构的内容中的多模态内容,仅在后续缺乏特征提取能力的情况下再继续使用其他工具来处理多模态内容
 - **选择性而非穷举**:只记录对还原质量有实质影响的信息                                          
 - **泛化描述**:描述创作规律,而非内容细节("主体特写,背景虚化"优于"穿蓝衣服的女生")          
 - **制作视角**:从"如何制作出这个效果"出发,而非"这是什么内容"                                  

+ 7 - 0
examples/how/tool/__init__.py

@@ -0,0 +1,7 @@
+"""
+How 示例的自定义工具
+"""
+
+from examples.how.tool.nanobanana import nanobanana
+
+__all__ = ["nanobanana"]

+ 572 - 0
examples/how/tool/nanobanana.py

@@ -0,0 +1,572 @@
+"""
+NanoBanana Tool - 图像特征提取与图像生成
+
+该工具可以提取图片中的特征,也可以根据描述生成图片。
+支持通过 OpenRouter 调用多模态模型,提取结构化的图像特征并保存为 JSON,
+或基于输入图像生成新的图像。
+"""
+
+import base64
+import json
+import mimetypes
+import os
+import re
+from pathlib import Path
+from typing import Optional, Dict, Any, List, Tuple
+
+import httpx
+from dotenv import load_dotenv
+
+from agent.tools import tool, ToolResult
+
+OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
+DEFAULT_TIMEOUT = 120.0
+
+DEFAULT_EXTRACTION_PROMPT = (
+    "请从这张图像中提取跨场景相对稳定、可复用的视觉不变特征。"
+    "输出严格 JSON,字段包含:identity_features、pose_features、appearance_features、"
+    "material_features、style_features、uncertainty、notes。"
+    "每个字段给出简洁要点,避免臆测。"
+)
+
+DEFAULT_IMAGE_PROMPT = (
+    "基于输入图像生成一张保留主体身份与关键视觉特征的新图像。"
+    "保持人物核心特征一致,同时提升清晰度与可用性。"
+)
+
+DEFAULT_IMAGE_MODEL_CANDIDATES = [
+    "google/gemini-2.5-flash-image",
+    "google/gemini-3-pro-image-preview",
+    "black-forest-labs/flux.2-flex",
+    "black-forest-labs/flux.2-pro",
+]
+
+
+def _resolve_api_key() -> Optional[str]:
+    """优先读取环境变量,缺失时尝试从 .env 加载。"""
+    api_key = os.getenv("OPENROUTER_API_KEY") or os.getenv("OPEN_ROUTER_API_KEY")
+    if api_key:
+        return api_key
+
+    load_dotenv()
+    return os.getenv("OPENROUTER_API_KEY") or os.getenv("OPEN_ROUTER_API_KEY")
+
+
+def _image_to_data_url(image_path: Path) -> str:
+    """将图片文件编码为 data URL。"""
+    mime_type = mimetypes.guess_type(str(image_path))[0] or "application/octet-stream"
+    raw = image_path.read_bytes()
+    b64 = base64.b64encode(raw).decode("utf-8")
+    return f"data:{mime_type};base64,{b64}"
+
+
+def _safe_json_parse(content: str) -> Dict[str, Any]:
+    """尽量从模型文本中提取 JSON。"""
+    try:
+        return json.loads(content)
+    except json.JSONDecodeError:
+        start = content.find("{")
+        end = content.rfind("}")
+        if start != -1 and end != -1 and end > start:
+            candidate = content[start:end + 1]
+            return json.loads(candidate)
+        raise
+
+
+def _extract_data_url_images(message: Dict[str, Any]) -> List[Tuple[str, str]]:
+    """
+    从 OpenRouter 响应消息中提取 data URL 图片。
+
+    Returns:
+        List[(mime_type, base64_data)]
+    """
+    extracted: List[Tuple[str, str]] = []
+
+    # 官方文档中的主要位置:message.images[]
+    for img in message.get("images", []) or []:
+        if not isinstance(img, dict):
+            continue
+        if img.get("type") != "image_url":
+            continue
+        data_url = ((img.get("image_url") or {}).get("url") or "").strip()
+        if not data_url.startswith("data:"):
+            continue
+        m = re.match(r"^data:([^;]+);base64,(.+)$", data_url, flags=re.DOTALL)
+        if not m:
+            continue
+        extracted.append((m.group(1), m.group(2)))
+
+    # 兼容某些模型可能把 image_url 放在 content 数组中
+    content = message.get("content")
+    if isinstance(content, list):
+        for part in content:
+            if not isinstance(part, dict):
+                continue
+            if part.get("type") != "image_url":
+                continue
+            data_url = ((part.get("image_url") or {}).get("url") or "").strip()
+            if not data_url.startswith("data:"):
+                continue
+            m = re.match(r"^data:([^;]+);base64,(.+)$", data_url, flags=re.DOTALL)
+            if not m:
+                continue
+            extracted.append((m.group(1), m.group(2)))
+
+    return extracted
+
+
+def _extract_image_refs(choice: Dict[str, Any], message: Dict[str, Any]) -> List[Dict[str, str]]:
+    """
+    尝试从不同响应格式中提取图片引用。
+
+    返回格式:
+    - {"kind": "data_url", "value": "data:image/png;base64,..."}
+    - {"kind": "base64", "value": "...", "mime_type": "image/png"}
+    - {"kind": "url", "value": "https://..."}
+    """
+    refs: List[Dict[str, str]] = []
+
+    # 1) 标准 message.images
+    for img in message.get("images", []) or []:
+        if not isinstance(img, dict):
+            continue
+        # image_url 结构
+        data_url = ((img.get("image_url") or {}).get("url") or "").strip()
+        if data_url.startswith("data:"):
+            refs.append({"kind": "data_url", "value": data_url})
+            continue
+        if data_url.startswith("http"):
+            refs.append({"kind": "url", "value": data_url})
+            continue
+
+        # 兼容 base64 字段
+        b64 = (img.get("b64_json") or img.get("base64") or "").strip()
+        if b64:
+            refs.append({"kind": "base64", "value": b64, "mime_type": img.get("mime_type", "image/png")})
+
+    # 2) 某些格式可能在 choice.images
+    for img in choice.get("images", []) or []:
+        if not isinstance(img, dict):
+            continue
+        data_url = ((img.get("image_url") or {}).get("url") or "").strip()
+        if data_url.startswith("data:"):
+            refs.append({"kind": "data_url", "value": data_url})
+            continue
+        if data_url.startswith("http"):
+            refs.append({"kind": "url", "value": data_url})
+            continue
+        b64 = (img.get("b64_json") or img.get("base64") or "").strip()
+        if b64:
+            refs.append({"kind": "base64", "value": b64, "mime_type": img.get("mime_type", "image/png")})
+
+    # 3) content 数组里的 image_url
+    content = message.get("content")
+    if isinstance(content, list):
+        for part in content:
+            if not isinstance(part, dict):
+                continue
+            if part.get("type") != "image_url":
+                continue
+            url = ((part.get("image_url") or {}).get("url") or "").strip()
+            if url.startswith("data:"):
+                refs.append({"kind": "data_url", "value": url})
+            elif url.startswith("http"):
+                refs.append({"kind": "url", "value": url})
+
+    # 4) 极端兼容:文本中可能出现 data:image 或 http 图片 URL
+    if isinstance(content, str):
+        # data URL
+        for m in re.finditer(r"(data:image\/[a-zA-Z0-9.+-]+;base64,[A-Za-z0-9+/=]+)", content):
+            refs.append({"kind": "data_url", "value": m.group(1)})
+        # http(s) 图片链接
+        for m in re.finditer(r"(https?://\S+\.(?:png|jpg|jpeg|webp))", content, flags=re.IGNORECASE):
+            refs.append({"kind": "url", "value": m.group(1)})
+
+    return refs
+
+
+def _mime_to_ext(mime_type: str) -> str:
+    """MIME 类型映射到扩展名。"""
+    mapping = {
+        "image/png": ".png",
+        "image/jpeg": ".jpg",
+        "image/webp": ".webp",
+    }
+    return mapping.get(mime_type.lower(), ".png")
+
+
+def _normalize_model_id(model_id: str) -> str:
+    """
+    规范化常见误写模型 ID,减少无效重试。
+    """
+    if not model_id:
+        return model_id
+    m = model_id.strip()
+    # 常见误写:gemini/gemini-xxx -> google/gemini-xxx
+    if m.startswith("gemini/"):
+        m = "google/" + m.split("/", 1)[1]
+    # 常见顺序误写:preview-image -> image
+    if "gemini-2.5-flash-preview-image" in m:
+        m = m.replace("gemini-2.5-flash-preview-image", "gemini-2.5-flash-image")
+    # 兼容旧 ID 到当前可用 ID
+    if "gemini-2.5-flash-image-preview" in m:
+        m = m.replace("gemini-2.5-flash-image-preview", "gemini-2.5-flash-image")
+    return m
+
+
+@tool(description="可以提取图片中的特征,也可以根据描述生成图片")
+async def nanobanana(
+    image_path: str = "",
+    image_paths: Optional[List[str]] = None,
+    output_file: Optional[str] = None,
+    prompt: Optional[str] = None,
+    model: Optional[str] = None,
+    max_tokens: int = 1200,
+    generate_image: bool = False,
+    image_output_path: Optional[str] = None,
+) -> ToolResult:
+    """
+    可以提取图片中的特征,也可以根据描述生成图片。
+
+    Args:
+        image_path: 输入图片路径(单图模式,可选)
+        image_paths: 输入图片路径列表(多图整体模式,可选)
+        output_file: 输出 JSON 文件路径(可选,用于特征提取模式)
+        prompt: 自定义提取指令或生成描述(可选)
+        model: OpenRouter 模型名(可选,默认读取 NANOBANANA_MODEL 或使用 Gemini 视觉模型)
+        max_tokens: 最大输出 token
+        generate_image: 是否生成图片(False=提取特征,True=生成图片)
+        image_output_path: 生成图片保存路径(generate_image=True 时可选)
+
+    Returns:
+        ToolResult: 包含结构化特征和输出文件路径,或生成的图片路径
+    """
+    raw_paths: List[str] = []
+    if image_paths:
+        raw_paths.extend(image_paths)
+    if image_path:
+        raw_paths.append(image_path)
+    if not raw_paths:
+        return ToolResult(
+            title="NanoBanana 提取失败",
+            output="",
+            error="未提供输入图片,请传入 image_path 或 image_paths",
+        )
+
+    # 去重并检查路径
+    unique_raw: List[str] = []
+    seen = set()
+    for p in raw_paths:
+        if p and p not in seen:
+            unique_raw.append(p)
+            seen.add(p)
+
+    input_paths: List[Path] = [Path(p) for p in unique_raw]
+    invalid = [str(p) for p in input_paths if (not p.exists() or not p.is_file())]
+    if invalid:
+        return ToolResult(
+            title="NanoBanana 提取失败",
+            output="",
+            error=f"以下图片不存在或不可读: {invalid}",
+        )
+
+    api_key = _resolve_api_key()
+    if not api_key:
+        return ToolResult(
+            title="NanoBanana 提取失败",
+            output="",
+            error="未找到 OpenRouter API Key,请设置 OPENROUTER_API_KEY 或 OPEN_ROUTER_API_KEY",
+        )
+
+    if generate_image:
+        user_prompt = prompt or DEFAULT_IMAGE_PROMPT
+    else:
+        chosen_model = model or os.getenv("NANOBANANA_MODEL") or "google/gemini-2.5-flash"
+        user_prompt = prompt or DEFAULT_EXTRACTION_PROMPT
+
+    try:
+        image_data_urls = [_image_to_data_url(p) for p in input_paths]
+    except Exception as e:
+        return ToolResult(
+            title="NanoBanana 提取失败",
+            output="",
+            error=f"图片编码失败: {e}",
+        )
+
+    user_content: List[Dict[str, Any]] = [{"type": "text", "text": user_prompt}]
+    for u in image_data_urls:
+        user_content.append({"type": "image_url", "image_url": {"url": u}})
+
+    payload: Dict[str, Any] = {
+        "messages": [
+            {
+                "role": "system",
+                "content": (
+                    "你是视觉助手。"
+                    "当任务为特征提取时输出 JSON 对象,不要输出 markdown。"
+                    "当任务为图像生成时请返回图像。"
+                ),
+            },
+            {
+                "role": "user",
+                "content": user_content,
+            },
+        ],
+        "temperature": 0.2,
+        "max_tokens": max_tokens,
+    }
+    if generate_image:
+        payload["modalities"] = ["image", "text"]
+
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+        "HTTP-Referer": "https://local-agent",
+        "X-Title": "Agent NanoBanana Tool",
+    }
+
+    endpoint = f"{OPENROUTER_BASE_URL}/chat/completions"
+
+    # 图像生成模式:自动尝试多个可用模型,减少 404/invalid model 影响
+    if generate_image:
+        candidates: List[str] = []
+        if model:
+            candidates.append(_normalize_model_id(model))
+        if env_model := os.getenv("NANOBANANA_IMAGE_MODEL"):
+            candidates.append(_normalize_model_id(env_model))
+        candidates.extend([_normalize_model_id(x) for x in DEFAULT_IMAGE_MODEL_CANDIDATES])
+        # 去重并保持顺序
+        dedup: List[str] = []
+        seen = set()
+        for m in candidates:
+            if m and m not in seen:
+                dedup.append(m)
+                seen.add(m)
+        candidates = dedup
+    else:
+        candidates = [chosen_model]
+
+    data: Optional[Dict[str, Any]] = None
+    used_model: Optional[str] = None
+    errors: List[Dict[str, Any]] = []
+
+    for cand in candidates:
+        modality_attempts: List[Optional[List[str]]] = [None]
+        if generate_image:
+            modality_attempts = [["image", "text"], ["image"], None]
+
+        for mods in modality_attempts:
+            trial_payload = dict(payload)
+            trial_payload["model"] = cand
+
+            if mods is None:
+                trial_payload.pop("modalities", None)
+            else:
+                trial_payload["modalities"] = mods
+
+            try:
+                async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
+                    resp = await client.post(endpoint, json=trial_payload, headers=headers)
+                    resp.raise_for_status()
+                    data = resp.json()
+                    used_model = cand
+                    break
+            except httpx.HTTPStatusError as e:
+                errors.append({
+                    "model": cand,
+                    "modalities": mods,
+                    "status_code": e.response.status_code,
+                    "body": e.response.text[:600],
+                })
+                continue
+            except Exception as e:
+                errors.append({
+                    "model": cand,
+                    "modalities": mods,
+                    "status_code": None,
+                    "body": str(e)[:600],
+                })
+                continue
+
+        if data is not None:
+            break
+
+    if data is None:
+        title = "NanoBanana 生成失败" if generate_image else "NanoBanana 提取失败"
+        return ToolResult(
+            title=title,
+            output=json.dumps({"attempted_models": candidates, "errors": errors}, ensure_ascii=False, indent=2),
+            long_term_memory="All candidate models failed for this request",
+            metadata={"attempted_models": candidates, "errors": errors},
+        )
+
+    chosen_model = used_model or candidates[0]
+
+    choices = data.get("choices") or []
+    message = choices[0].get("message", {}) if choices else {}
+
+    # 图像生成分支
+    if generate_image:
+        refs = _extract_image_refs(choices[0] if choices else {}, message)
+        if not refs:
+            content = message.get("content")
+            preview = ""
+            if isinstance(content, str):
+                preview = content[:500]
+            elif isinstance(content, list):
+                preview = json.dumps(content[:3], ensure_ascii=False)[:500]
+
+            return ToolResult(
+                title="NanoBanana 生成失败",
+                output=json.dumps(data, ensure_ascii=False, indent=2),
+                error="模型未返回可解析图片(未在 message.images/choice.images/content 中发现图片)",
+                metadata={
+                    "model": chosen_model,
+                    "choice_keys": list((choices[0] if choices else {}).keys()),
+                    "message_keys": list(message.keys()) if isinstance(message, dict) else [],
+                    "content_preview": preview,
+                },
+            )
+
+        output_paths: List[str] = []
+        if image_output_path:
+            base_path = Path(image_output_path)
+        else:
+            if len(input_paths) > 1:
+                base_path = input_paths[0].parent / "set_generated.png"
+            else:
+                base_path = input_paths[0].parent / f"{input_paths[0].stem}_generated.png"
+        base_path.parent.mkdir(parents=True, exist_ok=True)
+
+        for idx, ref in enumerate(refs):
+            kind = ref.get("kind", "")
+            mime_type = "image/png"
+            raw_bytes: Optional[bytes] = None
+
+            if kind == "data_url":
+                m = re.match(r"^data:([^;]+);base64,(.+)$", ref.get("value", ""), flags=re.DOTALL)
+                if not m:
+                    continue
+                mime_type = m.group(1)
+                raw_bytes = base64.b64decode(m.group(2))
+            elif kind == "base64":
+                mime_type = ref.get("mime_type", "image/png")
+                raw_bytes = base64.b64decode(ref.get("value", ""))
+            elif kind == "url":
+                url = ref.get("value", "")
+                try:
+                    with httpx.Client(timeout=DEFAULT_TIMEOUT) as client:
+                        r = client.get(url)
+                        r.raise_for_status()
+                        raw_bytes = r.content
+                        mime_type = r.headers.get("content-type", "image/png").split(";")[0]
+                except Exception:
+                    continue
+            else:
+                continue
+
+            if not raw_bytes:
+                continue
+
+            ext = _mime_to_ext(mime_type)
+            if len(refs) == 1:
+                target = base_path
+                if target.suffix.lower() not in [".png", ".jpg", ".jpeg", ".webp"]:
+                    target = target.with_suffix(ext)
+            else:
+                stem = base_path.stem
+                target = base_path.with_name(f"{stem}_{idx+1}{ext}")
+            try:
+                target.write_bytes(raw_bytes)
+                output_paths.append(str(target))
+            except Exception as e:
+                return ToolResult(
+                    title="NanoBanana 生成失败",
+                    output="",
+                    error=f"写入生成图片失败: {e}",
+                    metadata={"model": chosen_model},
+                )
+
+        if not output_paths:
+            return ToolResult(
+                title="NanoBanana 生成失败",
+                output=json.dumps(data, ensure_ascii=False, indent=2),
+                error="检测到图片引用但写入失败(可能是无效 base64 或 URL 不可访问)",
+                metadata={"model": chosen_model, "ref_count": len(refs)},
+            )
+
+        usage = data.get("usage", {})
+        prompt_tokens = usage.get("prompt_tokens") or usage.get("input_tokens", 0)
+        completion_tokens = usage.get("completion_tokens") or usage.get("output_tokens", 0)
+        summary = {
+            "model": chosen_model,
+            "input_images": [str(p) for p in input_paths],
+            "input_count": len(input_paths),
+            "generated_images": output_paths,
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+        }
+        return ToolResult(
+            title="NanoBanana 图片生成完成",
+            output=json.dumps({"summary": summary}, ensure_ascii=False, indent=2),
+            long_term_memory=f"Generated {len(output_paths)} image(s) from {len(input_paths)} input image(s) using {chosen_model}",
+            attachments=output_paths,
+            metadata=summary,
+        )
+
+    content = message.get("content") or ""
+    if not content:
+        return ToolResult(
+            title="NanoBanana 提取失败",
+            output=json.dumps(data, ensure_ascii=False, indent=2),
+            error="模型未返回内容",
+        )
+
+    try:
+        parsed = _safe_json_parse(content)
+    except Exception as e:
+        return ToolResult(
+            title="NanoBanana 提取失败",
+            output=content,
+            error=f"模型返回非 JSON 内容,解析失败: {e}",
+            metadata={"model": chosen_model},
+        )
+
+    if output_file:
+        out_path = Path(output_file)
+    else:
+        if len(input_paths) > 1:
+            out_path = input_paths[0].parent / "set_invariant_features.json"
+        else:
+            out_path = input_paths[0].parent / f"{input_paths[0].stem}_invariant_features.json"
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps(parsed, ensure_ascii=False, indent=2), encoding="utf-8")
+
+    usage = data.get("usage", {})
+    prompt_tokens = usage.get("prompt_tokens") or usage.get("input_tokens", 0)
+    completion_tokens = usage.get("completion_tokens") or usage.get("output_tokens", 0)
+
+    summary = {
+        "model": chosen_model,
+        "input_images": [str(p) for p in input_paths],
+        "input_count": len(input_paths),
+        "output_file": str(out_path),
+        "prompt_tokens": prompt_tokens,
+        "completion_tokens": completion_tokens,
+    }
+
+    return ToolResult(
+        title="NanoBanana 不变特征提取完成",
+        output=json.dumps(
+            {
+                "summary": summary,
+                "features": parsed,
+            },
+            ensure_ascii=False,
+            indent=2,
+        ),
+        long_term_memory=f"Extracted invariant features from {len(input_paths)} input image(s) using {chosen_model}",
+        attachments=[str(out_path)],
+        metadata=summary,
+    )