Przeglądaj źródła

feat(read_images): batch image tool with adaptive grid + shared image utils

Problem: when the agent needs to analyze many local images (pick the best
photo, compare candidates, batch-judge), reading them one at a time via
read_file blows up tokens — each image carries structural overhead per
message block, and there is no way to see them side-by-side for
comparison.

read_images solves this:
- Loads 1-12 images concurrently (local paths or URLs, mixable)
- Downscales every image to max_dimension (default 1024px) to control
  per-image token cost
- Two layouts:
  - grid (default): stitches N images into one index-numbered (1,2,3...)
    grid image so the LLM sees one picture with all candidates and can
    refer to them by index. Auto-picks columns/thumb_size based on count
    (2 imgs -> 2x1 @500px, 12 imgs -> 4x3 @320px), so final canvas stays
    within ~1400px long edge and no per-cell cell gets too small to read
    after LLM-internal resize
  - separate: returns N independent downscaled images for tools that
    really need per-image attention
- Output text maps index -> full original path so the LLM can reference
  "image 3" and resolve it to the source file for downstream edits

Grid mode caps at 12 images per call. Beyond that, each cell becomes too
small to be useful after the LLM's internal image resize (~1568px long
edge). Caller must batch in chunks.

Shared utilities (agent/tools/utils/image.py):
- load_image / load_images: async local+URL loader
- downscale: aspect-preserving resize
- build_image_grid: parameterized grid builder with scaled index boxes
  (index_box = thumb_size // 5, font = box * 0.65, so visual proportions
  stay constant across different thumb_size)
- encode_base64: PIL -> base64 JPEG for tool result images

Fixes a latent font bug at the same time: PingFang.ttc on macOS Sequoia
cannot be opened by PIL/FreeType (cryptic "cannot open resource"), so
search.py and crawler.py were silently rendering collages with the tiny
default bitmap font — Chinese titles showed as near-invisible dots. The
new font candidate list prioritizes Hiragino Sans GB and STHeiti Medium,
both of which PIL can actually read.

Refactor search.py and crawler.py to call build_image_grid instead of
maintaining their own ~120-line duplicate collage implementations. No
behavior change besides the font fix.

read_file.py: add a docstring note pointing at read_images for batch use
so the LLM can pick the right tool.
Talegorithm 3 dni temu
rodzic
commit
efea909f3b

+ 2 - 0
agent/tools/builtin/__init__.py

@@ -8,6 +8,7 @@
 """
 
 from agent.tools.builtin.file.read import read_file
+from agent.tools.builtin.file.read_images import read_images
 from agent.tools.builtin.file.edit import edit_file
 from agent.tools.builtin.file.write import write_file
 from agent.tools.builtin.glob_tool import glob_files
@@ -33,6 +34,7 @@ import agent.tools.builtin.im
 __all__ = [
     # 文件操作
     "read_file",
+    "read_images",
     "edit_file",
     "write_file",
     "glob_files",

+ 21 - 119
agent/tools/builtin/crawler.py

@@ -4,11 +4,7 @@
 提供 YouTube、X (Twitter) 和微信/通用链接的搜索和详情查询功能。
 """
 
-import asyncio
-import base64
-import io
 import json
-import math
 import os
 import subprocess
 import tempfile
@@ -16,9 +12,9 @@ from pathlib import Path
 from typing import Optional, List, Dict, Any
 
 import httpx
-from PIL import Image, ImageDraw, ImageFont
 
 from agent.tools import tool, ToolResult
+from agent.tools.utils.image import build_image_grid, encode_base64, load_images
 
 
 # API 配置
@@ -26,47 +22,22 @@ CRAWLER_BASE_URL = "http://crawler.aiddit.com/crawler"
 AIGC_BASE_URL = "http://aigc-channel.aiddit.com/aigc/channel"
 DEFAULT_TIMEOUT = 60.0
 
-# 拼接图配置
-THUMB_WIDTH = 250
-THUMB_HEIGHT = 250
-TEXT_HEIGHT = 80
-GRID_COLS = 5
-PADDING = 12
-BG_COLOR = (255, 255, 255)
-TEXT_COLOR = (30, 30, 30)
-INDEX_COLOR = (220, 60, 60)
-
 # 视频处理相关配置
 VIDEO_DOWNLOAD_DIR = Path(tempfile.gettempdir()) / "youtube_videos"
 VIDEO_DOWNLOAD_DIR.mkdir(exist_ok=True)
 
 
-# ── 辅助函数 ──
-
-def _truncate_text(text: str, max_len: int = 14) -> str:
-    """截断文本,超出部分用省略号"""
-    return text[:max_len] + "..." if len(text) > max_len else text
-
-
-async def _download_image(client: httpx.AsyncClient, url: str) -> Optional[Image.Image]:
-    """下载单张图片,失败返回 None"""
-    try:
-        resp = await client.get(url, timeout=15.0)
-        resp.raise_for_status()
-        return Image.open(io.BytesIO(resp.content)).convert("RGB")
-    except Exception:
-        return None
-
-
 async def _build_video_collage(videos: List[Dict[str, Any]]) -> Optional[str]:
     """
     将视频缩略图+序号+标题拼接成网格图,返回 base64 编码的 PNG。
+    复用 agent.tools.utils.image 中的共享拼图逻辑。
     """
     if not videos:
         return None
 
-    items = []
-    for idx, video in enumerate(videos):
+    urls: List[str] = []
+    titles: List[str] = []
+    for video in videos:
         thumbnail = None
         if "thumbnails" in video and isinstance(video["thumbnails"], list) and video["thumbnails"]:
             thumbnail = video["thumbnails"][0].get("url")
@@ -77,96 +48,27 @@ async def _build_video_collage(videos: List[Dict[str, Any]]) -> Optional[str]:
 
         title = video.get("title", "") or video.get("text", "")
         if thumbnail:
-            items.append({"url": thumbnail, "title": title, "index": idx + 1})
-    if not items:
+            urls.append(thumbnail)
+            titles.append(title)
+
+    if not urls:
         return None
 
-    async with httpx.AsyncClient() as client:
-        tasks = [_download_image(client, item["url"]) for item in items]
-        downloaded = await asyncio.gather(*tasks)
+    loaded = await load_images(urls)
+
+    valid_images = []
+    valid_labels = []
+    for (_, img), title in zip(loaded, titles):
+        if img is not None:
+            valid_images.append(img)
+            valid_labels.append(title)
 
-    valid = [(item, img) for item, img in zip(items, downloaded) if img is not None]
-    if not valid:
+    if not valid_images:
         return None
 
-    cols = min(GRID_COLS, len(valid))
-    rows = math.ceil(len(valid) / cols)
-    cell_w = THUMB_WIDTH + PADDING
-    cell_h = THUMB_HEIGHT + TEXT_HEIGHT + PADDING
-    canvas_w = cols * cell_w + PADDING
-    canvas_h = rows * cell_h + PADDING
-
-    canvas = Image.new("RGB", (canvas_w, canvas_h), BG_COLOR)
-    draw = ImageDraw.Draw(canvas)
-
-    font_title = None
-    font_index = None
-    font_candidates = [
-        "msyh.ttc", "simhei.ttf", "simsun.ttc",
-        "/System/Library/Fonts/PingFang.ttc",
-        "/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf",
-        "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc",
-        "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
-    ]
-    for font_path in font_candidates:
-        try:
-            font_title = ImageFont.truetype(font_path, 16)
-            font_index = ImageFont.truetype(font_path, 32)
-            break
-        except Exception:
-            continue
-    if not font_title:
-        font_title = ImageFont.load_default()
-        font_index = font_title
-
-    for item, img in valid:
-        idx = item["index"]
-        col = (idx - 1) % cols
-        row = (idx - 1) // cols
-        x = PADDING + col * cell_w
-        y = PADDING + row * cell_h
-
-        scale = min(THUMB_WIDTH / img.width, THUMB_HEIGHT / img.height)
-        new_w = int(img.width * scale)
-        new_h = int(img.height * scale)
-        thumb = img.resize((new_w, new_h), Image.LANCZOS)
-        offset_x = x + (THUMB_WIDTH - new_w) // 2
-        offset_y = y + (THUMB_HEIGHT - new_h) // 2
-        canvas.paste(thumb, (offset_x, offset_y))
-
-        index_text = str(idx)
-        idx_x = offset_x
-        idx_y = offset_y + 4
-        box_size = 52
-        draw.rectangle([idx_x, idx_y, idx_x + box_size, idx_y + box_size], fill=INDEX_COLOR)
-        bbox = draw.textbbox((0, 0), index_text, font=font_index)
-        tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
-        text_x = idx_x + (box_size - tw) // 2
-        text_y = idx_y + (box_size - th) // 2
-        draw.text((text_x, text_y), index_text, fill=(255, 255, 255), font=font_index)
-
-        title = item["title"] or ""
-        if title:
-            words = list(title)
-            lines = []
-            current_line = ""
-            for ch in words:
-                test_line = current_line + ch
-                bbox_line = draw.textbbox((0, 0), test_line, font=font_title)
-                if bbox_line[2] - bbox_line[0] > THUMB_WIDTH:
-                    if current_line:
-                        lines.append(current_line)
-                    current_line = ch
-                else:
-                    current_line = test_line
-            if current_line:
-                lines.append(current_line)
-            for line_i, line in enumerate(lines):
-                draw.text((x, y + THUMB_HEIGHT + 6 + line_i * 22), line, fill=TEXT_COLOR, font=font_title)
-
-    buf = io.BytesIO()
-    canvas.save(buf, format="PNG")
-    return base64.b64encode(buf.getvalue()).decode("utf-8")
+    grid = build_image_grid(images=valid_images, labels=valid_labels)
+    b64, _ = encode_base64(grid, format="PNG")
+    return b64
 
 
 def _parse_srt_to_outline(srt_content: str) -> List[Dict[str, str]]:

+ 5 - 2
agent/tools/builtin/file/read.py

@@ -27,7 +27,7 @@ MAX_LINE_LENGTH = 2000
 MAX_BYTES = 50 * 1024  # 50KB
 
 
-@tool(description="读取文件内容,支持文本文件、图片、PDF 等多种格式,也支持 HTTP/HTTPS URL", hidden_params=["context"])
+@tool(description="读取单个文件内容,支持文本文件、图片、PDF 等多种格式,也支持 HTTP/HTTPS URL", hidden_params=["context"])
 async def read_file(
     file_path: str,
     offset: int = 0,
@@ -35,7 +35,10 @@ async def read_file(
     context: Optional[ToolContext] = None
 ) -> ToolResult:
     """
-    读取文件内容
+    读取单个文件内容
+
+    用于读取一个文本文件、PDF 或一张图片。如需批量读取多张图片(2 张以上)
+    并做对比/选图,请使用 read_images 工具,它支持自动降采样和网格拼图。
 
     参考 OpenCode 实现
 

+ 320 - 0
agent/tools/builtin/file/read_images.py

@@ -0,0 +1,320 @@
+"""
+Read Images Tool - 批量读取图片工具
+
+为"批量读取 + 多图分析"场景设计的工具,与单文件的 read_file 分工:
+- read_file: 单个文件(文本 / PDF / 单张图片)
+- read_images: 2 张以上图片,支持网格拼图和降采样
+
+核心能力:
+1. 并发批量加载本地路径或 URL
+2. 自动降采样,防止 token 爆炸
+3. 可选拼图(grid 模式),把 N 张图合成一张带索引编号的网格图,
+   适合 LLM 横向对比、选图、批量判断场景
+4. 自适应布局 + 硬上限,保证拼图即使经过 LLM 内部缩放也能保持可辨
+"""
+
+from typing import Any, Dict, List, Literal, Optional, Tuple
+
+from agent.tools import tool, ToolResult, ToolContext
+from agent.tools.utils.image import (
+    build_image_grid,
+    downscale,
+    encode_base64,
+    load_images,
+)
+
+
+# Grid 模式的硬上限:超过此数量必须分批调用
+# 理由:12 张可排成 4x3 网格,每格 ~320px,人物/场景细节清晰可辨。
+# 再多格子就太小,分辨不出内容,失去对比价值。
+MAX_GRID_IMAGES = 12
+
+
+def _adaptive_layout(count: int) -> Tuple[int, int]:
+    """根据图片数量自动选择 (columns, thumb_size)。
+
+    目标:拼图最终边长不超过 ~1400px,同时每格缩略图保持 >= 320px 以保证可辨认。
+
+    Returns:
+        (columns, thumb_size)
+    """
+    if count <= 2:
+        return 2, 500   # 2x1
+    if count <= 4:
+        return 2, 450   # 2x2
+    if count <= 6:
+        return 3, 400   # 3x2
+    if count <= 9:
+        return 3, 380   # 3x3
+    # 10-12
+    return 4, 320       # 4x3
+
+
+@tool(
+    description="批量读取多张图片,支持自动降采样和网格拼图(用于横向对比/选图场景)",
+    hidden_params=["context"],
+    display={
+        "zh": {
+            "name": "批量读取图片",
+            "params": {
+                "paths": "图片路径列表",
+                "layout": "布局模式",
+                "max_dimension": "每张图最大边长",
+            },
+        },
+        "en": {
+            "name": "Read Images",
+            "params": {
+                "paths": "Image paths",
+                "layout": "Layout mode",
+                "max_dimension": "Max dimension per image",
+            },
+        },
+    },
+)
+async def read_images(
+    paths: List[str],
+    layout: Literal["grid", "separate"] = "grid",
+    max_dimension: int = 1024,
+    context: Optional[ToolContext] = None,
+) -> ToolResult:
+    """批量读取图片并返回给 LLM,支持自动降采样和网格拼图
+
+    为 **2 张以上** 的图片批量分析场景设计。单张图片请用 `read_file`。
+
+    ⚠️ **grid 模式最多 12 张**。超过请分批调用:第一次传前 12 张,第二次传后续,
+    以此类推。再多每格就太小,分辨不出内容。
+
+    两种布局模式:
+
+    - **grid**(默认):把所有图片拼成一张只带索引编号的网格图(1,2,3…)。
+      LLM 只看到 1 张拼图,大幅减少结构开销 token。索引对应的原始路径见
+      返回文本的对照表,LLM 可以用"第 3 张"来引用具体图片。
+      **自适应布局**:根据图片数量自动选择列数和缩略图尺寸,小批量时每张图更清晰:
+        * 1-2 张:2 列 × 500px
+        * 3-4 张:2 列 × 450px
+        * 5-6 张:3 列 × 400px
+        * 7-9 张:3 列 × 380px
+        * 10-12 张:4 列 × 320px
+      适合:从多张候选图中挑选、横向对比质量/风格、批量判断。
+
+    - **separate**:把每张图独立返回(仍然降采样)。无数量限制,但每张图都有
+      独立的结构开销 token。适合:
+        * 需要逐张做独立的精细分析
+        * 每张图之间没有对比关系
+
+    自动降采样:无论哪种模式,每张图都会先降采样到 max_dimension(默认 1024px)
+    的最大边长,防止高分辨率图片炸掉 token 预算。
+
+    Args:
+        paths: 图片路径列表,支持本地路径和 HTTP(S) URL,可混用。
+               grid 模式下不超过 12 张,超过必须分批调用。
+        layout: 布局模式,"grid" 拼图(默认)/ "separate" 多张独立
+        max_dimension: 每张图的最大边长(等比降采样到不超过此值),默认 1024
+        context: 工具上下文(框架注入,无需手动传)
+
+    Returns:
+        ToolResult:images 字段包含图片数据(grid 模式 1 张拼图,separate 模式 N 张),
+        output 字段包含每张图的索引和来源路径对照表
+    """
+    if not paths:
+        return ToolResult(
+            title="批量读图失败",
+            output="",
+            error="paths 不能为空",
+        )
+
+    # 硬上限检查(仅对 grid 模式)
+    if layout == "grid" and len(paths) > MAX_GRID_IMAGES:
+        return ToolResult(
+            title="批量读图失败",
+            output="",
+            error=(
+                f"grid 模式最多支持 {MAX_GRID_IMAGES} 张图片,当前传入 {len(paths)} 张。"
+                f"请分批调用:每次最多 {MAX_GRID_IMAGES} 张。"
+                f"或者使用 layout='separate' 模式(无数量限制但 token 开销更高)。"
+            ),
+        )
+
+    if len(paths) == 1:
+        hint = "(只有 1 张图片,建议用 read_file 更合适)"
+    else:
+        hint = ""
+
+    # 1. 并发加载所有图片
+    loaded = await load_images(paths)
+
+    # 2. 分离成功和失败
+    successes: List[tuple] = []  # [(path, PIL.Image), ...]
+    failures: List[str] = []     # [path, ...]
+    for source, img in loaded:
+        if img is None:
+            failures.append(source)
+        else:
+            successes.append((source, img))
+
+    if not successes:
+        return ToolResult(
+            title="批量读图失败",
+            output="",
+            error=f"所有 {len(paths)} 张图片均加载失败",
+            metadata={"failed": failures},
+        )
+
+    # 3. 每张图降采样
+    processed = [(src, downscale(img, max_dimension)) for src, img in successes]
+
+    # 4. 构建索引 → 路径对照表(用完整路径,方便 LLM 后续引用或调用)
+    index_lines = [f"{i}. {src}" for i, (src, _) in enumerate(processed, start=1)]
+    summary_parts = [f"共加载 {len(processed)}/{len(paths)} 张图片"]
+    if hint:
+        summary_parts.append(hint)
+    if failures:
+        summary_parts.append(f",失败 {len(failures)} 张")
+    summary = "".join(summary_parts)
+
+    output_lines = [summary, ""] + index_lines
+    if failures:
+        output_lines.append("")
+        output_lines.append("加载失败的路径:")
+        output_lines.extend(f"  - {p}" for p in failures)
+    output_text = "\n".join(output_lines)
+
+    # 5. 根据 layout 生成 images 字段
+    images_for_llm = []
+    if layout == "grid":
+        cols, thumb_size = _adaptive_layout(len(processed))
+        # 网格只显示序号,不写文件名 —— 索引对应的路径见上方 output 文本
+        grid = build_image_grid(
+            images=[img for _, img in processed],
+            labels=None,
+            columns=cols,
+            thumb_size=thumb_size,
+        )
+        # 网格拼图固定用 JPEG 节省 token
+        b64, media_type = encode_base64(grid, format="JPEG", quality=80)
+        images_for_llm.append({
+            "type": "base64",
+            "media_type": media_type,
+            "data": b64,
+        })
+    else:  # separate
+        for _, img in processed:
+            b64, media_type = encode_base64(img, format="JPEG", quality=80)
+            images_for_llm.append({
+                "type": "base64",
+                "media_type": media_type,
+                "data": b64,
+            })
+
+    return ToolResult(
+        title=f"批量读图成功({layout} 模式,{len(processed)} 张)",
+        output=output_text,
+        long_term_memory=f"Read {len(processed)} images via {layout} layout",
+        images=images_for_llm,
+        metadata={
+            "count": len(processed),
+            "failed_count": len(failures),
+            "layout": layout,
+        },
+    )
+
+
+# ── CLI 入口:图片拼图工具 ──
+#
+# 这个 CLI 的语义是**拼图工具**,不是"读图工具"——Claude Code 这样的调用方
+# 本身就能读单张图(用 Read 工具),真正稀缺的能力是把 N 张图合成一张
+# 带索引编号的网格图,让一次 Read 就能横向对比多张。
+#
+# 因此 CLI 只支持 grid 模式;如果你需要单张图,直接用 Read 工具即可。
+#
+# 用法:
+#   python agent/tools/builtin/file/read_images.py --out=<path> <img1> <img2> ...
+#
+# 必填参数:
+#   --out=/path/grid.jpg     拼图保存路径(必须显式指定,避免污染 /tmp)
+#
+# 可选参数:
+#   --max_dimension=1024     每张图预先降采样的最大边长(默认 1024)
+#
+# 示例:
+#   python agent/tools/builtin/file/read_images.py \
+#     --out=/tmp/compare.jpg \
+#     ~/Downloads/a.jpg ~/Downloads/b.jpg ~/Downloads/c.jpg
+#
+# 输出:一行 JSON,包含 out_path、index_map(索引→原始路径对照表)、
+# text(文字摘要)。调用方拿到 out_path 后用 Read 工具查看拼图即可。
+
+if __name__ == "__main__":
+    import base64
+    import json
+    import sys
+    from pathlib import Path as _Path
+
+    def _print_usage():
+        print("用法: python read_images.py --out=<path> <img1> <img2> ...")
+        print("     --out=/path/grid.jpg   拼图输出路径(必填)")
+        print("     --max_dimension=1024   每张图最大边长(可选,默认 1024)")
+        print(f"最多 {MAX_GRID_IMAGES} 张图片")
+
+    if len(sys.argv) < 2 or sys.argv[1] in ("-h", "--help"):
+        _print_usage()
+        sys.exit(0)
+
+    # 解析参数
+    cli_paths: List[str] = []
+    cli_out: Optional[str] = None
+    cli_max_dim: int = 1024
+    for arg in sys.argv[1:]:
+        if arg.startswith("--") and "=" in arg:
+            k, v = arg.split("=", 1)
+            k = k.lstrip("-").replace("-", "_")
+            if k == "out":
+                cli_out = v
+            elif k == "max_dimension":
+                cli_max_dim = int(v)
+            else:
+                print(f"警告: 未知参数 {k}", file=sys.stderr)
+        else:
+            cli_paths.append(arg)
+
+    if not cli_paths:
+        print("错误: 至少提供一个图片路径", file=sys.stderr)
+        _print_usage()
+        sys.exit(1)
+
+    if not cli_out:
+        print("错误: 必须显式指定 --out=<path>", file=sys.stderr)
+        _print_usage()
+        sys.exit(1)
+
+    import asyncio
+    result = asyncio.run(read_images(
+        paths=cli_paths,
+        layout="grid",
+        max_dimension=cli_max_dim,
+    ))
+
+    if result.error:
+        print(json.dumps({"error": result.error}, ensure_ascii=False, indent=2))
+        sys.exit(1)
+
+    # 写入拼图文件
+    out_p = _Path(cli_out)
+    out_p.parent.mkdir(parents=True, exist_ok=True)
+    out_p.write_bytes(base64.b64decode(result.images[0]["data"]))
+
+    # 解析索引 → 原始路径对照表
+    index_map: List[Dict[str, Any]] = []
+    for line in result.output.split("\n"):
+        if line and line[0].isdigit() and ". " in line:
+            idx_str, src = line.split(". ", 1)
+            if idx_str.isdigit():
+                index_map.append({"index": int(idx_str), "source": src})
+
+    print(json.dumps({
+        "out_path": str(out_p.resolve()),
+        "count": result.metadata.get("count", 0) if result.metadata else 0,
+        "index_map": index_map,
+        "text": result.output,
+    }, ensure_ascii=False, indent=2))

+ 24 - 139
agent/tools/builtin/search.py

@@ -9,19 +9,14 @@
 3. get_search_suggestions - 获取平台的搜索补全建议词
 """
 
-import asyncio
-import base64
-import io
 import json
-import math
-import textwrap
 from enum import Enum
 from typing import Any, Dict, List, Optional
 
 import httpx
-from PIL import Image, ImageDraw, ImageFont
 
 from agent.tools import tool, ToolResult
+from agent.tools.utils.image import build_image_grid, encode_base64, load_images
 
 
 # API 基础配置
@@ -31,155 +26,45 @@ DEFAULT_TIMEOUT = 60.0
 # 搜索结果缓存,以序号为 key
 _search_cache: Dict[int, Dict[str, Any]] = {}
 
-# 拼接图配置
-THUMB_WIDTH = 250
-THUMB_HEIGHT = 250
-TEXT_HEIGHT = 80
-GRID_COLS = 5
-PADDING = 12
-BG_COLOR = (255, 255, 255)
-TEXT_COLOR = (30, 30, 30)
-INDEX_COLOR = (220, 60, 60)
-
-
-def _truncate_text(text: str, max_len: int = 14) -> str:
-    """截断文本,超出部分用省略号"""
-    return text[:max_len] + "..." if len(text) > max_len else text
-
-
-async def _download_image(client: httpx.AsyncClient, url: str) -> Optional[Image.Image]:
-    """下载单张图片,失败返回 None"""
-    try:
-        resp = await client.get(url, timeout=15.0)
-        resp.raise_for_status()
-        return Image.open(io.BytesIO(resp.content)).convert("RGB")
-    except Exception:
-        return None
-
 
 async def _build_collage(posts: List[Dict[str, Any]]) -> Optional[str]:
     """
     将帖子封面图+序号+标题拼接成网格图,返回 base64 编码的 PNG。
-    每个格子:序号 + 封面图 + 标题
+    复用 agent.tools.utils.image 中的共享拼图逻辑。
     """
     if not posts:
         return None
 
-    # 收集有封面图的帖子,记录原始序号
-    items = []
-    for idx, post in enumerate(posts):
+    # 收集有封面图的帖子
+    urls: List[str] = []
+    titles: List[str] = []
+    for post in posts:
         imgs = post.get("images", [])
         cover_url = imgs[0] if imgs else None
         if cover_url:
-            items.append({
-                "url": cover_url,
-                "title": post.get("title", "") or "",
-                "index": idx + 1,
-            })
-    if not items:
+            urls.append(cover_url)
+            titles.append(post.get("title", "") or "")
+
+    if not urls:
         return None
 
-    # 并发下载封面图
-    async with httpx.AsyncClient() as client:
-        tasks = [_download_image(client, item["url"]) for item in items]
-        downloaded = await asyncio.gather(*tasks)
+    # 并发加载图片
+    loaded = await load_images(urls)
+
+    # 过滤加载失败的(保持 url 和 title 对齐)
+    valid_images = []
+    valid_labels = []
+    for (_, img), title in zip(loaded, titles):
+        if img is not None:
+            valid_images.append(img)
+            valid_labels.append(title)
 
-    # 过滤下载失败的
-    valid = [(item, img) for item, img in zip(items, downloaded) if img is not None]
-    if not valid:
+    if not valid_images:
         return None
 
-    cols = min(GRID_COLS, len(valid))
-    rows = math.ceil(len(valid) / cols)
-    cell_w = THUMB_WIDTH + PADDING
-    cell_h = THUMB_HEIGHT + TEXT_HEIGHT + PADDING
-    canvas_w = cols * cell_w + PADDING
-    canvas_h = rows * cell_h + PADDING
-
-    canvas = Image.new("RGB", (canvas_w, canvas_h), BG_COLOR)
-    draw = ImageDraw.Draw(canvas)
-
-    # 尝试加载字体(跨平台中文支持)
-    font_title = None
-    font_index = None
-
-    # 按优先级尝试不同平台的中文字体
-    font_candidates = [
-        "msyh.ttc",           # Windows 微软雅黑
-        "simhei.ttf",         # Windows 黑体
-        "simsun.ttc",         # Windows 宋体
-        "/System/Library/Fonts/PingFang.ttc",  # macOS 苹方
-        "/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf",  # Linux
-        "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc",  # Linux WenQuanYi
-        "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",  # Linux Noto
-    ]
-
-    for font_path in font_candidates:
-        try:
-            font_title = ImageFont.truetype(font_path, 16)
-            font_index = ImageFont.truetype(font_path, 32)
-            break
-        except Exception:
-            continue
-
-    # 如果都失败,使用默认字体(可能不支持中文)
-    if not font_title:
-        font_title = ImageFont.load_default()
-        font_index = font_title
-
-    for item, img in valid:
-        idx = item["index"]
-        col = (idx - 1) % cols
-        row = (idx - 1) // cols
-        x = PADDING + col * cell_w
-        y = PADDING + row * cell_h
-
-        # 等比缩放封面图,保持原始比例,居中放置
-        scale = min(THUMB_WIDTH / img.width, THUMB_HEIGHT / img.height)
-        new_w = int(img.width * scale)
-        new_h = int(img.height * scale)
-        thumb = img.resize((new_w, new_h), Image.LANCZOS)
-        offset_x = x + (THUMB_WIDTH - new_w) // 2
-        offset_y = y + (THUMB_HEIGHT - new_h) // 2
-        canvas.paste(thumb, (offset_x, offset_y))
-
-        # 左上角写序号(带背景),固定大小,跟随图片位置
-        index_text = str(idx)
-        idx_x = offset_x
-        idx_y = offset_y + 4
-        box_size = 52
-        draw.rectangle([idx_x, idx_y, idx_x + box_size, idx_y + box_size], fill=INDEX_COLOR)
-        # 序号居中绘制
-        bbox = draw.textbbox((0, 0), index_text, font=font_index)
-        tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
-        text_x = idx_x + (box_size - tw) // 2
-        text_y = idx_y + (box_size - th) // 2
-        draw.text((text_x, text_y), index_text, fill=(255, 255, 255), font=font_index)
-
-        # 写标题(完整显示,按像素宽度自动换行)
-        title = item["title"] or ""
-        if title:
-            words = list(title)  # 逐字符拆分,兼容中英文
-            lines = []
-            current_line = ""
-            for ch in words:
-                test_line = current_line + ch
-                bbox_line = draw.textbbox((0, 0), test_line, font=font_title)
-                if bbox_line[2] - bbox_line[0] > THUMB_WIDTH:
-                    if current_line:
-                        lines.append(current_line)
-                    current_line = ch
-                else:
-                    current_line = test_line
-            if current_line:
-                lines.append(current_line)
-            for line_i, line in enumerate(lines):
-                draw.text((x, y + THUMB_HEIGHT + 6 + line_i * 22), line, fill=TEXT_COLOR, font=font_title)
-
-    # 转 base64
-    buf = io.BytesIO()
-    canvas.save(buf, format="PNG")
-    return base64.b64encode(buf.getvalue()).decode("utf-8")
+    grid = build_image_grid(images=valid_images, labels=valid_labels)
+    b64, _ = encode_base64(grid, format="PNG")
+    return b64
 
 
 class PostSearchChannel(str, Enum):

+ 1 - 0
agent/tools/utils/__init__.py

@@ -0,0 +1 @@
+"""工具辅助模块 - 供多个工具共享的底层逻辑"""

+ 289 - 0
agent/tools/utils/image.py

@@ -0,0 +1,289 @@
+"""
+图片处理共享工具
+
+提供批量读图、降采样、网格拼图等通用逻辑。供 read_images、search_posts、
+youtube_search 等工具共享,避免代码重复。
+
+核心函数:
+- load_image: 从本地路径或 URL 加载为 PIL Image
+- downscale: 等比降采样到指定最大边长
+- build_image_grid: 将多张图片拼成带索引编号 + 标题的网格图
+- encode_base64: PIL Image → base64 字符串(默认 JPEG 以节省 token)
+"""
+
+import asyncio
+import base64
+import io
+import math
+from pathlib import Path
+from typing import List, Optional, Sequence, Tuple
+
+import httpx
+from PIL import Image, ImageDraw, ImageFont
+
+
+# ── 网格拼图默认参数 ──
+DEFAULT_THUMB_SIZE = 250         # 每格缩略图边长
+DEFAULT_TEXT_HEIGHT = 80          # 每格下方文字区高度
+DEFAULT_GRID_COLS = 5             # 每行几格
+DEFAULT_PADDING = 12
+DEFAULT_BG_COLOR = (255, 255, 255)
+DEFAULT_TEXT_COLOR = (30, 30, 30)
+DEFAULT_INDEX_COLOR = (220, 60, 60)
+
+# ── 字体候选(跨平台中文支持) ──
+# 注意:macOS 的 PingFang.ttc 因为格式原因 PIL/FreeType 无法读取,
+# 必须使用 Hiragino 或 STHeiti 等其他中文字体。
+_FONT_CANDIDATES = [
+    # macOS(按优先级)
+    "/System/Library/Fonts/Hiragino Sans GB.ttc",   # 冬青黑体,macOS 自带
+    "/System/Library/Fonts/STHeiti Medium.ttc",     # 华文黑体
+    "/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
+    # Linux
+    "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
+    "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc",
+    "/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf",
+    # Windows
+    "msyh.ttc",           # 微软雅黑
+    "simhei.ttf",         # 黑体
+    "simsun.ttc",         # 宋体
+]
+
+
+def _load_fonts(title_size: int = 16, index_size: int = 32):
+    """加载中文字体,全部失败则退回默认字体"""
+    for path in _FONT_CANDIDATES:
+        try:
+            return (
+                ImageFont.truetype(path, title_size),
+                ImageFont.truetype(path, index_size),
+            )
+        except Exception:
+            continue
+    default = ImageFont.load_default()
+    return default, default
+
+
+# ── 加载图片 ──
+
+async def _load_image_from_url(client: httpx.AsyncClient, url: str) -> Optional[Image.Image]:
+    """下载单张图片,失败返回 None"""
+    try:
+        resp = await client.get(url, timeout=15.0)
+        resp.raise_for_status()
+        return Image.open(io.BytesIO(resp.content)).convert("RGB")
+    except Exception:
+        return None
+
+
+def _load_image_from_path(path: str) -> Optional[Image.Image]:
+    """从本地路径加载图片,失败返回 None"""
+    try:
+        return Image.open(path).convert("RGB")
+    except Exception:
+        return None
+
+
+async def load_image(source: str, client: Optional[httpx.AsyncClient] = None) -> Optional[Image.Image]:
+    """
+    通用图片加载:自动识别 URL 或本地路径。
+
+    Args:
+        source: HTTP(S) URL 或本地文件路径
+        client: 可选的 httpx 客户端(URL 加载时复用连接)
+
+    Returns:
+        PIL Image 对象(RGB 模式),失败返回 None
+    """
+    if source.startswith(("http://", "https://")):
+        if client is not None:
+            return await _load_image_from_url(client, source)
+        async with httpx.AsyncClient() as c:
+            return await _load_image_from_url(c, source)
+    else:
+        # 本地路径:在 executor 中执行以避免阻塞事件循环
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(None, _load_image_from_path, source)
+
+
+async def load_images(sources: Sequence[str]) -> List[Tuple[str, Optional[Image.Image]]]:
+    """
+    并发批量加载图片。
+
+    Returns:
+        [(source, image_or_none), ...] — 保留原始顺序,失败项值为 None
+    """
+    async with httpx.AsyncClient() as client:
+        tasks = [load_image(src, client) for src in sources]
+        images = await asyncio.gather(*tasks)
+    return list(zip(sources, images))
+
+
+# ── 降采样 ──
+
+def downscale(image: Image.Image, max_dimension: int) -> Image.Image:
+    """
+    等比降采样到最大边不超过 max_dimension。
+    如果图片已经足够小则原样返回。
+    """
+    if max(image.width, image.height) <= max_dimension:
+        return image
+    scale = max_dimension / max(image.width, image.height)
+    new_size = (int(image.width * scale), int(image.height * scale))
+    return image.resize(new_size, Image.LANCZOS)
+
+
+# ── 网格拼图 ──
+
+def build_image_grid(
+    images: Sequence[Image.Image],
+    labels: Optional[Sequence[str]] = None,
+    columns: int = DEFAULT_GRID_COLS,
+    thumb_size: int = DEFAULT_THUMB_SIZE,
+    text_height: int = DEFAULT_TEXT_HEIGHT,
+    padding: int = DEFAULT_PADDING,
+    show_index: bool = True,
+) -> Image.Image:
+    """
+    将多张图片拼成带索引编号 + 标题的网格图。
+
+    每个单元格包含:
+      - 左上角红底白字的序号(1, 2, 3...)
+      - 等比缩放居中的缩略图
+      - 下方的标题文字(可选,自动按像素宽度换行)
+
+    Args:
+        images: 待拼接的 PIL Image 列表
+        labels: 每张图的标题(与 images 等长);None 则不显示标题
+        columns: 每行几格
+        thumb_size: 每个缩略图格子的边长
+        text_height: 每格下方文字区高度(labels 为 None 时自动置 0)
+        padding: 格子间距和画布边距
+        show_index: 是否显示左上角序号
+
+    Returns:
+        拼接后的 PIL Image
+    """
+    if not images:
+        raise ValueError("images 不能为空")
+
+    if labels is None:
+        labels = [""] * len(images)
+        text_height = 0
+    elif len(labels) != len(images):
+        raise ValueError(f"labels 长度 {len(labels)} 与 images {len(images)} 不匹配")
+
+    count = len(images)
+    cols = min(columns, count)
+    rows = math.ceil(count / cols)
+
+    cell_w = thumb_size + padding
+    cell_h = thumb_size + text_height + padding
+    canvas_w = cols * cell_w + padding
+    canvas_h = rows * cell_h + padding
+
+    canvas = Image.new("RGB", (canvas_w, canvas_h), DEFAULT_BG_COLOR)
+    draw = ImageDraw.Draw(canvas)
+
+    # 索引框按 thumb_size 比例缩放,保证视觉比例恒定(约 20% 占比)
+    index_box_size = max(40, thumb_size // 5)
+    index_font_size = int(index_box_size * 0.65)
+    # 标题字体略与 thumb_size 相关,但下限保证小图时可读
+    title_font_size = max(14, thumb_size // 18)
+    font_title, font_index = _load_fonts(
+        title_size=title_font_size,
+        index_size=index_font_size,
+    )
+
+    for idx, (img, label) in enumerate(zip(images, labels), start=1):
+        col = (idx - 1) % cols
+        row = (idx - 1) // cols
+        x = padding + col * cell_w
+        y = padding + row * cell_h
+
+        # 等比缩放居中
+        scale = min(thumb_size / img.width, thumb_size / img.height)
+        new_w = int(img.width * scale)
+        new_h = int(img.height * scale)
+        thumb = img.resize((new_w, new_h), Image.LANCZOS)
+        offset_x = x + (thumb_size - new_w) // 2
+        offset_y = y + (thumb_size - new_h) // 2
+        canvas.paste(thumb, (offset_x, offset_y))
+
+        # 左上角序号(跟随实际缩略图位置,大小按比例)
+        if show_index:
+            index_text = str(idx)
+            idx_x = offset_x
+            idx_y = offset_y
+            draw.rectangle(
+                [idx_x, idx_y, idx_x + index_box_size, idx_y + index_box_size],
+                fill=DEFAULT_INDEX_COLOR,
+            )
+            bbox = draw.textbbox((0, 0), index_text, font=font_index)
+            tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
+            # 文本垂直对齐用 bbox 的 top 偏移修正(font bbox 的 top 可能不为 0)
+            text_x = idx_x + (index_box_size - tw) // 2 - bbox[0]
+            text_y = idx_y + (index_box_size - th) // 2 - bbox[1]
+            draw.text((text_x, text_y), index_text, fill=(255, 255, 255), font=font_index)
+
+        # 下方标题(自动按像素宽度换行)
+        if label and text_height > 0:
+            lines = _wrap_text_by_pixel(label, font_title, thumb_size, draw)
+            for line_i, line in enumerate(lines):
+                draw.text(
+                    (x, y + thumb_size + 6 + line_i * 22),
+                    line,
+                    fill=DEFAULT_TEXT_COLOR,
+                    font=font_title,
+                )
+
+    return canvas
+
+
+def _wrap_text_by_pixel(text: str, font, max_width: int, draw: ImageDraw.ImageDraw) -> List[str]:
+    """按像素宽度自动换行,兼容中英文混排(逐字符判断)"""
+    lines = []
+    current = ""
+    for ch in text:
+        test = current + ch
+        bbox = draw.textbbox((0, 0), test, font=font)
+        if bbox[2] - bbox[0] > max_width:
+            if current:
+                lines.append(current)
+            current = ch
+        else:
+            current = test
+    if current:
+        lines.append(current)
+    return lines
+
+
+# ── 编码为 base64 ──
+
+def encode_base64(image: Image.Image, format: str = "JPEG", quality: int = 75) -> Tuple[str, str]:
+    """
+    将 PIL Image 编码为 base64 字符串。
+
+    Args:
+        image: PIL Image 对象
+        format: "JPEG" 或 "PNG"。JPEG 体积更小,推荐用于多模态 LLM 输入
+        quality: JPEG 质量(1-100),PNG 忽略此参数
+
+    Returns:
+        (base64_data, media_type) 元组,如 ("iVBOR...", "image/png")
+    """
+    buf = io.BytesIO()
+    save_kwargs = {"format": format}
+    if format.upper() == "JPEG":
+        # JPEG 不支持透明通道
+        if image.mode in ("RGBA", "LA", "P"):
+            image = image.convert("RGB")
+        save_kwargs["quality"] = quality
+        save_kwargs["optimize"] = True
+    image.save(buf, **save_kwargs)
+
+    data = base64.b64encode(buf.getvalue()).decode("utf-8")
+    media_type = f"image/{format.lower()}"
+    if format.upper() == "JPEG":
+        media_type = "image/jpeg"
+    return data, media_type