1 周之前 · de2cd57022
--- a/lib/README_semantic_similarity.md
+++ b/lib/README_semantic_similarity.md
@@ -0,0 +1,293 @@
 
															+# 语义相似度分析模块
														
 
															+
														
 
															+## 功能概述
														
 
															+
														
 
															+提供基于 AI Agent 的语义相似度分析功能，支持缓存机制以提高性能和降低 API 调用成本。
														
 
															+
														
 
															+## 主要功能
														
 
															+
														
 
															+### 1. 核心函数
														
 
															+
														
 
															+- `difference_between_phrases()` - 返回原始 AI 响应
														
 
															+- `difference_between_phrases_parsed()` - 返回解析后的 JSON 字典
														
 
															+- `compare_phrases()` - `difference_between_phrases_parsed()` 的别名
														
 
															+
														
 
															+### 2. 缓存系统设计
														
 
															+
														
 
															+#### 缓存文件名设计
														
 
															+
														
 
															+**方案：可读文件名 + 哈希后缀**
														
 
															+
														
 
															+```
														
 
															+cache/semantic_similarity/
														
 
															+├── 宿命感_vs_余华的小说_gpt-4.1-mini_t0.0_a7f3e2d9.json
														
 
															+├── 人工智能_vs_机器学习_claude-sonnet-4.5_t0.0_b8e4f3e0.json
														
 
															+├── 深度学习_vs_神经网络_gemini-2.5-pro_t0.2_c9f5g4h1.json
														
 
															+└── 创意写作_vs_AI生成_gpt-4.1-mini_t0.7_d0a6h5i2.json
														
 
															+```
														
 
															+
														
 
															+**文件名格式：**
														
 
															+```
														
 
															+{phrase_a}_vs_{phrase_b}_{model}_t{temp}_{hash[:8]}.json
														
 
															+```
														
 
															+
														
 
															+- `phrase_a`: 第一个短语（最长20字符，特殊字符转为下划线）
														
 
															+- `phrase_b`: 第二个短语（最长20字符，特殊字符转为下划线）
														
 
															+- `model`: 模型简称（提取 `/` 后部分，最长20字符）
														
 
															+- `t{temp}`: 温度参数（格式化为1位小数，如 t0.0, t0.2, t0.7）
														
 
															+- `hash[:8]`: 完整哈希的前8位
														
 
															+
														
 
															+**哈希生成逻辑：**
														
 
															+- 基于所有影响结果的参数生成唯一 MD5 哈希：
														
 
															+  - `phrase_a` - 第一个短语
														
 
															+  - `phrase_b` - 第二个短语
														
 
															+  - `model_name` - 模型名称
														
 
															+  - `temperature` - 温度参数
														
 
															+  - `max_tokens` - 最大 token 数
														
 
															+  - `prompt_template` - 提示词模板
														
 
															+
														
 
															+**缓存文件格式（结构化 JSON）：**
														
 
															+
														
 
															+```json
														
 
															+{
														
 
															+  "input": {
														
 
															+    "phrase_a": "宿命感",
														
 
															+    "phrase_b": "余华的小说",
														
 
															+    "model_name": "openai/gpt-4.1-mini",
														
 
															+    "temperature": 0.0,
														
 
															+    "max_tokens": 65536,
														
 
															+    "prompt_template": "从语意角度,判断【{phrase_a}】和【{phrase_b}】..."
														
 
															+  },
														
 
															+  "output": {
														
 
															+    "result": "{\n  \"说明\": \"...\",\n  \"相似度\": 0.75\n}"
														
 
															+  },
														
 
															+  "metadata": {
														
 
															+    "timestamp": "2025-11-19 14:30:45",
														
 
															+    "cache_key": "a7f3e2d9c1b4a5f8e6d7c9b2a1f3e5d7",
														
 
															+    "cache_file": "宿命感_vs_余华的小说_gpt-4.1-mini_t0.0_a7f3e2d9.json"
														
 
															+  }
														
 
															+}
														
 
															+```
														
 
															+
														
 
															+#### 缓存特性
														
 
															+
														
 
															+1. **自动缓存**：默认启用，首次调用保存结果
														
 
															+2. **智能匹配**：相同参数自动从缓存读取
														
 
															+3. **可控性**：支持 `use_cache=False` 强制重新请求
														
 
															+4. **可追溯**：缓存文件包含完整元数据和时间戳
														
 
															+5. **自定义目录**：支持通过 `cache_dir` 参数自定义缓存位置
														
 
															+
														
 
															+## 使用示例
														
 
															+
														
 
															+### 基本使用（自动缓存）
														
 
															+
														
 
															+```python
														
 
															+from lib.semantic_similarity import compare_phrases
														
 
															+
														
 
															+# 第一次调用 - 请求 API 并缓存
														
 
															+result = await compare_phrases("宿命感", "余华的小说")
														
 
															+# 输出: ✓ 已缓存: 宿命感_vs_余华的小说_gpt-4.1-mini_t0.0_a7f3e2d9.json
														
 
															+
														
 
															+# 第二次调用相同参数 - 从缓存读取
														
 
															+result = await compare_phrases("宿命感", "余华的小说")
														
 
															+# 输出: ✓ 使用缓存: 宿命感_vs_余华的小说_t0.0_a7f3e2d9.json
														
 
															+
														
 
															+print(result['相似度'])  # 0.3
														
 
															+print(result['说明'])    # "两个概念..."
														
 
															+```
														
 
															+
														
 
															+### 禁用缓存
														
 
															+
														
 
															+```python
														
 
															+# 强制重新请求 API
														
 
															+result = await compare_phrases(
														
 
															+    "人工智能",
														
 
															+    "机器学习",
														
 
															+    use_cache=False
														
 
															+)
														
 
															+```
														
 
															+
														
 
															+### 自定义缓存目录
														
 
															+
														
 
															+```python
														
 
															+# 使用自定义缓存目录
														
 
															+result = await compare_phrases(
														
 
															+    "深度学习",
														
 
															+    "神经网络",
														
 
															+    cache_dir="my_cache/similarity"
														
 
															+)
														
 
															+```
														
 
															+
														
 
															+### 自定义提示词模板
														
 
															+
														
 
															+```python
														
 
															+custom_template = """
														
 
															+请详细分析【{phrase_a}】和【{phrase_b}】的语义关系
														
 
															+输出格式：
														
 
															+```json
														
 
															+{{
														
 
															+  "说明": "详细分析",
														
 
															+  "相似度": 0.5,
														
 
															+  "关系类型": "相关/包含/对立/无关"
														
 
															+}}
														
 
															+```
														
 
															+"""
														
 
															+
														
 
															+result = await compare_phrases(
														
 
															+    "机器学习",
														
 
															+    "深度学习",
														
 
															+    prompt_template=custom_template
														
 
															+)
														
 
															+```
														
 
															+
														
 
															+### 配置不同模型
														
 
															+
														
 
															+```python
														
 
															+# 使用 Claude 模型
														
 
															+result = await compare_phrases(
														
 
															+    "人工智能",
														
 
															+    "深度学习",
														
 
															+    model_name='anthropic/claude-sonnet-4.5',
														
 
															+    temperature=0.2
														
 
															+)
														
 
															+```
														
 
															+
														
 
															+## 缓存管理
														
 
															+
														
 
															+### 查看缓存
														
 
															+
														
 
															+```bash
														
 
															+# 查看缓存目录
														
 
															+ls cache/semantic_similarity/
														
 
															+
														
 
															+# 查看特定缓存文件
														
 
															+cat cache/semantic_similarity/a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p6.json
														
 
															+```
														
 
															+
														
 
															+### 清理缓存
														
 
															+
														
 
															+```bash
														
 
															+# 清理所有缓存
														
 
															+rm -rf cache/semantic_similarity/
														
 
															+
														
 
															+# 清理特定缓存文件
														
 
															+rm cache/semantic_similarity/a1b2c3d4*.json
														
 
															+```
														
 
															+
														
 
															+### 缓存统计
														
 
															+
														
 
															+```python
														
 
															+from pathlib import Path
														
 
															+import json
														
 
															+
														
 
															+cache_dir = Path("cache/semantic_similarity")
														
 
															+cache_files = list(cache_dir.glob("*.json"))
														
 
															+
														
 
															+print(f"缓存文件总数: {len(cache_files)}")
														
 
															+
														
 
															+# 统计各模型使用情况
														
 
															+model_stats = {}
														
 
															+for file in cache_files:
														
 
															+    with open(file, 'r') as f:
														
 
															+        data = json.load(f)
														
 
															+        model = data.get('model_name', 'unknown')
														
 
															+        model_stats[model] = model_stats.get(model, 0) + 1
														
 
															+
														
 
															+print("各模型缓存数量:")
														
 
															+for model, count in model_stats.items():
														
 
															+    print(f"  {model}: {count}")
														
 
															+```
														
 
															+
														
 
															+## 参数说明
														
 
															+
														
 
															+### 所有函数共享参数
														
 
															+
														
 
															+| 参数 | 类型 | 默认值 | 说明 |
														
 
															+|------|------|--------|------|
														
 
															+| `phrase_a` | str | 必填 | 第一个短语 |
														
 
															+| `phrase_b` | str | 必填 | 第二个短语 |
														
 
															+| `model_name` | str | `'openai/gpt-4.1-mini'` | 使用的 AI 模型 |
														
 
															+| `temperature` | float | `0.0` | 温度参数（0.0-1.0） |
														
 
															+| `max_tokens` | int | `65536` | 最大生成 token 数 |
														
 
															+| `prompt_template` | str | `None` | 自定义提示词模板 |
														
 
															+| `use_cache` | bool | `True` | 是否启用缓存 |
														
 
															+| `cache_dir` | str | `'cache/semantic_similarity'` | 缓存目录路径 |
														
 
															+
														
 
															+### 支持的模型
														
 
															+
														
 
															+- `'google/gemini-2.5-pro'`
														
 
															+- `'anthropic/claude-sonnet-4.5'`
														
 
															+- `'google/gemini-2.0-flash-001'`
														
 
															+- `'openai/gpt-5-mini'`
														
 
															+- `'anthropic/claude-haiku-4.5'`
														
 
															+- `'openai/gpt-4.1-mini'` (默认)
														
 
															+
														
 
															+## 性能优化
														
 
															+
														
 
															+### 缓存命中率优化
														
 
															+
														
 
															+1. **参数标准化**：确保相同语义使用相同参数
														
 
															+2. **批量处理**：对相同短语对只调用一次
														
 
															+3. **预热缓存**：提前为常用短语对生成缓存
														
 
															+
														
 
															+### 示例：批量处理
														
 
															+
														
 
															+```python
														
 
															+phrase_pairs = [
														
 
															+    ("宿命感", "余华的小说"),
														
 
															+    ("人工智能", "机器学习"),
														
 
															+    ("深度学习", "神经网络"),
														
 
															+]
														
 
															+
														
 
															+for phrase_a, phrase_b in phrase_pairs:
														
 
															+    result = await compare_phrases(phrase_a, phrase_b)
														
 
															+    print(f"{phrase_a} vs {phrase_b}: {result['相似度']}")
														
 
															+```
														
 
															+
														
 
															+## 注意事项
														
 
															+
														
 
															+1. **参数敏感性**：任何参数变化都会导致新的缓存键
														
 
															+2. **存储空间**：长期使用可能积累大量缓存文件
														
 
															+3. **缓存一致性**：模型更新后建议清理旧缓存
														
 
															+4. **并发安全**：当前实现不支持并发写入同一缓存文件
														
 
															+
														
 
															+## 故障排查
														
 
															+
														
 
															+### 缓存未命中
														
 
															+
														
 
															+**问题**：相同参数调用但未使用缓存
														
 
															+
														
 
															+**可能原因**：
														
 
															+- 参数细微差异（如空格、换行）
														
 
															+- `prompt_template` 不一致
														
 
															+- 缓存文件损坏或被删除
														
 
															+
														
 
															+**解决方案**：
														
 
															+```python
														
 
															+# 检查缓存键
														
 
															+from lib.semantic_similarity import _generate_cache_key, DEFAULT_PROMPT_TEMPLATE
														
 
															+
														
 
															+key = _generate_cache_key(
														
 
															+    "宿命感", "余华的小说",
														
 
															+    "openai/gpt-4.1-mini", 0.0, 65536,
														
 
															+    DEFAULT_PROMPT_TEMPLATE
														
 
															+)
														
 
															+print(f"缓存键: {key}")
														
 
															+```
														
 
															+
														
 
															+### 缓存损坏
														
 
															+
														
 
															+**问题**：缓存文件存在但无法加载
														
 
															+
														
 
															+**解决方案**：
														
 
															+```bash
														
 
															+# 删除损坏的缓存文件
														
 
															+rm cache/semantic_similarity/{cache_key}.json
														
 
															+```
														
 
															+
														
 
															+## 版本历史
														
 
															+
														
 
															+- **v1.0** - 初始版本，支持基本语义相似度分析
														
 
															+- **v1.1** - 添加缓存系统
														
 
															+- **v1.2** - 支持自定义提示词模板
														
 
															+- **v1.3** - 优化缓存文件格式，添加元数据
														
--- a/lib/semantic_similarity.py
+++ b/lib/semantic_similarity.py
@@ -0,0 +1,589 @@
 
															+#!/usr/bin/env python3
														
 
															+"""
														
 
															+语义相似度分析模块
														
 
															+使用 AI Agent 判断两个短语之间的语义相似度
														
 
															+"""
														
 
															+
														
 
															+from agents import Agent, Runner, ModelSettings
														
 
															+from lib.client import get_model
														
 
															+from lib.utils import parse_json_from_text
														
 
															+from typing import Dict, Any, Optional
														
 
															+import hashlib
														
 
															+import json
														
 
															+import os
														
 
															+from datetime import datetime
														
 
															+from pathlib import Path
														
 
															+
														
 
															+
														
 
															+# 默认提示词模板
														
 
															+DEFAULT_PROMPT_TEMPLATE = """
														
 
															+从语意角度,判断【{phrase_a}】和【{phrase_b}】的相似度,从0-1打分，输出json格式
														
 
															+```json
														
 
															+{{
														
 
															+  "说明": "简明扼要说明理由",
														
 
															+  "相似度": 0.0,
														
 
															+}}
														
 
															+```
														
 
															+""".strip()
														
 
															+
														
 
															+# 默认缓存目录
														
 
															+DEFAULT_CACHE_DIR = "cache/semantic_similarity"
														
 
															+
														
 
															+
														
 
															+def _generate_cache_key(
														
 
															+    phrase_a: str,
														
 
															+    phrase_b: str,
														
 
															+    model_name: str,
														
 
															+    temperature: float,
														
 
															+    max_tokens: int,
														
 
															+    prompt_template: str,
														
 
															+    instructions: str = None,
														
 
															+    tools: str = "[]"
														
 
															+) -> str:
														
 
															+    """
														
 
															+    生成缓存键（哈希值）
														
 
															+
														
 
															+    Args:
														
 
															+        phrase_a: 第一个短语
														
 
															+        phrase_b: 第二个短语
														
 
															+        model_name: 模型名称
														
 
															+        temperature: 温度参数
														
 
															+        max_tokens: 最大token数
														
 
															+        prompt_template: 提示词模板
														
 
															+        instructions: Agent 系统指令
														
 
															+        tools: 工具列表的 JSON 字符串
														
 
															+
														
 
															+    Returns:
														
 
															+        32位MD5哈希值
														
 
															+    """
														
 
															+    # 创建包含所有参数的字符串
														
 
															+    cache_string = f"{phrase_a}||{phrase_b}||{model_name}||{temperature}||{max_tokens}||{prompt_template}||{instructions}||{tools}"
														
 
															+
														
 
															+    # 生成MD5哈希
														
 
															+    return hashlib.md5(cache_string.encode('utf-8')).hexdigest()
														
 
															+
														
 
															+
														
 
															+def _sanitize_for_filename(text: str, max_length: int = 30) -> str:
														
 
															+    """
														
 
															+    将文本转换为安全的文件名部分
														
 
															+
														
 
															+    Args:
														
 
															+        text: 原始文本
														
 
															+        max_length: 最大长度
														
 
															+
														
 
															+    Returns:
														
 
															+        安全的文件名字符串
														
 
															+    """
														
 
															+    import re
														
 
															+    # 移除特殊字符，只保留中文、英文、数字、下划线
														
 
															+    sanitized = re.sub(r'[^\w\u4e00-\u9fff]', '_', text)
														
 
															+    # 移除连续的下划线
														
 
															+    sanitized = re.sub(r'_+', '_', sanitized)
														
 
															+    # 截断到最大长度
														
 
															+    if len(sanitized) > max_length:
														
 
															+        sanitized = sanitized[:max_length]
														
 
															+    return sanitized.strip('_')
														
 
															+
														
 
															+
														
 
															+def _get_cache_filepath(
														
 
															+    cache_key: str,
														
 
															+    phrase_a: str,
														
 
															+    phrase_b: str,
														
 
															+    model_name: str,
														
 
															+    temperature: float,
														
 
															+    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+) -> Path:
														
 
															+    """
														
 
															+    获取缓存文件路径（可读文件名）
														
 
															+
														
 
															+    Args:
														
 
															+        cache_key: 缓存键（哈希值）
														
 
															+        phrase_a: 第一个短语
														
 
															+        phrase_b: 第二个短语
														
 
															+        model_name: 模型名称
														
 
															+        temperature: 温度参数
														
 
															+        cache_dir: 缓存目录
														
 
															+
														
 
															+    Returns:
														
 
															+        缓存文件的完整路径
														
 
															+
														
 
															+    文件名格式: {phrase_a}_vs_{phrase_b}_{model}_t{temp}_{hash[:8]}.json
														
 
															+    示例: 宿命感_vs_余华的小说_gpt-4.1-mini_t0.0_a7f3e2d9.json
														
 
															+    """
														
 
															+    # 清理短语和模型名
														
 
															+    clean_a = _sanitize_for_filename(phrase_a, max_length=20)
														
 
															+    clean_b = _sanitize_for_filename(phrase_b, max_length=20)
														
 
															+
														
 
															+    # 简化模型名（提取关键部分）
														
 
															+    model_short = model_name.split('/')[-1]  # 例如: openai/gpt-4.1-mini -> gpt-4.1-mini
														
 
															+    model_short = _sanitize_for_filename(model_short, max_length=20)
														
 
															+
														
 
															+    # 格式化温度参数
														
 
															+    temp_str = f"t{temperature:.1f}"
														
 
															+
														
 
															+    # 使用哈希的前8位
														
 
															+    hash_short = cache_key[:8]
														
 
															+
														
 
															+    # 组合文件名
														
 
															+    filename = f"{clean_a}_vs_{clean_b}_{model_short}_{temp_str}_{hash_short}.json"
														
 
															+
														
 
															+    return Path(cache_dir) / filename
														
 
															+
														
 
															+
														
 
															+def _load_from_cache(
														
 
															+    cache_key: str,
														
 
															+    phrase_a: str,
														
 
															+    phrase_b: str,
														
 
															+    model_name: str,
														
 
															+    temperature: float,
														
 
															+    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+) -> Optional[str]:
														
 
															+    """
														
 
															+    从缓存加载数据
														
 
															+
														
 
															+    Args:
														
 
															+        cache_key: 缓存键
														
 
															+        phrase_a: 第一个短语
														
 
															+        phrase_b: 第二个短语
														
 
															+        model_name: 模型名称
														
 
															+        temperature: 温度参数
														
 
															+        cache_dir: 缓存目录
														
 
															+
														
 
															+    Returns:
														
 
															+        缓存的结果字符串，如果不存在则返回 None
														
 
															+    """
														
 
															+    cache_file = _get_cache_filepath(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)
														
 
															+
														
 
															+    # 如果文件不存在，尝试通过哈希匹配查找
														
 
															+    if not cache_file.exists():
														
 
															+        # 查找所有以该哈希结尾的文件
														
 
															+        cache_path = Path(cache_dir)
														
 
															+        if cache_path.exists():
														
 
															+            hash_short = cache_key[:8]
														
 
															+            matching_files = list(cache_path.glob(f"*_{hash_short}.json"))
														
 
															+            if matching_files:
														
 
															+                cache_file = matching_files[0]
														
 
															+            else:
														
 
															+                return None
														
 
															+        else:
														
 
															+            return None
														
 
															+
														
 
															+    try:
														
 
															+        with open(cache_file, 'r', encoding='utf-8') as f:
														
 
															+            cached_data = json.load(f)
														
 
															+            return cached_data['output']['raw']
														
 
															+    except (json.JSONDecodeError, IOError, KeyError):
														
 
															+        return None
														
 
															+
														
 
															+
														
 
															+def _save_to_cache(
														
 
															+    cache_key: str,
														
 
															+    phrase_a: str,
														
 
															+    phrase_b: str,
														
 
															+    model_name: str,
														
 
															+    temperature: float,
														
 
															+    max_tokens: int,
														
 
															+    prompt_template: str,
														
 
															+    instructions: str,
														
 
															+    tools: str,
														
 
															+    result: str,
														
 
															+    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+) -> None:
														
 
															+    """
														
 
															+    保存数据到缓存
														
 
															+
														
 
															+    Args:
														
 
															+        cache_key: 缓存键
														
 
															+        phrase_a: 第一个短语
														
 
															+        phrase_b: 第二个短语
														
 
															+        model_name: 模型名称
														
 
															+        temperature: 温度参数
														
 
															+        max_tokens: 最大token数
														
 
															+        prompt_template: 提示词模板
														
 
															+        instructions: Agent 系统指令
														
 
															+        tools: 工具列表的 JSON 字符串
														
 
															+        result: 结果数据（原始字符串）
														
 
															+        cache_dir: 缓存目录
														
 
															+    """
														
 
															+    cache_file = _get_cache_filepath(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)
														
 
															+
														
 
															+    # 确保缓存目录存在
														
 
															+    cache_file.parent.mkdir(parents=True, exist_ok=True)
														
 
															+
														
 
															+    # 尝试解析 result 为 JSON
														
 
															+    parsed_result = parse_json_from_text(result)
														
 
															+
														
 
															+    # 准备缓存数据（包含完整的输入输出信息）
														
 
															+    cache_data = {
														
 
															+        "input": {
														
 
															+            "phrase_a": phrase_a,
														
 
															+            "phrase_b": phrase_b,
														
 
															+            "model_name": model_name,
														
 
															+            "temperature": temperature,
														
 
															+            "max_tokens": max_tokens,
														
 
															+            "prompt_template": prompt_template,
														
 
															+            "instructions": instructions,
														
 
															+            "tools": tools
														
 
															+        },
														
 
															+        "output": {
														
 
															+            "raw": result,              # 保留原始响应
														
 
															+            "parsed": parsed_result     # 解析后的JSON对象
														
 
															+        },
														
 
															+        "metadata": {
														
 
															+            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
														
 
															+            "cache_key": cache_key,
														
 
															+            "cache_file": str(cache_file.name)
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    try:
														
 
															+        with open(cache_file, 'w', encoding='utf-8') as f:
														
 
															+            json.dump(cache_data, f, ensure_ascii=False, indent=2)
														
 
															+    except IOError:
														
 
															+        pass  # 静默失败，不影响主流程
														
 
															+
														
 
															+
														
 
															+async def _difference_between_phrases(
														
 
															+    phrase_a: str,
														
 
															+    phrase_b: str,
														
 
															+    model_name: str = 'openai/gpt-4.1-mini',
														
 
															+    temperature: float = 0.0,
														
 
															+    max_tokens: int = 65536,
														
 
															+    prompt_template: str = None,
														
 
															+    instructions: str = None,
														
 
															+    tools: list = None,
														
 
															+    name: str = "Semantic Similarity Analyzer",
														
 
															+    use_cache: bool = True,
														
 
															+    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+) -> str:
														
 
															+    """
														
 
															+    从语义角度判断两个短语的相似度
														
 
															+
														
 
															+    Args:
														
 
															+        phrase_a: 第一个短语
														
 
															+        phrase_b: 第二个短语
														
 
															+        model_name: 使用的模型名称，可选值：
														
 
															+            - 'google/gemini-2.5-pro'
														
 
															+            - 'anthropic/claude-sonnet-4.5'
														
 
															+            - 'google/gemini-2.0-flash-001'
														
 
															+            - 'openai/gpt-5-mini'
														
 
															+            - 'anthropic/claude-haiku-4.5'
														
 
															+            - 'openai/gpt-4.1-mini' (默认)
														
 
															+        temperature: 模型温度参数，控制输出随机性，默认 0.0（确定性输出）
														
 
															+        max_tokens: 最大生成token数，默认 65536
														
 
															+        prompt_template: 自定义提示词模板，使用 {phrase_a} 和 {phrase_b} 作为占位符
														
 
															+                        如果为 None，使用默认模板
														
 
															+        instructions: Agent 的系统指令，默认为 None
														
 
															+        tools: Agent 可用的工具列表，默认为 []
														
 
															+        name: Agent 的名称，默认为 "Semantic Similarity Analyzer"（不参与缓存key构建）
														
 
															+        use_cache: 是否使用缓存，默认 True
														
 
															+        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
														
 
															+
														
 
															+    Returns:
														
 
															+        JSON 格式的相似度分析结果字符串
														
 
															+
														
 
															+    Examples:
														
 
															+        >>> # 使用默认模板和缓存
														
 
															+        >>> result = await difference_between_phrases("宿命感", "余华的小说")
														
 
															+        >>> print(result)
														
 
															+        {
														
 
															+          "说明": "简明扼要说明理由",
														
 
															+          "相似度": 0.0
														
 
															+        }
														
 
															+
														
 
															+        >>> # 禁用缓存
														
 
															+        >>> result = await difference_between_phrases(
														
 
															+        ...     "宿命感", "余华的小说",
														
 
															+        ...     use_cache=False
														
 
															+        ... )
														
 
															+
														
 
															+        >>> # 使用自定义模板
														
 
															+        >>> custom_template = '''
														
 
															+        ... 请分析【{phrase_a}】和【{phrase_b}】的语义关联度
														
 
															+        ... 输出格式：{{"score": 0.0, "reason": "..."}}
														
 
															+        ... '''
														
 
															+        >>> result = await difference_between_phrases(
														
 
															+        ...     "宿命感", "余华的小说",
														
 
															+        ...     prompt_template=custom_template
														
 
															+        ... )
														
 
															+    """
														
 
															+    # 使用自定义模板或默认模板
														
 
															+    if prompt_template is None:
														
 
															+        prompt_template = DEFAULT_PROMPT_TEMPLATE
														
 
															+
														
 
															+    # 默认tools为空列表
														
 
															+    if tools is None:
														
 
															+        tools = []
														
 
															+
														
 
															+    # 生成缓存键（tools转为JSON字符串以便哈希）
														
 
															+    tools_str = json.dumps(tools, sort_keys=True) if tools else "[]"
														
 
															+    cache_key = _generate_cache_key(
														
 
															+        phrase_a, phrase_b, model_name, temperature, max_tokens, prompt_template, instructions, tools_str
														
 
															+    )
														
 
															+
														
 
															+    # 尝试从缓存加载
														
 
															+    if use_cache:
														
 
															+        cached_result = _load_from_cache(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)
														
 
															+        if cached_result is not None:
														
 
															+            return cached_result
														
 
															+
														
 
															+    # 缓存未命中，调用 API
														
 
															+    agent = Agent(
														
 
															+        name=name,
														
 
															+        model=get_model(model_name),
														
 
															+        model_settings=ModelSettings(
														
 
															+            temperature=temperature,
														
 
															+            max_tokens=max_tokens,
														
 
															+        ),
														
 
															+        instructions=instructions,
														
 
															+        tools=tools,
														
 
															+    )
														
 
															+
														
 
															+    # 格式化提示词
														
 
															+    prompt = prompt_template.format(phrase_a=phrase_a, phrase_b=phrase_b)
														
 
															+
														
 
															+    result = await Runner.run(agent, input=prompt)
														
 
															+    final_output = result.final_output
														
 
															+
														
 
															+    # 保存到缓存
														
 
															+    if use_cache:
														
 
															+        _save_to_cache(
														
 
															+            cache_key, phrase_a, phrase_b, model_name,
														
 
															+            temperature, max_tokens, prompt_template,
														
 
															+            instructions, tools_str, final_output, cache_dir
														
 
															+        )
														
 
															+
														
 
															+    return final_output
														
 
															+
														
 
															+
														
 
															+async def _difference_between_phrases_parsed(
														
 
															+    phrase_a: str,
														
 
															+    phrase_b: str,
														
 
															+    model_name: str = 'openai/gpt-4.1-mini',
														
 
															+    temperature: float = 0.0,
														
 
															+    max_tokens: int = 65536,
														
 
															+    prompt_template: str = None,
														
 
															+    instructions: str = None,
														
 
															+    tools: list = None,
														
 
															+    name: str = "Semantic Similarity Analyzer",
														
 
															+    use_cache: bool = True,
														
 
															+    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+) -> Dict[str, Any]:
														
 
															+    """
														
 
															+    从语义角度判断两个短语的相似度，并解析返回结果为字典
														
 
															+
														
 
															+    Args:
														
 
															+        phrase_a: 第一个短语
														
 
															+        phrase_b: 第二个短语
														
 
															+        model_name: 使用的模型名称
														
 
															+        temperature: 模型温度参数，控制输出随机性，默认 0.0（确定性输出）
														
 
															+        max_tokens: 最大生成token数，默认 65536
														
 
															+        prompt_template: 自定义提示词模板，使用 {phrase_a} 和 {phrase_b} 作为占位符
														
 
															+        instructions: Agent 的系统指令，默认为 None
														
 
															+        tools: Agent 可用的工具列表，默认为 []
														
 
															+        name: Agent 的名称，默认为 "Semantic Similarity Analyzer"
														
 
															+        use_cache: 是否使用缓存，默认 True
														
 
															+        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
														
 
															+
														
 
															+    Returns:
														
 
															+        解析后的字典，包含：
														
 
															+        - 说明: 相似度判断的理由
														
 
															+        - 相似度: 0-1之间的浮点数
														
 
															+
														
 
															+    Examples:
														
 
															+        >>> result = await difference_between_phrases_parsed("宿命感", "余华的小说")
														
 
															+        >>> print(result['相似度'])
														
 
															+        0.3
														
 
															+        >>> print(result['说明'])
														
 
															+        "两个概念有一定关联..."
														
 
															+    """
														
 
															+    raw_result = await _difference_between_phrases(
														
 
															+        phrase_a, phrase_b, model_name, temperature, max_tokens,
														
 
															+        prompt_template, instructions, tools, name, use_cache, cache_dir
														
 
															+    )
														
 
															+
														
 
															+    # 使用 utils.parse_json_from_text 解析结果
														
 
															+    parsed_result = parse_json_from_text(raw_result)
														
 
															+
														
 
															+    # 如果解析失败（返回空字典），返回带错误信息的结果
														
 
															+    if not parsed_result:
														
 
															+        return {
														
 
															+            "说明": "解析失败: 无法从响应中提取有效的 JSON",
														
 
															+            "相似度": 0.0,
														
 
															+            "raw_response": raw_result
														
 
															+        }
														
 
															+
														
 
															+    return parsed_result
														
 
															+
														
 
															+
														
 
															+# ========== V1 版本（默认版本） ==========
														
 
															+
														
 
															+# 对外接口 - V1
														
 
															+async def compare_phrases(
														
 
															+    phrase_a: str,
														
 
															+    phrase_b: str,
														
 
															+    model_name: str = 'openai/gpt-4.1-mini',
														
 
															+    temperature: float = 0.0,
														
 
															+    max_tokens: int = 65536,
														
 
															+    prompt_template: str = None,
														
 
															+    instructions: str = None,
														
 
															+    tools: list = None,
														
 
															+    name: str = "Semantic Similarity Analyzer",
														
 
															+    use_cache: bool = True,
														
 
															+    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+) -> Dict[str, Any]:
														
 
															+    """
														
 
															+    比较两个短语的语义相似度（对外唯一接口）
														
 
															+
														
 
															+    Args:
														
 
															+        phrase_a: 第一个短语
														
 
															+        phrase_b: 第二个短语
														
 
															+        model_name: 使用的模型名称
														
 
															+        temperature: 模型温度参数，控制输出随机性，默认 0.0（确定性输出）
														
 
															+        max_tokens: 最大生成token数，默认 65536
														
 
															+        prompt_template: 自定义提示词模板，使用 {phrase_a} 和 {phrase_b} 作为占位符
														
 
															+        instructions: Agent 的系统指令，默认为 None
														
 
															+        tools: Agent 可用的工具列表，默认为 []
														
 
															+        name: Agent 的名称，默认为 "Semantic Similarity Analyzer"
														
 
															+        use_cache: 是否使用缓存，默认 True
														
 
															+        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
														
 
															+
														
 
															+    Returns:
														
 
															+        解析后的字典
														
 
															+    """
														
 
															+    return await _difference_between_phrases_parsed(
														
 
															+        phrase_a, phrase_b, model_name, temperature, max_tokens,
														
 
															+        prompt_template, instructions, tools, name, use_cache, cache_dir
														
 
															+    )
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    import asyncio
														
 
															+
														
 
															+    async def main():
														
 
															+        """示例使用"""
														
 
															+        # 示例 1: 基本使用（使用缓存）
														
 
															+        print("示例 1: 基本使用")
														
 
															+        result = await compare_phrases("宿命感", "余华的小说")
														
 
															+        print(f"相似度: {result.get('相似度')}")
														
 
															+        print(f"说明: {result.get('说明')}")
														
 
															+        print()
														
 
															+
														
 
															+        # 示例 2: 再次调用相同参数（应该从缓存读取）
														
 
															+        print("示例 2: 测试缓存")
														
 
															+        result = await compare_phrases("宿命感", "余华的小说")
														
 
															+        print(f"相似度: {result.get('相似度')}")
														
 
															+        print()
														
 
															+
														
 
															+        # 示例 3: 自定义温度
														
 
															+        print("示例 3: 自定义温度（创意性输出）")
														
 
															+        result = await compare_phrases(
														
 
															+            "创意写作", "AI生成",
														
 
															+            temperature=0.7
														
 
															+        )
														
 
															+        print(f"相似度: {result.get('相似度')}")
														
 
															+        print(f"说明: {result.get('说明')}")
														
 
															+        print()
														
 
															+
														
 
															+        # 示例 4: 自定义 Agent 名称
														
 
															+        print("示例 4: 自定义 Agent 名称")
														
 
															+        result = await compare_phrases(
														
 
															+            "人工智能", "机器学习",
														
 
															+            name="AI语义分析专家"
														
 
															+        )
														
 
															+        print(f"相似度: {result.get('相似度')}")
														
 
															+        print(f"说明: {result.get('说明')}")
														
 
															+        print()
														
 
															+
														
 
															+        # 示例 5: 使用不同的模型
														
 
															+        print("示例 5: 使用 Claude 模型")
														
 
															+        result = await compare_phrases(
														
 
															+            "深度学习", "神经网络",
														
 
															+            model_name='anthropic/claude-haiku-4.5'
														
 
															+        )
														
 
															+        print(f"相似度: {result.get('相似度')}")
														
 
															+        print(f"说明: {result.get('说明')}")
														
 
															+
														
 
															+    asyncio.run(main())
														
 
															+
														
 
															+
														
 
															+# ========== V2 版本（示例：详细分析版本） ==========
														
 
															+
														
 
															+# V2 默认提示词模板（更详细的分析）
														
 
															+DEFAULT_PROMPT_TEMPLATE_V2 = """
														
 
															+请深入分析【{phrase_a}】和【{phrase_b}】的语义关系，包括：
														
 
															+1. 语义相似度（0-1）
														
 
															+2. 关系类型（如：包含、相关、对立、无关等）
														
 
															+3. 详细说明
														
 
															+
														
 
															+输出格式：
														
 
															+```json
														
 
															+{{
														
 
															+  "相似度": 0.0,
														
 
															+  "关系类型": "相关/包含/对立/无关",
														
 
															+  "详细说明": "详细分析两者的语义关系...",
														
 
															+  "应用场景": "该关系在实际应用中的意义..."
														
 
															+}}
														
 
															+```
														
 
															+""".strip()
														
 
															+
														
 
															+
														
 
															+# 对外接口 - V2
														
 
															+async def compare_phrases_v2(
														
 
															+    phrase_a: str,
														
 
															+    phrase_b: str,
														
 
															+    model_name: str = 'anthropic/claude-sonnet-4.5',  # V2 默认使用更强的模型
														
 
															+    temperature: float = 0.0,
														
 
															+    max_tokens: int = 65536,
														
 
															+    prompt_template: str = None,
														
 
															+    instructions: str = None,
														
 
															+    tools: list = None,
														
 
															+    name: str = "Advanced Semantic Analyzer",
														
 
															+    use_cache: bool = True,
														
 
															+    cache_dir: str = DEFAULT_CACHE_DIR
														
 
															+) -> Dict[str, Any]:
														
 
															+    """
														
 
															+    比较两个短语的语义相似度 - V2 版本（详细分析）
														
 
															+
														
 
															+    V2 特点：
														
 
															+    - 默认使用更强的模型（Claude Sonnet 4.5）
														
 
															+    - 更详细的分析输出（包含关系类型和应用场景）
														
 
															+    - 适合需要深入分析的场景
														
 
															+
														
 
															+    Args:
														
 
															+        phrase_a: 第一个短语
														
 
															+        phrase_b: 第二个短语
														
 
															+        model_name: 使用的模型名称，默认 'anthropic/claude-sonnet-4.5'
														
 
															+        temperature: 模型温度参数，默认 0.0
														
 
															+        max_tokens: 最大生成token数，默认 65536
														
 
															+        prompt_template: 自定义提示词模板，默认使用 V2 详细模板
														
 
															+        instructions: Agent 的系统指令，默认为 None
														
 
															+        tools: Agent 可用的工具列表，默认为 []
														
 
															+        name: Agent 的名称，默认 "Advanced Semantic Analyzer"
														
 
															+        use_cache: 是否使用缓存，默认 True
														
 
															+        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
														
 
															+
														
 
															+    Returns:
														
 
															+        解析后的字典，包含：
														
 
															+        - 相似度: 0-1之间的浮点数
														
 
															+        - 关系类型: 关系分类
														
 
															+        - 详细说明: 详细分析
														
 
															+        - 应用场景: 应用建议
														
 
															+
														
 
															+    Examples:
														
 
															+        >>> result = await compare_phrases_v2("深度学习", "神经网络")
														
 
															+        >>> print(result['相似度'])
														
 
															+        0.9
														
 
															+        >>> print(result['关系类型'])
														
 
															+        "包含"
														
 
															+        >>> print(result['详细说明'])
														
 
															+        "深度学习是基于人工神经网络的机器学习方法..."
														
 
															+    """
														
 
															+    # 使用 V2 默认模板（如果未指定）
														
 
															+    if prompt_template is None:
														
 
															+        prompt_template = DEFAULT_PROMPT_TEMPLATE_V2
														
 
															+
														
 
															+    return await _difference_between_phrases_parsed(
														
 
															+        phrase_a, phrase_b, model_name, temperature, max_tokens,
														
 
															+        prompt_template, instructions, tools, name, use_cache, cache_dir
														
 
															+    )