vor 1 Woche · fa506b4586
--- a/lib/README_semantic_similarity.md
+++ b/lib/README_semantic_similarity.md
@@ -0,0 +1,293 @@
 
				+# 语义相似度分析模块
			
 
				+
			
 
				+## 功能概述
			
 
				+
			
 
				+提供基于 AI Agent 的语义相似度分析功能，支持缓存机制以提高性能和降低 API 调用成本。
			
 
				+
			
 
				+## 主要功能
			
 
				+
			
 
				+### 1. 核心函数
			
 
				+
			
 
				+- `difference_between_phrases()` - 返回原始 AI 响应
			
 
				+- `difference_between_phrases_parsed()` - 返回解析后的 JSON 字典
			
 
				+- `compare_phrases()` - `difference_between_phrases_parsed()` 的别名
			
 
				+
			
 
				+### 2. 缓存系统设计
			
 
				+
			
 
				+#### 缓存文件名设计
			
 
				+
			
 
				+**方案：可读文件名 + 哈希后缀**
			
 
				+
			
 
				+```
			
 
				+cache/semantic_similarity/
			
 
				+├── 宿命感_vs_余华的小说_gpt-4.1-mini_t0.0_a7f3e2d9.json
			
 
				+├── 人工智能_vs_机器学习_claude-sonnet-4.5_t0.0_b8e4f3e0.json
			
 
				+├── 深度学习_vs_神经网络_gemini-2.5-pro_t0.2_c9f5g4h1.json
			
 
				+└── 创意写作_vs_AI生成_gpt-4.1-mini_t0.7_d0a6h5i2.json
			
 
				+```
			
 
				+
			
 
				+**文件名格式：**
			
 
				+```
			
 
				+{phrase_a}_vs_{phrase_b}_{model}_t{temp}_{hash[:8]}.json
			
 
				+```
			
 
				+
			
 
				+- `phrase_a`: 第一个短语（最长20字符，特殊字符转为下划线）
			
 
				+- `phrase_b`: 第二个短语（最长20字符，特殊字符转为下划线）
			
 
				+- `model`: 模型简称（提取 `/` 后部分，最长20字符）
			
 
				+- `t{temp}`: 温度参数（格式化为1位小数，如 t0.0, t0.2, t0.7）
			
 
				+- `hash[:8]`: 完整哈希的前8位
			
 
				+
			
 
				+**哈希生成逻辑：**
			
 
				+- 基于所有影响结果的参数生成唯一 MD5 哈希：
			
 
				+  - `phrase_a` - 第一个短语
			
 
				+  - `phrase_b` - 第二个短语
			
 
				+  - `model_name` - 模型名称
			
 
				+  - `temperature` - 温度参数
			
 
				+  - `max_tokens` - 最大 token 数
			
 
				+  - `prompt_template` - 提示词模板
			
 
				+
			
 
				+**缓存文件格式（结构化 JSON）：**
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "input": {
			
 
				+    "phrase_a": "宿命感",
			
 
				+    "phrase_b": "余华的小说",
			
 
				+    "model_name": "openai/gpt-4.1-mini",
			
 
				+    "temperature": 0.0,
			
 
				+    "max_tokens": 65536,
			
 
				+    "prompt_template": "从语意角度,判断【{phrase_a}】和【{phrase_b}】..."
			
 
				+  },
			
 
				+  "output": {
			
 
				+    "result": "{\n  \"说明\": \"...\",\n  \"相似度\": 0.75\n}"
			
 
				+  },
			
 
				+  "metadata": {
			
 
				+    "timestamp": "2025-11-19 14:30:45",
			
 
				+    "cache_key": "a7f3e2d9c1b4a5f8e6d7c9b2a1f3e5d7",
			
 
				+    "cache_file": "宿命感_vs_余华的小说_gpt-4.1-mini_t0.0_a7f3e2d9.json"
			
 
				+  }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+#### 缓存特性
			
 
				+
			
 
				+1. **自动缓存**：默认启用，首次调用保存结果
			
 
				+2. **智能匹配**：相同参数自动从缓存读取
			
 
				+3. **可控性**：支持 `use_cache=False` 强制重新请求
			
 
				+4. **可追溯**：缓存文件包含完整元数据和时间戳
			
 
				+5. **自定义目录**：支持通过 `cache_dir` 参数自定义缓存位置
			
 
				+
			
 
				+## 使用示例
			
 
				+
			
 
				+### 基本使用（自动缓存）
			
 
				+
			
 
				+```python
			
 
				+from lib.semantic_similarity import compare_phrases
			
 
				+
			
 
				+# 第一次调用 - 请求 API 并缓存
			
 
				+result = await compare_phrases("宿命感", "余华的小说")
			
 
				+# 输出: ✓ 已缓存: 宿命感_vs_余华的小说_gpt-4.1-mini_t0.0_a7f3e2d9.json
			
 
				+
			
 
				+# 第二次调用相同参数 - 从缓存读取
			
 
				+result = await compare_phrases("宿命感", "余华的小说")
			
 
				+# 输出: ✓ 使用缓存: 宿命感_vs_余华的小说_t0.0_a7f3e2d9.json
			
 
				+
			
 
				+print(result['相似度'])  # 0.3
			
 
				+print(result['说明'])    # "两个概念..."
			
 
				+```
			
 
				+
			
 
				+### 禁用缓存
			
 
				+
			
 
				+```python
			
 
				+# 强制重新请求 API
			
 
				+result = await compare_phrases(
			
 
				+    "人工智能",
			
 
				+    "机器学习",
			
 
				+    use_cache=False
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+### 自定义缓存目录
			
 
				+
			
 
				+```python
			
 
				+# 使用自定义缓存目录
			
 
				+result = await compare_phrases(
			
 
				+    "深度学习",
			
 
				+    "神经网络",
			
 
				+    cache_dir="my_cache/similarity"
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+### 自定义提示词模板
			
 
				+
			
 
				+```python
			
 
				+custom_template = """
			
 
				+请详细分析【{phrase_a}】和【{phrase_b}】的语义关系
			
 
				+输出格式：
			
 
				+```json
			
 
				+{{
			
 
				+  "说明": "详细分析",
			
 
				+  "相似度": 0.5,
			
 
				+  "关系类型": "相关/包含/对立/无关"
			
 
				+}}
			
 
				+```
			
 
				+"""
			
 
				+
			
 
				+result = await compare_phrases(
			
 
				+    "机器学习",
			
 
				+    "深度学习",
			
 
				+    prompt_template=custom_template
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+### 配置不同模型
			
 
				+
			
 
				+```python
			
 
				+# 使用 Claude 模型
			
 
				+result = await compare_phrases(
			
 
				+    "人工智能",
			
 
				+    "深度学习",
			
 
				+    model_name='anthropic/claude-sonnet-4.5',
			
 
				+    temperature=0.2
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+## 缓存管理
			
 
				+
			
 
				+### 查看缓存
			
 
				+
			
 
				+```bash
			
 
				+# 查看缓存目录
			
 
				+ls cache/semantic_similarity/
			
 
				+
			
 
				+# 查看特定缓存文件
			
 
				+cat cache/semantic_similarity/a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p6.json
			
 
				+```
			
 
				+
			
 
				+### 清理缓存
			
 
				+
			
 
				+```bash
			
 
				+# 清理所有缓存
			
 
				+rm -rf cache/semantic_similarity/
			
 
				+
			
 
				+# 清理特定缓存文件
			
 
				+rm cache/semantic_similarity/a1b2c3d4*.json
			
 
				+```
			
 
				+
			
 
				+### 缓存统计
			
 
				+
			
 
				+```python
			
 
				+from pathlib import Path
			
 
				+import json
			
 
				+
			
 
				+cache_dir = Path("cache/semantic_similarity")
			
 
				+cache_files = list(cache_dir.glob("*.json"))
			
 
				+
			
 
				+print(f"缓存文件总数: {len(cache_files)}")
			
 
				+
			
 
				+# 统计各模型使用情况
			
 
				+model_stats = {}
			
 
				+for file in cache_files:
			
 
				+    with open(file, 'r') as f:
			
 
				+        data = json.load(f)
			
 
				+        model = data.get('model_name', 'unknown')
			
 
				+        model_stats[model] = model_stats.get(model, 0) + 1
			
 
				+
			
 
				+print("各模型缓存数量:")
			
 
				+for model, count in model_stats.items():
			
 
				+    print(f"  {model}: {count}")
			
 
				+```
			
 
				+
			
 
				+## 参数说明
			
 
				+
			
 
				+### 所有函数共享参数
			
 
				+
			
 
				+| 参数 | 类型 | 默认值 | 说明 |
			
 
				+|------|------|--------|------|
			
 
				+| `phrase_a` | str | 必填 | 第一个短语 |
			
 
				+| `phrase_b` | str | 必填 | 第二个短语 |
			
 
				+| `model_name` | str | `'openai/gpt-4.1-mini'` | 使用的 AI 模型 |
			
 
				+| `temperature` | float | `0.0` | 温度参数（0.0-1.0） |
			
 
				+| `max_tokens` | int | `65536` | 最大生成 token 数 |
			
 
				+| `prompt_template` | str | `None` | 自定义提示词模板 |
			
 
				+| `use_cache` | bool | `True` | 是否启用缓存 |
			
 
				+| `cache_dir` | str | `'cache/semantic_similarity'` | 缓存目录路径 |
			
 
				+
			
 
				+### 支持的模型
			
 
				+
			
 
				+- `'google/gemini-2.5-pro'`
			
 
				+- `'anthropic/claude-sonnet-4.5'`
			
 
				+- `'google/gemini-2.0-flash-001'`
			
 
				+- `'openai/gpt-5-mini'`
			
 
				+- `'anthropic/claude-haiku-4.5'`
			
 
				+- `'openai/gpt-4.1-mini'` (默认)
			
 
				+
			
 
				+## 性能优化
			
 
				+
			
 
				+### 缓存命中率优化
			
 
				+
			
 
				+1. **参数标准化**：确保相同语义使用相同参数
			
 
				+2. **批量处理**：对相同短语对只调用一次
			
 
				+3. **预热缓存**：提前为常用短语对生成缓存
			
 
				+
			
 
				+### 示例：批量处理
			
 
				+
			
 
				+```python
			
 
				+phrase_pairs = [
			
 
				+    ("宿命感", "余华的小说"),
			
 
				+    ("人工智能", "机器学习"),
			
 
				+    ("深度学习", "神经网络"),
			
 
				+]
			
 
				+
			
 
				+for phrase_a, phrase_b in phrase_pairs:
			
 
				+    result = await compare_phrases(phrase_a, phrase_b)
			
 
				+    print(f"{phrase_a} vs {phrase_b}: {result['相似度']}")
			
 
				+```
			
 
				+
			
 
				+## 注意事项
			
 
				+
			
 
				+1. **参数敏感性**：任何参数变化都会导致新的缓存键
			
 
				+2. **存储空间**：长期使用可能积累大量缓存文件
			
 
				+3. **缓存一致性**：模型更新后建议清理旧缓存
			
 
				+4. **并发安全**：当前实现不支持并发写入同一缓存文件
			
 
				+
			
 
				+## 故障排查
			
 
				+
			
 
				+### 缓存未命中
			
 
				+
			
 
				+**问题**：相同参数调用但未使用缓存
			
 
				+
			
 
				+**可能原因**：
			
 
				+- 参数细微差异（如空格、换行）
			
 
				+- `prompt_template` 不一致
			
 
				+- 缓存文件损坏或被删除
			
 
				+
			
 
				+**解决方案**：
			
 
				+```python
			
 
				+# 检查缓存键
			
 
				+from lib.semantic_similarity import _generate_cache_key, DEFAULT_PROMPT_TEMPLATE
			
 
				+
			
 
				+key = _generate_cache_key(
			
 
				+    "宿命感", "余华的小说",
			
 
				+    "openai/gpt-4.1-mini", 0.0, 65536,
			
 
				+    DEFAULT_PROMPT_TEMPLATE
			
 
				+)
			
 
				+print(f"缓存键: {key}")
			
 
				+```
			
 
				+
			
 
				+### 缓存损坏
			
 
				+
			
 
				+**问题**：缓存文件存在但无法加载
			
 
				+
			
 
				+**解决方案**：
			
 
				+```bash
			
 
				+# 删除损坏的缓存文件
			
 
				+rm cache/semantic_similarity/{cache_key}.json
			
 
				+```
			
 
				+
			
 
				+## 版本历史
			
 
				+
			
 
				+- **v1.0** - 初始版本，支持基本语义相似度分析
			
 
				+- **v1.1** - 添加缓存系统
			
 
				+- **v1.2** - 支持自定义提示词模板
			
 
				+- **v1.3** - 优化缓存文件格式，添加元数据
			
--- a/lib/semantic_similarity.py
+++ b/lib/semantic_similarity.py
@@ -0,0 +1,589 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+语义相似度分析模块
			
 
				+使用 AI Agent 判断两个短语之间的语义相似度
			
 
				+"""
			
 
				+
			
 
				+from agents import Agent, Runner, ModelSettings
			
 
				+from lib.client import get_model
			
 
				+from lib.utils import parse_json_from_text
			
 
				+from typing import Dict, Any, Optional
			
 
				+import hashlib
			
 
				+import json
			
 
				+import os
			
 
				+from datetime import datetime
			
 
				+from pathlib import Path
			
 
				+
			
 
				+
			
 
				+# 默认提示词模板
			
 
				+DEFAULT_PROMPT_TEMPLATE = """
			
 
				+从语意角度,判断【{phrase_a}】和【{phrase_b}】的相似度,从0-1打分，输出json格式
			
 
				+```json
			
 
				+{{
			
 
				+  "说明": "简明扼要说明理由",
			
 
				+  "相似度": 0.0,
			
 
				+}}
			
 
				+```
			
 
				+""".strip()
			
 
				+
			
 
				+# 默认缓存目录
			
 
				+DEFAULT_CACHE_DIR = "cache/semantic_similarity"
			
 
				+
			
 
				+
			
 
				+def _generate_cache_key(
			
 
				+    phrase_a: str,
			
 
				+    phrase_b: str,
			
 
				+    model_name: str,
			
 
				+    temperature: float,
			
 
				+    max_tokens: int,
			
 
				+    prompt_template: str,
			
 
				+    instructions: str = None,
			
 
				+    tools: str = "[]"
			
 
				+) -> str:
			
 
				+    """
			
 
				+    生成缓存键（哈希值）
			
 
				+
			
 
				+    Args:
			
 
				+        phrase_a: 第一个短语
			
 
				+        phrase_b: 第二个短语
			
 
				+        model_name: 模型名称
			
 
				+        temperature: 温度参数
			
 
				+        max_tokens: 最大token数
			
 
				+        prompt_template: 提示词模板
			
 
				+        instructions: Agent 系统指令
			
 
				+        tools: 工具列表的 JSON 字符串
			
 
				+
			
 
				+    Returns:
			
 
				+        32位MD5哈希值
			
 
				+    """
			
 
				+    # 创建包含所有参数的字符串
			
 
				+    cache_string = f"{phrase_a}||{phrase_b}||{model_name}||{temperature}||{max_tokens}||{prompt_template}||{instructions}||{tools}"
			
 
				+
			
 
				+    # 生成MD5哈希
			
 
				+    return hashlib.md5(cache_string.encode('utf-8')).hexdigest()
			
 
				+
			
 
				+
			
 
				+def _sanitize_for_filename(text: str, max_length: int = 30) -> str:
			
 
				+    """
			
 
				+    将文本转换为安全的文件名部分
			
 
				+
			
 
				+    Args:
			
 
				+        text: 原始文本
			
 
				+        max_length: 最大长度
			
 
				+
			
 
				+    Returns:
			
 
				+        安全的文件名字符串
			
 
				+    """
			
 
				+    import re
			
 
				+    # 移除特殊字符，只保留中文、英文、数字、下划线
			
 
				+    sanitized = re.sub(r'[^\w\u4e00-\u9fff]', '_', text)
			
 
				+    # 移除连续的下划线
			
 
				+    sanitized = re.sub(r'_+', '_', sanitized)
			
 
				+    # 截断到最大长度
			
 
				+    if len(sanitized) > max_length:
			
 
				+        sanitized = sanitized[:max_length]
			
 
				+    return sanitized.strip('_')
			
 
				+
			
 
				+
			
 
				+def _get_cache_filepath(
			
 
				+    cache_key: str,
			
 
				+    phrase_a: str,
			
 
				+    phrase_b: str,
			
 
				+    model_name: str,
			
 
				+    temperature: float,
			
 
				+    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+) -> Path:
			
 
				+    """
			
 
				+    获取缓存文件路径（可读文件名）
			
 
				+
			
 
				+    Args:
			
 
				+        cache_key: 缓存键（哈希值）
			
 
				+        phrase_a: 第一个短语
			
 
				+        phrase_b: 第二个短语
			
 
				+        model_name: 模型名称
			
 
				+        temperature: 温度参数
			
 
				+        cache_dir: 缓存目录
			
 
				+
			
 
				+    Returns:
			
 
				+        缓存文件的完整路径
			
 
				+
			
 
				+    文件名格式: {phrase_a}_vs_{phrase_b}_{model}_t{temp}_{hash[:8]}.json
			
 
				+    示例: 宿命感_vs_余华的小说_gpt-4.1-mini_t0.0_a7f3e2d9.json
			
 
				+    """
			
 
				+    # 清理短语和模型名
			
 
				+    clean_a = _sanitize_for_filename(phrase_a, max_length=20)
			
 
				+    clean_b = _sanitize_for_filename(phrase_b, max_length=20)
			
 
				+
			
 
				+    # 简化模型名（提取关键部分）
			
 
				+    model_short = model_name.split('/')[-1]  # 例如: openai/gpt-4.1-mini -> gpt-4.1-mini
			
 
				+    model_short = _sanitize_for_filename(model_short, max_length=20)
			
 
				+
			
 
				+    # 格式化温度参数
			
 
				+    temp_str = f"t{temperature:.1f}"
			
 
				+
			
 
				+    # 使用哈希的前8位
			
 
				+    hash_short = cache_key[:8]
			
 
				+
			
 
				+    # 组合文件名
			
 
				+    filename = f"{clean_a}_vs_{clean_b}_{model_short}_{temp_str}_{hash_short}.json"
			
 
				+
			
 
				+    return Path(cache_dir) / filename
			
 
				+
			
 
				+
			
 
				+def _load_from_cache(
			
 
				+    cache_key: str,
			
 
				+    phrase_a: str,
			
 
				+    phrase_b: str,
			
 
				+    model_name: str,
			
 
				+    temperature: float,
			
 
				+    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+) -> Optional[str]:
			
 
				+    """
			
 
				+    从缓存加载数据
			
 
				+
			
 
				+    Args:
			
 
				+        cache_key: 缓存键
			
 
				+        phrase_a: 第一个短语
			
 
				+        phrase_b: 第二个短语
			
 
				+        model_name: 模型名称
			
 
				+        temperature: 温度参数
			
 
				+        cache_dir: 缓存目录
			
 
				+
			
 
				+    Returns:
			
 
				+        缓存的结果字符串，如果不存在则返回 None
			
 
				+    """
			
 
				+    cache_file = _get_cache_filepath(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)
			
 
				+
			
 
				+    # 如果文件不存在，尝试通过哈希匹配查找
			
 
				+    if not cache_file.exists():
			
 
				+        # 查找所有以该哈希结尾的文件
			
 
				+        cache_path = Path(cache_dir)
			
 
				+        if cache_path.exists():
			
 
				+            hash_short = cache_key[:8]
			
 
				+            matching_files = list(cache_path.glob(f"*_{hash_short}.json"))
			
 
				+            if matching_files:
			
 
				+                cache_file = matching_files[0]
			
 
				+            else:
			
 
				+                return None
			
 
				+        else:
			
 
				+            return None
			
 
				+
			
 
				+    try:
			
 
				+        with open(cache_file, 'r', encoding='utf-8') as f:
			
 
				+            cached_data = json.load(f)
			
 
				+            return cached_data['output']['raw']
			
 
				+    except (json.JSONDecodeError, IOError, KeyError):
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def _save_to_cache(
			
 
				+    cache_key: str,
			
 
				+    phrase_a: str,
			
 
				+    phrase_b: str,
			
 
				+    model_name: str,
			
 
				+    temperature: float,
			
 
				+    max_tokens: int,
			
 
				+    prompt_template: str,
			
 
				+    instructions: str,
			
 
				+    tools: str,
			
 
				+    result: str,
			
 
				+    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+) -> None:
			
 
				+    """
			
 
				+    保存数据到缓存
			
 
				+
			
 
				+    Args:
			
 
				+        cache_key: 缓存键
			
 
				+        phrase_a: 第一个短语
			
 
				+        phrase_b: 第二个短语
			
 
				+        model_name: 模型名称
			
 
				+        temperature: 温度参数
			
 
				+        max_tokens: 最大token数
			
 
				+        prompt_template: 提示词模板
			
 
				+        instructions: Agent 系统指令
			
 
				+        tools: 工具列表的 JSON 字符串
			
 
				+        result: 结果数据（原始字符串）
			
 
				+        cache_dir: 缓存目录
			
 
				+    """
			
 
				+    cache_file = _get_cache_filepath(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)
			
 
				+
			
 
				+    # 确保缓存目录存在
			
 
				+    cache_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    # 尝试解析 result 为 JSON
			
 
				+    parsed_result = parse_json_from_text(result)
			
 
				+
			
 
				+    # 准备缓存数据（包含完整的输入输出信息）
			
 
				+    cache_data = {
			
 
				+        "input": {
			
 
				+            "phrase_a": phrase_a,
			
 
				+            "phrase_b": phrase_b,
			
 
				+            "model_name": model_name,
			
 
				+            "temperature": temperature,
			
 
				+            "max_tokens": max_tokens,
			
 
				+            "prompt_template": prompt_template,
			
 
				+            "instructions": instructions,
			
 
				+            "tools": tools
			
 
				+        },
			
 
				+        "output": {
			
 
				+            "raw": result,              # 保留原始响应
			
 
				+            "parsed": parsed_result     # 解析后的JSON对象
			
 
				+        },
			
 
				+        "metadata": {
			
 
				+            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
			
 
				+            "cache_key": cache_key,
			
 
				+            "cache_file": str(cache_file.name)
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    try:
			
 
				+        with open(cache_file, 'w', encoding='utf-8') as f:
			
 
				+            json.dump(cache_data, f, ensure_ascii=False, indent=2)
			
 
				+    except IOError:
			
 
				+        pass  # 静默失败，不影响主流程
			
 
				+
			
 
				+
			
 
				+async def _difference_between_phrases(
			
 
				+    phrase_a: str,
			
 
				+    phrase_b: str,
			
 
				+    model_name: str = 'openai/gpt-4.1-mini',
			
 
				+    temperature: float = 0.0,
			
 
				+    max_tokens: int = 65536,
			
 
				+    prompt_template: str = None,
			
 
				+    instructions: str = None,
			
 
				+    tools: list = None,
			
 
				+    name: str = "Semantic Similarity Analyzer",
			
 
				+    use_cache: bool = True,
			
 
				+    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+) -> str:
			
 
				+    """
			
 
				+    从语义角度判断两个短语的相似度
			
 
				+
			
 
				+    Args:
			
 
				+        phrase_a: 第一个短语
			
 
				+        phrase_b: 第二个短语
			
 
				+        model_name: 使用的模型名称，可选值：
			
 
				+            - 'google/gemini-2.5-pro'
			
 
				+            - 'anthropic/claude-sonnet-4.5'
			
 
				+            - 'google/gemini-2.0-flash-001'
			
 
				+            - 'openai/gpt-5-mini'
			
 
				+            - 'anthropic/claude-haiku-4.5'
			
 
				+            - 'openai/gpt-4.1-mini' (默认)
			
 
				+        temperature: 模型温度参数，控制输出随机性，默认 0.0（确定性输出）
			
 
				+        max_tokens: 最大生成token数，默认 65536
			
 
				+        prompt_template: 自定义提示词模板，使用 {phrase_a} 和 {phrase_b} 作为占位符
			
 
				+                        如果为 None，使用默认模板
			
 
				+        instructions: Agent 的系统指令，默认为 None
			
 
				+        tools: Agent 可用的工具列表，默认为 []
			
 
				+        name: Agent 的名称，默认为 "Semantic Similarity Analyzer"（不参与缓存key构建）
			
 
				+        use_cache: 是否使用缓存，默认 True
			
 
				+        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
			
 
				+
			
 
				+    Returns:
			
 
				+        JSON 格式的相似度分析结果字符串
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> # 使用默认模板和缓存
			
 
				+        >>> result = await difference_between_phrases("宿命感", "余华的小说")
			
 
				+        >>> print(result)
			
 
				+        {
			
 
				+          "说明": "简明扼要说明理由",
			
 
				+          "相似度": 0.0
			
 
				+        }
			
 
				+
			
 
				+        >>> # 禁用缓存
			
 
				+        >>> result = await difference_between_phrases(
			
 
				+        ...     "宿命感", "余华的小说",
			
 
				+        ...     use_cache=False
			
 
				+        ... )
			
 
				+
			
 
				+        >>> # 使用自定义模板
			
 
				+        >>> custom_template = '''
			
 
				+        ... 请分析【{phrase_a}】和【{phrase_b}】的语义关联度
			
 
				+        ... 输出格式：{{"score": 0.0, "reason": "..."}}
			
 
				+        ... '''
			
 
				+        >>> result = await difference_between_phrases(
			
 
				+        ...     "宿命感", "余华的小说",
			
 
				+        ...     prompt_template=custom_template
			
 
				+        ... )
			
 
				+    """
			
 
				+    # 使用自定义模板或默认模板
			
 
				+    if prompt_template is None:
			
 
				+        prompt_template = DEFAULT_PROMPT_TEMPLATE
			
 
				+
			
 
				+    # 默认tools为空列表
			
 
				+    if tools is None:
			
 
				+        tools = []
			
 
				+
			
 
				+    # 生成缓存键（tools转为JSON字符串以便哈希）
			
 
				+    tools_str = json.dumps(tools, sort_keys=True) if tools else "[]"
			
 
				+    cache_key = _generate_cache_key(
			
 
				+        phrase_a, phrase_b, model_name, temperature, max_tokens, prompt_template, instructions, tools_str
			
 
				+    )
			
 
				+
			
 
				+    # 尝试从缓存加载
			
 
				+    if use_cache:
			
 
				+        cached_result = _load_from_cache(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)
			
 
				+        if cached_result is not None:
			
 
				+            return cached_result
			
 
				+
			
 
				+    # 缓存未命中，调用 API
			
 
				+    agent = Agent(
			
 
				+        name=name,
			
 
				+        model=get_model(model_name),
			
 
				+        model_settings=ModelSettings(
			
 
				+            temperature=temperature,
			
 
				+            max_tokens=max_tokens,
			
 
				+        ),
			
 
				+        instructions=instructions,
			
 
				+        tools=tools,
			
 
				+    )
			
 
				+
			
 
				+    # 格式化提示词
			
 
				+    prompt = prompt_template.format(phrase_a=phrase_a, phrase_b=phrase_b)
			
 
				+
			
 
				+    result = await Runner.run(agent, input=prompt)
			
 
				+    final_output = result.final_output
			
 
				+
			
 
				+    # 保存到缓存
			
 
				+    if use_cache:
			
 
				+        _save_to_cache(
			
 
				+            cache_key, phrase_a, phrase_b, model_name,
			
 
				+            temperature, max_tokens, prompt_template,
			
 
				+            instructions, tools_str, final_output, cache_dir
			
 
				+        )
			
 
				+
			
 
				+    return final_output
			
 
				+
			
 
				+
			
 
				+async def _difference_between_phrases_parsed(
			
 
				+    phrase_a: str,
			
 
				+    phrase_b: str,
			
 
				+    model_name: str = 'openai/gpt-4.1-mini',
			
 
				+    temperature: float = 0.0,
			
 
				+    max_tokens: int = 65536,
			
 
				+    prompt_template: str = None,
			
 
				+    instructions: str = None,
			
 
				+    tools: list = None,
			
 
				+    name: str = "Semantic Similarity Analyzer",
			
 
				+    use_cache: bool = True,
			
 
				+    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    从语义角度判断两个短语的相似度，并解析返回结果为字典
			
 
				+
			
 
				+    Args:
			
 
				+        phrase_a: 第一个短语
			
 
				+        phrase_b: 第二个短语
			
 
				+        model_name: 使用的模型名称
			
 
				+        temperature: 模型温度参数，控制输出随机性，默认 0.0（确定性输出）
			
 
				+        max_tokens: 最大生成token数，默认 65536
			
 
				+        prompt_template: 自定义提示词模板，使用 {phrase_a} 和 {phrase_b} 作为占位符
			
 
				+        instructions: Agent 的系统指令，默认为 None
			
 
				+        tools: Agent 可用的工具列表，默认为 []
			
 
				+        name: Agent 的名称，默认为 "Semantic Similarity Analyzer"
			
 
				+        use_cache: 是否使用缓存，默认 True
			
 
				+        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
			
 
				+
			
 
				+    Returns:
			
 
				+        解析后的字典，包含：
			
 
				+        - 说明: 相似度判断的理由
			
 
				+        - 相似度: 0-1之间的浮点数
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> result = await difference_between_phrases_parsed("宿命感", "余华的小说")
			
 
				+        >>> print(result['相似度'])
			
 
				+        0.3
			
 
				+        >>> print(result['说明'])
			
 
				+        "两个概念有一定关联..."
			
 
				+    """
			
 
				+    raw_result = await _difference_between_phrases(
			
 
				+        phrase_a, phrase_b, model_name, temperature, max_tokens,
			
 
				+        prompt_template, instructions, tools, name, use_cache, cache_dir
			
 
				+    )
			
 
				+
			
 
				+    # 使用 utils.parse_json_from_text 解析结果
			
 
				+    parsed_result = parse_json_from_text(raw_result)
			
 
				+
			
 
				+    # 如果解析失败（返回空字典），返回带错误信息的结果
			
 
				+    if not parsed_result:
			
 
				+        return {
			
 
				+            "说明": "解析失败: 无法从响应中提取有效的 JSON",
			
 
				+            "相似度": 0.0,
			
 
				+            "raw_response": raw_result
			
 
				+        }
			
 
				+
			
 
				+    return parsed_result
			
 
				+
			
 
				+
			
 
				+# ========== V1 版本（默认版本） ==========
			
 
				+
			
 
				+# 对外接口 - V1
			
 
				+async def compare_phrases(
			
 
				+    phrase_a: str,
			
 
				+    phrase_b: str,
			
 
				+    model_name: str = 'openai/gpt-4.1-mini',
			
 
				+    temperature: float = 0.0,
			
 
				+    max_tokens: int = 65536,
			
 
				+    prompt_template: str = None,
			
 
				+    instructions: str = None,
			
 
				+    tools: list = None,
			
 
				+    name: str = "Semantic Similarity Analyzer",
			
 
				+    use_cache: bool = True,
			
 
				+    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    比较两个短语的语义相似度（对外唯一接口）
			
 
				+
			
 
				+    Args:
			
 
				+        phrase_a: 第一个短语
			
 
				+        phrase_b: 第二个短语
			
 
				+        model_name: 使用的模型名称
			
 
				+        temperature: 模型温度参数，控制输出随机性，默认 0.0（确定性输出）
			
 
				+        max_tokens: 最大生成token数，默认 65536
			
 
				+        prompt_template: 自定义提示词模板，使用 {phrase_a} 和 {phrase_b} 作为占位符
			
 
				+        instructions: Agent 的系统指令，默认为 None
			
 
				+        tools: Agent 可用的工具列表，默认为 []
			
 
				+        name: Agent 的名称，默认为 "Semantic Similarity Analyzer"
			
 
				+        use_cache: 是否使用缓存，默认 True
			
 
				+        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
			
 
				+
			
 
				+    Returns:
			
 
				+        解析后的字典
			
 
				+    """
			
 
				+    return await _difference_between_phrases_parsed(
			
 
				+        phrase_a, phrase_b, model_name, temperature, max_tokens,
			
 
				+        prompt_template, instructions, tools, name, use_cache, cache_dir
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    import asyncio
			
 
				+
			
 
				+    async def main():
			
 
				+        """示例使用"""
			
 
				+        # 示例 1: 基本使用（使用缓存）
			
 
				+        print("示例 1: 基本使用")
			
 
				+        result = await compare_phrases("宿命感", "余华的小说")
			
 
				+        print(f"相似度: {result.get('相似度')}")
			
 
				+        print(f"说明: {result.get('说明')}")
			
 
				+        print()
			
 
				+
			
 
				+        # 示例 2: 再次调用相同参数（应该从缓存读取）
			
 
				+        print("示例 2: 测试缓存")
			
 
				+        result = await compare_phrases("宿命感", "余华的小说")
			
 
				+        print(f"相似度: {result.get('相似度')}")
			
 
				+        print()
			
 
				+
			
 
				+        # 示例 3: 自定义温度
			
 
				+        print("示例 3: 自定义温度（创意性输出）")
			
 
				+        result = await compare_phrases(
			
 
				+            "创意写作", "AI生成",
			
 
				+            temperature=0.7
			
 
				+        )
			
 
				+        print(f"相似度: {result.get('相似度')}")
			
 
				+        print(f"说明: {result.get('说明')}")
			
 
				+        print()
			
 
				+
			
 
				+        # 示例 4: 自定义 Agent 名称
			
 
				+        print("示例 4: 自定义 Agent 名称")
			
 
				+        result = await compare_phrases(
			
 
				+            "人工智能", "机器学习",
			
 
				+            name="AI语义分析专家"
			
 
				+        )
			
 
				+        print(f"相似度: {result.get('相似度')}")
			
 
				+        print(f"说明: {result.get('说明')}")
			
 
				+        print()
			
 
				+
			
 
				+        # 示例 5: 使用不同的模型
			
 
				+        print("示例 5: 使用 Claude 模型")
			
 
				+        result = await compare_phrases(
			
 
				+            "深度学习", "神经网络",
			
 
				+            model_name='anthropic/claude-haiku-4.5'
			
 
				+        )
			
 
				+        print(f"相似度: {result.get('相似度')}")
			
 
				+        print(f"说明: {result.get('说明')}")
			
 
				+
			
 
				+    asyncio.run(main())
			
 
				+
			
 
				+
			
 
				+# ========== V2 版本（示例：详细分析版本） ==========
			
 
				+
			
 
				+# V2 默认提示词模板（更详细的分析）
			
 
				+DEFAULT_PROMPT_TEMPLATE_V2 = """
			
 
				+请深入分析【{phrase_a}】和【{phrase_b}】的语义关系，包括：
			
 
				+1. 语义相似度（0-1）
			
 
				+2. 关系类型（如：包含、相关、对立、无关等）
			
 
				+3. 详细说明
			
 
				+
			
 
				+输出格式：
			
 
				+```json
			
 
				+{{
			
 
				+  "相似度": 0.0,
			
 
				+  "关系类型": "相关/包含/对立/无关",
			
 
				+  "详细说明": "详细分析两者的语义关系...",
			
 
				+  "应用场景": "该关系在实际应用中的意义..."
			
 
				+}}
			
 
				+```
			
 
				+""".strip()
			
 
				+
			
 
				+
			
 
				+# 对外接口 - V2
			
 
				+async def compare_phrases_v2(
			
 
				+    phrase_a: str,
			
 
				+    phrase_b: str,
			
 
				+    model_name: str = 'anthropic/claude-sonnet-4.5',  # V2 默认使用更强的模型
			
 
				+    temperature: float = 0.0,
			
 
				+    max_tokens: int = 65536,
			
 
				+    prompt_template: str = None,
			
 
				+    instructions: str = None,
			
 
				+    tools: list = None,
			
 
				+    name: str = "Advanced Semantic Analyzer",
			
 
				+    use_cache: bool = True,
			
 
				+    cache_dir: str = DEFAULT_CACHE_DIR
			
 
				+) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    比较两个短语的语义相似度 - V2 版本（详细分析）
			
 
				+
			
 
				+    V2 特点：
			
 
				+    - 默认使用更强的模型（Claude Sonnet 4.5）
			
 
				+    - 更详细的分析输出（包含关系类型和应用场景）
			
 
				+    - 适合需要深入分析的场景
			
 
				+
			
 
				+    Args:
			
 
				+        phrase_a: 第一个短语
			
 
				+        phrase_b: 第二个短语
			
 
				+        model_name: 使用的模型名称，默认 'anthropic/claude-sonnet-4.5'
			
 
				+        temperature: 模型温度参数，默认 0.0
			
 
				+        max_tokens: 最大生成token数，默认 65536
			
 
				+        prompt_template: 自定义提示词模板，默认使用 V2 详细模板
			
 
				+        instructions: Agent 的系统指令，默认为 None
			
 
				+        tools: Agent 可用的工具列表，默认为 []
			
 
				+        name: Agent 的名称，默认 "Advanced Semantic Analyzer"
			
 
				+        use_cache: 是否使用缓存，默认 True
			
 
				+        cache_dir: 缓存目录，默认 'cache/semantic_similarity'
			
 
				+
			
 
				+    Returns:
			
 
				+        解析后的字典，包含：
			
 
				+        - 相似度: 0-1之间的浮点数
			
 
				+        - 关系类型: 关系分类
			
 
				+        - 详细说明: 详细分析
			
 
				+        - 应用场景: 应用建议
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> result = await compare_phrases_v2("深度学习", "神经网络")
			
 
				+        >>> print(result['相似度'])
			
 
				+        0.9
			
 
				+        >>> print(result['关系类型'])
			
 
				+        "包含"
			
 
				+        >>> print(result['详细说明'])
			
 
				+        "深度学习是基于人工神经网络的机器学习方法..."
			
 
				+    """
			
 
				+    # 使用 V2 默认模板（如果未指定）
			
 
				+    if prompt_template is None:
			
 
				+        prompt_template = DEFAULT_PROMPT_TEMPLATE_V2
			
 
				+
			
 
				+    return await _difference_between_phrases_parsed(
			
 
				+        phrase_a, phrase_b, model_name, temperature, max_tokens,
			
 
				+        prompt_template, instructions, tools, name, use_cache, cache_dir
			
 
				+    )