yangxiaohui
/
how


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646
							#!/usr/bin/env python3
"""
语义相似度分析模块
使用 AI Agent 判断两个短语之间的语义相似度
"""

from agents import Agent, Runner, ModelSettings
from lib.client import get_model
from lib.utils import parse_json_from_text
from lib.config import get_cache_dir
from typing import Dict, Any, Optional
import hashlib
import json
import os
from datetime import datetime
from pathlib import Path


# 默认提示词模板
DEFAULT_PROMPT_TEMPLATE = """
从语意角度,判断【{phrase_a}】和【{phrase_b}】的相似度,从0-1打分，输出json格式
```json
{{
  "说明": "简明扼要说明理由",
  "相似度": 0.0,
}}
```
""".strip()


def _get_default_cache_dir() -> str:
    """获取默认缓存目录（从配置中读取）"""
    return get_cache_dir("semantic_similarity")


def _generate_cache_key(
    phrase_a: str,
    phrase_b: str,
    model_name: str,
    temperature: float,
    max_tokens: int,
    prompt_template: str,
    instructions: str = None,
    tools: str = "[]"
) -> str:
    """
    生成缓存键（哈希值）

    Args:
        phrase_a: 第一个短语
        phrase_b: 第二个短语
        model_name: 模型名称
        temperature: 温度参数
        max_tokens: 最大token数
        prompt_template: 提示词模板
        instructions: Agent 系统指令
        tools: 工具列表的 JSON 字符串

    Returns:
        32位MD5哈希值
    """
    # 创建包含所有参数的字符串
    cache_string = f"{phrase_a}||{phrase_b}||{model_name}||{temperature}||{max_tokens}||{prompt_template}||{instructions}||{tools}"

    # 生成MD5哈希
    return hashlib.md5(cache_string.encode('utf-8')).hexdigest()


def _sanitize_for_filename(text: str, max_length: int = 30) -> str:
    """
    将文本转换为安全的文件名部分

    Args:
        text: 原始文本
        max_length: 最大长度

    Returns:
        安全的文件名字符串
    """
    import re
    # 移除特殊字符，只保留中文、英文、数字、下划线
    sanitized = re.sub(r'[^\w\u4e00-\u9fff]', '_', text)
    # 移除连续的下划线
    sanitized = re.sub(r'_+', '_', sanitized)
    # 截断到最大长度
    if len(sanitized) > max_length:
        sanitized = sanitized[:max_length]
    return sanitized.strip('_')


def _get_cache_filepath(
    cache_key: str,
    phrase_a: str,
    phrase_b: str,
    model_name: str,
    temperature: float,
    cache_dir: Optional[str] = None
) -> Path:
    """
    获取缓存文件路径（可读文件名）

    Args:
        cache_key: 缓存键（哈希值）
        phrase_a: 第一个短语
        phrase_b: 第二个短语
        model_name: 模型名称
        temperature: 温度参数
        cache_dir: 缓存目录

    Returns:
        缓存文件的完整路径

    文件名格式: {phrase_a}_vs_{phrase_b}_{model}_t{temp}_{hash[:8]}.json
    示例: 宿命感_vs_余华的小说_gpt-4.1-mini_t0.0_a7f3e2d9.json
    """
    if cache_dir is None:
        cache_dir = _get_default_cache_dir()

    # 清理短语和模型名
    clean_a = _sanitize_for_filename(phrase_a, max_length=20)
    clean_b = _sanitize_for_filename(phrase_b, max_length=20)

    # 简化模型名（提取关键部分）
    model_short = model_name.split('/')[-1]  # 例如: openai/gpt-4.1-mini -> gpt-4.1-mini
    model_short = _sanitize_for_filename(model_short, max_length=20)

    # 格式化温度参数
    temp_str = f"t{temperature:.1f}"

    # 使用哈希的前8位
    hash_short = cache_key[:8]

    # 组合文件名
    filename = f"{clean_a}_vs_{clean_b}_{model_short}_{temp_str}_{hash_short}.json"

    return Path(cache_dir) / filename


def _load_from_cache(
    cache_key: str,
    phrase_a: str,
    phrase_b: str,
    model_name: str,
    temperature: float,
    cache_dir: Optional[str] = None
) -> Optional[str]:
    """
    从缓存加载数据

    Args:
        cache_key: 缓存键
        phrase_a: 第一个短语
        phrase_b: 第二个短语
        model_name: 模型名称
        temperature: 温度参数
        cache_dir: 缓存目录

    Returns:
        缓存的结果字符串，如果不存在则返回 None
    """
    if cache_dir is None:
        cache_dir = _get_default_cache_dir()

    cache_file = _get_cache_filepath(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)

    # 如果文件不存在，尝试通过哈希匹配查找
    if not cache_file.exists():
        # 查找所有以该哈希结尾的文件
        cache_path = Path(cache_dir)
        if cache_path.exists():
            hash_short = cache_key[:8]
            matching_files = list(cache_path.glob(f"*_{hash_short}.json"))
            if matching_files:
                cache_file = matching_files[0]
            else:
                return None
        else:
            return None

    try:
        with open(cache_file, 'r', encoding='utf-8') as f:
            cached_data = json.load(f)
            return cached_data['output']['raw']
    except (json.JSONDecodeError, IOError, KeyError):
        return None


def _save_to_cache(
    cache_key: str,
    phrase_a: str,
    phrase_b: str,
    model_name: str,
    temperature: float,
    max_tokens: int,
    prompt_template: str,
    instructions: str,
    tools: str,
    result: str,
    cache_dir: Optional[str] = None
) -> None:
    """
    保存数据到缓存

    Args:
        cache_key: 缓存键
        phrase_a: 第一个短语
        phrase_b: 第二个短语
        model_name: 模型名称
        temperature: 温度参数
        max_tokens: 最大token数
        prompt_template: 提示词模板
        instructions: Agent 系统指令
        tools: 工具列表的 JSON 字符串
        result: 结果数据（原始字符串）
        cache_dir: 缓存目录
    """
    if cache_dir is None:
        cache_dir = _get_default_cache_dir()

    cache_file = _get_cache_filepath(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)

    # 确保缓存目录存在
    cache_file.parent.mkdir(parents=True, exist_ok=True)

    # 尝试解析 result 为 JSON
    parsed_result = parse_json_from_text(result)

    # 准备缓存数据（包含完整的输入输出信息）
    cache_data = {
        "input": {
            "phrase_a": phrase_a,
            "phrase_b": phrase_b,
            "model_name": model_name,
            "temperature": temperature,
            "max_tokens": max_tokens,
            "prompt_template": prompt_template,
            "instructions": instructions,
            "tools": tools
        },
        "output": {
            "raw": result,              # 保留原始响应
            "parsed": parsed_result     # 解析后的JSON对象
        },
        "metadata": {
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "cache_key": cache_key,
            "cache_file": str(cache_file.name)
        }
    }

    try:
        with open(cache_file, 'w', encoding='utf-8') as f:
            json.dump(cache_data, f, ensure_ascii=False, indent=2)
    except IOError:
        pass  # 静默失败，不影响主流程


async def _difference_between_phrases(
    phrase_a: str,
    phrase_b: str,
    model_name: str = 'openai/gpt-4.1-mini',
    temperature: float = 0.0,
    max_tokens: int = 65536,
    prompt_template: str = None,
    instructions: str = None,
    tools: list = None,
    name: str = "Semantic Similarity Analyzer",
    use_cache: bool = True,
    cache_dir: Optional[str] = None
) -> str:
    """
    从语义角度判断两个短语的相似度

    Args:
        phrase_a: 第一个短语
        phrase_b: 第二个短语
        model_name: 使用的模型名称，可选值：
            - 'google/gemini-2.5-pro'
            - 'anthropic/claude-sonnet-4.5'
            - 'google/gemini-2.0-flash-001'
            - 'openai/gpt-5-mini'
            - 'anthropic/claude-haiku-4.5'
            - 'openai/gpt-4.1-mini' (默认)
        temperature: 模型温度参数，控制输出随机性，默认 0.0（确定性输出）
        max_tokens: 最大生成token数，默认 65536
        prompt_template: 自定义提示词模板，使用 {phrase_a} 和 {phrase_b} 作为占位符
                        如果为 None，使用默认模板
        instructions: Agent 的系统指令，默认为 None
        tools: Agent 可用的工具列表，默认为 []
        name: Agent 的名称，默认为 "Semantic Similarity Analyzer"（不参与缓存key构建）
        use_cache: 是否使用缓存，默认 True
        cache_dir: 缓存目录，默认从配置读取（可通过 lib.config 设置）

    Returns:
        JSON 格式的相似度分析结果字符串

    Examples:
        >>> # 使用默认模板和缓存
        >>> result = await difference_between_phrases("宿命感", "余华的小说")
        >>> print(result)
        {
          "说明": "简明扼要说明理由",
          "相似度": 0.0
        }

        >>> # 禁用缓存
        >>> result = await difference_between_phrases(
        ...     "宿命感", "余华的小说",
        ...     use_cache=False
        ... )

        >>> # 使用自定义模板
        >>> custom_template = '''
        ... 请分析【{phrase_a}】和【{phrase_b}】的语义关联度
        ... 输出格式：{{"score": 0.0, "reason": "..."}}
        ... '''
        >>> result = await difference_between_phrases(
        ...     "宿命感", "余华的小说",
        ...     prompt_template=custom_template
        ... )
    """
    # 使用自定义模板或默认模板
    if prompt_template is None:
        prompt_template = DEFAULT_PROMPT_TEMPLATE

    # 默认tools为空列表
    if tools is None:
        tools = []

    # 生成缓存键（tools转为JSON字符串以便哈希）
    tools_str = json.dumps(tools, sort_keys=True) if tools else "[]"
    cache_key = _generate_cache_key(
        phrase_a, phrase_b, model_name, temperature, max_tokens, prompt_template, instructions, tools_str
    )

    # 尝试从缓存加载
    if use_cache:
        cached_result = _load_from_cache(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)
        if cached_result is not None:
            return cached_result

    # 缓存未命中，调用 API
    agent = Agent(
        name=name,
        model=get_model(model_name),
        model_settings=ModelSettings(
            temperature=temperature,
            max_tokens=max_tokens,
        ),
        instructions=instructions,
        tools=tools,
    )

    # 格式化提示词
    prompt = prompt_template.format(phrase_a=phrase_a, phrase_b=phrase_b)

    result = await Runner.run(agent, input=prompt)
    final_output = result.final_output

    # 注意：不在这里缓存，而是在解析成功后缓存
    # 这样可以避免缓存解析失败的响应

    return final_output


async def _difference_between_phrases_parsed(
    phrase_a: str,
    phrase_b: str,
    model_name: str = 'openai/gpt-4.1-mini',
    temperature: float = 0.0,
    max_tokens: int = 65536,
    prompt_template: str = None,
    instructions: str = None,
    tools: list = None,
    name: str = "Semantic Similarity Analyzer",
    use_cache: bool = True,
    cache_dir: Optional[str] = None
) -> Dict[str, Any]:
    """
    从语义角度判断两个短语的相似度，并解析返回结果为字典

    Args:
        phrase_a: 第一个短语
        phrase_b: 第二个短语
        model_name: 使用的模型名称
        temperature: 模型温度参数，控制输出随机性，默认 0.0（确定性输出）
        max_tokens: 最大生成token数，默认 65536
        prompt_template: 自定义提示词模板，使用 {phrase_a} 和 {phrase_b} 作为占位符
        instructions: Agent 的系统指令，默认为 None
        tools: Agent 可用的工具列表，默认为 []
        name: Agent 的名称，默认为 "Semantic Similarity Analyzer"
        use_cache: 是否使用缓存，默认 True
        cache_dir: 缓存目录，默认从配置读取（可通过 lib.config 设置）

    Returns:
        解析后的字典，包含：
        - 说明: 相似度判断的理由
        - 相似度: 0-1之间的浮点数

    Raises:
        ValueError: 当无法解析AI响应为有效JSON时抛出

    Examples:
        >>> result = await difference_between_phrases_parsed("宿命感", "余华的小说")
        >>> print(result['相似度'])
        0.3
        >>> print(result['说明'])
        "两个概念有一定关联..."
    """
    # 使用默认模板或自定义模板
    if prompt_template is None:
        prompt_template = DEFAULT_PROMPT_TEMPLATE

    # 默认tools为空列表
    if tools is None:
        tools = []

    # 生成缓存键
    tools_str = json.dumps(tools, sort_keys=True) if tools else "[]"
    cache_key = _generate_cache_key(
        phrase_a, phrase_b, model_name, temperature, max_tokens, prompt_template, instructions, tools_str
    )

    # 尝试从缓存加载
    if use_cache:
        cached_result = _load_from_cache(cache_key, phrase_a, phrase_b, model_name, temperature, cache_dir)
        if cached_result is not None:
            # 缓存命中，直接解析并返回
            parsed_result = parse_json_from_text(cached_result)
            if parsed_result:
                return parsed_result
            # 如果缓存的内容也无法解析，继续执行API调用（可能之前缓存了错误响应）

    # 调用AI获取原始响应（不传use_cache，因为我们在这里手动处理缓存）
    raw_result = await _difference_between_phrases(
        phrase_a, phrase_b, model_name, temperature, max_tokens,
        prompt_template, instructions, tools, name, use_cache=False, cache_dir=cache_dir
    )

    # 使用 utils.parse_json_from_text 解析结果
    parsed_result = parse_json_from_text(raw_result)

    # 如果解析失败（返回空字典），抛出异常并包含详细信息
    if not parsed_result:
        # 格式化prompt用于错误信息
        formatted_prompt = prompt_template.format(phrase_a=phrase_a, phrase_b=phrase_b)

        error_msg = f"""
JSON解析失败！
================================================================================
短语A: {phrase_a}
短语B: {phrase_b}
模型: {model_name}
温度: {temperature}
================================================================================
Prompt:
{formatted_prompt}
================================================================================
AI响应 (长度: {len(raw_result)}):
{raw_result}
================================================================================
"""
        raise ValueError(error_msg)

    # 只有解析成功后才缓存
    if use_cache:
        _save_to_cache(
            cache_key, phrase_a, phrase_b, model_name,
            temperature, max_tokens, prompt_template,
            instructions, tools_str, raw_result, cache_dir
        )

    return parsed_result


# ========== V1 版本（默认版本） ==========

# 对外接口 - V1
async def compare_phrases(
    phrase_a: str,
    phrase_b: str,
    model_name: str = 'openai/gpt-4.1-mini',
    temperature: float = 0.0,
    max_tokens: int = 65536,
    prompt_template: str = None,
    instructions: str = None,
    tools: list = None,
    name: str = "Semantic Similarity Analyzer",
    use_cache: bool = True,
    cache_dir: Optional[str] = None
) -> Dict[str, Any]:
    """
    比较两个短语的语义相似度（对外唯一接口）

    Args:
        phrase_a: 第一个短语
        phrase_b: 第二个短语
        model_name: 使用的模型名称
        temperature: 模型温度参数，控制输出随机性，默认 0.0（确定性输出）
        max_tokens: 最大生成token数，默认 65536
        prompt_template: 自定义提示词模板，使用 {phrase_a} 和 {phrase_b} 作为占位符
        instructions: Agent 的系统指令，默认为 None
        tools: Agent 可用的工具列表，默认为 []
        name: Agent 的名称，默认为 "Semantic Similarity Analyzer"
        use_cache: 是否使用缓存，默认 True
        cache_dir: 缓存目录，默认从配置读取（可通过 lib.config 设置）

    Returns:
        解析后的字典
    """
    return await _difference_between_phrases_parsed(
        phrase_a, phrase_b, model_name, temperature, max_tokens,
        prompt_template, instructions, tools, name, use_cache, cache_dir
    )


if __name__ == "__main__":
    import asyncio

    async def main():
        """示例使用"""
        # 示例 1: 基本使用（使用缓存）
        print("示例 1: 基本使用")
        result = await compare_phrases("宿命感", "余华的小说")
        print(f"相似度: {result.get('相似度')}")
        print(f"说明: {result.get('说明')}")
        print()

        # 示例 2: 再次调用相同参数（应该从缓存读取）
        print("示例 2: 测试缓存")
        result = await compare_phrases("宿命感", "余华的小说")
        print(f"相似度: {result.get('相似度')}")
        print()

        # 示例 3: 自定义温度
        print("示例 3: 自定义温度（创意性输出）")
        result = await compare_phrases(
            "创意写作", "AI生成",
            temperature=0.7
        )
        print(f"相似度: {result.get('相似度')}")
        print(f"说明: {result.get('说明')}")
        print()

        # 示例 4: 自定义 Agent 名称
        print("示例 4: 自定义 Agent 名称")
        result = await compare_phrases(
            "人工智能", "机器学习",
            name="AI语义分析专家"
        )
        print(f"相似度: {result.get('相似度')}")
        print(f"说明: {result.get('说明')}")
        print()

        # 示例 5: 使用不同的模型
        print("示例 5: 使用 Claude 模型")
        result = await compare_phrases(
            "深度学习", "神经网络",
            model_name='anthropic/claude-haiku-4.5'
        )
        print(f"相似度: {result.get('相似度')}")
        print(f"说明: {result.get('说明')}")

    asyncio.run(main())


# ========== V2 版本（示例：详细分析版本） ==========

# V2 默认提示词模板（更详细的分析）
DEFAULT_PROMPT_TEMPLATE_V2 = """
请深入分析【{phrase_a}】和【{phrase_b}】的语义关系，包括：
1. 语义相似度（0-1）
2. 关系类型（如：包含、相关、对立、无关等）
3. 详细说明

输出格式：
```json
{{
  "相似度": 0.0,
  "关系类型": "相关/包含/对立/无关",
  "详细说明": "详细分析两者的语义关系...",
  "应用场景": "该关系在实际应用中的意义..."
}}
```
""".strip()


# 对外接口 - V2
async def compare_phrases_v2(
    phrase_a: str,
    phrase_b: str,
    model_name: str = 'anthropic/claude-sonnet-4.5',  # V2 默认使用更强的模型
    temperature: float = 0.0,
    max_tokens: int = 65536,
    prompt_template: str = None,
    instructions: str = None,
    tools: list = None,
    name: str = "Advanced Semantic Analyzer",
    use_cache: bool = True,
    cache_dir: Optional[str] = None
) -> Dict[str, Any]:
    """
    比较两个短语的语义相似度 - V2 版本（详细分析）

    V2 特点：
    - 默认使用更强的模型（Claude Sonnet 4.5）
    - 更详细的分析输出（包含关系类型和应用场景）
    - 适合需要深入分析的场景

    Args:
        phrase_a: 第一个短语
        phrase_b: 第二个短语
        model_name: 使用的模型名称，默认 'anthropic/claude-sonnet-4.5'
        temperature: 模型温度参数，默认 0.0
        max_tokens: 最大生成token数，默认 65536
        prompt_template: 自定义提示词模板，默认使用 V2 详细模板
        instructions: Agent 的系统指令，默认为 None
        tools: Agent 可用的工具列表，默认为 []
        name: Agent 的名称，默认 "Advanced Semantic Analyzer"
        use_cache: 是否使用缓存，默认 True
        cache_dir: 缓存目录，默认从配置读取（可通过 lib.config 设置）

    Returns:
        解析后的字典，包含：
        - 相似度: 0-1之间的浮点数
        - 关系类型: 关系分类
        - 详细说明: 详细分析
        - 应用场景: 应用建议

    Examples:
        >>> result = await compare_phrases_v2("深度学习", "神经网络")
        >>> print(result['相似度'])
        0.9
        >>> print(result['关系类型'])
        "包含"
        >>> print(result['详细说明'])
        "深度学习是基于人工神经网络的机器学习方法..."
    """
    # 使用 V2 默认模板（如果未指定）
    if prompt_template is None:
        prompt_template = DEFAULT_PROMPT_TEMPLATE_V2

    return await _difference_between_phrases_parsed(
        phrase_a, phrase_b, model_name, temperature, max_tokens,
        prompt_template, instructions, tools, name, use_cache, cache_dir
    )