yangxiaohui
/
how


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
							#!/usr/bin/env python3
"""
混合相似度计算模块
结合向量模型（text_embedding）和LLM模型（semantic_similarity）的结果

提供2种接口：
1. compare_phrases() - 单对计算
2. compare_phrases_cartesian() - 笛卡尔积批量计算 (M×N)
"""

from typing import Dict, Any, Optional, List
import asyncio
import numpy as np
from lib.text_embedding import compare_phrases as compare_phrases_embedding
from lib.text_embedding_api import compare_phrases_cartesian as compare_phrases_cartesian_api
from lib.semantic_similarity import compare_phrases as compare_phrases_semantic
from lib.semantic_similarity import compare_phrases_cartesian as compare_phrases_cartesian_semantic
from lib.config import get_cache_dir


async def compare_phrases(
    phrase_a: str,
    phrase_b: str,
    weight_embedding: float = 0.5,
    weight_semantic: float = 0.5,
    embedding_model: str = "chinese",
    semantic_model: str = 'openai/gpt-4.1-mini',
    use_cache: bool = True,
    cache_dir_embedding: Optional[str] = None,
    cache_dir_semantic: Optional[str] = None,
    **semantic_kwargs
) -> Dict[str, Any]:
    """
    混合相似度计算：同时使用向量模型和LLM模型，按权重组合结果

    Args:
        phrase_a: 第一个短语
        phrase_b: 第二个短语
        weight_embedding: 向量模型权重，默认 0.5
        weight_semantic: LLM模型权重，默认 0.5
        embedding_model: 向量模型名称，默认 "chinese"
        semantic_model: LLM模型名称，默认 'openai/gpt-4.1-mini'
        use_cache: 是否使用缓存，默认 True
        cache_dir_embedding: 向量模型缓存目录，默认从配置读取
        cache_dir_semantic: LLM模型缓存目录，默认从配置读取
        **semantic_kwargs: 其他传递给semantic_similarity的参数
            - temperature: 温度参数，默认 0.0
            - max_tokens: 最大token数，默认 65536
            - prompt_template: 自定义提示词模板
            - instructions: Agent系统指令
            - tools: Agent工具列表
            - name: Agent名称

    Returns:
        {
            "相似度": float,           # 加权平均后的相似度 (0-1)
            "说明": str               # 综合说明（包含各模型的分数和说明）
        }

    Examples:
        >>> # 使用默认权重 (0.5:0.5)
        >>> result = await compare_phrases("深度学习", "神经网络")
        >>> print(result['相似度'])  # 加权平均后的相似度
        0.82

        >>> # 自定义权重，更倾向向量模型
        >>> result = await compare_phrases(
        ...     "深度学习", "神经网络",
        ...     weight_embedding=0.7,
        ...     weight_semantic=0.3
        ... )

        >>> # 使用不同的模型
        >>> result = await compare_phrases(
        ...     "深度学习", "神经网络",
        ...     embedding_model="multilingual",
        ...     semantic_model="anthropic/claude-sonnet-4.5"
        ... )
    """
    # 验证权重
    total_weight = weight_embedding + weight_semantic
    if abs(total_weight - 1.0) > 0.001:
        raise ValueError(f"权重之和必须为1.0，当前为: {total_weight}")

    # 使用配置的缓存目录（如果未指定）
    if cache_dir_embedding is None:
        cache_dir_embedding = get_cache_dir("text_embedding")
    if cache_dir_semantic is None:
        cache_dir_semantic = get_cache_dir("semantic_similarity")

    # 并发调用两个模型
    embedding_task = asyncio.to_thread(
        compare_phrases_embedding,
        phrase_a=phrase_a,
        phrase_b=phrase_b,
        model_name=embedding_model,
        use_cache=use_cache,
        cache_dir=cache_dir_embedding
    )

    semantic_task = compare_phrases_semantic(
        phrase_a=phrase_a,
        phrase_b=phrase_b,
        model_name=semantic_model,
        use_cache=use_cache,
        cache_dir=cache_dir_semantic,
        **semantic_kwargs
    )

    # 等待两个任务完成
    embedding_result, semantic_result = await asyncio.gather(
        embedding_task,
        semantic_task
    )

    # 提取相似度分数
    score_embedding = embedding_result.get("相似度", 0.0)
    score_semantic = semantic_result.get("相似度", 0.0)

    # 计算加权平均
    final_score = (
        score_embedding * weight_embedding +
        score_semantic * weight_semantic
    )

    # 生成综合说明（格式化为清晰的结构）
    explanation = (
        f"【混合相似度】{final_score:.3f}（向量模型权重{weight_embedding}，LLM模型权重{weight_semantic}）\n\n"
        f"【向量模型】相似度={score_embedding:.3f}\n"
        f"{embedding_result.get('说明', 'N/A')}\n\n"
        f"【LLM模型】相似度={score_semantic:.3f}\n"
        f"{semantic_result.get('说明', 'N/A')}"
    )

    # 构建返回结果（与原接口完全一致）
    return {
        "相似度": final_score,
        "说明": explanation
    }


async def compare_phrases_cartesian(
    phrases_a: List[str],
    phrases_b: List[str],
    max_concurrent: int = 50,
    progress_callback: Optional[callable] = None
) -> List[List[Dict[str, Any]]]:
    """
    混合相似度笛卡尔积批量计算：M×N矩阵

    结合向量模型API笛卡尔积（快速）和LLM并发调用（已优化）
    使用默认权重：向量0.5，LLM 0.5

    Args:
        phrases_a: 第一组短语列表（M个）
        phrases_b: 第二组短语列表（N个）
        max_concurrent: 最大并发数，默认50（控制LLM调用并发）
        progress_callback: 进度回调函数，每完成一个LLM任务时调用

    Returns:
        嵌套列表 List[List[Dict]]，每个Dict包含完整结果
        results[i][j] = {
            "相似度": float,  # 混合相似度
            "说明": str       # 包含向量和LLM的详细说明
        }

    Examples:
        >>> results = await compare_phrases_cartesian(
        ...     ["深度学习"],
        ...     ["神经网络", "Python"]
        ... )
        >>> print(results[0][0]['相似度'])  # 混合相似度
        >>> print(results[0][1]['说明'])    # 完整说明

        >>> # 使用进度回调
        >>> def on_progress(count):
        ...     print(f"完成 {count} 个任务")
        >>> results = await compare_phrases_cartesian(
        ...     ["深度学习"],
        ...     ["神经网络", "Python"],
        ...     max_concurrent=100,
        ...     progress_callback=on_progress
        ... )
    """
    # 参数验证
    if not phrases_a or not phrases_b:
        return [[]]

    M, N = len(phrases_a), len(phrases_b)

    # 默认权重
    weight_embedding = 0.5
    weight_semantic = 0.5

    # 串行执行两个任务（向量模型快，先执行；避免并发死锁）
    # 1. 向量模型：使用API笛卡尔积（一次调用获取M×N完整结果，通常1-2秒）
    import time
    start_time = time.time()
    embedding_results = await asyncio.to_thread(
        compare_phrases_cartesian_api,
        phrases_a,
        phrases_b
    )
    elapsed = time.time() - start_time
    # print(f"✓ 向量模型完成，耗时: {elapsed:.1f}秒")  # 调试用

    # 2. LLM模型：使用并发调用（M×N个任务，受max_concurrent控制）
    semantic_results = await compare_phrases_cartesian_semantic(
        phrases_a,
        phrases_b,
        max_concurrent,
        progress_callback  # 传递进度回调
    )
    # embedding_results[i][j] = {"相似度": float, "说明": str}
    # semantic_results[i][j] = {"相似度": float, "说明": str}

    # 构建嵌套列表，包含完整信息（带子模型详细说明）
    nested_results = []
    for i in range(M):
        row_results = []
        for j in range(N):
            # 获取子模型的完整结果
            embedding_result = embedding_results[i][j]
            semantic_result = semantic_results[i][j]

            score_embedding = embedding_result.get("相似度", 0.0)
            score_semantic = semantic_result.get("相似度", 0.0)

            # 计算加权平均
            final_score = (
                score_embedding * weight_embedding +
                score_semantic * weight_semantic
            )

            # 生成完整说明（包含子模型的详细说明）
            explanation = (
                f"【混合相似度】{final_score:.3f}（向量模型权重{weight_embedding}，LLM模型权重{weight_semantic}）\n\n"
                f"【向量模型】相似度={score_embedding:.3f}\n"
                f"{embedding_result.get('说明', 'N/A')}\n\n"
                f"【LLM模型】相似度={score_semantic:.3f}\n"
                f"{semantic_result.get('说明', 'N/A')}"
            )

            row_results.append({
                "相似度": final_score,
                "说明": explanation
            })
        nested_results.append(row_results)

    return nested_results


def compare_phrases_sync(
    phrase_a: str,
    phrase_b: str,
    weight_embedding: float = 0.5,
    weight_semantic: float = 0.5,
    **kwargs
) -> Dict[str, Any]:
    """
    混合相似度计算的同步版本（内部创建事件循环）

    Args:
        phrase_a: 第一个短语
        phrase_b: 第二个短语
        weight_embedding: 向量模型权重，默认 0.5
        weight_semantic: LLM模型权重，默认 0.5
        **kwargs: 其他参数（同 compare_phrases）

    Returns:
        同 compare_phrases

    Examples:
        >>> result = compare_phrases_sync("深度学习", "神经网络")
        >>> print(result['相似度'])
    """
    return asyncio.run(
        compare_phrases(
            phrase_a=phrase_a,
            phrase_b=phrase_b,
            weight_embedding=weight_embedding,
            weight_semantic=weight_semantic,
            **kwargs
        )
    )


if __name__ == "__main__":
    async def main():
        print("=" * 80)
        print("混合相似度计算示例")
        print("=" * 80)
        print()

        # 示例 1: 默认权重 (0.5:0.5)
        print("示例 1: 默认权重 (0.5:0.5)")
        print("-" * 80)
        result = await compare_phrases("深度学习", "神经网络")
        print(f"相似度: {result['相似度']:.3f}")
        print(f"说明:\n{result['说明']}")
        print()

        # 示例 2: 不相关的短语
        print("示例 2: 不相关的短语")
        print("-" * 80)
        result = await compare_phrases("编程", "吃饭")
        print(f"相似度: {result['相似度']:.3f}")
        print(f"说明:\n{result['说明']}")
        print()

        # 示例 3: 自定义权重，更倾向向量模型
        print("示例 3: 自定义权重 (向量:0.7, LLM:0.3)")
        print("-" * 80)
        result = await compare_phrases(
            "人工智能", "机器学习",
            weight_embedding=0.7,
            weight_semantic=0.3
        )
        print(f"相似度: {result['相似度']:.3f}")
        print(f"说明:\n{result['说明']}")
        print()

        # 示例 4: 完整输出示例
        print("示例 4: 完整输出示例")
        print("-" * 80)
        result = await compare_phrases("宿命感", "余华的小说")
        print(f"相似度: {result['相似度']:.3f}")
        print(f"说明:\n{result['说明']}")
        print()

        # 示例 5: 同步版本
        print("示例 5: 同步版本调用")
        print("-" * 80)
        result = compare_phrases_sync("Python", "编程语言")
        print(f"相似度: {result['相似度']:.3f}")
        print(f"说明:\n{result['说明']}")
        print()

        print("=" * 80)

    asyncio.run(main())