#!/usr/bin/env python3 """ 混合相似度计算模块 结合向量模型(text_embedding)和LLM模型(semantic_similarity)的结果 提供2种接口: 1. compare_phrases() - 单对计算 2. compare_phrases_cartesian() - 笛卡尔积批量计算 (M×N) """ from typing import Dict, Any, Optional, List import asyncio import numpy as np from lib.text_embedding import compare_phrases as compare_phrases_embedding from lib.text_embedding_api import compare_phrases_cartesian as compare_phrases_cartesian_api from lib.semantic_similarity import compare_phrases as compare_phrases_semantic from lib.semantic_similarity import compare_phrases_cartesian as compare_phrases_cartesian_semantic from lib.config import get_cache_dir async def compare_phrases( phrase_a: str, phrase_b: str, weight_embedding: float = 0.5, weight_semantic: float = 0.5, embedding_model: str = "chinese", semantic_model: str = 'openai/gpt-4.1-mini', use_cache: bool = True, cache_dir_embedding: Optional[str] = None, cache_dir_semantic: Optional[str] = None, **semantic_kwargs ) -> Dict[str, Any]: """ 混合相似度计算:同时使用向量模型和LLM模型,按权重组合结果 Args: phrase_a: 第一个短语 phrase_b: 第二个短语 weight_embedding: 向量模型权重,默认 0.5 weight_semantic: LLM模型权重,默认 0.5 embedding_model: 向量模型名称,默认 "chinese" semantic_model: LLM模型名称,默认 'openai/gpt-4.1-mini' use_cache: 是否使用缓存,默认 True cache_dir_embedding: 向量模型缓存目录,默认从配置读取 cache_dir_semantic: LLM模型缓存目录,默认从配置读取 **semantic_kwargs: 其他传递给semantic_similarity的参数 - temperature: 温度参数,默认 0.0 - max_tokens: 最大token数,默认 65536 - prompt_template: 自定义提示词模板 - instructions: Agent系统指令 - tools: Agent工具列表 - name: Agent名称 Returns: { "相似度": float, # 加权平均后的相似度 (0-1) "说明": str # 综合说明(包含各模型的分数和说明) } Examples: >>> # 使用默认权重 (0.5:0.5) >>> result = await compare_phrases("深度学习", "神经网络") >>> print(result['相似度']) # 加权平均后的相似度 0.82 >>> # 自定义权重,更倾向向量模型 >>> result = await compare_phrases( ... "深度学习", "神经网络", ... weight_embedding=0.7, ... weight_semantic=0.3 ... ) >>> # 使用不同的模型 >>> result = await compare_phrases( ... "深度学习", "神经网络", ... embedding_model="multilingual", ... semantic_model="anthropic/claude-sonnet-4.5" ... ) """ # 验证权重 total_weight = weight_embedding + weight_semantic if abs(total_weight - 1.0) > 0.001: raise ValueError(f"权重之和必须为1.0,当前为: {total_weight}") # 使用配置的缓存目录(如果未指定) if cache_dir_embedding is None: cache_dir_embedding = get_cache_dir("text_embedding") if cache_dir_semantic is None: cache_dir_semantic = get_cache_dir("semantic_similarity") # 并发调用两个模型 embedding_task = asyncio.to_thread( compare_phrases_embedding, phrase_a=phrase_a, phrase_b=phrase_b, model_name=embedding_model, use_cache=use_cache, cache_dir=cache_dir_embedding ) semantic_task = compare_phrases_semantic( phrase_a=phrase_a, phrase_b=phrase_b, model_name=semantic_model, use_cache=use_cache, cache_dir=cache_dir_semantic, **semantic_kwargs ) # 等待两个任务完成 embedding_result, semantic_result = await asyncio.gather( embedding_task, semantic_task ) # 提取相似度分数 score_embedding = embedding_result.get("相似度", 0.0) score_semantic = semantic_result.get("相似度", 0.0) # 计算加权平均 final_score = ( score_embedding * weight_embedding + score_semantic * weight_semantic ) # 生成综合说明(格式化为清晰的结构) explanation = ( f"【混合相似度】{final_score:.3f}(向量模型权重{weight_embedding},LLM模型权重{weight_semantic})\n\n" f"【向量模型】相似度={score_embedding:.3f}\n" f"{embedding_result.get('说明', 'N/A')}\n\n" f"【LLM模型】相似度={score_semantic:.3f}\n" f"{semantic_result.get('说明', 'N/A')}" ) # 构建返回结果(与原接口完全一致) return { "相似度": final_score, "说明": explanation } async def compare_phrases_cartesian( phrases_a: List[str], phrases_b: List[str], max_concurrent: int = 50, progress_callback: Optional[callable] = None ) -> List[List[Dict[str, Any]]]: """ 混合相似度笛卡尔积批量计算:M×N矩阵 结合向量模型API笛卡尔积(快速)和LLM并发调用(已优化) 使用默认权重:向量0.5,LLM 0.5 Args: phrases_a: 第一组短语列表(M个) phrases_b: 第二组短语列表(N个) max_concurrent: 最大并发数,默认50(控制LLM调用并发) progress_callback: 进度回调函数,每完成一个LLM任务时调用 Returns: 嵌套列表 List[List[Dict]],每个Dict包含完整结果 results[i][j] = { "相似度": float, # 混合相似度 "说明": str # 包含向量和LLM的详细说明 } Examples: >>> results = await compare_phrases_cartesian( ... ["深度学习"], ... ["神经网络", "Python"] ... ) >>> print(results[0][0]['相似度']) # 混合相似度 >>> print(results[0][1]['说明']) # 完整说明 >>> # 使用进度回调 >>> def on_progress(count): ... print(f"完成 {count} 个任务") >>> results = await compare_phrases_cartesian( ... ["深度学习"], ... ["神经网络", "Python"], ... max_concurrent=100, ... progress_callback=on_progress ... ) """ # 参数验证 if not phrases_a or not phrases_b: return [[]] M, N = len(phrases_a), len(phrases_b) # 默认权重 weight_embedding = 0.5 weight_semantic = 0.5 # 串行执行两个任务(向量模型快,先执行;避免并发死锁) # 1. 向量模型:使用API笛卡尔积(一次调用获取M×N完整结果,通常1-2秒) import time start_time = time.time() embedding_results = await asyncio.to_thread( compare_phrases_cartesian_api, phrases_a, phrases_b ) elapsed = time.time() - start_time # print(f"✓ 向量模型完成,耗时: {elapsed:.1f}秒") # 调试用 # 2. LLM模型:使用并发调用(M×N个任务,受max_concurrent控制) semantic_results = await compare_phrases_cartesian_semantic( phrases_a, phrases_b, max_concurrent, progress_callback # 传递进度回调 ) # embedding_results[i][j] = {"相似度": float, "说明": str} # semantic_results[i][j] = {"相似度": float, "说明": str} # 构建嵌套列表,包含完整信息(带子模型详细说明) nested_results = [] for i in range(M): row_results = [] for j in range(N): # 获取子模型的完整结果 embedding_result = embedding_results[i][j] semantic_result = semantic_results[i][j] score_embedding = embedding_result.get("相似度", 0.0) score_semantic = semantic_result.get("相似度", 0.0) # 计算加权平均 final_score = ( score_embedding * weight_embedding + score_semantic * weight_semantic ) # 生成完整说明(包含子模型的详细说明) explanation = ( f"【混合相似度】{final_score:.3f}(向量模型权重{weight_embedding},LLM模型权重{weight_semantic})\n\n" f"【向量模型】相似度={score_embedding:.3f}\n" f"{embedding_result.get('说明', 'N/A')}\n\n" f"【LLM模型】相似度={score_semantic:.3f}\n" f"{semantic_result.get('说明', 'N/A')}" ) row_results.append({ "相似度": final_score, "说明": explanation }) nested_results.append(row_results) return nested_results def compare_phrases_sync( phrase_a: str, phrase_b: str, weight_embedding: float = 0.5, weight_semantic: float = 0.5, **kwargs ) -> Dict[str, Any]: """ 混合相似度计算的同步版本(内部创建事件循环) Args: phrase_a: 第一个短语 phrase_b: 第二个短语 weight_embedding: 向量模型权重,默认 0.5 weight_semantic: LLM模型权重,默认 0.5 **kwargs: 其他参数(同 compare_phrases) Returns: 同 compare_phrases Examples: >>> result = compare_phrases_sync("深度学习", "神经网络") >>> print(result['相似度']) """ return asyncio.run( compare_phrases( phrase_a=phrase_a, phrase_b=phrase_b, weight_embedding=weight_embedding, weight_semantic=weight_semantic, **kwargs ) ) if __name__ == "__main__": async def main(): print("=" * 80) print("混合相似度计算示例") print("=" * 80) print() # 示例 1: 默认权重 (0.5:0.5) print("示例 1: 默认权重 (0.5:0.5)") print("-" * 80) result = await compare_phrases("深度学习", "神经网络") print(f"相似度: {result['相似度']:.3f}") print(f"说明:\n{result['说明']}") print() # 示例 2: 不相关的短语 print("示例 2: 不相关的短语") print("-" * 80) result = await compare_phrases("编程", "吃饭") print(f"相似度: {result['相似度']:.3f}") print(f"说明:\n{result['说明']}") print() # 示例 3: 自定义权重,更倾向向量模型 print("示例 3: 自定义权重 (向量:0.7, LLM:0.3)") print("-" * 80) result = await compare_phrases( "人工智能", "机器学习", weight_embedding=0.7, weight_semantic=0.3 ) print(f"相似度: {result['相似度']:.3f}") print(f"说明:\n{result['说明']}") print() # 示例 4: 完整输出示例 print("示例 4: 完整输出示例") print("-" * 80) result = await compare_phrases("宿命感", "余华的小说") print(f"相似度: {result['相似度']:.3f}") print(f"说明:\n{result['说明']}") print() # 示例 5: 同步版本 print("示例 5: 同步版本调用") print("-" * 80) result = compare_phrases_sync("Python", "编程语言") print(f"相似度: {result['相似度']:.3f}") print(f"说明:\n{result['说明']}") print() print("=" * 80) asyncio.run(main())