| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338 |
- #!/usr/bin/env python3
- """
- 混合相似度计算模块
- 结合向量模型(text_embedding)和LLM模型(semantic_similarity)的结果
- 提供2种接口:
- 1. compare_phrases() - 单对计算
- 2. compare_phrases_cartesian() - 笛卡尔积批量计算 (M×N)
- """
- from typing import Dict, Any, Optional, List
- import asyncio
- import numpy as np
- from lib.text_embedding import compare_phrases as compare_phrases_embedding
- from lib.text_embedding_api import compare_phrases_cartesian as compare_phrases_cartesian_api
- from lib.semantic_similarity import compare_phrases as compare_phrases_semantic
- from lib.semantic_similarity import compare_phrases_cartesian as compare_phrases_cartesian_semantic
- from lib.config import get_cache_dir
- async def compare_phrases(
- phrase_a: str,
- phrase_b: str,
- weight_embedding: float = 0.5,
- weight_semantic: float = 0.5,
- embedding_model: str = "chinese",
- semantic_model: str = 'openai/gpt-4.1-mini',
- use_cache: bool = True,
- cache_dir_embedding: Optional[str] = None,
- cache_dir_semantic: Optional[str] = None,
- **semantic_kwargs
- ) -> Dict[str, Any]:
- """
- 混合相似度计算:同时使用向量模型和LLM模型,按权重组合结果
- Args:
- phrase_a: 第一个短语
- phrase_b: 第二个短语
- weight_embedding: 向量模型权重,默认 0.5
- weight_semantic: LLM模型权重,默认 0.5
- embedding_model: 向量模型名称,默认 "chinese"
- semantic_model: LLM模型名称,默认 'openai/gpt-4.1-mini'
- use_cache: 是否使用缓存,默认 True
- cache_dir_embedding: 向量模型缓存目录,默认从配置读取
- cache_dir_semantic: LLM模型缓存目录,默认从配置读取
- **semantic_kwargs: 其他传递给semantic_similarity的参数
- - temperature: 温度参数,默认 0.0
- - max_tokens: 最大token数,默认 65536
- - prompt_template: 自定义提示词模板
- - instructions: Agent系统指令
- - tools: Agent工具列表
- - name: Agent名称
- Returns:
- {
- "相似度": float, # 加权平均后的相似度 (0-1)
- "说明": str # 综合说明(包含各模型的分数和说明)
- }
- Examples:
- >>> # 使用默认权重 (0.5:0.5)
- >>> result = await compare_phrases("深度学习", "神经网络")
- >>> print(result['相似度']) # 加权平均后的相似度
- 0.82
- >>> # 自定义权重,更倾向向量模型
- >>> result = await compare_phrases(
- ... "深度学习", "神经网络",
- ... weight_embedding=0.7,
- ... weight_semantic=0.3
- ... )
- >>> # 使用不同的模型
- >>> result = await compare_phrases(
- ... "深度学习", "神经网络",
- ... embedding_model="multilingual",
- ... semantic_model="anthropic/claude-sonnet-4.5"
- ... )
- """
- # 验证权重
- total_weight = weight_embedding + weight_semantic
- if abs(total_weight - 1.0) > 0.001:
- raise ValueError(f"权重之和必须为1.0,当前为: {total_weight}")
- # 使用配置的缓存目录(如果未指定)
- if cache_dir_embedding is None:
- cache_dir_embedding = get_cache_dir("text_embedding")
- if cache_dir_semantic is None:
- cache_dir_semantic = get_cache_dir("semantic_similarity")
- # 并发调用两个模型
- embedding_task = asyncio.to_thread(
- compare_phrases_embedding,
- phrase_a=phrase_a,
- phrase_b=phrase_b,
- model_name=embedding_model,
- use_cache=use_cache,
- cache_dir=cache_dir_embedding
- )
- semantic_task = compare_phrases_semantic(
- phrase_a=phrase_a,
- phrase_b=phrase_b,
- model_name=semantic_model,
- use_cache=use_cache,
- cache_dir=cache_dir_semantic,
- **semantic_kwargs
- )
- # 等待两个任务完成
- embedding_result, semantic_result = await asyncio.gather(
- embedding_task,
- semantic_task
- )
- # 提取相似度分数
- score_embedding = embedding_result.get("相似度", 0.0)
- score_semantic = semantic_result.get("相似度", 0.0)
- # 计算加权平均
- final_score = (
- score_embedding * weight_embedding +
- score_semantic * weight_semantic
- )
- # 生成综合说明(格式化为清晰的结构)
- explanation = (
- f"【混合相似度】{final_score:.3f}(向量模型权重{weight_embedding},LLM模型权重{weight_semantic})\n\n"
- f"【向量模型】相似度={score_embedding:.3f}\n"
- f"{embedding_result.get('说明', 'N/A')}\n\n"
- f"【LLM模型】相似度={score_semantic:.3f}\n"
- f"{semantic_result.get('说明', 'N/A')}"
- )
- # 构建返回结果(与原接口完全一致)
- return {
- "相似度": final_score,
- "说明": explanation
- }
- async def compare_phrases_cartesian(
- phrases_a: List[str],
- phrases_b: List[str],
- max_concurrent: int = 50
- ) -> List[List[Dict[str, Any]]]:
- """
- 混合相似度笛卡尔积批量计算:M×N矩阵
- 结合向量模型API笛卡尔积(快速)和LLM并发调用(已优化)
- 使用默认权重:向量0.5,LLM 0.5
- Args:
- phrases_a: 第一组短语列表(M个)
- phrases_b: 第二组短语列表(N个)
- max_concurrent: 最大并发数,默认50(控制LLM调用并发)
- Returns:
- 嵌套列表 List[List[Dict]],每个Dict包含完整结果
- results[i][j] = {
- "相似度": float, # 混合相似度
- "说明": str # 包含向量和LLM的详细说明
- }
- Examples:
- >>> results = await compare_phrases_cartesian(
- ... ["深度学习"],
- ... ["神经网络", "Python"]
- ... )
- >>> print(results[0][0]['相似度']) # 混合相似度
- >>> print(results[0][1]['说明']) # 完整说明
- >>> # 自定义并发控制
- >>> results = await compare_phrases_cartesian(
- ... ["深度学习"],
- ... ["神经网络", "Python"],
- ... max_concurrent=100 # 提高并发数
- ... )
- """
- # 参数验证
- if not phrases_a or not phrases_b:
- return [[]]
- M, N = len(phrases_a), len(phrases_b)
- # 默认权重
- weight_embedding = 0.5
- weight_semantic = 0.5
- # 并发执行两个任务
- # 1. 向量模型:使用API笛卡尔积(一次调用获取M×N完整结果)
- embedding_task = asyncio.to_thread(
- compare_phrases_cartesian_api,
- phrases_a,
- phrases_b,
- max_concurrent # 传递并发参数(API不使用,但保持接口一致)
- )
- # 2. LLM模型:使用并发调用(M×N个任务,受max_concurrent控制)
- semantic_task = compare_phrases_cartesian_semantic(
- phrases_a,
- phrases_b,
- max_concurrent # 传递并发参数控制LLM调用
- )
- # 等待两个任务完成
- embedding_results, semantic_results = await asyncio.gather(
- embedding_task,
- semantic_task
- )
- # embedding_results[i][j] = {"相似度": float, "说明": str}
- # semantic_results[i][j] = {"相似度": float, "说明": str}
- # 构建嵌套列表,包含完整信息(带子模型详细说明)
- nested_results = []
- for i in range(M):
- row_results = []
- for j in range(N):
- # 获取子模型的完整结果
- embedding_result = embedding_results[i][j]
- semantic_result = semantic_results[i][j]
- score_embedding = embedding_result.get("相似度", 0.0)
- score_semantic = semantic_result.get("相似度", 0.0)
- # 计算加权平均
- final_score = (
- score_embedding * weight_embedding +
- score_semantic * weight_semantic
- )
- # 生成完整说明(包含子模型的详细说明)
- explanation = (
- f"【混合相似度】{final_score:.3f}(向量模型权重{weight_embedding},LLM模型权重{weight_semantic})\n\n"
- f"【向量模型】相似度={score_embedding:.3f}\n"
- f"{embedding_result.get('说明', 'N/A')}\n\n"
- f"【LLM模型】相似度={score_semantic:.3f}\n"
- f"{semantic_result.get('说明', 'N/A')}"
- )
- row_results.append({
- "相似度": final_score,
- "说明": explanation
- })
- nested_results.append(row_results)
- return nested_results
- def compare_phrases_sync(
- phrase_a: str,
- phrase_b: str,
- weight_embedding: float = 0.5,
- weight_semantic: float = 0.5,
- **kwargs
- ) -> Dict[str, Any]:
- """
- 混合相似度计算的同步版本(内部创建事件循环)
- Args:
- phrase_a: 第一个短语
- phrase_b: 第二个短语
- weight_embedding: 向量模型权重,默认 0.5
- weight_semantic: LLM模型权重,默认 0.5
- **kwargs: 其他参数(同 compare_phrases)
- Returns:
- 同 compare_phrases
- Examples:
- >>> result = compare_phrases_sync("深度学习", "神经网络")
- >>> print(result['相似度'])
- """
- return asyncio.run(
- compare_phrases(
- phrase_a=phrase_a,
- phrase_b=phrase_b,
- weight_embedding=weight_embedding,
- weight_semantic=weight_semantic,
- **kwargs
- )
- )
- if __name__ == "__main__":
- async def main():
- print("=" * 80)
- print("混合相似度计算示例")
- print("=" * 80)
- print()
- # 示例 1: 默认权重 (0.5:0.5)
- print("示例 1: 默认权重 (0.5:0.5)")
- print("-" * 80)
- result = await compare_phrases("深度学习", "神经网络")
- print(f"相似度: {result['相似度']:.3f}")
- print(f"说明:\n{result['说明']}")
- print()
- # 示例 2: 不相关的短语
- print("示例 2: 不相关的短语")
- print("-" * 80)
- result = await compare_phrases("编程", "吃饭")
- print(f"相似度: {result['相似度']:.3f}")
- print(f"说明:\n{result['说明']}")
- print()
- # 示例 3: 自定义权重,更倾向向量模型
- print("示例 3: 自定义权重 (向量:0.7, LLM:0.3)")
- print("-" * 80)
- result = await compare_phrases(
- "人工智能", "机器学习",
- weight_embedding=0.7,
- weight_semantic=0.3
- )
- print(f"相似度: {result['相似度']:.3f}")
- print(f"说明:\n{result['说明']}")
- print()
- # 示例 4: 完整输出示例
- print("示例 4: 完整输出示例")
- print("-" * 80)
- result = await compare_phrases("宿命感", "余华的小说")
- print(f"相似度: {result['相似度']:.3f}")
- print(f"说明:\n{result['说明']}")
- print()
- # 示例 5: 同步版本
- print("示例 5: 同步版本调用")
- print("-" * 80)
- result = compare_phrases_sync("Python", "编程语言")
- print(f"相似度: {result['相似度']:.3f}")
- print(f"说明:\n{result['说明']}")
- print()
- print("=" * 80)
- asyncio.run(main())
|