|
@@ -0,0 +1,216 @@
|
|
|
|
|
+#!/usr/bin/env python3
|
|
|
|
|
+"""
|
|
|
|
|
+混合相似度计算模块
|
|
|
|
|
+结合向量模型(text_embedding)和LLM模型(semantic_similarity)的结果
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+from typing import Dict, Any, Optional
|
|
|
|
|
+import asyncio
|
|
|
|
|
+from lib.text_embedding import compare_phrases as compare_phrases_embedding
|
|
|
|
|
+from lib.semantic_similarity import compare_phrases as compare_phrases_semantic
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+async def compare_phrases(
|
|
|
|
|
+ phrase_a: str,
|
|
|
|
|
+ phrase_b: str,
|
|
|
|
|
+ weight_embedding: float = 0.5,
|
|
|
|
|
+ weight_semantic: float = 0.5,
|
|
|
|
|
+ embedding_model: str = "chinese",
|
|
|
|
|
+ semantic_model: str = 'openai/gpt-4.1-mini',
|
|
|
|
|
+ use_cache: bool = True,
|
|
|
|
|
+ cache_dir_embedding: str = "cache/text_embedding",
|
|
|
|
|
+ cache_dir_semantic: str = "cache/semantic_similarity",
|
|
|
|
|
+ **semantic_kwargs
|
|
|
|
|
+) -> Dict[str, Any]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 混合相似度计算:同时使用向量模型和LLM模型,按权重组合结果
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ phrase_a: 第一个短语
|
|
|
|
|
+ phrase_b: 第二个短语
|
|
|
|
|
+ weight_embedding: 向量模型权重,默认 0.5
|
|
|
|
|
+ weight_semantic: LLM模型权重,默认 0.5
|
|
|
|
|
+ embedding_model: 向量模型名称,默认 "chinese"
|
|
|
|
|
+ semantic_model: LLM模型名称,默认 'openai/gpt-4.1-mini'
|
|
|
|
|
+ use_cache: 是否使用缓存,默认 True
|
|
|
|
|
+ cache_dir_embedding: 向量模型缓存目录
|
|
|
|
|
+ cache_dir_semantic: LLM模型缓存目录
|
|
|
|
|
+ **semantic_kwargs: 其他传递给semantic_similarity的参数
|
|
|
|
|
+ - temperature: 温度参数,默认 0.0
|
|
|
|
|
+ - max_tokens: 最大token数,默认 65536
|
|
|
|
|
+ - prompt_template: 自定义提示词模板
|
|
|
|
|
+ - instructions: Agent系统指令
|
|
|
|
|
+ - tools: Agent工具列表
|
|
|
|
|
+ - name: Agent名称
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ {
|
|
|
|
|
+ "相似度": float, # 加权平均后的相似度 (0-1)
|
|
|
|
|
+ "说明": str # 综合说明(包含各模型的分数和说明)
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ Examples:
|
|
|
|
|
+ >>> # 使用默认权重 (0.5:0.5)
|
|
|
|
|
+ >>> result = await compare_phrases("深度学习", "神经网络")
|
|
|
|
|
+ >>> print(result['相似度']) # 加权平均后的相似度
|
|
|
|
|
+ 0.82
|
|
|
|
|
+
|
|
|
|
|
+ >>> # 自定义权重,更倾向向量模型
|
|
|
|
|
+ >>> result = await compare_phrases(
|
|
|
|
|
+ ... "深度学习", "神经网络",
|
|
|
|
|
+ ... weight_embedding=0.7,
|
|
|
|
|
+ ... weight_semantic=0.3
|
|
|
|
|
+ ... )
|
|
|
|
|
+
|
|
|
|
|
+ >>> # 使用不同的模型
|
|
|
|
|
+ >>> result = await compare_phrases(
|
|
|
|
|
+ ... "深度学习", "神经网络",
|
|
|
|
|
+ ... embedding_model="multilingual",
|
|
|
|
|
+ ... semantic_model="anthropic/claude-sonnet-4.5"
|
|
|
|
|
+ ... )
|
|
|
|
|
+ """
|
|
|
|
|
+ # 验证权重
|
|
|
|
|
+ total_weight = weight_embedding + weight_semantic
|
|
|
|
|
+ if abs(total_weight - 1.0) > 0.001:
|
|
|
|
|
+ raise ValueError(f"权重之和必须为1.0,当前为: {total_weight}")
|
|
|
|
|
+
|
|
|
|
|
+ # 并发调用两个模型
|
|
|
|
|
+ embedding_task = asyncio.to_thread(
|
|
|
|
|
+ compare_phrases_embedding,
|
|
|
|
|
+ phrase_a=phrase_a,
|
|
|
|
|
+ phrase_b=phrase_b,
|
|
|
|
|
+ model_name=embedding_model,
|
|
|
|
|
+ use_cache=use_cache,
|
|
|
|
|
+ cache_dir=cache_dir_embedding
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ semantic_task = compare_phrases_semantic(
|
|
|
|
|
+ phrase_a=phrase_a,
|
|
|
|
|
+ phrase_b=phrase_b,
|
|
|
|
|
+ model_name=semantic_model,
|
|
|
|
|
+ use_cache=use_cache,
|
|
|
|
|
+ cache_dir=cache_dir_semantic,
|
|
|
|
|
+ **semantic_kwargs
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 等待两个任务完成
|
|
|
|
|
+ embedding_result, semantic_result = await asyncio.gather(
|
|
|
|
|
+ embedding_task,
|
|
|
|
|
+ semantic_task
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 提取相似度分数
|
|
|
|
|
+ score_embedding = embedding_result.get("相似度", 0.0)
|
|
|
|
|
+ score_semantic = semantic_result.get("相似度", 0.0)
|
|
|
|
|
+
|
|
|
|
|
+ # 计算加权平均
|
|
|
|
|
+ final_score = (
|
|
|
|
|
+ score_embedding * weight_embedding +
|
|
|
|
|
+ score_semantic * weight_semantic
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 生成综合说明(格式化为清晰的结构)
|
|
|
|
|
+ explanation = (
|
|
|
|
|
+ f"【混合相似度】{final_score:.3f}(向量模型权重{weight_embedding},LLM模型权重{weight_semantic})\n\n"
|
|
|
|
|
+ f"【向量模型】相似度={score_embedding:.3f}\n"
|
|
|
|
|
+ f"{embedding_result.get('说明', 'N/A')}\n\n"
|
|
|
|
|
+ f"【LLM模型】相似度={score_semantic:.3f}\n"
|
|
|
|
|
+ f"{semantic_result.get('说明', 'N/A')}"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 构建返回结果(与原接口完全一致)
|
|
|
|
|
+ return {
|
|
|
|
|
+ "相似度": final_score,
|
|
|
|
|
+ "说明": explanation
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def compare_phrases_sync(
|
|
|
|
|
+ phrase_a: str,
|
|
|
|
|
+ phrase_b: str,
|
|
|
|
|
+ weight_embedding: float = 0.5,
|
|
|
|
|
+ weight_semantic: float = 0.5,
|
|
|
|
|
+ **kwargs
|
|
|
|
|
+) -> Dict[str, Any]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 混合相似度计算的同步版本(内部创建事件循环)
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ phrase_a: 第一个短语
|
|
|
|
|
+ phrase_b: 第二个短语
|
|
|
|
|
+ weight_embedding: 向量模型权重,默认 0.5
|
|
|
|
|
+ weight_semantic: LLM模型权重,默认 0.5
|
|
|
|
|
+ **kwargs: 其他参数(同 compare_phrases)
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 同 compare_phrases
|
|
|
|
|
+
|
|
|
|
|
+ Examples:
|
|
|
|
|
+ >>> result = compare_phrases_sync("深度学习", "神经网络")
|
|
|
|
|
+ >>> print(result['相似度'])
|
|
|
|
|
+ """
|
|
|
|
|
+ return asyncio.run(
|
|
|
|
|
+ compare_phrases(
|
|
|
|
|
+ phrase_a=phrase_a,
|
|
|
|
|
+ phrase_b=phrase_b,
|
|
|
|
|
+ weight_embedding=weight_embedding,
|
|
|
|
|
+ weight_semantic=weight_semantic,
|
|
|
|
|
+ **kwargs
|
|
|
|
|
+ )
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ async def main():
|
|
|
|
|
+ print("=" * 80)
|
|
|
|
|
+ print("混合相似度计算示例")
|
|
|
|
|
+ print("=" * 80)
|
|
|
|
|
+ print()
|
|
|
|
|
+
|
|
|
|
|
+ # 示例 1: 默认权重 (0.5:0.5)
|
|
|
|
|
+ print("示例 1: 默认权重 (0.5:0.5)")
|
|
|
|
|
+ print("-" * 80)
|
|
|
|
|
+ result = await compare_phrases("深度学习", "神经网络")
|
|
|
|
|
+ print(f"相似度: {result['相似度']:.3f}")
|
|
|
|
|
+ print(f"说明:\n{result['说明']}")
|
|
|
|
|
+ print()
|
|
|
|
|
+
|
|
|
|
|
+ # 示例 2: 不相关的短语
|
|
|
|
|
+ print("示例 2: 不相关的短语")
|
|
|
|
|
+ print("-" * 80)
|
|
|
|
|
+ result = await compare_phrases("编程", "吃饭")
|
|
|
|
|
+ print(f"相似度: {result['相似度']:.3f}")
|
|
|
|
|
+ print(f"说明:\n{result['说明']}")
|
|
|
|
|
+ print()
|
|
|
|
|
+
|
|
|
|
|
+ # 示例 3: 自定义权重,更倾向向量模型
|
|
|
|
|
+ print("示例 3: 自定义权重 (向量:0.7, LLM:0.3)")
|
|
|
|
|
+ print("-" * 80)
|
|
|
|
|
+ result = await compare_phrases(
|
|
|
|
|
+ "人工智能", "机器学习",
|
|
|
|
|
+ weight_embedding=0.7,
|
|
|
|
|
+ weight_semantic=0.3
|
|
|
|
|
+ )
|
|
|
|
|
+ print(f"相似度: {result['相似度']:.3f}")
|
|
|
|
|
+ print(f"说明:\n{result['说明']}")
|
|
|
|
|
+ print()
|
|
|
|
|
+
|
|
|
|
|
+ # 示例 4: 完整输出示例
|
|
|
|
|
+ print("示例 4: 完整输出示例")
|
|
|
|
|
+ print("-" * 80)
|
|
|
|
|
+ result = await compare_phrases("宿命感", "余华的小说")
|
|
|
|
|
+ print(f"相似度: {result['相似度']:.3f}")
|
|
|
|
|
+ print(f"说明:\n{result['说明']}")
|
|
|
|
|
+ print()
|
|
|
|
|
+
|
|
|
|
|
+ # 示例 5: 同步版本
|
|
|
|
|
+ print("示例 5: 同步版本调用")
|
|
|
|
|
+ print("-" * 80)
|
|
|
|
|
+ result = compare_phrases_sync("Python", "编程语言")
|
|
|
|
|
+ print(f"相似度: {result['相似度']:.3f}")
|
|
|
|
|
+ print(f"说明:\n{result['说明']}")
|
|
|
|
|
+ print()
|
|
|
|
|
+
|
|
|
|
|
+ print("=" * 80)
|
|
|
|
|
+
|
|
|
|
|
+ asyncio.run(main())
|