hybrid_similarity.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. #!/usr/bin/env python3
  2. """
  3. 混合相似度计算模块
  4. 结合向量模型(text_embedding)和LLM模型(semantic_similarity)的结果
  5. 提供2种接口:
  6. 1. compare_phrases() - 单对计算
  7. 2. compare_phrases_cartesian() - 笛卡尔积批量计算 (M×N)
  8. """
  9. from typing import Dict, Any, Optional, List
  10. import asyncio
  11. import numpy as np
  12. from lib.text_embedding import compare_phrases as compare_phrases_embedding
  13. from lib.text_embedding_api import compare_phrases_cartesian as compare_phrases_cartesian_api
  14. from lib.semantic_similarity import compare_phrases as compare_phrases_semantic
  15. from lib.semantic_similarity import compare_phrases_cartesian as compare_phrases_cartesian_semantic
  16. from lib.config import get_cache_dir
  17. async def compare_phrases(
  18. phrase_a: str,
  19. phrase_b: str,
  20. weight_embedding: float = 0.5,
  21. weight_semantic: float = 0.5,
  22. embedding_model: str = "chinese",
  23. semantic_model: str = 'openai/gpt-4.1-mini',
  24. use_cache: bool = True,
  25. cache_dir_embedding: Optional[str] = None,
  26. cache_dir_semantic: Optional[str] = None,
  27. **semantic_kwargs
  28. ) -> Dict[str, Any]:
  29. """
  30. 混合相似度计算:同时使用向量模型和LLM模型,按权重组合结果
  31. Args:
  32. phrase_a: 第一个短语
  33. phrase_b: 第二个短语
  34. weight_embedding: 向量模型权重,默认 0.5
  35. weight_semantic: LLM模型权重,默认 0.5
  36. embedding_model: 向量模型名称,默认 "chinese"
  37. semantic_model: LLM模型名称,默认 'openai/gpt-4.1-mini'
  38. use_cache: 是否使用缓存,默认 True
  39. cache_dir_embedding: 向量模型缓存目录,默认从配置读取
  40. cache_dir_semantic: LLM模型缓存目录,默认从配置读取
  41. **semantic_kwargs: 其他传递给semantic_similarity的参数
  42. - temperature: 温度参数,默认 0.0
  43. - max_tokens: 最大token数,默认 65536
  44. - prompt_template: 自定义提示词模板
  45. - instructions: Agent系统指令
  46. - tools: Agent工具列表
  47. - name: Agent名称
  48. Returns:
  49. {
  50. "相似度": float, # 加权平均后的相似度 (0-1)
  51. "说明": str # 综合说明(包含各模型的分数和说明)
  52. }
  53. Examples:
  54. >>> # 使用默认权重 (0.5:0.5)
  55. >>> result = await compare_phrases("深度学习", "神经网络")
  56. >>> print(result['相似度']) # 加权平均后的相似度
  57. 0.82
  58. >>> # 自定义权重,更倾向向量模型
  59. >>> result = await compare_phrases(
  60. ... "深度学习", "神经网络",
  61. ... weight_embedding=0.7,
  62. ... weight_semantic=0.3
  63. ... )
  64. >>> # 使用不同的模型
  65. >>> result = await compare_phrases(
  66. ... "深度学习", "神经网络",
  67. ... embedding_model="multilingual",
  68. ... semantic_model="anthropic/claude-sonnet-4.5"
  69. ... )
  70. """
  71. # 验证权重
  72. total_weight = weight_embedding + weight_semantic
  73. if abs(total_weight - 1.0) > 0.001:
  74. raise ValueError(f"权重之和必须为1.0,当前为: {total_weight}")
  75. # 使用配置的缓存目录(如果未指定)
  76. if cache_dir_embedding is None:
  77. cache_dir_embedding = get_cache_dir("text_embedding")
  78. if cache_dir_semantic is None:
  79. cache_dir_semantic = get_cache_dir("semantic_similarity")
  80. # 并发调用两个模型
  81. embedding_task = asyncio.to_thread(
  82. compare_phrases_embedding,
  83. phrase_a=phrase_a,
  84. phrase_b=phrase_b,
  85. model_name=embedding_model,
  86. use_cache=use_cache,
  87. cache_dir=cache_dir_embedding
  88. )
  89. semantic_task = compare_phrases_semantic(
  90. phrase_a=phrase_a,
  91. phrase_b=phrase_b,
  92. model_name=semantic_model,
  93. use_cache=use_cache,
  94. cache_dir=cache_dir_semantic,
  95. **semantic_kwargs
  96. )
  97. # 等待两个任务完成
  98. embedding_result, semantic_result = await asyncio.gather(
  99. embedding_task,
  100. semantic_task
  101. )
  102. # 提取相似度分数
  103. score_embedding = embedding_result.get("相似度", 0.0)
  104. score_semantic = semantic_result.get("相似度", 0.0)
  105. # 计算加权平均
  106. final_score = (
  107. score_embedding * weight_embedding +
  108. score_semantic * weight_semantic
  109. )
  110. # 生成综合说明(格式化为清晰的结构)
  111. explanation = (
  112. f"【混合相似度】{final_score:.3f}(向量模型权重{weight_embedding},LLM模型权重{weight_semantic})\n\n"
  113. f"【向量模型】相似度={score_embedding:.3f}\n"
  114. f"{embedding_result.get('说明', 'N/A')}\n\n"
  115. f"【LLM模型】相似度={score_semantic:.3f}\n"
  116. f"{semantic_result.get('说明', 'N/A')}"
  117. )
  118. # 构建返回结果(与原接口完全一致)
  119. return {
  120. "相似度": final_score,
  121. "说明": explanation
  122. }
  123. async def compare_phrases_cartesian(
  124. phrases_a: List[str],
  125. phrases_b: List[str],
  126. max_concurrent: int = 50,
  127. llm_progress_callback: Optional[callable] = None,
  128. embedding_progress_callback: Optional[callable] = None
  129. ) -> List[List[Dict[str, Any]]]:
  130. """
  131. 混合相似度笛卡尔积批量计算:M×N矩阵(带双进度回调)
  132. 结合向量模型API笛卡尔积(快速)和LLM并发调用(已优化)
  133. 使用默认权重:向量0.5,LLM 0.5
  134. Args:
  135. phrases_a: 第一组短语列表(M个)
  136. phrases_b: 第二组短语列表(N个)
  137. max_concurrent: 最大并发数,默认50(控制LLM调用并发)
  138. llm_progress_callback: LLM进度回调函数,每完成一个LLM任务调用一次
  139. embedding_progress_callback: 向量进度回调函数,每完成一个向量任务调用一次
  140. Returns:
  141. 嵌套列表 List[List[Dict]],每个Dict包含完整结果
  142. results[i][j] = {
  143. "相似度": float, # 混合相似度
  144. "说明": str # 包含向量和LLM的详细说明
  145. }
  146. Examples:
  147. >>> results = await compare_phrases_cartesian(
  148. ... ["深度学习"],
  149. ... ["神经网络", "Python"]
  150. ... )
  151. >>> print(results[0][0]['相似度']) # 混合相似度
  152. >>> print(results[0][1]['说明']) # 完整说明
  153. >>> # 自定义并发控制
  154. >>> results = await compare_phrases_cartesian(
  155. ... ["深度学习"],
  156. ... ["神经网络", "Python"],
  157. ... max_concurrent=100 # 提高并发数
  158. ... )
  159. """
  160. # 参数验证
  161. if not phrases_a or not phrases_b:
  162. return [[]]
  163. M, N = len(phrases_a), len(phrases_b)
  164. # 默认权重
  165. weight_embedding = 0.5
  166. weight_semantic = 0.5
  167. # 串行执行两个任务(向量模型快,先执行;避免并发死锁)
  168. # 1. 向量模型:使用API笛卡尔积(一次调用获取M×N完整结果,通常1-2秒)
  169. import time
  170. start_time = time.time()
  171. embedding_results = await asyncio.to_thread(
  172. compare_phrases_cartesian_api,
  173. phrases_a,
  174. phrases_b,
  175. max_concurrent,
  176. None # 不传递回调
  177. )
  178. elapsed = time.time() - start_time
  179. # print(f"✓ 向量模型完成,耗时: {elapsed:.1f}秒") # 调试用
  180. # 向量模型完成后,一次性批量更新进度(而不是循环25704次)
  181. if embedding_progress_callback:
  182. embedding_progress_callback(M * N) # 传递总数,一次更新
  183. # 2. LLM模型:使用并发调用(M×N个任务,受max_concurrent控制)
  184. semantic_results = await compare_phrases_cartesian_semantic(
  185. phrases_a,
  186. phrases_b,
  187. max_concurrent, # 传递并发参数控制LLM调用
  188. llm_progress_callback # 传递LLM进度回调
  189. )
  190. # embedding_results[i][j] = {"相似度": float, "说明": str}
  191. # semantic_results[i][j] = {"相似度": float, "说明": str}
  192. # 构建嵌套列表,包含完整信息(带子模型详细说明)
  193. nested_results = []
  194. for i in range(M):
  195. row_results = []
  196. for j in range(N):
  197. # 获取子模型的完整结果
  198. embedding_result = embedding_results[i][j]
  199. semantic_result = semantic_results[i][j]
  200. score_embedding = embedding_result.get("相似度", 0.0)
  201. score_semantic = semantic_result.get("相似度", 0.0)
  202. # 计算加权平均
  203. final_score = (
  204. score_embedding * weight_embedding +
  205. score_semantic * weight_semantic
  206. )
  207. # 生成完整说明(包含子模型的详细说明)
  208. explanation = (
  209. f"【混合相似度】{final_score:.3f}(向量模型权重{weight_embedding},LLM模型权重{weight_semantic})\n\n"
  210. f"【向量模型】相似度={score_embedding:.3f}\n"
  211. f"{embedding_result.get('说明', 'N/A')}\n\n"
  212. f"【LLM模型】相似度={score_semantic:.3f}\n"
  213. f"{semantic_result.get('说明', 'N/A')}"
  214. )
  215. row_results.append({
  216. "相似度": final_score,
  217. "说明": explanation
  218. })
  219. nested_results.append(row_results)
  220. return nested_results
  221. def compare_phrases_sync(
  222. phrase_a: str,
  223. phrase_b: str,
  224. weight_embedding: float = 0.5,
  225. weight_semantic: float = 0.5,
  226. **kwargs
  227. ) -> Dict[str, Any]:
  228. """
  229. 混合相似度计算的同步版本(内部创建事件循环)
  230. Args:
  231. phrase_a: 第一个短语
  232. phrase_b: 第二个短语
  233. weight_embedding: 向量模型权重,默认 0.5
  234. weight_semantic: LLM模型权重,默认 0.5
  235. **kwargs: 其他参数(同 compare_phrases)
  236. Returns:
  237. 同 compare_phrases
  238. Examples:
  239. >>> result = compare_phrases_sync("深度学习", "神经网络")
  240. >>> print(result['相似度'])
  241. """
  242. return asyncio.run(
  243. compare_phrases(
  244. phrase_a=phrase_a,
  245. phrase_b=phrase_b,
  246. weight_embedding=weight_embedding,
  247. weight_semantic=weight_semantic,
  248. **kwargs
  249. )
  250. )
  251. if __name__ == "__main__":
  252. async def main():
  253. print("=" * 80)
  254. print("混合相似度计算示例")
  255. print("=" * 80)
  256. print()
  257. # 示例 1: 默认权重 (0.5:0.5)
  258. print("示例 1: 默认权重 (0.5:0.5)")
  259. print("-" * 80)
  260. result = await compare_phrases("深度学习", "神经网络")
  261. print(f"相似度: {result['相似度']:.3f}")
  262. print(f"说明:\n{result['说明']}")
  263. print()
  264. # 示例 2: 不相关的短语
  265. print("示例 2: 不相关的短语")
  266. print("-" * 80)
  267. result = await compare_phrases("编程", "吃饭")
  268. print(f"相似度: {result['相似度']:.3f}")
  269. print(f"说明:\n{result['说明']}")
  270. print()
  271. # 示例 3: 自定义权重,更倾向向量模型
  272. print("示例 3: 自定义权重 (向量:0.7, LLM:0.3)")
  273. print("-" * 80)
  274. result = await compare_phrases(
  275. "人工智能", "机器学习",
  276. weight_embedding=0.7,
  277. weight_semantic=0.3
  278. )
  279. print(f"相似度: {result['相似度']:.3f}")
  280. print(f"说明:\n{result['说明']}")
  281. print()
  282. # 示例 4: 完整输出示例
  283. print("示例 4: 完整输出示例")
  284. print("-" * 80)
  285. result = await compare_phrases("宿命感", "余华的小说")
  286. print(f"相似度: {result['相似度']:.3f}")
  287. print(f"说明:\n{result['说明']}")
  288. print()
  289. # 示例 5: 同步版本
  290. print("示例 5: 同步版本调用")
  291. print("-" * 80)
  292. result = compare_phrases_sync("Python", "编程语言")
  293. print(f"相似度: {result['相似度']:.3f}")
  294. print(f"说明:\n{result['说明']}")
  295. print()
  296. print("=" * 80)
  297. asyncio.run(main())