1 週間前 · bc6f8c1123
--- a/lib/hybrid_similarity.py
+++ b/lib/hybrid_similarity.py
@@ -0,0 +1,216 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+混合相似度计算模块
			
 
				+结合向量模型（text_embedding）和LLM模型（semantic_similarity）的结果
			
 
				+"""
			
 
				+
			
 
				+from typing import Dict, Any, Optional
			
 
				+import asyncio
			
 
				+from lib.text_embedding import compare_phrases as compare_phrases_embedding
			
 
				+from lib.semantic_similarity import compare_phrases as compare_phrases_semantic
			
 
				+
			
 
				+
			
 
				+async def compare_phrases(
			
 
				+    phrase_a: str,
			
 
				+    phrase_b: str,
			
 
				+    weight_embedding: float = 0.5,
			
 
				+    weight_semantic: float = 0.5,
			
 
				+    embedding_model: str = "chinese",
			
 
				+    semantic_model: str = 'openai/gpt-4.1-mini',
			
 
				+    use_cache: bool = True,
			
 
				+    cache_dir_embedding: str = "cache/text_embedding",
			
 
				+    cache_dir_semantic: str = "cache/semantic_similarity",
			
 
				+    **semantic_kwargs
			
 
				+) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    混合相似度计算：同时使用向量模型和LLM模型，按权重组合结果
			
 
				+
			
 
				+    Args:
			
 
				+        phrase_a: 第一个短语
			
 
				+        phrase_b: 第二个短语
			
 
				+        weight_embedding: 向量模型权重，默认 0.5
			
 
				+        weight_semantic: LLM模型权重，默认 0.5
			
 
				+        embedding_model: 向量模型名称，默认 "chinese"
			
 
				+        semantic_model: LLM模型名称，默认 'openai/gpt-4.1-mini'
			
 
				+        use_cache: 是否使用缓存，默认 True
			
 
				+        cache_dir_embedding: 向量模型缓存目录
			
 
				+        cache_dir_semantic: LLM模型缓存目录
			
 
				+        **semantic_kwargs: 其他传递给semantic_similarity的参数
			
 
				+            - temperature: 温度参数，默认 0.0
			
 
				+            - max_tokens: 最大token数，默认 65536
			
 
				+            - prompt_template: 自定义提示词模板
			
 
				+            - instructions: Agent系统指令
			
 
				+            - tools: Agent工具列表
			
 
				+            - name: Agent名称
			
 
				+
			
 
				+    Returns:
			
 
				+        {
			
 
				+            "相似度": float,           # 加权平均后的相似度 (0-1)
			
 
				+            "说明": str               # 综合说明（包含各模型的分数和说明）
			
 
				+        }
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> # 使用默认权重 (0.5:0.5)
			
 
				+        >>> result = await compare_phrases("深度学习", "神经网络")
			
 
				+        >>> print(result['相似度'])  # 加权平均后的相似度
			
 
				+        0.82
			
 
				+
			
 
				+        >>> # 自定义权重，更倾向向量模型
			
 
				+        >>> result = await compare_phrases(
			
 
				+        ...     "深度学习", "神经网络",
			
 
				+        ...     weight_embedding=0.7,
			
 
				+        ...     weight_semantic=0.3
			
 
				+        ... )
			
 
				+
			
 
				+        >>> # 使用不同的模型
			
 
				+        >>> result = await compare_phrases(
			
 
				+        ...     "深度学习", "神经网络",
			
 
				+        ...     embedding_model="multilingual",
			
 
				+        ...     semantic_model="anthropic/claude-sonnet-4.5"
			
 
				+        ... )
			
 
				+    """
			
 
				+    # 验证权重
			
 
				+    total_weight = weight_embedding + weight_semantic
			
 
				+    if abs(total_weight - 1.0) > 0.001:
			
 
				+        raise ValueError(f"权重之和必须为1.0，当前为: {total_weight}")
			
 
				+
			
 
				+    # 并发调用两个模型
			
 
				+    embedding_task = asyncio.to_thread(
			
 
				+        compare_phrases_embedding,
			
 
				+        phrase_a=phrase_a,
			
 
				+        phrase_b=phrase_b,
			
 
				+        model_name=embedding_model,
			
 
				+        use_cache=use_cache,
			
 
				+        cache_dir=cache_dir_embedding
			
 
				+    )
			
 
				+
			
 
				+    semantic_task = compare_phrases_semantic(
			
 
				+        phrase_a=phrase_a,
			
 
				+        phrase_b=phrase_b,
			
 
				+        model_name=semantic_model,
			
 
				+        use_cache=use_cache,
			
 
				+        cache_dir=cache_dir_semantic,
			
 
				+        **semantic_kwargs
			
 
				+    )
			
 
				+
			
 
				+    # 等待两个任务完成
			
 
				+    embedding_result, semantic_result = await asyncio.gather(
			
 
				+        embedding_task,
			
 
				+        semantic_task
			
 
				+    )
			
 
				+
			
 
				+    # 提取相似度分数
			
 
				+    score_embedding = embedding_result.get("相似度", 0.0)
			
 
				+    score_semantic = semantic_result.get("相似度", 0.0)
			
 
				+
			
 
				+    # 计算加权平均
			
 
				+    final_score = (
			
 
				+        score_embedding * weight_embedding +
			
 
				+        score_semantic * weight_semantic
			
 
				+    )
			
 
				+
			
 
				+    # 生成综合说明（格式化为清晰的结构）
			
 
				+    explanation = (
			
 
				+        f"【混合相似度】{final_score:.3f}（向量模型权重{weight_embedding}，LLM模型权重{weight_semantic}）\n\n"
			
 
				+        f"【向量模型】相似度={score_embedding:.3f}\n"
			
 
				+        f"{embedding_result.get('说明', 'N/A')}\n\n"
			
 
				+        f"【LLM模型】相似度={score_semantic:.3f}\n"
			
 
				+        f"{semantic_result.get('说明', 'N/A')}"
			
 
				+    )
			
 
				+
			
 
				+    # 构建返回结果（与原接口完全一致）
			
 
				+    return {
			
 
				+        "相似度": final_score,
			
 
				+        "说明": explanation
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def compare_phrases_sync(
			
 
				+    phrase_a: str,
			
 
				+    phrase_b: str,
			
 
				+    weight_embedding: float = 0.5,
			
 
				+    weight_semantic: float = 0.5,
			
 
				+    **kwargs
			
 
				+) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    混合相似度计算的同步版本（内部创建事件循环）
			
 
				+
			
 
				+    Args:
			
 
				+        phrase_a: 第一个短语
			
 
				+        phrase_b: 第二个短语
			
 
				+        weight_embedding: 向量模型权重，默认 0.5
			
 
				+        weight_semantic: LLM模型权重，默认 0.5
			
 
				+        **kwargs: 其他参数（同 compare_phrases）
			
 
				+
			
 
				+    Returns:
			
 
				+        同 compare_phrases
			
 
				+
			
 
				+    Examples:
			
 
				+        >>> result = compare_phrases_sync("深度学习", "神经网络")
			
 
				+        >>> print(result['相似度'])
			
 
				+    """
			
 
				+    return asyncio.run(
			
 
				+        compare_phrases(
			
 
				+            phrase_a=phrase_a,
			
 
				+            phrase_b=phrase_b,
			
 
				+            weight_embedding=weight_embedding,
			
 
				+            weight_semantic=weight_semantic,
			
 
				+            **kwargs
			
 
				+        )
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    async def main():
			
 
				+        print("=" * 80)
			
 
				+        print("混合相似度计算示例")
			
 
				+        print("=" * 80)
			
 
				+        print()
			
 
				+
			
 
				+        # 示例 1: 默认权重 (0.5:0.5)
			
 
				+        print("示例 1: 默认权重 (0.5:0.5)")
			
 
				+        print("-" * 80)
			
 
				+        result = await compare_phrases("深度学习", "神经网络")
			
 
				+        print(f"相似度: {result['相似度']:.3f}")
			
 
				+        print(f"说明:\n{result['说明']}")
			
 
				+        print()
			
 
				+
			
 
				+        # 示例 2: 不相关的短语
			
 
				+        print("示例 2: 不相关的短语")
			
 
				+        print("-" * 80)
			
 
				+        result = await compare_phrases("编程", "吃饭")
			
 
				+        print(f"相似度: {result['相似度']:.3f}")
			
 
				+        print(f"说明:\n{result['说明']}")
			
 
				+        print()
			
 
				+
			
 
				+        # 示例 3: 自定义权重，更倾向向量模型
			
 
				+        print("示例 3: 自定义权重 (向量:0.7, LLM:0.3)")
			
 
				+        print("-" * 80)
			
 
				+        result = await compare_phrases(
			
 
				+            "人工智能", "机器学习",
			
 
				+            weight_embedding=0.7,
			
 
				+            weight_semantic=0.3
			
 
				+        )
			
 
				+        print(f"相似度: {result['相似度']:.3f}")
			
 
				+        print(f"说明:\n{result['说明']}")
			
 
				+        print()
			
 
				+
			
 
				+        # 示例 4: 完整输出示例
			
 
				+        print("示例 4: 完整输出示例")
			
 
				+        print("-" * 80)
			
 
				+        result = await compare_phrases("宿命感", "余华的小说")
			
 
				+        print(f"相似度: {result['相似度']:.3f}")
			
 
				+        print(f"说明:\n{result['说明']}")
			
 
				+        print()
			
 
				+
			
 
				+        # 示例 5: 同步版本
			
 
				+        print("示例 5: 同步版本调用")
			
 
				+        print("-" * 80)
			
 
				+        result = compare_phrases_sync("Python", "编程语言")
			
 
				+        print(f"相似度: {result['相似度']:.3f}")
			
 
				+        print(f"说明:\n{result['说明']}")
			
 
				+        print()
			
 
				+
			
 
				+        print("=" * 80)
			
 
				+
			
 
				+    asyncio.run(main())
			
--- a/script/data_processing/match_inspiration_features.py
+++ b/script/data_processing/match_inspiration_features.py
@@ -18,7 +18,7 @@ from datetime import datetime
 
				 project_root = Path(__file__).parent.parent.parent
			
 
				 sys.path.insert(0, str(project_root))
			
 
				 
			
 
				-from lib.text_embedding import compare_phrases
			
 
				+from lib.hybrid_similarity import compare_phrases
			
 
				 
			
 
				 # 全局并发限制
			
 
				 MAX_CONCURRENT_REQUESTS = 100
			
@@ -119,11 +119,12 @@ async def match_single_pair(
 
				     global progress_tracker
			
 
				     sem = get_semaphore()
			
 
				     async with sem:
			
 
				-        # 使用 asyncio.to_thread 将同步函数转为异步执行
			
 
				-        similarity_result = await asyncio.to_thread(
			
 
				-            compare_phrases,
			
 
				+        # 使用混合相似度模型（异步调用）
			
 
				+        similarity_result = await compare_phrases(
			
 
				             phrase_a=feature_name,
			
 
				             phrase_b=persona_name,
			
 
				+            weight_embedding=0.5,
			
 
				+            weight_semantic=0.5
			
 
				         )
			
 
				 
			
 
				         # 更新进度
			
@@ -468,9 +469,9 @@ async def main():
 
				     with open(category_mapping_file, "r", encoding="utf-8") as f:
			
 
				         category_mapping = json.load(f)
			
 
				 
			
 
				-    # 预先加载模型（在主线程中，避免多线程冲突）
			
 
				-    print("\n预加载文本相似度模型...")
			
 
				-    await asyncio.to_thread(compare_phrases, "测试", "测试")
			
 
				+    # 预先加载模型（混合模型会自动处理）
			
 
				+    print("\n预加载混合相似度模型...")
			
 
				+    await compare_phrases("测试", "测试", weight_embedding=0.5, weight_semantic=0.5)
			
 
				     print("模型预加载完成！\n")
			
 
				 
			
 
				     # 获取任务列表
			
--- a/test_hybrid_case.py
+++ b/test_hybrid_case.py
@@ -0,0 +1,34 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+测试混合相似度计算：拟人 vs 形式
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+from lib.hybrid_similarity import compare_phrases
			
 
				+
			
 
				+
			
 
				+async def main():
			
 
				+    print("=" * 80)
			
 
				+    print("测试案例：拟人 vs 形式")
			
 
				+    print("=" * 80)
			
 
				+    print()
			
 
				+
			
 
				+    # 测试混合相似度
			
 
				+    result = await compare_phrases(
			
 
				+        phrase_a="拟人",
			
 
				+        phrase_b="形式",
			
 
				+        weight_embedding=0.5,
			
 
				+        weight_semantic=0.5
			
 
				+    )
			
 
				+
			
 
				+    print(f"相似度: {result['相似度']:.3f}")
			
 
				+    print()
			
 
				+    print("说明:")
			
 
				+    print("-" * 80)
			
 
				+    print(result['说明'])
			
 
				+    print()
			
 
				+    print("=" * 80)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    asyncio.run(main())