Переглянути джерело

refactor: 将特征匹配从relation_analyzer改为semantic_similarity

- match_inspiration_features.py: 使用compare_phrases替代analyze_relation
- visualize_how_results.py: 适配新的相似度数据格式
  - 从关系类型统计改为相似度区间统计(高/中/低)
  - 更新字段访问:relation/score/explanation → 相似度/说明
  - 注释掉旧的关系类型辅助函数

新的数据格式:
{
  "相似度": 0.75,
  "说明": "..."
}

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
yangxiaohui 1 тиждень тому
батько
коміт
69b3a1aea7

+ 11 - 5
script/data_processing/match_inspiration_features.py

@@ -17,7 +17,7 @@ import sys
 project_root = Path(__file__).parent.parent.parent
 sys.path.insert(0, str(project_root))
 
-from lib.relation_analyzer import analyze_relation
+from lib.semantic_similarity import compare_phrases
 
 # 全局并发限制
 MAX_CONCURRENT_REQUESTS = 20
@@ -46,20 +46,26 @@ async def match_single_pair(
         model_name: 使用的模型名称
 
     Returns:
-        单个匹配结果
+        单个匹配结果,格式:
+        {
+            "人设特征名称": "xxx",
+            "匹配结果": {
+                "相似度": 0.75,
+                "说明": "..."
+            }
+        }
     """
     sem = get_semaphore()
     async with sem:
         print(f"      匹配: {feature_name} <-> {persona_name}")
-        relation_result = await analyze_relation(
+        similarity_result = await compare_phrases(
             phrase_a=feature_name,
             phrase_b=persona_name,
-            model_name=model_name
         )
 
         return {
             "人设特征名称": persona_name,
-            "匹配结果": relation_result
+            "匹配结果": similarity_result
         }
 
 

+ 65 - 43
script/data_processing/visualize_how_results.py

@@ -20,32 +20,33 @@ project_root = Path(__file__).parent.parent.parent
 sys.path.insert(0, str(project_root))
 
 
-def get_relation_color(relation: str) -> str:
-    """根据关系类型返回对应的颜色"""
-    color_map = {
-        "same": "#10b981",           # 绿色 - 同义
-        "contains": "#3b82f6",       # 蓝色 - 包含
-        "contained_by": "#8b5cf6",   # 紫色 - 被包含
-        "coordinate": "#f59e0b",     # 橙色 - 同级
-        "overlap": "#ec4899",        # 粉色 - 部分重叠
-        "related": "#6366f1",        # 靛蓝 - 相关
-        "unrelated": "#9ca3af"       # 灰色 - 无关
-    }
-    return color_map.get(relation, "#9ca3af")
-
-
-def get_relation_label(relation: str) -> str:
-    """返回关系类型的中文标签"""
-    label_map = {
-        "same": "同义",
-        "contains": "包含",
-        "contained_by": "被包含",
-        "coordinate": "同级",
-        "overlap": "部分重叠",
-        "related": "相关",
-        "unrelated": "无关"
-    }
-    return label_map.get(relation, relation)
+# 注意:已改用基于相似度的显示方式,不再使用关系类型
+# def get_relation_color(relation: str) -> str:
+#     """根据关系类型返回对应的颜色"""
+#     color_map = {
+#         "same": "#10b981",           # 绿色 - 同义
+#         "contains": "#3b82f6",       # 蓝色 - 包含
+#         "contained_by": "#8b5cf6",   # 紫色 - 被包含
+#         "coordinate": "#f59e0b",     # 橙色 - 同级
+#         "overlap": "#ec4899",        # 粉色 - 部分重叠
+#         "related": "#6366f1",        # 靛蓝 - 相关
+#         "unrelated": "#9ca3af"       # 灰色 - 无关
+#     }
+#     return color_map.get(relation, "#9ca3af")
+#
+#
+# def get_relation_label(relation: str) -> str:
+#     """返回关系类型的中文标签"""
+#     label_map = {
+#         "same": "同义",
+#         "contains": "包含",
+#         "contained_by": "被包含",
+#         "coordinate": "同级",
+#         "overlap": "部分重叠",
+#         "related": "相关",
+#         "unrelated": "无关"
+#     }
+#     return label_map.get(relation, relation)
 
 
 def generate_historical_post_card_html(post_detail: Dict, inspiration_point: Dict) -> str:
@@ -266,21 +267,35 @@ def generate_match_results_html(how_steps: List[Dict], feature_idx: int, insp_id
     if category_mapping is None:
         category_mapping = {}
 
-    # 按分数排序
-    sorted_matches = sorted(match_results, key=lambda x: x.get("匹配结果", {}).get("score", 0), reverse=True)
+    # 按相似度排序
+    sorted_matches = sorted(match_results, key=lambda x: x.get("匹配结果", {}).get("相似度", 0), reverse=True)
 
-    # 统计匹配类型
-    relation_counts = {}
+    # 统计相似度分布(按区间统计)
+    similarity_ranges = {
+        "高相似 (≥0.7)": 0,
+        "中相似 (0.4-0.7)": 0,
+        "低相似 (<0.4)": 0
+    }
     for match in match_results:
-        relation = match.get("匹配结果", {}).get("relation", "unrelated")
-        relation_counts[relation] = relation_counts.get(relation, 0) + 1
+        similarity = match.get("匹配结果", {}).get("相似度", 0)
+        if similarity >= 0.7:
+            similarity_ranges["高相似 (≥0.7)"] += 1
+        elif similarity >= 0.4:
+            similarity_ranges["中相似 (0.4-0.7)"] += 1
+        else:
+            similarity_ranges["低相似 (<0.4)"] += 1
 
     # 生成统计信息
     stats_items = []
-    for relation, count in sorted(relation_counts.items(), key=lambda x: x[1], reverse=True):
-        label = get_relation_label(relation)
-        color = get_relation_color(relation)
-        stats_items.append(f'<span class="stat-badge" style="background: {color};">{label}: {count}</span>')
+    range_colors = {
+        "高相似 (≥0.7)": "#10b981",
+        "中相似 (0.4-0.7)": "#f59e0b",
+        "低相似 (<0.4)": "#9ca3af"
+    }
+    for range_name, count in similarity_ranges.items():
+        if count > 0:
+            color = range_colors[range_name]
+            stats_items.append(f'<span class="stat-badge" style="background: {color};">{range_name}: {count}</span>')
     stats_html = "".join(stats_items)
 
     # 生成匹配项
@@ -288,12 +303,19 @@ def generate_match_results_html(how_steps: List[Dict], feature_idx: int, insp_id
     for i, match in enumerate(sorted_matches):
         persona_name = match.get("人设特征名称", "")
         match_result = match.get("匹配结果", {})
-        relation = match_result.get("relation", "unrelated")
-        score = match_result.get("score", 0.0)
-        explanation = match_result.get("explanation", "")
-
-        color = get_relation_color(relation)
-        label = get_relation_label(relation)
+        similarity = match_result.get("相似度", 0.0)
+        explanation = match_result.get("说明", "")
+
+        # 根据相似度确定颜色
+        if similarity >= 0.7:
+            color = "#10b981"  # 绿色 - 高相似
+            label = "高相似"
+        elif similarity >= 0.4:
+            color = "#f59e0b"  # 橙色 - 中相似
+            label = "中相似"
+        else:
+            color = "#9ca3af"  # 灰色 - 低相似
+            label = "低相似"
 
         match_id = f"post-{post_idx}-insp-{insp_idx}-feat-{feature_idx}-match-{i}"
 
@@ -345,7 +367,7 @@ def generate_match_results_html(how_steps: List[Dict], feature_idx: int, insp_id
                     <span class="expand-icon" id="{match_id}-icon">▶</span>
                     <span class="persona-name">{categories_html} {html_module.escape(persona_name)}</span>
                     <span class="relation-badge" style="background: {color};">{label}</span>
-                    <span class="score-badge">分数: {score:.2f}</span>
+                    <span class="score-badge">相似度: {similarity:.2f}</span>
                 </div>
             </div>
             <div class="match-content" id="{match_id}-content" style="display: none;">