刘立冬 3 هفته پیش
والد
کامیت
6a279d9db4
1فایلهای تغییر یافته به همراه367 افزوده شده و 17 حذف شده
  1. 367 17
      visualize_stage78_with_deconstruction.py

+ 367 - 17
visualize_stage78_with_deconstruction.py

@@ -53,6 +53,65 @@ def load_persona_library(json_path: str) -> Dict[str, Any]:
         return json.load(f)
 
 
+def extract_all_features_from_how(how_json_path: str) -> Dict[str, Dict[str, Any]]:
+    """
+    从how解构结果中提取所有特征的完整信息
+
+    Returns:
+        Dict[特征名称, {
+            'similarity': float,    # 最高相似度
+            'weight': float,        # 权重
+            'dimension': str,       # 所属维度
+            'category': str         # 分类:'已搜索', '待搜索', '低相似度'
+        }]
+    """
+    with open(how_json_path, 'r', encoding='utf-8') as f:
+        how_data = json.load(f)
+
+    features = {}
+
+    for dimension_key in ['灵感点列表', '目的点列表', '关键点列表']:
+        dimension_list = how_data.get('how解构结果', {}).get(dimension_key, [])
+        dimension_name = dimension_key.replace('列表', '')
+
+        for point in dimension_list:
+            feature_list = point.get('特征列表', [])
+
+            for feature in feature_list:
+                feature_name = feature.get('特征名称', '')
+                weight = feature.get('权重', 0)
+
+                # 查找最高相似度
+                max_similarity = 0
+                how_steps = point.get('how步骤列表', [])
+
+                for step in how_steps:
+                    step_features = step.get('特征列表', [])
+                    for step_feature in step_features:
+                        if step_feature.get('特征名称') == feature_name:
+                            matches = step_feature.get('匹配结果', [])
+                            for match in matches:
+                                similarity = match.get('匹配结果', {}).get('相似度', 0)
+                                max_similarity = max(max_similarity, similarity)
+
+                # 确定分类
+                if max_similarity < 0.4:
+                    category = '低相似度'
+                elif 0.4 <= max_similarity <= 0.8:
+                    category = '待搜索'  # 后续会根据Stage6数据更新为'已搜索'
+                else:
+                    category = '高相似度'  # >0.8,孤立点
+
+                features[feature_name] = {
+                    'similarity': max_similarity,
+                    'weight': weight,
+                    'dimension': dimension_name,
+                    'category': category
+                }
+
+    return features
+
+
 def is_category_or_feature(persona_name: str, persona_data: Dict[str, Any]) -> str:
     """
     递归判断人设项是特征还是分类
@@ -100,12 +159,16 @@ def is_category_or_feature(persona_name: str, persona_data: Dict[str, Any]) -> s
     return 'feature'
 
 
-def extract_relationship_data(how_json_path: str, persona_data: Dict[str, Any]) -> List[Dict[str, Any]]:
+def extract_relationship_data(how_json_path: str, persona_data: Dict[str, Any]) -> tuple:
     """
     从how解构结果中提取关系数据
 
     Returns:
-        List of relationship data with format:
+        Tuple of (relationships, all_high_matches):
+        - relationships: 最高相似度匹配的关系列表
+        - all_high_matches: 所有相似度>0.8的匹配列表
+
+        Data format:
         {
             'post_feature': str,  # 帖子特征名称
             'dimension': str,     # 所属维度(灵感点/目的点/关键点)
@@ -119,6 +182,7 @@ def extract_relationship_data(how_json_path: str, persona_data: Dict[str, Any])
         how_data = json.load(f)
 
     relationships = []
+    all_high_matches = []  # 所有相似度>0.8的匹配
 
     # 遍历how解构结果
     for dimension_key in ['灵感点列表', '目的点列表', '关键点列表']:
@@ -144,6 +208,21 @@ def extract_relationship_data(how_json_path: str, persona_data: Dict[str, Any])
                             matches = step_feature.get('匹配结果', [])
                             for match in matches:
                                 similarity = match.get('匹配结果', {}).get('相似度', 0)
+                                persona_name = match.get('人设特征名称', '')
+
+                                # 收集所有相似度>0.8的匹配
+                                if similarity > 0.8:
+                                    item_type = is_category_or_feature(persona_name, persona_data)
+                                    all_high_matches.append({
+                                        'post_feature': feature_name,
+                                        'dimension': dimension_name,
+                                        'weight': weight,
+                                        'persona_item': persona_name,
+                                        'similarity': similarity,
+                                        'item_type': item_type
+                                    })
+
+                                # 追踪最高相似度
                                 if similarity > max_similarity:
                                     max_similarity = similarity
                                     max_match = match
@@ -162,15 +241,17 @@ def extract_relationship_data(how_json_path: str, persona_data: Dict[str, Any])
                         'item_type': item_type
                     })
 
-    return relationships
+    return relationships, all_high_matches
 
 
-def generate_relationship_graph_html(relationships: List[Dict[str, Any]]) -> str:
+def generate_relationship_graph_html(relationships: List[Dict[str, Any]],
+                                    all_high_matches: List[Dict[str, Any]]) -> str:
     """
     生成关系图的SVG HTML代码
 
     Args:
-        relationships: extract_relationship_data返回的关系数据列表
+        relationships: 最高相似度匹配的关系数据列表
+        all_high_matches: 所有相似度>0.8的匹配列表
 
     Returns:
         HTML字符串(包含SVG)
@@ -178,8 +259,18 @@ def generate_relationship_graph_html(relationships: List[Dict[str, Any]]) -> str
     if not relationships:
         return '<div style="padding: 40px; text-align: center; color: #6b7280;">暂无关系数据</div>'
 
-    # 提取唯一的人设项和帖子特征
-    persona_items = list({(r['persona_item'], r['item_type'], r['similarity']) for r in relationships})
+    # 合并所有需要显示的人设项(去重)
+    all_personas = set()
+
+    # 添加relationships中的人设项
+    for r in relationships:
+        all_personas.add((r['persona_item'], r['item_type'], r['similarity']))
+
+    # 添加所有>0.8的人设项
+    for match in all_high_matches:
+        all_personas.add((match['persona_item'], match['item_type'], match['similarity']))
+
+    persona_items = list(all_personas)
 
     # 为每个帖子特征关联最高相似度
     post_feature_map = {}
@@ -266,13 +357,44 @@ def generate_relationship_graph_html(relationships: List[Dict[str, Any]]) -> str
                 </marker>
             </defs>
 
+            <!-- 左右标记 -->
+            <g class="labels">
+                <!-- 人设标记 -->
+                <text x="{left_margin}" y="30"
+                      font-size="18" font-weight="600" fill="#6b7280"
+                      text-anchor="middle">人设</text>
+
+                <!-- 帖子标记 -->
+                <text x="{left_margin + middle_space}" y="30"
+                      font-size="18" font-weight="600" fill="#6b7280"
+                      text-anchor="middle">帖子</text>
+            </g>
+
             <!-- 连接线 -->
             <g class="connections">
     ''']
 
+    # 合并所有需要绘制的连接(去重)
+    all_connections = []
+    connection_keys = set()
+
+    # 添加主要关系(最高相似度匹配)
+    for rel in relationships:
+        key = (rel['persona_item'], rel['post_feature'])
+        if key not in connection_keys:
+            all_connections.append(rel)
+            connection_keys.add(key)
+
+    # 添加额外的>0.8关系
+    for match in all_high_matches:
+        key = (match['persona_item'], match['post_feature'])
+        if key not in connection_keys:
+            all_connections.append(match)
+            connection_keys.add(key)
+
     # 绘制连接线
     line_idx = 0  # 计数器用于标签错开
-    for rel in relationships:
+    for rel in all_connections:
         persona_pos = persona_positions.get(rel['persona_item'])
         post_pos = post_positions.get(rel['post_feature'])
 
@@ -484,6 +606,8 @@ def calculate_statistics(data: List[Dict[str, Any]]) -> Dict[str, Any]:
 def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
                   stage7_mapping: Dict[str, Any], stage8_mapping: Dict[str, Any],
                   relationship_graph_html: str,
+                  all_features: Dict[str, Dict[str, Any]],
+                  low_similarity_features: List[Dict[str, Any]],
                   output_path: str):
     """生成HTML可视化页面"""
 
@@ -491,6 +615,8 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
     data_json = json.dumps(data, ensure_ascii=False, indent=2)
     stage7_json = json.dumps(stage7_mapping, ensure_ascii=False, indent=2)
     stage8_json = json.dumps(stage8_mapping, ensure_ascii=False, indent=2)
+    all_features_json = json.dumps(all_features, ensure_ascii=False, indent=2)
+    low_similarity_json = json.dumps(low_similarity_features, ensure_ascii=False, indent=2)
 
     html_content = f'''<!DOCTYPE html>
 <html lang="zh-CN">
@@ -2159,6 +2285,117 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
         .connection-line.dimmed {{
             opacity: 0.1 !important;
         }}
+
+        /* ========== 低相似度特征样式 ========== */
+        .low-similarity-section {{
+            margin-top: 20px;
+            border-top: 2px solid #fca5a5;
+            padding-top: 15px;
+        }}
+
+        .low-similarity-header {{
+            display: flex;
+            align-items: center;
+            justify-content: space-between;
+            padding: 12px 15px;
+            background: linear-gradient(135deg, #fee2e2 0%, #fef2f2 100%);
+            border-radius: 8px;
+            cursor: pointer;
+            border: 2px solid #fca5a5;
+            transition: all 0.3s ease;
+        }}
+
+        .low-similarity-header:hover {{
+            background: linear-gradient(135deg, #fecaca 0%, #fee2e2 100%);
+            border-color: #f87171;
+        }}
+
+        .low-similarity-title {{
+            font-size: 14px;
+            font-weight: 600;
+            color: #dc2626;
+        }}
+
+        .low-similarity-count {{
+            font-size: 12px;
+            font-weight: 600;
+            color: #991b1b;
+            background: white;
+            padding: 2px 8px;
+            border-radius: 12px;
+        }}
+
+        .toggle-icon {{
+            font-size: 12px;
+            color: #dc2626;
+            transition: transform 0.3s ease;
+        }}
+
+        .toggle-icon.rotated {{
+            transform: rotate(180deg);
+        }}
+
+        .low-similarity-list {{
+            margin-top: 10px;
+        }}
+
+        .low-similarity-item {{
+            padding: 10px 12px;
+            margin: 8px 0;
+            background: white;
+            border: 2px solid #fecaca;
+            border-radius: 6px;
+            cursor: pointer;
+            transition: all 0.2s ease;
+        }}
+
+        .low-similarity-item:hover {{
+            border-color: #f87171;
+            background: #fef2f2;
+        }}
+
+        .low-feature-header {{
+            display: flex;
+            align-items: center;
+            justify-content: space-between;
+            margin-bottom: 4px;
+        }}
+
+        .low-feature-name {{
+            font-size: 13px;
+            font-weight: 600;
+            color: #991b1b;
+            flex: 1;
+        }}
+
+        .low-feature-score {{
+            font-size: 14px;
+            font-weight: 700;
+            color: #dc2626;
+            background: #fee2e2;
+            padding: 2px 8px;
+            border-radius: 4px;
+        }}
+
+        .low-feature-meta {{
+            font-size: 11px;
+            color: #9ca3af;
+            margin-bottom: 6px;
+        }}
+
+        .low-feature-detail {{
+            padding: 8px;
+            background: #fef2f2;
+            border-radius: 4px;
+            border-left: 3px solid #f87171;
+            margin-top: 8px;
+        }}
+
+        .low-feature-reason {{
+            font-size: 12px;
+            color: #7f1d1d;
+            line-height: 1.6;
+        }}
     </style>
 </head>
 <body>
@@ -2241,7 +2478,7 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
     <!-- Tab导航 -->
     <div class="tab-navigation">
         <button class="tab-button active" onclick="switchTab('search')">📋 搜索结果</button>
-        <button class="tab-button" onclick="switchTab('relationship')">📊 关系图</button>
+        <button class="tab-button" onclick="switchTab('relationship')">📊 人设-帖子匹配</button>
     </div>
 
     <!-- Tab内容区域 -->
@@ -2261,7 +2498,7 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
         <!-- Tab2: 关系图 -->
         <div class="tab-pane" id="tab-relationship">
             <div class="relationship-section">
-                <div class="section-header">📊 帖子-人设关系图</div>
+                <div class="section-header">📊 人设-帖子关系图</div>
                 {relationship_graph_html}
             </div>
         </div>
@@ -2310,6 +2547,8 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
         const data = {data_json};
         const stage7Data = {stage7_json};
         const stage8Data = {stage8_json};
+        const allFeatures = {all_features_json};
+        const lowSimilarityFeatures = {low_similarity_json};
         let currentFilter = 'all';
 
         // Tab切换功能
@@ -2431,10 +2670,20 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
                     totalSearches += (group['top10_searches'] || []).length;
                 }});
 
+                // 从allFeatures获取特征信息
+                const featureName = feature['原始特征名称'];
+                const featureInfo = allFeatures[featureName] || {{}};
+                const similarity = featureInfo.similarity || 0;
+                const weight = featureInfo.weight || 0;
+                const dimension = featureInfo.dimension || '';
+
                 html += `
                     <div class="feature-group">
                         <div class="feature-header" onclick="toggleFeature(${{featureIdx}})" id="feature-header-${{featureIdx}}">
-                            <div class="feature-title post-target-word">📝 ${{feature['原始特征名称']}}</div>
+                            <div class="feature-title post-target-word">📝 ${{featureName}}</div>
+                            <div class="feature-meta" style="font-size:11px;color:#9ca3af;margin-top:4px;">
+                                相似度: ${{similarity.toFixed(2)}} · ${{dimension}}
+                            </div>
                         </div>
                         <div class="search-words-list" id="search-words-${{featureIdx}}">
                 `;
@@ -2455,7 +2704,7 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
                                 <div class="base-word-meta">相似度: ${{baseSimilarity.toFixed(2)}} · ${{searches.length}}个搜索词</div>
                             </div>
                             <div class="base-word-desc" id="base-word-desc-${{featureIdx}}-${{groupIdx}}">
-                                ${{relatedWordNames || '无相关词汇'}}
+                                <span style="color:#92400e;font-weight:600;">来源人设候选特征/分类: </span>${{relatedWordNames || '无相关词汇'}}
                             </div>
                             <div class="search-words-sublist" id="search-words-sublist-${{featureIdx}}-${{groupIdx}}">
                     `;
@@ -2533,6 +2782,49 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
                 `;
             }});
 
+            // 添加低相似度特征板块
+            if (lowSimilarityFeatures && lowSimilarityFeatures.length > 0) {{
+                html += `
+                    <div class="low-similarity-section">
+                        <div class="low-similarity-header" onclick="toggleLowSimilarity()">
+                            <div class="low-similarity-title">🔴 低相似度特征(未搜索)</div>
+                            <div class="low-similarity-count">${{lowSimilarityFeatures.length}}个</div>
+                            <div class="toggle-icon" id="low-similarity-toggle">▼</div>
+                        </div>
+                        <div class="low-similarity-list" id="low-similarity-list" style="display:none;">
+                `;
+
+                lowSimilarityFeatures.forEach((feature, idx) => {{
+                    const name = feature.name || '';
+                    const similarity = feature.similarity || 0;
+                    const weight = feature.weight || 0;
+                    const dimension = feature.dimension || '';
+
+                    html += `
+                        <div class="low-similarity-item" onclick="toggleLowFeatureDetail(${{idx}})" id="low-feature-${{idx}}">
+                            <div class="low-feature-header">
+                                <div class="low-feature-name">📝 ${{name}}</div>
+                                <div class="low-feature-score">${{similarity.toFixed(2)}}</div>
+                            </div>
+                            <div class="low-feature-meta">
+                                ${{dimension}}
+                            </div>
+                            <div class="low-feature-detail" id="low-feature-detail-${{idx}}" style="display:none;">
+                                <div class="low-feature-reason">
+                                    ❌ 相似度 ${{similarity.toFixed(2)}} &lt; 0.4<br>
+                                    低于搜索阈值,未执行搜索
+                                </div>
+                            </div>
+                        </div>
+                    `;
+                }});
+
+                html += `
+                        </div>
+                    </div>
+                `;
+            }}
+
             sidebar.innerHTML = html;
         }}
 
@@ -3194,6 +3486,35 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
             }}
         }}
 
+        // 切换低相似度特征板块展开/折叠
+        function toggleLowSimilarity() {{
+            const list = document.getElementById('low-similarity-list');
+            const toggle = document.getElementById('low-similarity-toggle');
+
+            if (list && toggle) {{
+                if (list.style.display === 'none') {{
+                    list.style.display = 'block';
+                    toggle.classList.add('rotated');
+                }} else {{
+                    list.style.display = 'none';
+                    toggle.classList.remove('rotated');
+                }}
+            }}
+        }}
+
+        // 切换单个低相似度特征的详情展开/折叠
+        function toggleLowFeatureDetail(idx) {{
+            const detail = document.getElementById(`low-feature-detail-${{idx}}`);
+
+            if (detail) {{
+                if (detail.style.display === 'none') {{
+                    detail.style.display = 'block';
+                }} else {{
+                    detail.style.display = 'none';
+                }}
+            }}
+        }}
+
         function filterNotes(category) {{
             currentFilter = category;
 
@@ -3459,7 +3780,7 @@ def main():
     print(f"  - 完全匹配: {stats['match_complete']} ({stats['complete_rate']}%)")
 
     # 生成关系图
-    print(f"\n📊 生成帖子-人设关系图...")
+    print(f"\n📊 生成人设-帖子关系图...")
     relationship_graph_html = ""
     try:
         print(f"📖 加载人设库: {persona_library_path}")
@@ -3467,10 +3788,11 @@ def main():
         print(f"✓ 加载人设库完成")
 
         print(f"📖 提取关系数据: {how_json_path}")
-        relationships = extract_relationship_data(how_json_path, persona_data)
-        print(f"✓ 提取了 {len(relationships)} 个关系")
+        relationships, all_high_matches = extract_relationship_data(how_json_path, persona_data)
+        print(f"✓ 提取了 {len(relationships)} 个最高匹配关系")
+        print(f"✓ 提取了 {len(all_high_matches)} 个高相似度(>0.8)匹配")
 
-        relationship_graph_html = generate_relationship_graph_html(relationships)
+        relationship_graph_html = generate_relationship_graph_html(relationships, all_high_matches)
         print(f"✓ 关系图生成完成")
     except FileNotFoundError as e:
         print(f"⚠️ 警告: 无法生成关系图 - {e}")
@@ -3479,8 +3801,36 @@ def main():
         print(f"⚠️ 警告: 生成关系图时出错 - {e}")
         relationship_graph_html = f'<div style="padding: 40px; text-align: center; color: #ef4444;">关系图生成失败: {e}</div>'
 
+    # 提取所有特征信息(包括低相似度特征)
+    print(f"\n📊 提取所有特征信息...")
+    all_features = {}
+    low_similarity_features = []
+    try:
+        all_features = extract_all_features_from_how(how_json_path)
+
+        # 标记Stage6中已搜索的特征
+        stage6_feature_names = set([item.get('原始特征名称') for item in data])
+        for feature_name in stage6_feature_names:
+            if feature_name in all_features and all_features[feature_name]['category'] == '待搜索':
+                all_features[feature_name]['category'] = '已搜索'
+
+        # 提取低相似度特征(<0.4)
+        low_similarity_features = [
+            {'name': name, **info}
+            for name, info in all_features.items()
+            if info['category'] == '低相似度'
+        ]
+        # 按相似度降序排序
+        low_similarity_features.sort(key=lambda x: x['similarity'], reverse=True)
+
+        print(f"✓ 提取了 {len(all_features)} 个特征")
+        print(f"  - 低相似度特征: {len(low_similarity_features)} 个")
+    except Exception as e:
+        print(f"⚠️ 警告: 提取特征信息时出错 - {e}")
+        low_similarity_features = []
+
     print(f"\n🎨 生成可视化页面...")
-    generate_html(data, stats, stage7_mapping, stage8_mapping, relationship_graph_html, output_path)
+    generate_html(data, stats, stage7_mapping, stage8_mapping, relationship_graph_html, all_features, low_similarity_features, output_path)
     print(f"✓ 生成完成: {output_path}")
 
     print(f"\n🌐 在浏览器中打开查看:")