hai 2 meses · e488de4152
--- a/enhanced_search_v2.py
+++ b/enhanced_search_v2.py
@@ -11,9 +11,11 @@ import copy
 
				 import time
			
 
				 import os
			
 
				 import argparse
			
 
				+import subprocess
			
 
				 from typing import Dict, List, Any, Optional, Set, Tuple
			
 
				 from datetime import datetime
			
 
				 from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				+from itertools import combinations
			
 
				 
			
 
				 from openrouter_client import OpenRouterClient
			
 
				 from llm_evaluator import LLMEvaluator
			
@@ -41,7 +43,10 @@ class EnhancedSearchV2:
 
				         dimension_associations_path: str,
			
 
				         optimized_clustered_data_path: str,
			
 
				         openrouter_api_key: Optional[str] = None,
			
 
				-        output_dir: str = "output_v2"
			
 
				+        output_dir: str = "output_v2",
			
 
				+        top_n: int = 10,
			
 
				+        max_total_searches: Optional[int] = None,
			
 
				+        search_max_workers: int = 3
			
 
				     ):
			
 
				         """
			
 
				         初始化系统
			
@@ -52,11 +57,17 @@ class EnhancedSearchV2:
 
				             optimized_clustered_data_path: 人设特征库路径
			
 
				             openrouter_api_key: OpenRouter API密钥
			
 
				             output_dir: 输出目录
			
 
				+            top_n: 每个原始特征取评分最高的N个搜索词（默认10）
			
 
				+            max_total_searches: 全局最大搜索次数限制（默认None不限制）
			
 
				+            search_max_workers: 搜索并发数（默认3）
			
 
				         """
			
 
				         self.how_json_path = how_json_path
			
 
				         self.dimension_associations_path = dimension_associations_path
			
 
				         self.optimized_clustered_data_path = optimized_clustered_data_path
			
 
				         self.output_dir = output_dir
			
 
				+        self.top_n = top_n
			
 
				+        self.max_total_searches = max_total_searches
			
 
				+        self.search_max_workers = search_max_workers
			
 
				 
			
 
				         # 创建输出目录
			
 
				         os.makedirs(output_dir, exist_ok=True)
			
@@ -572,38 +583,95 @@ class EnhancedSearchV2:
 
				             'sub_classifications': sub_classifications
			
 
				         }
			
 
				 
			
 
				-    # ========== 阶段3：提取特征列表 ==========
			
 
				+    # ========== 阶段3：筛选高相似度匹配（>0.8） ==========
			
 
				 
			
 
				-    def stage3_extract_features(self, associations_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
			
 
				+    def stage3_filter_high_similarity_matches(self, associations_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
			
 
				         """
			
 
				-        阶段3：从关联分类中提取特征列表
			
 
				+        阶段3：筛选高相似度匹配（>0.8）
			
 
				+
			
 
				+        遍历how解构中的所有原始特征，找出匹配结果中相似度>0.8
			
 
				+        且人设特征名称在Stage2关联范围内的高质量匹配
			
 
				 
			
 
				         Args:
			
 
				             associations_data: 阶段2的关联数据
			
 
				 
			
 
				         Returns:
			
 
				-            带特征列表的数据
			
 
				+            带高相似度候选的数据
			
 
				         """
			
 
				         logger.info("=" * 60)
			
 
				-        logger.info("阶段3：提取特征列表")
			
 
				+        logger.info("阶段3：筛选高相似度匹配（>0.8）")
			
 
				         logger.info("=" * 60)
			
 
				 
			
 
				         for idx, feature_result in enumerate(associations_data, 1):
			
 
				-            logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {feature_result['原始特征名称']}")
			
 
				+            original_feature_name = feature_result['原始特征名称']
			
 
				+            logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {original_feature_name}")
			
 
				 
			
 
				-            for assoc in feature_result.get('找到的关联', []):
			
 
				-                target_path = assoc['目标分类路径']
			
 
				-                logger.info(f"  提取特征: {target_path}")
			
 
				+            # 步骤1: 收集Stage2的关联范围（分类名+标签）
			
 
				+            stage2_scope = self._collect_stage2_scope(feature_result)
			
 
				+            logger.info(f"  Stage2范围包含 {len(stage2_scope)} 个分类/标签")
			
 
				 
			
 
				-                # 提取特征
			
 
				-                features = self._find_features_by_path(target_path)
			
 
				+            # 步骤2: 遍历how解构中的所有原始特征，找出高相似度匹配
			
 
				+            high_sim_candidates = []
			
 
				+            total_checked = 0
			
 
				+            high_sim_found = 0
			
 
				+
			
 
				+            how_result = self.how_data.get('how解构结果', {})
			
 
				+            for level_name, level_list in how_result.items():
			
 
				+                if not isinstance(level_list, list):
			
 
				+                    continue
			
 
				 
			
 
				-                # 添加到关联中
			
 
				-                assoc['特征列表'] = features
			
 
				-                logger.info(f"    找到 {len(features)} 个特征")
			
 
				+                for item in level_list:
			
 
				+                    for step in item.get('how步骤列表', []):
			
 
				+                        for feature in step.get('特征列表', []):
			
 
				+                            # 获取该特征的所有匹配
			
 
				+                            matches = feature.get('匹配结果', [])
			
 
				+                            total_checked += len(matches)
			
 
				+
			
 
				+                            # 筛选相似度>0.8且在Stage2范围内的匹配
			
 
				+                            for match in matches:
			
 
				+                                sim = match.get('匹配结果', {}).get('相似度', 0)
			
 
				+                                persona_feature_name = match.get('人设特征名称', '')
			
 
				+
			
 
				+                                if sim > 0.8 and persona_feature_name in stage2_scope:
			
 
				+                                    high_sim_found += 1
			
 
				+                                    # 记录来源信息
			
 
				+                                    high_sim_candidates.append({
			
 
				+                                        '人设特征名称': persona_feature_name,
			
 
				+                                        '相似度': sim,
			
 
				+                                        '特征类型': match.get('特征类型', ''),
			
 
				+                                        '特征分类': match.get('特征分类', []),
			
 
				+                                        '人设特征层级': match.get('人设特征层级', ''),
			
 
				+                                        '来源路径': self._build_classification_path(match.get('特征分类', [])),
			
 
				+                                        '匹配说明': match.get('匹配结果', {}).get('说明', ''),
			
 
				+                                        '来源原始特征': feature.get('特征名称', '')  # 记录来自哪个原始特征
			
 
				+                                    })
			
 
				+
			
 
				+            logger.info(f"  检查了 {total_checked} 个匹配")
			
 
				+            logger.info(f"  找到 {high_sim_found} 个相似度>0.8的匹配")
			
 
				+
			
 
				+            # 按相似度降序排序，并去重（同一个人设特征名称只保留最高分）
			
 
				+            seen_names = set()
			
 
				+            unique_candidates = []
			
 
				+            high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
			
 
				+
			
 
				+            for candidate in high_sim_candidates:
			
 
				+                name = candidate['人设特征名称']
			
 
				+                if name not in seen_names:
			
 
				+                    seen_names.add(name)
			
 
				+                    unique_candidates.append(candidate)
			
 
				+
			
 
				+            # 添加到结果中
			
 
				+            feature_result['高相似度候选'] = unique_candidates
			
 
				+            logger.info(f"  去重后筛选出 {len(unique_candidates)} 个高相似度候选")
			
 
				+
			
 
				+            # 显示前5个
			
 
				+            if unique_candidates:
			
 
				+                logger.info(f"  Top 5:")
			
 
				+                for c in unique_candidates[:5]:
			
 
				+                    logger.info(f"    • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
			
 
				 
			
 
				         # 保存结果
			
 
				-        output_path = os.path.join(self.output_dir, "stage3_features.json")
			
 
				+        output_path = os.path.join(self.output_dir, "stage3_high_similarity.json")
			
 
				         self._save_json(associations_data, output_path)
			
 
				 
			
 
				         logger.info(f"\n" + "=" * 60)
			
@@ -612,6 +680,29 @@ class EnhancedSearchV2:
 
				 
			
 
				         return associations_data
			
 
				 
			
 
				+
			
 
				+    def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
			
 
				+        """
			
 
				+        收集Stage2找到的所有分类名和标签，形成范围集合
			
 
				+
			
 
				+        Args:
			
 
				+            feature_result: 特征结果数据
			
 
				+
			
 
				+        Returns:
			
 
				+            包含所有分类名和标签的集合
			
 
				+        """
			
 
				+        scope = set()
			
 
				+
			
 
				+        for assoc in feature_result.get('找到的关联', []):
			
 
				+            # 添加分类名
			
 
				+            scope.add(assoc['分类名称'])
			
 
				+
			
 
				+            # 添加所有标签
			
 
				+            tags = assoc.get('标签列表', [])
			
 
				+            scope.update(tags)
			
 
				+
			
 
				+        return scope
			
 
				+
			
 
				     def _find_features_by_path(self, target_classification: str) -> List[Dict[str, Any]]:
			
 
				         """
			
 
				         根据路径查找特征列表
			
@@ -630,56 +721,63 @@ class EnhancedSearchV2:
 
				         # 深拷贝
			
 
				         return copy.deepcopy(features)
			
 
				 
			
 
				-    # ========== 阶段4：生成搜索词 + LLM评估质量 ==========
			
 
				+    # ========== 阶段4：多词组合 + LLM评估 ==========
			
 
				 
			
 
				     def stage4_generate_and_evaluate_search_words(
			
 
				         self,
			
 
				-        features_data: List[Dict[str, Any]]
			
 
				+        features_data: List[Dict[str, Any]],
			
 
				+        max_workers: int = 4,
			
 
				+        max_candidates: int = 20,
			
 
				+        max_combo_length: int = 4
			
 
				     ) -> List[Dict[str, Any]]:
			
 
				         """
			
 
				-        阶段4：生成搜索词并用LLM评估质量
			
 
				+        阶段4：多词组合 + LLM评估
			
 
				+
			
 
				+        基于Stage1的基础词和Stage3的高相似度候选，
			
 
				+        生成所有2-N词组合，通过LLM评估选出Top10
			
 
				 
			
 
				         Args:
			
 
				-            features_data: 阶段3的特征数据
			
 
				+            features_data: 阶段3的数据（包含高相似度候选）
			
 
				+            max_workers: 并发评估的原始特征数（默认4）
			
 
				+            max_candidates: 参与组合的最大候选词数（默认20）
			
 
				+            max_combo_length: 最大组合词数（默认4，即基础词+3个候选）
			
 
				 
			
 
				         Returns:
			
 
				             带LLM评估的数据
			
 
				         """
			
 
				         logger.info("=" * 60)
			
 
				-        logger.info("阶段4：生成搜索词 + LLM评估质量")
			
 
				+        logger.info("阶段4：多词组合 + LLM评估")
			
 
				+        logger.info(f"  最大候选词数: {max_candidates}")
			
 
				+        logger.info(f"  最大组合长度: {max_combo_length} 词")
			
 
				+        logger.info(f"  并发数: {max_workers} 个原始特征")
			
 
				         logger.info("=" * 60)
			
 
				 
			
 
				-        for idx, feature_result in enumerate(features_data, 1):
			
 
				-            logger.info(f"\n[{idx}/{len(features_data)}] 处理: {feature_result['原始特征名称']}")
			
 
				-
			
 
				-            # 生成搜索词
			
 
				-            self._add_search_words(feature_result)
			
 
				-
			
 
				-            # 收集所有搜索词
			
 
				-            all_search_words = self._collect_all_search_words(feature_result)
			
 
				-
			
 
				-            if not all_search_words:
			
 
				-                logger.info(f"  无搜索词，跳过")
			
 
				-                continue
			
 
				-
			
 
				-            logger.info(f"  生成 {len(all_search_words)} 个搜索词")
			
 
				-
			
 
				-            # LLM分批评估（每10个一批）
			
 
				-            logger.info(f"  开始LLM评估...")
			
 
				-            original_feature = feature_result['原始特征名称']
			
 
				-            evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
			
 
				-                original_feature=original_feature,
			
 
				-                search_words=[sw['search_word'] for sw in all_search_words],
			
 
				-                batch_size=10
			
 
				-            )
			
 
				+        total_features = len(features_data)
			
 
				 
			
 
				-            # 将评估结果写回到特征节点
			
 
				-            self._write_back_evaluations(feature_result, evaluated)
			
 
				+        # 使用ThreadPoolExecutor并行处理不同的原始特征
			
 
				+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
			
 
				+            # 提交所有任务
			
 
				+            futures = []
			
 
				+            for idx, feature_result in enumerate(features_data, 1):
			
 
				+                future = executor.submit(
			
 
				+                    self._process_single_feature_combinations,
			
 
				+                    idx,
			
 
				+                    total_features,
			
 
				+                    feature_result,
			
 
				+                    max_candidates,
			
 
				+                    max_combo_length
			
 
				+                )
			
 
				+                futures.append((future, feature_result))
			
 
				 
			
 
				-            logger.info(f"  评估完成，最高分: {evaluated[0]['score']:.3f}")
			
 
				+            # 等待所有任务完成并收集结果
			
 
				+            for future, feature_result in futures:
			
 
				+                try:
			
 
				+                    _ = future.result()  # 等待完成，结果已经写回到feature_result中
			
 
				+                except Exception as e:
			
 
				+                    logger.error(f"  评估失败: {feature_result['原始特征名称']}, 错误: {e}")
			
 
				 
			
 
				         # 保存结果
			
 
				-        output_path = os.path.join(self.output_dir, "stage4_with_llm_scores.json")
			
 
				+        output_path = os.path.join(self.output_dir, "stage4_combinations_evaluated.json")
			
 
				         self._save_json(features_data, output_path)
			
 
				 
			
 
				         logger.info(f"\n" + "=" * 60)
			
@@ -688,95 +786,152 @@ class EnhancedSearchV2:
 
				 
			
 
				         return features_data
			
 
				 
			
 
				-    def _add_search_words(self, result: Dict[str, Any]):
			
 
				+    def _process_single_feature_combinations(
			
 
				+        self,
			
 
				+        idx: int,
			
 
				+        total: int,
			
 
				+        feature_result: Dict[str, Any],
			
 
				+        max_candidates: int,
			
 
				+        max_combo_length: int
			
 
				+    ) -> None:
			
 
				         """
			
 
				-        为结果项添加search_word字段（去重）
			
 
				+        处理单个原始特征的组合生成和评估
			
 
				+
			
 
				+        Steps:
			
 
				+        1. Get base_word from Stage1's 最高匹配信息
			
 
				+        2. Get candidates from Stage3's 高相似度候选 (top max_candidates)
			
 
				+        3. Generate 2-N word combinations
			
 
				+        4. LLM batch evaluation
			
 
				+        5. Select Top 10 and write back
			
 
				 
			
 
				         Args:
			
 
				-            result: 单个结果项
			
 
				+            idx: 特征索引
			
 
				+            total: 总特征数
			
 
				+            feature_result: 特征结果数据
			
 
				+            max_candidates: 参与组合的最大候选词数
			
 
				+            max_combo_length: 最大组合词数
			
 
				         """
			
 
				-        # 获取基础词（人设特征名称）
			
 
				-        base_word = result.get('最高匹配信息', {}).get('人设特征名称', '')
			
 
				+        original_feature = feature_result['原始特征名称']
			
 
				+        logger.info(f"\n[{idx}/{total}] 处理: {original_feature}")
			
 
				 
			
 
				+        # 步骤1: 获取基础词
			
 
				+        base_word = feature_result.get('最高匹配信息', {}).get('人设特征名称', '')
			
 
				         if not base_word:
			
 
				+            logger.info(f"  无基础词，跳过")
			
 
				+            feature_result['组合评估结果'] = []
			
 
				             return
			
 
				 
			
 
				-        # 去重集合（在当前结果项范围内）
			
 
				-        seen_words: Set[str] = set()
			
 
				+        logger.info(f"  基础词: {base_word}")
			
 
				 
			
 
				-        # 遍历所有关联的特征列表
			
 
				-        for assoc in result.get('找到的关联', []):
			
 
				-            for feature in assoc.get('特征列表', []):
			
 
				-                feature_name = feature.get('特征名称', '')
			
 
				+        # 步骤2: 获取候选词（从高相似度候选中）
			
 
				+        high_sim_candidates = feature_result.get('高相似度候选', [])
			
 
				 
			
 
				-                if not feature_name:
			
 
				-                    feature['search_word'] = None
			
 
				-                    continue
			
 
				+        # 限制候选词数量
			
 
				+        candidates = high_sim_candidates[:max_candidates]
			
 
				+        candidate_words = [c['人设特征名称'] for c in candidates]
			
 
				 
			
 
				-                # 生成组合词
			
 
				-                search_word = f"{base_word} {feature_name}"
			
 
				+        if not candidate_words:
			
 
				+            logger.info(f"  无候选词，跳过")
			
 
				+            feature_result['组合评估结果'] = []
			
 
				+            return
			
 
				 
			
 
				-                # 检查是否重复
			
 
				-                if search_word not in seen_words:
			
 
				-                    feature['search_word'] = search_word
			
 
				-                    seen_words.add(search_word)
			
 
				-                else:
			
 
				-                    feature['search_word'] = None
			
 
				+        logger.info(f"  候选词数量: {len(candidate_words)} (限制: {max_candidates})")
			
 
				+
			
 
				+        # 步骤3: 生成所有组合
			
 
				+        all_combinations = []
			
 
				+
			
 
				+        # 生成1词到max_combo_length-1词的候选词组合（因为还要加上base_word）
			
 
				+        for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
			
 
				+            for combo in combinations(candidate_words, length):
			
 
				+                # 组合成搜索词：基础词 + 候选词组合
			
 
				+                search_phrase = base_word + ' ' + ' '.join(combo)
			
 
				+                all_combinations.append({
			
 
				+                    'search_word': search_phrase,
			
 
				+                    'base_word': base_word,
			
 
				+                    'candidate_words': list(combo),
			
 
				+                    'combo_length': length + 1  # +1 因为包含base_word
			
 
				+                })
			
 
				 
			
 
				-    def _collect_all_search_words(self, feature_result: Dict[str, Any]) -> List[Dict[str, Any]]:
			
 
				-        """
			
 
				-        收集结果项中所有非空的search_word
			
 
				+        logger.info(f"  生成 {len(all_combinations)} 个组合")
			
 
				 
			
 
				-        Args:
			
 
				-            feature_result: 结果项
			
 
				+        # 步骤4: LLM批量评估
			
 
				+        logger.info(f"  开始LLM评估...")
			
 
				+        evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
			
 
				+            original_feature=original_feature,
			
 
				+            search_words=[c['search_word'] for c in all_combinations],
			
 
				+            batch_size=50
			
 
				+        )
			
 
				 
			
 
				-        Returns:
			
 
				-            搜索词列表，每个包含 search_word 和特征引用
			
 
				-        """
			
 
				-        search_words = []
			
 
				-
			
 
				-        for assoc_idx, assoc in enumerate(feature_result.get('找到的关联', [])):
			
 
				-            for feat_idx, feature in enumerate(assoc.get('特征列表', [])):
			
 
				-                sw = feature.get('search_word')
			
 
				-                if sw and sw.strip():
			
 
				-                    search_words.append({
			
 
				-                        'search_word': sw,
			
 
				-                        'assoc_idx': assoc_idx,
			
 
				-                        'feat_idx': feat_idx,
			
 
				-                        'feature_ref': feature  # 引用，方便写回
			
 
				-                    })
			
 
				+        # 步骤5: 选出Top 10
			
 
				+        top_10 = evaluated[:10]
			
 
				+
			
 
				+        # 写回结果
			
 
				+        feature_result['组合评估结果'] = top_10
			
 
				 
			
 
				-        return search_words
			
 
				+        max_score = top_10[0]['score'] if top_10 else 0.0
			
 
				+        logger.info(f"  评估完成，Top 10 最高分: {max_score:.3f}")
			
 
				 
			
 
				-    def _write_back_evaluations(
			
 
				+    # ========== 阶段5：执行搜索 ==========
			
 
				+
			
 
				+    def _execute_single_search(
			
 
				         self,
			
 
				-        feature_result: Dict[str, Any],
			
 
				-        evaluated: List[Dict[str, Any]]
			
 
				-    ):
			
 
				+        idx: int,
			
 
				+        total: int,
			
 
				+        search_word: str,
			
 
				+        feature_ref: Dict[str, Any]
			
 
				+    ) -> Dict[str, Any]:
			
 
				         """
			
 
				-        将LLM评估结果写回到特征节点
			
 
				+        执行单个搜索任务（用于并发执行）
			
 
				 
			
 
				         Args:
			
 
				-            feature_result: 结果项
			
 
				-            evaluated: 评估结果列表
			
 
				+            idx: 搜索索引
			
 
				+            total: 总搜索数
			
 
				+            search_word: 搜索词
			
 
				+            feature_ref: 特征引用（用于写入结果）
			
 
				+
			
 
				+        Returns:
			
 
				+            搜索结果信息
			
 
				         """
			
 
				-        # 创建查找映射
			
 
				-        eval_map = {e['search_word']: e for e in evaluated}
			
 
				+        logger.info(f"[{idx}/{total}] 搜索: {search_word}")
			
 
				 
			
 
				-        # 写回到特征节点
			
 
				-        for assoc in feature_result.get('找到的关联', []):
			
 
				-            for feature in assoc.get('特征列表', []):
			
 
				-                sw = feature.get('search_word')
			
 
				-                if sw and sw in eval_map:
			
 
				-                    eval_result = eval_map[sw]
			
 
				-                    feature['llm_evaluation'] = {
			
 
				-                        'score': eval_result['score'],
			
 
				-                        'rank': eval_result['rank'],
			
 
				-                        'reasoning': eval_result['reasoning'],
			
 
				-                        'original_feature': eval_result['original_feature']
			
 
				-                    }
			
 
				+        try:
			
 
				+            result = self.search_client.search(
			
 
				+                keyword=search_word,
			
 
				+                content_type='不限',
			
 
				+                sort_type='综合',
			
 
				+                max_retries=3,
			
 
				+                use_cache=True  # 启用搜索缓存
			
 
				+            )
			
 
				 
			
 
				-    # ========== 阶段5：执行搜索 ==========
			
 
				+            note_count = len(result.get('data', {}).get('data', []))
			
 
				+            logger.info(f"  ✓ 成功，获取 {note_count} 条帖子")
			
 
				+
			
 
				+            # 写入结果
			
 
				+            feature_ref['search_result'] = result
			
 
				+            feature_ref['search_metadata'] = {
			
 
				+                'searched_at': datetime.now().isoformat(),
			
 
				+                'status': 'success',
			
 
				+                'note_count': note_count,
			
 
				+                'search_params': {
			
 
				+                    'keyword': search_word,
			
 
				+                    'content_type': '图文',
			
 
				+                    'sort_type': '综合'
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            return {'status': 'success', 'search_word': search_word, 'note_count': note_count}
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"  ✗ 失败: {e}")
			
 
				+            feature_ref['search_result'] = None
			
 
				+            feature_ref['search_metadata'] = {
			
 
				+                'searched_at': datetime.now().isoformat(),
			
 
				+                'status': 'failed',
			
 
				+                'note_count': 0,
			
 
				+                'error': str(e)
			
 
				+            }
			
 
				+
			
 
				+            return {'status': 'failed', 'search_word': search_word, 'error': str(e)}
			
 
				 
			
 
				     def stage5_execute_searches(
			
 
				         self,
			
@@ -799,7 +954,7 @@ class EnhancedSearchV2:
 
				         logger.info("阶段5：执行小红书搜索")
			
 
				         logger.info("=" * 60)
			
 
				 
			
 
				-        # 按原始特征分组收集搜索词
			
 
				+        # 按原始特征分组收集搜索词（从Stage4的组合评估结果读取）
			
 
				         feature_search_groups = {}
			
 
				 
			
 
				         for feature_result in features_data:
			
@@ -808,21 +963,19 @@ class EnhancedSearchV2:
 
				             if original_feature not in feature_search_groups:
			
 
				                 feature_search_groups[original_feature] = []
			
 
				 
			
 
				-            for assoc in feature_result.get('找到的关联', []):
			
 
				-                for feature in assoc.get('特征列表', []):
			
 
				-                    sw = feature.get('search_word')
			
 
				-                    if not sw:
			
 
				-                        continue
			
 
				+            # 从Stage4的组合评估结果读取
			
 
				+            for eval_item in feature_result.get('组合评估结果', []):
			
 
				+                sw = eval_item.get('search_word')
			
 
				+                if not sw:
			
 
				+                    continue
			
 
				 
			
 
				-                    # 获取LLM评分
			
 
				-                    llm_eval = feature.get('llm_evaluation', {})
			
 
				-                    score = llm_eval.get('score', 0.0)
			
 
				+                score = eval_item.get('score', 0.0)
			
 
				 
			
 
				-                    feature_search_groups[original_feature].append({
			
 
				-                        'search_word': sw,
			
 
				-                        'score': score,
			
 
				-                        'feature_ref': feature
			
 
				-                    })
			
 
				+                feature_search_groups[original_feature].append({
			
 
				+                    'search_word': sw,
			
 
				+                    'score': score,
			
 
				+                    'feature_ref': eval_item  # 引用评估项，用于写入搜索结果
			
 
				+                })
			
 
				 
			
 
				         # 每组取Top N
			
 
				         all_searches = []
			
@@ -844,52 +997,35 @@ class EnhancedSearchV2:
 
				 
			
 
				             logger.info(f"  {original_feature}: 从 {len(sorted_list)} 个搜索词中选择 Top {len(selected)} (过滤 {filtered} 个)")
			
 
				 
			
 
				-        logger.info(f"\n共 {len(all_searches)} 个搜索任务（过滤前: {total_before_filter}, 过滤掉: {total_filtered}）")
			
 
				-
			
 
				-        # 执行搜索
			
 
				-        for idx, item in enumerate(all_searches, 1):
			
 
				-            sw = item['search_word']
			
 
				-            feature = item['feature_ref']
			
 
				+        # 应用全局搜索次数限制
			
 
				+        if self.max_total_searches and len(all_searches) > self.max_total_searches:
			
 
				+            logger.info(f"  应用全局限制：从 {len(all_searches)} 个减少到 {self.max_total_searches} 个")
			
 
				+            all_searches = all_searches[:self.max_total_searches]
			
 
				 
			
 
				-            logger.info(f"[{idx}/{len(all_searches)}] 搜索: {sw}")
			
 
				+        logger.info(f"\n共 {len(all_searches)} 个搜索任务（过滤前: {total_before_filter}, 过滤掉: {total_filtered}）")
			
 
				+        logger.info(f"  并发执行搜索（并发数: {self.search_max_workers}）")
			
 
				 
			
 
				-            try:
			
 
				-                result = self.search_client.search(
			
 
				-                    keyword=sw,
			
 
				-                    content_type='图文',
			
 
				-                    sort_type='综合',
			
 
				-                    max_retries=3
			
 
				+        # 使用ThreadPoolExecutor并发执行搜索
			
 
				+        with ThreadPoolExecutor(max_workers=self.search_max_workers) as executor:
			
 
				+            # 提交所有搜索任务
			
 
				+            futures = []
			
 
				+            for idx, item in enumerate(all_searches, 1):
			
 
				+                future = executor.submit(
			
 
				+                    self._execute_single_search,
			
 
				+                    idx,
			
 
				+                    len(all_searches),
			
 
				+                    item['search_word'],
			
 
				+                    item['feature_ref']
			
 
				                 )
			
 
				+                futures.append(future)
			
 
				 
			
 
				-                note_count = len(result.get('data', {}).get('data', []))
			
 
				-                logger.info(f"  ✓ 成功，获取 {note_count} 条帖子")
			
 
				-
			
 
				-                # 写入结果
			
 
				-                feature['search_result'] = result
			
 
				-                feature['search_metadata'] = {
			
 
				-                    'searched_at': datetime.now().isoformat(),
			
 
				-                    'status': 'success',
			
 
				-                    'note_count': note_count,
			
 
				-                    'search_params': {
			
 
				-                        'keyword': sw,
			
 
				-                        'content_type': '图文',
			
 
				-                        'sort_type': '综合'
			
 
				-                    }
			
 
				-                }
			
 
				-
			
 
				-            except Exception as e:
			
 
				-                logger.error(f"  ✗ 失败: {e}")
			
 
				-                feature['search_result'] = None
			
 
				-                feature['search_metadata'] = {
			
 
				-                    'searched_at': datetime.now().isoformat(),
			
 
				-                    'status': 'failed',
			
 
				-                    'note_count': 0,
			
 
				-                    'error': str(e)
			
 
				-                }
			
 
				-
			
 
				-            # 延迟
			
 
				-            if idx < len(all_searches):
			
 
				-                time.sleep(search_delay)
			
 
				+            # 等待所有搜索完成
			
 
				+            for future in as_completed(futures):
			
 
				+                try:
			
 
				+                    result = future.result()
			
 
				+                    # 结果已经写入feature_ref，无需额外处理
			
 
				+                except Exception as e:
			
 
				+                    logger.error(f"  搜索任务失败: {e}")
			
 
				 
			
 
				         # 保存结果
			
 
				         output_path = os.path.join(self.output_dir, "stage5_with_search_results.json")
			
@@ -974,7 +1110,7 @@ class EnhancedSearchV2:
 
				         feature_node: Dict[str, Any]
			
 
				     ) -> Dict[str, Any]:
			
 
				         """
			
 
				-        评估单个搜索结果
			
 
				+        评估单个搜索结果（使用并行评估）
			
 
				 
			
 
				         Args:
			
 
				             original_feature: 原始特征
			
@@ -986,12 +1122,12 @@ class EnhancedSearchV2:
 
				         search_word = feature_node.get('search_word', '')
			
 
				         notes = feature_node['search_result'].get('data', {}).get('data', [])
			
 
				 
			
 
				-        return self.llm_evaluator.evaluate_search_results(
			
 
				+        return self.llm_evaluator.evaluate_search_results_parallel(
			
 
				             original_feature=original_feature,
			
 
				             search_word=search_word,
			
 
				             notes=notes,
			
 
				             max_notes=20,
			
 
				-            max_images_per_note=2
			
 
				+            max_workers=20  # 20个并发评估每个帖子
			
 
				         )
			
 
				 
			
 
				     # ========== 阶段7：扩展搜索 ==========
			
@@ -1052,9 +1188,10 @@ class EnhancedSearchV2:
 
				             try:
			
 
				                 result = self.search_client.search(
			
 
				                     keyword=extended_kw,
			
 
				-                    content_type='图文',
			
 
				+                    content_type='不限',
			
 
				                     sort_type='综合',
			
 
				-                    max_retries=3
			
 
				+                    max_retries=3,
			
 
				+                    use_cache=True  # 启用搜索缓存
			
 
				                 )
			
 
				 
			
 
				                 note_count = len(result.get('data', {}).get('data', []))
			
@@ -1121,26 +1258,53 @@ class EnhancedSearchV2:
 
				             # 阶段2
			
 
				             stage2_results = self.stage2_find_associations(stage1_results)
			
 
				 
			
 
				-            # 阶段3
			
 
				-            stage3_results = self.stage3_extract_features(stage2_results)
			
 
				+            # 阶段3 - 使用新方法：筛选高相似度匹配
			
 
				+            stage3_results = self.stage3_filter_high_similarity_matches(stage2_results)
			
 
				 
			
 
				             # 阶段4
			
 
				-            stage4_results = self.stage4_generate_and_evaluate_search_words(stage3_results)
			
 
				+            stage4_results = self.stage4_generate_and_evaluate_search_words(
			
 
				+                stage3_results,
			
 
				+                max_workers=8,         # 提高并发从4到8
			
 
				+                max_combo_length=3     # 降低组合长度从4到3
			
 
				+            )
			
 
				 
			
 
				             # 阶段5
			
 
				-            stage5_results = self.stage5_execute_searches(stage4_results, search_delay=2.0, top_n=10)
			
 
				+            stage5_results = self.stage5_execute_searches(stage4_results, search_delay=2.0, top_n=self.top_n)
			
 
				 
			
 
				-            # 阶段6
			
 
				-            stage6_results = self.stage6_evaluate_search_results(stage5_results)
			
 
				+            # 阶段6 - 暂时切断执行（代码保留）
			
 
				+            # stage6_results = self.stage6_evaluate_search_results(stage5_results)
			
 
				 
			
 
				-            # 阶段7
			
 
				-            final_results = self.stage7_extended_searches(stage6_results, search_delay=2.0)
			
 
				+            # 阶段7 - 暂时切断执行（代码保留）
			
 
				+            # final_results = self.stage7_extended_searches(stage6_results, search_delay=2.0)
			
 
				+
			
 
				+            logger.info("\n" + "=" * 60)
			
 
				+            logger.info("✓ 完整流程执行完成（Stage1-5）")
			
 
				+            logger.info("=" * 60)
			
 
				 
			
 
				+            # 自动执行可视化
			
 
				             logger.info("\n" + "=" * 60)
			
 
				-            logger.info("✓ 完整流程执行完成")
			
 
				+            logger.info("开始生成可视化...")
			
 
				             logger.info("=" * 60)
			
 
				 
			
 
				-            return final_results
			
 
				+            try:
			
 
				+                result = subprocess.run(
			
 
				+                    ['python3', 'visualize_stage5_results.py'],
			
 
				+                    capture_output=True,
			
 
				+                    text=True,
			
 
				+                    timeout=60
			
 
				+                )
			
 
				+
			
 
				+                if result.returncode == 0:
			
 
				+                    logger.info("✓ 可视化生成成功")
			
 
				+                    logger.info(result.stdout)
			
 
				+                else:
			
 
				+                    logger.error(f"可视化生成失败: {result.stderr}")
			
 
				+            except subprocess.TimeoutExpired:
			
 
				+                logger.error("可视化生成超时")
			
 
				+            except Exception as e:
			
 
				+                logger.error(f"可视化生成异常: {e}")
			
 
				+
			
 
				+            return stage5_results
			
 
				 
			
 
				         except Exception as e:
			
 
				             logger.error(f"流程执行失败: {e}")
			
@@ -1152,7 +1316,7 @@ def main():
 
				     parser = argparse.ArgumentParser(description='增强搜索系统V2')
			
 
				     parser.add_argument(
			
 
				         '--how-json',
			
 
				-        default='69114f150000000007001f30_how.json',
			
 
				+        default='69114f150000000007001f30_how copy.json',
			
 
				         help='How解构文件路径'
			
 
				     )
			
 
				     parser.add_argument(
			
@@ -1175,6 +1339,24 @@ def main():
 
				         default='output_v2',
			
 
				         help='输出目录'
			
 
				     )
			
 
				+    parser.add_argument(
			
 
				+        '--top-n',
			
 
				+        type=int,
			
 
				+        default=10,
			
 
				+        help='每个原始特征取评分最高的N个搜索词（默认10）'
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--max-total-searches',
			
 
				+        type=int,
			
 
				+        default=None,
			
 
				+        help='全局最大搜索次数限制（默认None不限制）'
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--search-workers',
			
 
				+        type=int,
			
 
				+        default=3,
			
 
				+        help='搜索并发数（默认3）'
			
 
				+    )
			
 
				 
			
 
				     args = parser.parse_args()
			
 
				 
			
@@ -1184,7 +1366,10 @@ def main():
 
				         dimension_associations_path=args.dimension_associations,
			
 
				         optimized_clustered_data_path=args.optimized_clustered,
			
 
				         openrouter_api_key=args.api_key,
			
 
				-        output_dir=args.output_dir
			
 
				+        output_dir=args.output_dir,
			
 
				+        top_n=args.top_n,
			
 
				+        max_total_searches=args.max_total_searches,
			
 
				+        search_max_workers=args.search_workers
			
 
				     )
			
 
				 
			
 
				     # 执行完整流程
			
--- a/llm_evaluator.py
+++ b/llm_evaluator.py
@@ -42,20 +42,24 @@ class LLMEvaluator:
 
				         """
			
 
				         prompt = f"""你是一个小红书内容分析专家。
			
 
				 
			
 
				-任务：评估搜索词能否找到包含目标特征的内容
			
 
				+# 任务说明
			
 
				+从给定关键词中提取并组合适合在小红书搜索的query词(目标是找到【{original_feature}】相关内容,但query中不能直接出现"{original_feature}")
			
 
				 
			
 
				-原始特征："{original_feature}"
			
 
				-组合搜索词："{search_word}"
			
 
				+## 可选词汇
			
 
				+{search_word}
			
 
				 
			
 
				-评估标准：
			
 
				-1. 这个搜索词在小红书上能否找到包含"{original_feature}"相关元素的帖子
			
 
				-2. 搜索词的关键词组合是否合理、是否过于宽泛或过于具体
			
 
				-3. 搜索词与原始特征的语义关联性
			
 
				+## 要求
			
 
				+1. 只能使用可选词汇中的词,可以进行以下变化:
			
 
				+   - 直接使用原词或括号内的同义词
			
 
				+   - 多个词组合
			
 
				+   - 适当精简
			
 
				+2. 不能添加可选词汇以外的新词
			
 
				+3. 按推荐程度排序(越靠前越推荐)
			
 
				 
			
 
				-请仔细分析并返回JSON格式：
			
 
				+## 输出格式(JSON)
			
 
				 {{
			
 
				-  "score": 0.75,  // 0.0-1.0，能找到相关内容的可能性
			
 
				-  "reasoning": "详细的评估理由，说明为什么给出这个分数"
			
 
				+  "score": 0.75,
			
 
				+  "reasoning": "评估理由"
			
 
				 }}
			
 
				 
			
 
				 注意：只返回JSON，不要其他内容。"""
			
@@ -136,7 +140,7 @@ class LLMEvaluator:
 
				         self,
			
 
				         original_feature: str,
			
 
				         search_words: List[str],
			
 
				-        batch_size: int = 10
			
 
				+        batch_size: int = 50
			
 
				     ) -> List[Dict[str, Any]]:
			
 
				         """
			
 
				         分批评估搜索词（每批N个，减少API调用）
			
@@ -162,62 +166,71 @@ class LLMEvaluator:
 
				 
			
 
				             logger.info(f"  处理第 {batch_idx + 1}/{total_batches} 批（{len(batch_words)} 个搜索词）")
			
 
				 
			
 
				-            # 构建包含多个搜索词的prompt
			
 
				-            words_list = "\n".join([
			
 
				-                f"{i+1}. {word}"
			
 
				-                for i, word in enumerate(batch_words)
			
 
				-            ])
			
 
				+            # 从搜索词中提取所有独特的词作为可选词汇
			
 
				+            available_words_set = set()
			
 
				+            for word in batch_words:
			
 
				+                # 分割搜索词，提取单个词
			
 
				+                parts = word.split()
			
 
				+                available_words_set.update(parts)
			
 
				+
			
 
				+            # 转换为列表并排序（保证稳定性）
			
 
				+            available_words = sorted(list(available_words_set))
			
 
				+
			
 
				+            # 构建可选词汇字符串（逗号分隔）
			
 
				+            available_words_str = "、".join(available_words)
			
 
				 
			
 
				-            prompt = f"""你是一个小红书内容分析专家。
			
 
				+            prompt = f"""
			
 
				 
			
 
				-任务：评估以下搜索词在小红书上能否找到包含目标特征"{original_feature}"的内容
			
 
				+# 任务说明
			
 
				+从给定关键词中提取并组合适合在小红书搜索的query词（目标是找到【{original_feature}】相关内容，但query中不能直接出现"{original_feature}"二字）
			
 
				 
			
 
				-搜索词列表：
			
 
				-{words_list}
			
 
				+## 可选词汇
			
 
				+{available_words_str}
			
 
				 
			
 
				-评估标准：
			
 
				-1. 这个搜索词在小红书上能否找到包含"{original_feature}"相关元素的帖子
			
 
				-2. 搜索词的关键词组合是否合理、是否过于宽泛或过于具体
			
 
				-3. 搜索词与原始特征的语义关联性
			
 
				+## 要求
			
 
				+1. 只能使用可选词汇中的词,可以进行以下变化:
			
 
				+   - 直接使用原词或括号内的同义词
			
 
				+   - 多个词组合
			
 
				+   - 适当精简
			
 
				+2. 不能添加可选词汇以外的新词
			
 
				+3. 按推荐程度排序(越靠前越推荐)
			
 
				 
			
 
				-请为每个搜索词返回评估结果，JSON数组格式：
			
 
				+## 输出格式（JSON）:
			
 
				 [
			
 
				   {{
			
 
				     "index": 1,
			
 
				-    "score": 0.75,
			
 
				-    "reasoning": "详细的评估理由"
			
 
				+    "search_word": "组合的搜索词",
			
 
				+    "score": 0.85,
			
 
				+    "reasoning": "推荐理由"
			
 
				   }},
			
 
				   {{
			
 
				     "index": 2,
			
 
				+    "search_word": "组合的搜索词",
			
 
				     "score": 0.80,
			
 
				-    "reasoning": "详细的评估理由"
			
 
				+    "reasoning": "推荐理由"
			
 
				   }}
			
 
				 ]
			
 
				-
			
 
				-注意：
			
 
				-- index 对应搜索词的编号（1-{len(batch_words)}）
			
 
				-- score 范围 0.0-1.0
			
 
				 - 只返回JSON数组，不要其他内容"""
			
 
				 
			
 
				             # 调用LLM
			
 
				             result = self.client.chat_json(prompt=prompt, max_retries=3)
			
 
				 
			
 
				             if result and isinstance(result, list):
			
 
				-                # 处理结果
			
 
				-                for item in result:
			
 
				-                    idx = item.get("index", 0) - 1  # 转换为0-based索引
			
 
				-                    if 0 <= idx < len(batch_words):
			
 
				+                # 处理结果 - 新格式直接包含search_word
			
 
				+                for idx, item in enumerate(result):
			
 
				+                    search_word = item.get("search_word", "")
			
 
				+                    if search_word:  # 确保有搜索词
			
 
				                         all_results.append({
			
 
				-                            "search_word": batch_words[idx],
			
 
				+                            "search_word": search_word,
			
 
				                             "score": item.get("score", 0.0),
			
 
				                             "reasoning": item.get("reasoning", ""),
			
 
				                             "original_feature": original_feature
			
 
				                         })
			
 
				                         logger.info(f"    [{start_idx + idx + 1}/{len(search_words)}] "
			
 
				-                                   f"{batch_words[idx]}: {item.get('score', 0.0):.3f}")
			
 
				+                                   f"{search_word}: {item.get('score', 0.0):.3f}")
			
 
				             else:
			
 
				                 logger.error(f"  第 {batch_idx + 1} 批评估失败，跳过")
			
 
				-                # 为失败的批次添加默认结果
			
 
				+                # 为失败的批次添加默认结果（使用原搜索词）
			
 
				                 for word in batch_words:
			
 
				                     all_results.append({
			
 
				                         "search_word": word,
			
@@ -237,6 +250,155 @@ class LLMEvaluator:
 
				 
			
 
				         return all_results
			
 
				 
			
 
				+    def evaluate_single_note(
			
 
				+        self,
			
 
				+        original_feature: str,
			
 
				+        search_word: str,
			
 
				+        note: Dict[str, Any],
			
 
				+        note_index: int = 0
			
 
				+    ) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        评估单个帖子（阶段6，多模态）
			
 
				+
			
 
				+        Args:
			
 
				+            original_feature: 原始特征
			
 
				+            search_word: 搜索词
			
 
				+            note: 单个帖子
			
 
				+            note_index: 帖子索引
			
 
				+
			
 
				+        Returns:
			
 
				+            单个帖子的评估结果
			
 
				+        """
			
 
				+        card = note.get("note_card", {})
			
 
				+        title = card.get("display_title", "")
			
 
				+        desc = card.get("desc", "")[:500]  # 限制长度
			
 
				+        images = card.get("image_list", [])[:10]  # 最多10张图
			
 
				+
			
 
				+        prompt = f"""你是一个小红书内容分析专家。
			
 
				+
			
 
				+任务：评估这个帖子是否包含目标特征"{original_feature}"的元素
			
 
				+
			
 
				+原始特征："{original_feature}"
			
 
				+搜索词："{search_word}"
			
 
				+
			
 
				+帖子内容：
			
 
				+标题: {title}
			
 
				+正文: {desc}
			
 
				+
			
 
				+请分析帖子的文字和图片内容，返回JSON格式：
			
 
				+{{
			
 
				+  "relevance": 0.85,  // 0.0-1.0，相关度
			
 
				+  "matched_elements": ["元素1", "元素2"],  // 匹配的元素列表
			
 
				+  "reasoning": "简短的匹配理由"
			
 
				+}}
			
 
				+
			
 
				+只返回JSON，不要其他内容。"""
			
 
				+
			
 
				+        result = self.client.chat_json(
			
 
				+            prompt=prompt,
			
 
				+            images=images if images else None,
			
 
				+            max_retries=3
			
 
				+        )
			
 
				+
			
 
				+        if result:
			
 
				+            return {
			
 
				+                "note_index": note_index,
			
 
				+                "relevance": result.get("relevance", 0.0),
			
 
				+                "matched_elements": result.get("matched_elements", []),
			
 
				+                "reasoning": result.get("reasoning", "")
			
 
				+            }
			
 
				+        else:
			
 
				+            logger.error(f"  评估帖子 {note_index} 失败: {search_word}")
			
 
				+            return {
			
 
				+                "note_index": note_index,
			
 
				+                "relevance": 0.0,
			
 
				+                "matched_elements": [],
			
 
				+                "reasoning": "评估失败"
			
 
				+            }
			
 
				+
			
 
				+    def evaluate_search_results_parallel(
			
 
				+        self,
			
 
				+        original_feature: str,
			
 
				+        search_word: str,
			
 
				+        notes: List[Dict[str, Any]],
			
 
				+        max_notes: int = 20,
			
 
				+        max_workers: int = 20
			
 
				+    ) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        并行评估搜索结果（每个帖子独立评估）
			
 
				+
			
 
				+        Args:
			
 
				+            original_feature: 原始特征
			
 
				+            search_word: 搜索词
			
 
				+            notes: 帖子列表
			
 
				+            max_notes: 最多评估几条帖子
			
 
				+            max_workers: 最大并发数
			
 
				+
			
 
				+        Returns:
			
 
				+            评估结果汇总
			
 
				+        """
			
 
				+        if not notes:
			
 
				+            return {
			
 
				+                "overall_relevance": 0.0,
			
 
				+                "extracted_elements": [],
			
 
				+                "evaluated_notes": []
			
 
				+            }
			
 
				+
			
 
				+        notes_to_eval = notes[:max_notes]
			
 
				+        evaluated_notes = []
			
 
				+
			
 
				+        logger.info(f"  并行评估 {len(notes_to_eval)} 个帖子（{max_workers}并发）")
			
 
				+
			
 
				+        # 20并发评估每个帖子
			
 
				+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
			
 
				+            futures = []
			
 
				+            for idx, note in enumerate(notes_to_eval):
			
 
				+                future = executor.submit(
			
 
				+                    self.evaluate_single_note,
			
 
				+                    original_feature,
			
 
				+                    search_word,
			
 
				+                    note,
			
 
				+                    idx
			
 
				+                )
			
 
				+                futures.append(future)
			
 
				+
			
 
				+            # 收集结果
			
 
				+            for future in as_completed(futures):
			
 
				+                try:
			
 
				+                    result = future.result()
			
 
				+                    evaluated_notes.append(result)
			
 
				+                except Exception as e:
			
 
				+                    logger.error(f"  评估帖子失败: {e}")
			
 
				+
			
 
				+        # 按note_index排序
			
 
				+        evaluated_notes.sort(key=lambda x: x['note_index'])
			
 
				+
			
 
				+        # 汇总：计算整体相关度和提取元素
			
 
				+        if evaluated_notes:
			
 
				+            overall_relevance = sum(n['relevance'] for n in evaluated_notes) / len(evaluated_notes)
			
 
				+
			
 
				+            # 提取所有元素并统计频次
			
 
				+            element_counts = {}
			
 
				+            for note in evaluated_notes:
			
 
				+                for elem in note['matched_elements']:
			
 
				+                    element_counts[elem] = element_counts.get(elem, 0) + 1
			
 
				+
			
 
				+            # 按频次排序，取前5个
			
 
				+            extracted_elements = sorted(
			
 
				+                element_counts.keys(),
			
 
				+                key=lambda x: element_counts[x],
			
 
				+                reverse=True
			
 
				+            )[:5]
			
 
				+        else:
			
 
				+            overall_relevance = 0.0
			
 
				+            extracted_elements = []
			
 
				+
			
 
				+        return {
			
 
				+            "overall_relevance": overall_relevance,
			
 
				+            "extracted_elements": extracted_elements,
			
 
				+            "evaluated_notes": evaluated_notes
			
 
				+        }
			
 
				+
			
 
				     def evaluate_search_results(
			
 
				         self,
			
 
				         original_feature: str,
			
--- a/visualize_stage5_results.py
+++ b/visualize_stage5_results.py
@@ -0,0 +1,818 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Stage5搜索结果可视化工具
			
 
				+生成带图片轮播的交互式HTML页面
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import os
			
 
				+from datetime import datetime
			
 
				+from typing import List, Dict, Any
			
 
				+
			
 
				+
			
 
				+def load_data(json_path: str) -> List[Dict[str, Any]]:
			
 
				+    """加载JSON数据"""
			
 
				+    with open(json_path, 'r', encoding='utf-8') as f:
			
 
				+        return json.load(f)
			
 
				+
			
 
				+
			
 
				+def calculate_statistics(data: List[Dict[str, Any]]) -> Dict[str, Any]:
			
 
				+    """计算统计数据"""
			
 
				+    total_features = len(data)
			
 
				+    total_search_words = 0
			
 
				+    total_notes = 0
			
 
				+    video_count = 0
			
 
				+    normal_count = 0
			
 
				+
			
 
				+    for feature in data:
			
 
				+        search_results = feature.get('组合评估结果', [])
			
 
				+        total_search_words += len(search_results)
			
 
				+
			
 
				+        for search_item in search_results:
			
 
				+            search_result = search_item.get('search_result', {})
			
 
				+            notes = search_result.get('data', {}).get('data', [])
			
 
				+            total_notes += len(notes)
			
 
				+
			
 
				+            for note in notes:
			
 
				+                note_type = note.get('note_card', {}).get('type', '')
			
 
				+                if note_type == 'video':
			
 
				+                    video_count += 1
			
 
				+                else:
			
 
				+                    normal_count += 1
			
 
				+
			
 
				+    return {
			
 
				+        'total_features': total_features,
			
 
				+        'total_search_words': total_search_words,
			
 
				+        'total_notes': total_notes,
			
 
				+        'video_count': video_count,
			
 
				+        'normal_count': normal_count,
			
 
				+        'video_percentage': round(video_count / total_notes * 100, 1) if total_notes > 0 else 0,
			
 
				+        'normal_percentage': round(normal_count / total_notes * 100, 1) if total_notes > 0 else 0
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path: str):
			
 
				+    """生成HTML可视化页面"""
			
 
				+
			
 
				+    # 准备数据JSON（用于JavaScript）
			
 
				+    data_json = json.dumps(data, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    html_content = f'''<!DOCTYPE html>
			
 
				+<html lang="zh-CN">
			
 
				+<head>
			
 
				+    <meta charset="UTF-8">
			
 
				+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
			
 
				+    <title>Stage5 搜索结果可视化</title>
			
 
				+    <style>
			
 
				+        * {{
			
 
				+            margin: 0;
			
 
				+            padding: 0;
			
 
				+            box-sizing: border-box;
			
 
				+        }}
			
 
				+
			
 
				+        body {{
			
 
				+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
			
 
				+            background: #f5f7fa;
			
 
				+            color: #333;
			
 
				+            overflow-x: hidden;
			
 
				+        }}
			
 
				+
			
 
				+        /* 顶部统计面板 */
			
 
				+        .stats-panel {{
			
 
				+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
			
 
				+            color: white;
			
 
				+            padding: 20px;
			
 
				+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
			
 
				+        }}
			
 
				+
			
 
				+        .stats-container {{
			
 
				+            max-width: 1400px;
			
 
				+            margin: 0 auto;
			
 
				+            display: flex;
			
 
				+            justify-content: space-around;
			
 
				+            align-items: center;
			
 
				+            flex-wrap: wrap;
			
 
				+            gap: 20px;
			
 
				+        }}
			
 
				+
			
 
				+        .stat-item {{
			
 
				+            text-align: center;
			
 
				+        }}
			
 
				+
			
 
				+        .stat-value {{
			
 
				+            font-size: 32px;
			
 
				+            font-weight: bold;
			
 
				+            margin-bottom: 5px;
			
 
				+        }}
			
 
				+
			
 
				+        .stat-label {{
			
 
				+            font-size: 14px;
			
 
				+            opacity: 0.9;
			
 
				+        }}
			
 
				+
			
 
				+        /* 主容器 */
			
 
				+        .main-container {{
			
 
				+            display: flex;
			
 
				+            max-width: 1400px;
			
 
				+            margin: 20px auto;
			
 
				+            gap: 20px;
			
 
				+            padding: 0 20px;
			
 
				+            height: calc(100vh - 140px);
			
 
				+        }}
			
 
				+
			
 
				+        /* 左侧导航 */
			
 
				+        .left-sidebar {{
			
 
				+            width: 30%;
			
 
				+            background: white;
			
 
				+            border-radius: 8px;
			
 
				+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
			
 
				+            overflow-y: auto;
			
 
				+            position: sticky;
			
 
				+            top: 20px;
			
 
				+            height: fit-content;
			
 
				+            max-height: calc(100vh - 160px);
			
 
				+        }}
			
 
				+
			
 
				+        .feature-group {{
			
 
				+            border-bottom: 1px solid #e5e7eb;
			
 
				+        }}
			
 
				+
			
 
				+        .feature-header {{
			
 
				+            padding: 15px 20px;
			
 
				+            background: #f9fafb;
			
 
				+            cursor: pointer;
			
 
				+            user-select: none;
			
 
				+            transition: background 0.2s;
			
 
				+        }}
			
 
				+
			
 
				+        .feature-header:hover {{
			
 
				+            background: #f3f4f6;
			
 
				+        }}
			
 
				+
			
 
				+        .feature-header.active {{
			
 
				+            background: #667eea;
			
 
				+            color: white;
			
 
				+        }}
			
 
				+
			
 
				+        .feature-title {{
			
 
				+            font-size: 16px;
			
 
				+            font-weight: 600;
			
 
				+            margin-bottom: 5px;
			
 
				+        }}
			
 
				+
			
 
				+        .feature-meta {{
			
 
				+            font-size: 12px;
			
 
				+            color: #6b7280;
			
 
				+        }}
			
 
				+
			
 
				+        .feature-header.active .feature-meta {{
			
 
				+            color: rgba(255,255,255,0.8);
			
 
				+        }}
			
 
				+
			
 
				+        .search-words-list {{
			
 
				+            display: none;
			
 
				+            padding: 10px 0;
			
 
				+        }}
			
 
				+
			
 
				+        .search-words-list.expanded {{
			
 
				+            display: block;
			
 
				+        }}
			
 
				+
			
 
				+        .search-word-item {{
			
 
				+            padding: 12px 20px 12px 40px;
			
 
				+            cursor: pointer;
			
 
				+            border-left: 3px solid transparent;
			
 
				+            transition: all 0.2s;
			
 
				+        }}
			
 
				+
			
 
				+        .search-word-item:hover {{
			
 
				+            background: #f9fafb;
			
 
				+            border-left-color: #667eea;
			
 
				+        }}
			
 
				+
			
 
				+        .search-word-item.active {{
			
 
				+            background: #ede9fe;
			
 
				+            border-left-color: #7c3aed;
			
 
				+        }}
			
 
				+
			
 
				+        .search-word-text {{
			
 
				+            font-size: 14px;
			
 
				+            font-weight: 500;
			
 
				+            color: #374151;
			
 
				+            margin-bottom: 4px;
			
 
				+        }}
			
 
				+
			
 
				+        .search-word-score {{
			
 
				+            display: inline-block;
			
 
				+            padding: 2px 8px;
			
 
				+            border-radius: 12px;
			
 
				+            font-size: 11px;
			
 
				+            font-weight: 600;
			
 
				+            margin-left: 8px;
			
 
				+        }}
			
 
				+
			
 
				+        .score-high {{
			
 
				+            background: #d1fae5;
			
 
				+            color: #065f46;
			
 
				+        }}
			
 
				+
			
 
				+        .score-medium {{
			
 
				+            background: #fef3c7;
			
 
				+            color: #92400e;
			
 
				+        }}
			
 
				+
			
 
				+        .score-low {{
			
 
				+            background: #fee2e2;
			
 
				+            color: #991b1b;
			
 
				+        }}
			
 
				+
			
 
				+        .search-word-reasoning {{
			
 
				+            font-size: 12px;
			
 
				+            color: #6b7280;
			
 
				+            margin-top: 4px;
			
 
				+            display: -webkit-box;
			
 
				+            -webkit-line-clamp: 2;
			
 
				+            -webkit-box-orient: vertical;
			
 
				+            overflow: hidden;
			
 
				+        }}
			
 
				+
			
 
				+        /* 右侧结果区 */
			
 
				+        .right-content {{
			
 
				+            flex: 1;
			
 
				+            overflow-y: auto;
			
 
				+            padding-bottom: 40px;
			
 
				+        }}
			
 
				+
			
 
				+        .result-block {{
			
 
				+            background: white;
			
 
				+            border-radius: 8px;
			
 
				+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
			
 
				+            margin-bottom: 30px;
			
 
				+            padding: 20px;
			
 
				+            scroll-margin-top: 20px;
			
 
				+        }}
			
 
				+
			
 
				+        .result-header {{
			
 
				+            margin-bottom: 20px;
			
 
				+            padding-bottom: 15px;
			
 
				+            border-bottom: 2px solid #e5e7eb;
			
 
				+        }}
			
 
				+
			
 
				+        .result-title {{
			
 
				+            font-size: 20px;
			
 
				+            font-weight: 600;
			
 
				+            color: #111827;
			
 
				+            margin-bottom: 10px;
			
 
				+        }}
			
 
				+
			
 
				+        .result-stats {{
			
 
				+            display: flex;
			
 
				+            gap: 15px;
			
 
				+            font-size: 13px;
			
 
				+            color: #6b7280;
			
 
				+        }}
			
 
				+
			
 
				+        .stat-badge {{
			
 
				+            background: #f3f4f6;
			
 
				+            padding: 4px 10px;
			
 
				+            border-radius: 4px;
			
 
				+        }}
			
 
				+
			
 
				+        /* 帖子网格 */
			
 
				+        .notes-grid {{
			
 
				+            display: grid;
			
 
				+            grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
			
 
				+            gap: 20px;
			
 
				+        }}
			
 
				+
			
 
				+        .note-card {{
			
 
				+            border: 1px solid #e5e7eb;
			
 
				+            border-radius: 8px;
			
 
				+            overflow: hidden;
			
 
				+            cursor: pointer;
			
 
				+            transition: all 0.3s;
			
 
				+            background: white;
			
 
				+        }}
			
 
				+
			
 
				+        .note-card:hover {{
			
 
				+            transform: translateY(-4px);
			
 
				+            box-shadow: 0 10px 25px rgba(0,0,0,0.15);
			
 
				+        }}
			
 
				+
			
 
				+        /* 图片轮播 */
			
 
				+        .image-carousel {{
			
 
				+            position: relative;
			
 
				+            width: 100%;
			
 
				+            height: 280px;
			
 
				+            background: #f3f4f6;
			
 
				+            overflow: hidden;
			
 
				+        }}
			
 
				+
			
 
				+        .carousel-images {{
			
 
				+            display: flex;
			
 
				+            height: 100%;
			
 
				+            transition: transform 0.3s ease;
			
 
				+        }}
			
 
				+
			
 
				+        .carousel-image {{
			
 
				+            min-width: 100%;
			
 
				+            height: 100%;
			
 
				+            object-fit: cover;
			
 
				+        }}
			
 
				+
			
 
				+        .carousel-btn {{
			
 
				+            position: absolute;
			
 
				+            top: 50%;
			
 
				+            transform: translateY(-50%);
			
 
				+            background: rgba(0,0,0,0.5);
			
 
				+            color: white;
			
 
				+            border: none;
			
 
				+            width: 32px;
			
 
				+            height: 32px;
			
 
				+            border-radius: 50%;
			
 
				+            cursor: pointer;
			
 
				+            font-size: 16px;
			
 
				+            display: none;
			
 
				+            align-items: center;
			
 
				+            justify-content: center;
			
 
				+            transition: background 0.2s;
			
 
				+            z-index: 10;
			
 
				+        }}
			
 
				+
			
 
				+        .carousel-btn:hover {{
			
 
				+            background: rgba(0,0,0,0.7);
			
 
				+        }}
			
 
				+
			
 
				+        .carousel-btn.prev {{
			
 
				+            left: 8px;
			
 
				+        }}
			
 
				+
			
 
				+        .carousel-btn.next {{
			
 
				+            right: 8px;
			
 
				+        }}
			
 
				+
			
 
				+        .note-card:hover .carousel-btn {{
			
 
				+            display: flex;
			
 
				+        }}
			
 
				+
			
 
				+        .carousel-indicators {{
			
 
				+            position: absolute;
			
 
				+            bottom: 10px;
			
 
				+            left: 50%;
			
 
				+            transform: translateX(-50%);
			
 
				+            display: flex;
			
 
				+            gap: 6px;
			
 
				+            z-index: 10;
			
 
				+        }}
			
 
				+
			
 
				+        .dot {{
			
 
				+            width: 8px;
			
 
				+            height: 8px;
			
 
				+            border-radius: 50%;
			
 
				+            background: rgba(255,255,255,0.5);
			
 
				+            cursor: pointer;
			
 
				+            transition: all 0.2s;
			
 
				+        }}
			
 
				+
			
 
				+        .dot.active {{
			
 
				+            background: white;
			
 
				+            width: 24px;
			
 
				+            border-radius: 4px;
			
 
				+        }}
			
 
				+
			
 
				+        .image-counter {{
			
 
				+            position: absolute;
			
 
				+            top: 10px;
			
 
				+            right: 10px;
			
 
				+            background: rgba(0,0,0,0.6);
			
 
				+            color: white;
			
 
				+            padding: 4px 8px;
			
 
				+            border-radius: 4px;
			
 
				+            font-size: 12px;
			
 
				+            z-index: 10;
			
 
				+        }}
			
 
				+
			
 
				+        /* 帖子信息 */
			
 
				+        .note-info {{
			
 
				+            padding: 12px;
			
 
				+        }}
			
 
				+
			
 
				+        .note-title {{
			
 
				+            font-size: 14px;
			
 
				+            font-weight: 500;
			
 
				+            color: #111827;
			
 
				+            margin-bottom: 8px;
			
 
				+            display: -webkit-box;
			
 
				+            -webkit-line-clamp: 2;
			
 
				+            -webkit-box-orient: vertical;
			
 
				+            overflow: hidden;
			
 
				+            line-height: 1.4;
			
 
				+        }}
			
 
				+
			
 
				+        .note-meta {{
			
 
				+            display: flex;
			
 
				+            align-items: center;
			
 
				+            justify-content: space-between;
			
 
				+            font-size: 12px;
			
 
				+            color: #6b7280;
			
 
				+        }}
			
 
				+
			
 
				+        .note-type {{
			
 
				+            padding: 3px 8px;
			
 
				+            border-radius: 4px;
			
 
				+            font-weight: 500;
			
 
				+        }}
			
 
				+
			
 
				+        .type-video {{
			
 
				+            background: #dbeafe;
			
 
				+            color: #1e40af;
			
 
				+        }}
			
 
				+
			
 
				+        .type-normal {{
			
 
				+            background: #d1fae5;
			
 
				+            color: #065f46;
			
 
				+        }}
			
 
				+
			
 
				+        .note-author {{
			
 
				+            display: flex;
			
 
				+            align-items: center;
			
 
				+            gap: 6px;
			
 
				+        }}
			
 
				+
			
 
				+        .author-avatar {{
			
 
				+            width: 24px;
			
 
				+            height: 24px;
			
 
				+            border-radius: 50%;
			
 
				+        }}
			
 
				+
			
 
				+        /* SVG连线层 */
			
 
				+        #connection-svg {{
			
 
				+            position: fixed;
			
 
				+            top: 0;
			
 
				+            left: 0;
			
 
				+            width: 100%;
			
 
				+            height: 100%;
			
 
				+            pointer-events: none;
			
 
				+            z-index: 1;
			
 
				+        }}
			
 
				+
			
 
				+        .connection-line {{
			
 
				+            stroke: #cbd5e1;
			
 
				+            stroke-width: 1;
			
 
				+            stroke-dasharray: 5,5;
			
 
				+            fill: none;
			
 
				+            opacity: 0.3;
			
 
				+            transition: all 0.2s;
			
 
				+        }}
			
 
				+
			
 
				+        .connection-line.active {{
			
 
				+            stroke: #667eea;
			
 
				+            stroke-width: 2;
			
 
				+            stroke-dasharray: none;
			
 
				+            opacity: 1;
			
 
				+        }}
			
 
				+
			
 
				+        /* 滚动条样式 */
			
 
				+        ::-webkit-scrollbar {{
			
 
				+            width: 8px;
			
 
				+            height: 8px;
			
 
				+        }}
			
 
				+
			
 
				+        ::-webkit-scrollbar-track {{
			
 
				+            background: #f1f1f1;
			
 
				+        }}
			
 
				+
			
 
				+        ::-webkit-scrollbar-thumb {{
			
 
				+            background: #888;
			
 
				+            border-radius: 4px;
			
 
				+        }}
			
 
				+
			
 
				+        ::-webkit-scrollbar-thumb:hover {{
			
 
				+            background: #555;
			
 
				+        }}
			
 
				+    </style>
			
 
				+</head>
			
 
				+<body>
			
 
				+    <!-- 统计面板 -->
			
 
				+    <div class="stats-panel">
			
 
				+        <div class="stats-container">
			
 
				+            <div class="stat-item">
			
 
				+                <div class="stat-value">📊 {stats['total_features']}</div>
			
 
				+                <div class="stat-label">原始特征数</div>
			
 
				+            </div>
			
 
				+            <div class="stat-item">
			
 
				+                <div class="stat-value">🔍 {stats['total_search_words']}</div>
			
 
				+                <div class="stat-label">搜索词数</div>
			
 
				+            </div>
			
 
				+            <div class="stat-item">
			
 
				+                <div class="stat-value">📝 {stats['total_notes']}</div>
			
 
				+                <div class="stat-label">帖子总数</div>
			
 
				+            </div>
			
 
				+            <div class="stat-item">
			
 
				+                <div class="stat-value">🎬 {stats['video_count']}</div>
			
 
				+                <div class="stat-label">视频类型 ({stats['video_percentage']}%)</div>
			
 
				+            </div>
			
 
				+            <div class="stat-item">
			
 
				+                <div class="stat-value">📷 {stats['normal_count']}</div>
			
 
				+                <div class="stat-label">图文类型 ({stats['normal_percentage']}%)</div>
			
 
				+            </div>
			
 
				+        </div>
			
 
				+    </div>
			
 
				+
			
 
				+    <!-- SVG连线层 -->
			
 
				+    <svg id="connection-svg"></svg>
			
 
				+
			
 
				+    <!-- 主容器 -->
			
 
				+    <div class="main-container">
			
 
				+        <!-- 左侧导航 -->
			
 
				+        <div class="left-sidebar" id="leftSidebar">
			
 
				+            <!-- 通过JavaScript动态生成 -->
			
 
				+        </div>
			
 
				+
			
 
				+        <!-- 右侧结果区 -->
			
 
				+        <div class="right-content" id="rightContent">
			
 
				+            <!-- 通过JavaScript动态生成 -->
			
 
				+        </div>
			
 
				+    </div>
			
 
				+
			
 
				+    <script>
			
 
				+        // 数据
			
 
				+        const data = {data_json};
			
 
				+
			
 
				+        // 渲染左侧导航
			
 
				+        function renderLeftSidebar() {{
			
 
				+            const sidebar = document.getElementById('leftSidebar');
			
 
				+            let html = '';
			
 
				+
			
 
				+            data.forEach((feature, featureIdx) => {{
			
 
				+                const searchWords = feature['组合评估结果'] || [];
			
 
				+
			
 
				+                html += `
			
 
				+                    <div class="feature-group">
			
 
				+                        <div class="feature-header" onclick="toggleFeature(${{featureIdx}})" id="feature-header-${{featureIdx}}">
			
 
				+                            <div class="feature-title">${{feature['原始特征名称']}}</div>
			
 
				+                            <div class="feature-meta">
			
 
				+                                ${{feature['来源层级']}} · 权重: ${{feature['权重'].toFixed(2)}} · ${{searchWords.length}}个搜索词
			
 
				+                            </div>
			
 
				+                        </div>
			
 
				+                        <div class="search-words-list" id="search-words-${{featureIdx}}">
			
 
				+                `;
			
 
				+
			
 
				+                searchWords.forEach((sw, swIdx) => {{
			
 
				+                    const score = sw.score || 0;
			
 
				+                    const scoreClass = score >= 0.9 ? 'score-high' : score >= 0.7 ? 'score-medium' : 'score-low';
			
 
				+                    const blockId = `block-${{featureIdx}}-${{swIdx}}`;
			
 
				+
			
 
				+                    html += `
			
 
				+                        <div class="search-word-item" onclick="scrollToBlock('${{blockId}}')"
			
 
				+                             id="sw-${{featureIdx}}-${{swIdx}}"
			
 
				+                             data-block-id="${{blockId}}">
			
 
				+                            <div class="search-word-text">
			
 
				+                                ${{sw.search_word}}
			
 
				+                                <span class="search-word-score ${{scoreClass}}">${{score.toFixed(2)}}</span>
			
 
				+                            </div>
			
 
				+                            <div class="search-word-reasoning" title="${{sw.reasoning}}">
			
 
				+                                ${{sw.reasoning || ''}}
			
 
				+                            </div>
			
 
				+                        </div>
			
 
				+                    `;
			
 
				+                }});
			
 
				+
			
 
				+                html += `
			
 
				+                        </div>
			
 
				+                    </div>
			
 
				+                `;
			
 
				+            }});
			
 
				+
			
 
				+            sidebar.innerHTML = html;
			
 
				+        }}
			
 
				+
			
 
				+        // 渲染右侧结果区
			
 
				+        function renderRightContent() {{
			
 
				+            const content = document.getElementById('rightContent');
			
 
				+            let html = '';
			
 
				+
			
 
				+            data.forEach((feature, featureIdx) => {{
			
 
				+                const searchWords = feature['组合评估结果'] || [];
			
 
				+
			
 
				+                searchWords.forEach((sw, swIdx) => {{
			
 
				+                    const blockId = `block-${{featureIdx}}-${{swIdx}}`;
			
 
				+                    const searchResult = sw.search_result || {{}};
			
 
				+                    const notes = searchResult.data?.data || [];
			
 
				+
			
 
				+                    const videoCount = notes.filter(n => n.note_card?.type === 'video').length;
			
 
				+                    const normalCount = notes.length - videoCount;
			
 
				+
			
 
				+                    html += `
			
 
				+                        <div class="result-block" id="${{blockId}}">
			
 
				+                            <div class="result-header">
			
 
				+                                <div class="result-title">${{sw.search_word}}</div>
			
 
				+                                <div class="result-stats">
			
 
				+                                    <span class="stat-badge">📝 ${{notes.length}} 条帖子</span>
			
 
				+                                    <span class="stat-badge">🎬 ${{videoCount}} 视频</span>
			
 
				+                                    <span class="stat-badge">📷 ${{normalCount}} 图文</span>
			
 
				+                                </div>
			
 
				+                            </div>
			
 
				+                            <div class="notes-grid">
			
 
				+                                ${{notes.map((note, noteIdx) => renderNoteCard(note, featureIdx, swIdx, noteIdx)).join('')}}
			
 
				+                            </div>
			
 
				+                        </div>
			
 
				+                    `;
			
 
				+                }});
			
 
				+            }});
			
 
				+
			
 
				+            content.innerHTML = html;
			
 
				+        }}
			
 
				+
			
 
				+        // 渲染单个帖子卡片
			
 
				+        function renderNoteCard(note, featureIdx, swIdx, noteIdx) {{
			
 
				+            const card = note.note_card || {{}};
			
 
				+            const images = card.image_list || [];
			
 
				+            const title = card.display_title || '无标题';
			
 
				+            const noteType = card.type || 'normal';
			
 
				+            const noteId = note.id || '';
			
 
				+            const user = card.user || {{}};
			
 
				+            const userName = user.nick_name || '未知用户';
			
 
				+            const userAvatar = user.avatar || '';
			
 
				+
			
 
				+            const carouselId = `carousel-${{featureIdx}}-${{swIdx}}-${{noteIdx}}`;
			
 
				+
			
 
				+            return `
			
 
				+                <div class="note-card" onclick="openNote('${{noteId}}')">
			
 
				+                    <div class="image-carousel" id="${{carouselId}}">
			
 
				+                        <div class="carousel-images">
			
 
				+                            ${{images.map(img => `<img class="carousel-image" src="${{img}}" alt="帖子图片" loading="lazy">`).join('')}}
			
 
				+                        </div>
			
 
				+                        ${{images.length > 1 ? `
			
 
				+                            <button class="carousel-btn prev" onclick="event.stopPropagation(); changeImage('${{carouselId}}', -1)">←</button>
			
 
				+                            <button class="carousel-btn next" onclick="event.stopPropagation(); changeImage('${{carouselId}}', 1)">→</button>
			
 
				+                            <div class="carousel-indicators">
			
 
				+                                ${{images.map((_, i) => `<span class="dot ${{i === 0 ? 'active' : ''}}" onclick="event.stopPropagation(); goToImage('${{carouselId}}', ${{i}})"></span>`).join('')}}
			
 
				+                            </div>
			
 
				+                            <span class="image-counter">1/${{images.length}}</span>
			
 
				+                        ` : ''}}
			
 
				+                    </div>
			
 
				+                    <div class="note-info">
			
 
				+                        <div class="note-title">${{title}}</div>
			
 
				+                        <div class="note-meta">
			
 
				+                            <span class="note-type type-${{noteType}}">
			
 
				+                                ${{noteType === 'video' ? '🎬 视频' : '📷 图文'}}
			
 
				+                            </span>
			
 
				+                            <div class="note-author">
			
 
				+                                ${{userAvatar ? `<img class="author-avatar" src="${{userAvatar}}" alt="${{userName}}">` : ''}}
			
 
				+                                <span>${{userName}}</span>
			
 
				+                            </div>
			
 
				+                        </div>
			
 
				+                    </div>
			
 
				+                </div>
			
 
				+            `;
			
 
				+        }}
			
 
				+
			
 
				+        // 图片轮播逻辑
			
 
				+        const carouselStates = {{}};
			
 
				+
			
 
				+        function changeImage(carouselId, direction) {{
			
 
				+            if (!carouselStates[carouselId]) {{
			
 
				+                carouselStates[carouselId] = {{ currentIndex: 0 }};
			
 
				+            }}
			
 
				+
			
 
				+            const carousel = document.getElementById(carouselId);
			
 
				+            const imagesContainer = carousel.querySelector('.carousel-images');
			
 
				+            const images = carousel.querySelectorAll('.carousel-image');
			
 
				+            const dots = carousel.querySelectorAll('.dot');
			
 
				+            const counter = carousel.querySelector('.image-counter');
			
 
				+
			
 
				+            let newIndex = carouselStates[carouselId].currentIndex + direction;
			
 
				+            if (newIndex < 0) newIndex = images.length - 1;
			
 
				+            if (newIndex >= images.length) newIndex = 0;
			
 
				+
			
 
				+            carouselStates[carouselId].currentIndex = newIndex;
			
 
				+            imagesContainer.style.transform = `translateX(-${{newIndex * 100}}%)`;
			
 
				+
			
 
				+            // 更新指示器
			
 
				+            dots.forEach((dot, i) => {{
			
 
				+                dot.classList.toggle('active', i === newIndex);
			
 
				+            }});
			
 
				+
			
 
				+            // 更新计数器
			
 
				+            if (counter) {{
			
 
				+                counter.textContent = `${{newIndex + 1}}/${{images.length}}`;
			
 
				+            }}
			
 
				+        }}
			
 
				+
			
 
				+        function goToImage(carouselId, index) {{
			
 
				+            if (!carouselStates[carouselId]) {{
			
 
				+                carouselStates[carouselId] = {{ currentIndex: 0 }};
			
 
				+            }}
			
 
				+
			
 
				+            const carousel = document.getElementById(carouselId);
			
 
				+            const imagesContainer = carousel.querySelector('.carousel-images');
			
 
				+            const dots = carousel.querySelectorAll('.dot');
			
 
				+            const counter = carousel.querySelector('.image-counter');
			
 
				+
			
 
				+            carouselStates[carouselId].currentIndex = index;
			
 
				+            imagesContainer.style.transform = `translateX(-${{index * 100}}%)`;
			
 
				+
			
 
				+            // 更新指示器
			
 
				+            dots.forEach((dot, i) => {{
			
 
				+                dot.classList.toggle('active', i === index);
			
 
				+            }});
			
 
				+
			
 
				+            // 更新计数器
			
 
				+            if (counter) {{
			
 
				+                counter.textContent = `${{index + 1}}/${{dots.length}}`;
			
 
				+            }}
			
 
				+        }}
			
 
				+
			
 
				+        // 展开/折叠特征组
			
 
				+        function toggleFeature(featureIdx) {{
			
 
				+            const searchWordsList = document.getElementById(`search-words-${{featureIdx}}`);
			
 
				+            const featureHeader = document.getElementById(`feature-header-${{featureIdx}}`);
			
 
				+
			
 
				+            searchWordsList.classList.toggle('expanded');
			
 
				+            featureHeader.classList.toggle('active');
			
 
				+        }}
			
 
				+
			
 
				+        // 滚动到指定结果块
			
 
				+        function scrollToBlock(blockId) {{
			
 
				+            const block = document.getElementById(blockId);
			
 
				+            if (block) {{
			
 
				+                block.scrollIntoView({{ behavior: 'smooth', block: 'start' }});
			
 
				+
			
 
				+                // 高亮对应的搜索词
			
 
				+                document.querySelectorAll('.search-word-item').forEach(item => {{
			
 
				+                    item.classList.remove('active');
			
 
				+                }});
			
 
				+
			
 
				+                document.querySelectorAll(`[data-block-id="${{blockId}}"]`).forEach(item => {{
			
 
				+                    item.classList.add('active');
			
 
				+                }});
			
 
				+            }}
			
 
				+        }}
			
 
				+
			
 
				+        // 打开小红书帖子
			
 
				+        function openNote(noteId) {{
			
 
				+            if (noteId) {{
			
 
				+                window.open(`https://www.xiaohongshu.com/explore/${{noteId}}`, '_blank');
			
 
				+            }}
			
 
				+        }}
			
 
				+
			
 
				+        // 初始化
			
 
				+        document.addEventListener('DOMContentLoaded', () => {{
			
 
				+            renderLeftSidebar();
			
 
				+            renderRightContent();
			
 
				+
			
 
				+            // 默认展开第一个特征组
			
 
				+            if (data.length > 0) {{
			
 
				+                toggleFeature(0);
			
 
				+            }}
			
 
				+        }});
			
 
				+    </script>
			
 
				+</body>
			
 
				+</html>
			
 
				+'''
			
 
				+
			
 
				+    # 写入文件
			
 
				+    with open(output_path, 'w', encoding='utf-8') as f:
			
 
				+        f.write(html_content)
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    # 配置路径
			
 
				+    script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				+    json_path = os.path.join(script_dir, 'output_v2', 'stage5_with_search_results.json')
			
 
				+    output_dir = os.path.join(script_dir, 'visualization')
			
 
				+    os.makedirs(output_dir, exist_ok=True)
			
 
				+
			
 
				+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
			
 
				+    output_path = os.path.join(output_dir, f'stage5_interactive_{timestamp}.html')
			
 
				+
			
 
				+    # 加载数据
			
 
				+    print(f"📖 加载数据: {json_path}")
			
 
				+    data = load_data(json_path)
			
 
				+    print(f"✓ 加载了 {len(data)} 个原始特征")
			
 
				+
			
 
				+    # 计算统计
			
 
				+    print("📊 计算统计数据...")
			
 
				+    stats = calculate_statistics(data)
			
 
				+    print(f"✓ 统计完成:")
			
 
				+    print(f"  - 原始特征: {stats['total_features']}")
			
 
				+    print(f"  - 搜索词: {stats['total_search_words']}")
			
 
				+    print(f"  - 帖子总数: {stats['total_notes']}")
			
 
				+    print(f"  - 视频: {stats['video_count']} ({stats['video_percentage']}%)")
			
 
				+    print(f"  - 图文: {stats['normal_count']} ({stats['normal_percentage']}%)")
			
 
				+
			
 
				+    # 生成HTML
			
 
				+    print(f"\n🎨 生成可视化页面...")
			
 
				+    generate_html(data, stats, output_path)
			
 
				+    print(f"✓ 生成完成: {output_path}")
			
 
				+
			
 
				+    # 打印访问提示
			
 
				+    print(f"\n🌐 在浏览器中打开查看:")
			
 
				+    print(f"   file://{output_path}")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/xiaohongshu_search.py
+++ b/xiaohongshu_search.py
@@ -9,9 +9,12 @@ import json
 
				 import os
			
 
				 import argparse
			
 
				 import time
			
 
				+import logging
			
 
				 from datetime import datetime
			
 
				 from typing import Dict, Any
			
 
				 
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				 
			
 
				 class XiaohongshuSearch:
			
 
				     """小红书笔记搜索API封装类"""
			
@@ -20,12 +23,13 @@ class XiaohongshuSearch:
 
				     TOOL_NAME = "xhs_note_search"
			
 
				     PLATFORM = "xiaohongshu"
			
 
				 
			
 
				-    def __init__(self, results_dir: str = None):
			
 
				+    def __init__(self, results_dir: str = None, cache_dir: str = "search_cache"):
			
 
				         """
			
 
				         初始化API客户端
			
 
				 
			
 
				         Args:
			
 
				             results_dir: 结果输出目录，默认为项目根目录下的 data/search 文件夹
			
 
				+            cache_dir: 缓存目录，默认为 search_cache
			
 
				         """
			
 
				         self.api_url = f"{self.BASE_URL}/tools/call/{self.TOOL_NAME}"
			
 
				 
			
@@ -38,19 +42,60 @@ class XiaohongshuSearch:
 
				             project_root = os.path.dirname(os.path.dirname(script_dir))
			
 
				             self.results_base_dir = os.path.join(project_root, "data", "search")
			
 
				 
			
 
				+        # 设置缓存目录
			
 
				+        self.cache_dir = cache_dir
			
 
				+        if cache_dir:
			
 
				+            os.makedirs(cache_dir, exist_ok=True)
			
 
				+
			
 
				+    def _get_cache_key(
			
 
				+        self,
			
 
				+        keyword: str,
			
 
				+        content_type: str,
			
 
				+        sort_type: str,
			
 
				+        publish_time: str
			
 
				+    ) -> str:
			
 
				+        """
			
 
				+        生成缓存key
			
 
				+
			
 
				+        Args:
			
 
				+            keyword: 搜索关键词
			
 
				+            content_type: 内容类型
			
 
				+            sort_type: 排序方式
			
 
				+            publish_time: 发布时间
			
 
				+
			
 
				+        Returns:
			
 
				+            缓存key字符串
			
 
				+        """
			
 
				+        return f"{keyword}_{content_type}_{sort_type}_{publish_time}"
			
 
				+
			
 
				+    def _get_cache_path(self, cache_key: str) -> str:
			
 
				+        """
			
 
				+        获取缓存文件路径
			
 
				+
			
 
				+        Args:
			
 
				+            cache_key: 缓存key
			
 
				+
			
 
				+        Returns:
			
 
				+            缓存文件完整路径
			
 
				+        """
			
 
				+        # 清理文件名中的非法字符
			
 
				+        safe_key = cache_key.replace('/', '_').replace('\\', '_').replace(' ', '_')
			
 
				+        return os.path.join(self.cache_dir, f"{safe_key}.json")
			
 
				+
			
 
				     def search(
			
 
				         self,
			
 
				         keyword: str,
			
 
				-        content_type: str = "图文",
			
 
				+        content_type: str = "不限",
			
 
				         sort_type: str = "综合",
			
 
				         publish_time: str = "不限",
			
 
				         cursor: str = "",
			
 
				         timeout: int = 30,
			
 
				-        max_retries: int = 3,
			
 
				-        retry_delay: int = 2
			
 
				+        max_retries: int = 5,
			
 
				+        retry_delay: int = 2,
			
 
				+        use_cache: bool = True
			
 
				     ) -> Dict[str, Any]:
			
 
				         """
			
 
				-        搜索小红书笔记（带重试机制）
			
 
				+        搜索小红书笔记（带重试机制和缓存）
			
 
				 
			
 
				         Args:
			
 
				             keyword: 搜索关键词
			
@@ -61,6 +106,7 @@ class XiaohongshuSearch:
 
				             timeout: 请求超时时间（秒），默认30秒
			
 
				             max_retries: 最大重试次数，默认3次
			
 
				             retry_delay: 重试间隔时间（秒），默认2秒
			
 
				+            use_cache: 是否使用缓存，默认True
			
 
				 
			
 
				         Returns:
			
 
				             API响应的JSON数据
			
@@ -68,9 +114,24 @@ class XiaohongshuSearch:
 
				         Raises:
			
 
				             requests.exceptions.RequestException: 所有重试都失败时抛出异常
			
 
				         """
			
 
				+        # 检查缓存
			
 
				+        if use_cache and self.cache_dir:
			
 
				+            cache_key = self._get_cache_key(keyword, content_type, sort_type, publish_time)
			
 
				+            cache_path = self._get_cache_path(cache_key)
			
 
				+
			
 
				+            if os.path.exists(cache_path):
			
 
				+                try:
			
 
				+                    with open(cache_path, 'r', encoding='utf-8') as f:
			
 
				+                        cached_result = json.load(f)
			
 
				+                    logger.info(f"  ✓ 使用缓存: {keyword}")
			
 
				+                    return cached_result
			
 
				+                except Exception as e:
			
 
				+                    logger.warning(f"  读取缓存失败: {e}，将重新搜索")
			
 
				+
			
 
				+        # 缓存未命中或未启用，执行实际搜索
			
 
				         payload = {
			
 
				             "keyword": keyword,
			
 
				-            "content_type": content_type,
			
 
				+            "content_type": '不限',  # 使用映射后的参数
			
 
				             "sort_type": sort_type,
			
 
				             "publish_time": publish_time,
			
 
				             "cursor": cursor
			
@@ -106,6 +167,17 @@ class XiaohongshuSearch:
 
				                 if attempt > 1:
			
 
				                     print(f"    ✓ 重试成功")
			
 
				 
			
 
				+                # 保存到缓存
			
 
				+                if use_cache and self.cache_dir:
			
 
				+                    try:
			
 
				+                        cache_key = self._get_cache_key(keyword, content_type, sort_type, publish_time)
			
 
				+                        cache_path = self._get_cache_path(cache_key)
			
 
				+                        with open(cache_path, 'w', encoding='utf-8') as f:
			
 
				+                            json.dump(result, f, ensure_ascii=False, indent=2)
			
 
				+                        logger.info(f"  ✓ 已缓存: {keyword}")
			
 
				+                    except Exception as e:
			
 
				+                        logger.warning(f"  保存缓存失败: {e}")
			
 
				+
			
 
				                 return result
			
 
				 
			
 
				             except requests.exceptions.RequestException as e: