3 týždňov pred · e488de4152
--- a/enhanced_search_v2.py
+++ b/enhanced_search_v2.py
@@ -11,9 +11,11 @@ import copy
 
															 import time
														
 
															 import os
														
 
															 import argparse
														
 
															+import subprocess
														
 
															 from typing import Dict, List, Any, Optional, Set, Tuple
														
 
															 from datetime import datetime
														
 
															 from concurrent.futures import ThreadPoolExecutor, as_completed
														
 
															+from itertools import combinations
														
 
															 from openrouter_client import OpenRouterClient
														
 
															 from llm_evaluator import LLMEvaluator
														
@@ -41,7 +43,10 @@ class EnhancedSearchV2:
 
															         dimension_associations_path: str,
														
 
															         optimized_clustered_data_path: str,
														
 
															         openrouter_api_key: Optional[str] = None,
														
 
															-        output_dir: str = "output_v2"
														
 
															+        output_dir: str = "output_v2",
														
 
															+        top_n: int = 10,
														
 
															+        max_total_searches: Optional[int] = None,
														
 
															+        search_max_workers: int = 3
														
 
															     ):
														
 
															         """
														
 
															         初始化系统
														
@@ -52,11 +57,17 @@ class EnhancedSearchV2:
 
															             optimized_clustered_data_path: 人设特征库路径
														
 
															             openrouter_api_key: OpenRouter API密钥
														
 
															             output_dir: 输出目录
														
 
															+            top_n: 每个原始特征取评分最高的N个搜索词（默认10）
														
 
															+            max_total_searches: 全局最大搜索次数限制（默认None不限制）
														
 
															+            search_max_workers: 搜索并发数（默认3）
														
 
															         """
														
 
															         self.how_json_path = how_json_path
														
 
															         self.dimension_associations_path = dimension_associations_path
														
 
															         self.optimized_clustered_data_path = optimized_clustered_data_path
														
 
															         self.output_dir = output_dir
														
 
															+        self.top_n = top_n
														
 
															+        self.max_total_searches = max_total_searches
														
 
															+        self.search_max_workers = search_max_workers
														
 
															         # 创建输出目录
														
 
															         os.makedirs(output_dir, exist_ok=True)
														
@@ -572,38 +583,95 @@ class EnhancedSearchV2:
 
															             'sub_classifications': sub_classifications
														
 
															         }
														
 
															-    # ========== 阶段3：提取特征列表 ==========
														
 
															+    # ========== 阶段3：筛选高相似度匹配（>0.8） ==========
														
 
															-    def stage3_extract_features(self, associations_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
														
 
															+    def stage3_filter_high_similarity_matches(self, associations_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
														
 
															         """
														
 
															-        阶段3：从关联分类中提取特征列表
														
 
															+        阶段3：筛选高相似度匹配（>0.8）
														
 
															+
														
 
															+        遍历how解构中的所有原始特征，找出匹配结果中相似度>0.8
														
 
															+        且人设特征名称在Stage2关联范围内的高质量匹配
														
 
															         Args:
														
 
															             associations_data: 阶段2的关联数据
														
 
															         Returns:
														
 
															-            带特征列表的数据
														
 
															+            带高相似度候选的数据
														
 
															         """
														
 
															         logger.info("=" * 60)
														
 
															-        logger.info("阶段3：提取特征列表")
														
 
															+        logger.info("阶段3：筛选高相似度匹配（>0.8）")
														
 
															         logger.info("=" * 60)
														
 
															         for idx, feature_result in enumerate(associations_data, 1):
														
 
															-            logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {feature_result['原始特征名称']}")
														
 
															+            original_feature_name = feature_result['原始特征名称']
														
 
															+            logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {original_feature_name}")
														
 
															-            for assoc in feature_result.get('找到的关联', []):
														
 
															-                target_path = assoc['目标分类路径']
														
 
															-                logger.info(f"  提取特征: {target_path}")
														
 
															+            # 步骤1: 收集Stage2的关联范围（分类名+标签）
														
 
															+            stage2_scope = self._collect_stage2_scope(feature_result)
														
 
															+            logger.info(f"  Stage2范围包含 {len(stage2_scope)} 个分类/标签")
														
 
															-                # 提取特征
														
 
															-                features = self._find_features_by_path(target_path)
														
 
															+            # 步骤2: 遍历how解构中的所有原始特征，找出高相似度匹配
														
 
															+            high_sim_candidates = []
														
 
															+            total_checked = 0
														
 
															+            high_sim_found = 0
														
 
															+
														
 
															+            how_result = self.how_data.get('how解构结果', {})
														
 
															+            for level_name, level_list in how_result.items():
														
 
															+                if not isinstance(level_list, list):
														
 
															+                    continue
														
 
															-                # 添加到关联中
														
 
															-                assoc['特征列表'] = features
														
 
															-                logger.info(f"    找到 {len(features)} 个特征")
														
 
															+                for item in level_list:
														
 
															+                    for step in item.get('how步骤列表', []):
														
 
															+                        for feature in step.get('特征列表', []):
														
 
															+                            # 获取该特征的所有匹配
														
 
															+                            matches = feature.get('匹配结果', [])
														
 
															+                            total_checked += len(matches)
														
 
															+
														
 
															+                            # 筛选相似度>0.8且在Stage2范围内的匹配
														
 
															+                            for match in matches:
														
 
															+                                sim = match.get('匹配结果', {}).get('相似度', 0)
														
 
															+                                persona_feature_name = match.get('人设特征名称', '')
														
 
															+
														
 
															+                                if sim > 0.8 and persona_feature_name in stage2_scope:
														
 
															+                                    high_sim_found += 1
														
 
															+                                    # 记录来源信息
														
 
															+                                    high_sim_candidates.append({
														
 
															+                                        '人设特征名称': persona_feature_name,
														
 
															+                                        '相似度': sim,
														
 
															+                                        '特征类型': match.get('特征类型', ''),
														
 
															+                                        '特征分类': match.get('特征分类', []),
														
 
															+                                        '人设特征层级': match.get('人设特征层级', ''),
														
 
															+                                        '来源路径': self._build_classification_path(match.get('特征分类', [])),
														
 
															+                                        '匹配说明': match.get('匹配结果', {}).get('说明', ''),
														
 
															+                                        '来源原始特征': feature.get('特征名称', '')  # 记录来自哪个原始特征
														
 
															+                                    })
														
 
															+
														
 
															+            logger.info(f"  检查了 {total_checked} 个匹配")
														
 
															+            logger.info(f"  找到 {high_sim_found} 个相似度>0.8的匹配")
														
 
															+
														
 
															+            # 按相似度降序排序，并去重（同一个人设特征名称只保留最高分）
														
 
															+            seen_names = set()
														
 
															+            unique_candidates = []
														
 
															+            high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
														
 
															+
														
 
															+            for candidate in high_sim_candidates:
														
 
															+                name = candidate['人设特征名称']
														
 
															+                if name not in seen_names:
														
 
															+                    seen_names.add(name)
														
 
															+                    unique_candidates.append(candidate)
														
 
															+
														
 
															+            # 添加到结果中
														
 
															+            feature_result['高相似度候选'] = unique_candidates
														
 
															+            logger.info(f"  去重后筛选出 {len(unique_candidates)} 个高相似度候选")
														
 
															+
														
 
															+            # 显示前5个
														
 
															+            if unique_candidates:
														
 
															+                logger.info(f"  Top 5:")
														
 
															+                for c in unique_candidates[:5]:
														
 
															+                    logger.info(f"    • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
														
 
															         # 保存结果
														
 
															-        output_path = os.path.join(self.output_dir, "stage3_features.json")
														
 
															+        output_path = os.path.join(self.output_dir, "stage3_high_similarity.json")
														
 
															         self._save_json(associations_data, output_path)
														
 
															         logger.info(f"\n" + "=" * 60)
														
@@ -612,6 +680,29 @@ class EnhancedSearchV2:
 
															         return associations_data
														
 
															+
														
 
															+    def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
														
 
															+        """
														
 
															+        收集Stage2找到的所有分类名和标签，形成范围集合
														
 
															+
														
 
															+        Args:
														
 
															+            feature_result: 特征结果数据
														
 
															+
														
 
															+        Returns:
														
 
															+            包含所有分类名和标签的集合
														
 
															+        """
														
 
															+        scope = set()
														
 
															+
														
 
															+        for assoc in feature_result.get('找到的关联', []):
														
 
															+            # 添加分类名
														
 
															+            scope.add(assoc['分类名称'])
														
 
															+
														
 
															+            # 添加所有标签
														
 
															+            tags = assoc.get('标签列表', [])
														
 
															+            scope.update(tags)
														
 
															+
														
 
															+        return scope
														
 
															+
														
 
															     def _find_features_by_path(self, target_classification: str) -> List[Dict[str, Any]]:
														
 
															         """
														
 
															         根据路径查找特征列表
														
@@ -630,56 +721,63 @@ class EnhancedSearchV2:
 
															         # 深拷贝
														
 
															         return copy.deepcopy(features)
														
 
															-    # ========== 阶段4：生成搜索词 + LLM评估质量 ==========
														
 
															+    # ========== 阶段4：多词组合 + LLM评估 ==========
														
 
															     def stage4_generate_and_evaluate_search_words(
														
 
															         self,
														
 
															-        features_data: List[Dict[str, Any]]
														
 
															+        features_data: List[Dict[str, Any]],
														
 
															+        max_workers: int = 4,
														
 
															+        max_candidates: int = 20,
														
 
															+        max_combo_length: int = 4
														
 
															     ) -> List[Dict[str, Any]]:
														
 
															         """
														
 
															-        阶段4：生成搜索词并用LLM评估质量
														
 
															+        阶段4：多词组合 + LLM评估
														
 
															+
														
 
															+        基于Stage1的基础词和Stage3的高相似度候选，
														
 
															+        生成所有2-N词组合，通过LLM评估选出Top10
														
 
															         Args:
														
 
															-            features_data: 阶段3的特征数据
														
 
															+            features_data: 阶段3的数据（包含高相似度候选）
														
 
															+            max_workers: 并发评估的原始特征数（默认4）
														
 
															+            max_candidates: 参与组合的最大候选词数（默认20）
														
 
															+            max_combo_length: 最大组合词数（默认4，即基础词+3个候选）
														
 
															         Returns:
														
 
															             带LLM评估的数据
														
 
															         """
														
 
															         logger.info("=" * 60)
														
 
															-        logger.info("阶段4：生成搜索词 + LLM评估质量")
														
 
															+        logger.info("阶段4：多词组合 + LLM评估")
														
 
															+        logger.info(f"  最大候选词数: {max_candidates}")
														
 
															+        logger.info(f"  最大组合长度: {max_combo_length} 词")
														
 
															+        logger.info(f"  并发数: {max_workers} 个原始特征")
														
 
															         logger.info("=" * 60)
														
 
															-        for idx, feature_result in enumerate(features_data, 1):
														
 
															-            logger.info(f"\n[{idx}/{len(features_data)}] 处理: {feature_result['原始特征名称']}")
														
 
															-
														
 
															-            # 生成搜索词
														
 
															-            self._add_search_words(feature_result)
														
 
															-
														
 
															-            # 收集所有搜索词
														
 
															-            all_search_words = self._collect_all_search_words(feature_result)
														
 
															-
														
 
															-            if not all_search_words:
														
 
															-                logger.info(f"  无搜索词，跳过")
														
 
															-                continue
														
 
															-
														
 
															-            logger.info(f"  生成 {len(all_search_words)} 个搜索词")
														
 
															-
														
 
															-            # LLM分批评估（每10个一批）
														
 
															-            logger.info(f"  开始LLM评估...")
														
 
															-            original_feature = feature_result['原始特征名称']
														
 
															-            evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
														
 
															-                original_feature=original_feature,
														
 
															-                search_words=[sw['search_word'] for sw in all_search_words],
														
 
															-                batch_size=10
														
 
															-            )
														
 
															+        total_features = len(features_data)
														
 
															-            # 将评估结果写回到特征节点
														
 
															-            self._write_back_evaluations(feature_result, evaluated)
														
 
															+        # 使用ThreadPoolExecutor并行处理不同的原始特征
														
 
															+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
														
 
															+            # 提交所有任务
														
 
															+            futures = []
														
 
															+            for idx, feature_result in enumerate(features_data, 1):
														
 
															+                future = executor.submit(
														
 
															+                    self._process_single_feature_combinations,
														
 
															+                    idx,
														
 
															+                    total_features,
														
 
															+                    feature_result,
														
 
															+                    max_candidates,
														
 
															+                    max_combo_length
														
 
															+                )
														
 
															+                futures.append((future, feature_result))
														
 
															-            logger.info(f"  评估完成，最高分: {evaluated[0]['score']:.3f}")
														
 
															+            # 等待所有任务完成并收集结果
														
 
															+            for future, feature_result in futures:
														
 
															+                try:
														
 
															+                    _ = future.result()  # 等待完成，结果已经写回到feature_result中
														
 
															+                except Exception as e:
														
 
															+                    logger.error(f"  评估失败: {feature_result['原始特征名称']}, 错误: {e}")
														
 
															         # 保存结果
														
 
															-        output_path = os.path.join(self.output_dir, "stage4_with_llm_scores.json")
														
 
															+        output_path = os.path.join(self.output_dir, "stage4_combinations_evaluated.json")
														
 
															         self._save_json(features_data, output_path)
														
 
															         logger.info(f"\n" + "=" * 60)
														
@@ -688,95 +786,152 @@ class EnhancedSearchV2:
 
															         return features_data
														
 
															-    def _add_search_words(self, result: Dict[str, Any]):
														
 
															+    def _process_single_feature_combinations(
														
 
															+        self,
														
 
															+        idx: int,
														
 
															+        total: int,
														
 
															+        feature_result: Dict[str, Any],
														
 
															+        max_candidates: int,
														
 
															+        max_combo_length: int
														
 
															+    ) -> None:
														
 
															         """
														
 
															-        为结果项添加search_word字段（去重）
														
 
															+        处理单个原始特征的组合生成和评估
														
 
															+
														
 
															+        Steps:
														
 
															+        1. Get base_word from Stage1's 最高匹配信息
														
 
															+        2. Get candidates from Stage3's 高相似度候选 (top max_candidates)
														
 
															+        3. Generate 2-N word combinations
														
 
															+        4. LLM batch evaluation
														
 
															+        5. Select Top 10 and write back
														
 
															         Args:
														
 
															-            result: 单个结果项
														
 
															+            idx: 特征索引
														
 
															+            total: 总特征数
														
 
															+            feature_result: 特征结果数据
														
 
															+            max_candidates: 参与组合的最大候选词数
														
 
															+            max_combo_length: 最大组合词数
														
 
															         """
														
 
															-        # 获取基础词（人设特征名称）
														
 
															-        base_word = result.get('最高匹配信息', {}).get('人设特征名称', '')
														
 
															+        original_feature = feature_result['原始特征名称']
														
 
															+        logger.info(f"\n[{idx}/{total}] 处理: {original_feature}")
														
 
															+        # 步骤1: 获取基础词
														
 
															+        base_word = feature_result.get('最高匹配信息', {}).get('人设特征名称', '')
														
 
															         if not base_word:
														
 
															+            logger.info(f"  无基础词，跳过")
														
 
															+            feature_result['组合评估结果'] = []
														
 
															             return
														
 
															-        # 去重集合（在当前结果项范围内）
														
 
															-        seen_words: Set[str] = set()
														
 
															+        logger.info(f"  基础词: {base_word}")
														
 
															-        # 遍历所有关联的特征列表
														
 
															-        for assoc in result.get('找到的关联', []):
														
 
															-            for feature in assoc.get('特征列表', []):
														
 
															-                feature_name = feature.get('特征名称', '')
														
 
															+        # 步骤2: 获取候选词（从高相似度候选中）
														
 
															+        high_sim_candidates = feature_result.get('高相似度候选', [])
														
 
															-                if not feature_name:
														
 
															-                    feature['search_word'] = None
														
 
															-                    continue
														
 
															+        # 限制候选词数量
														
 
															+        candidates = high_sim_candidates[:max_candidates]
														
 
															+        candidate_words = [c['人设特征名称'] for c in candidates]
														
 
															-                # 生成组合词
														
 
															-                search_word = f"{base_word} {feature_name}"
														
 
															+        if not candidate_words:
														
 
															+            logger.info(f"  无候选词，跳过")
														
 
															+            feature_result['组合评估结果'] = []
														
 
															+            return
														
 
															-                # 检查是否重复
														
 
															-                if search_word not in seen_words:
														
 
															-                    feature['search_word'] = search_word
														
 
															-                    seen_words.add(search_word)
														
 
															-                else:
														
 
															-                    feature['search_word'] = None
														
 
															+        logger.info(f"  候选词数量: {len(candidate_words)} (限制: {max_candidates})")
														
 
															+
														
 
															+        # 步骤3: 生成所有组合
														
 
															+        all_combinations = []
														
 
															+
														
 
															+        # 生成1词到max_combo_length-1词的候选词组合（因为还要加上base_word）
														
 
															+        for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
														
 
															+            for combo in combinations(candidate_words, length):
														
 
															+                # 组合成搜索词：基础词 + 候选词组合
														
 
															+                search_phrase = base_word + ' ' + ' '.join(combo)
														
 
															+                all_combinations.append({
														
 
															+                    'search_word': search_phrase,
														
 
															+                    'base_word': base_word,
														
 
															+                    'candidate_words': list(combo),
														
 
															+                    'combo_length': length + 1  # +1 因为包含base_word
														
 
															+                })
														
 
															-    def _collect_all_search_words(self, feature_result: Dict[str, Any]) -> List[Dict[str, Any]]:
														
 
															-        """
														
 
															-        收集结果项中所有非空的search_word
														
 
															+        logger.info(f"  生成 {len(all_combinations)} 个组合")
														
 
															-        Args:
														
 
															-            feature_result: 结果项
														
 
															+        # 步骤4: LLM批量评估
														
 
															+        logger.info(f"  开始LLM评估...")
														
 
															+        evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
														
 
															+            original_feature=original_feature,
														
 
															+            search_words=[c['search_word'] for c in all_combinations],
														
 
															+            batch_size=50
														
 
															+        )
														
 
															-        Returns:
														
 
															-            搜索词列表，每个包含 search_word 和特征引用
														
 
															-        """
														
 
															-        search_words = []
														
 
															-
														
 
															-        for assoc_idx, assoc in enumerate(feature_result.get('找到的关联', [])):
														
 
															-            for feat_idx, feature in enumerate(assoc.get('特征列表', [])):
														
 
															-                sw = feature.get('search_word')
														
 
															-                if sw and sw.strip():
														
 
															-                    search_words.append({
														
 
															-                        'search_word': sw,
														
 
															-                        'assoc_idx': assoc_idx,
														
 
															-                        'feat_idx': feat_idx,
														
 
															-                        'feature_ref': feature  # 引用，方便写回
														
 
															-                    })
														
 
															+        # 步骤5: 选出Top 10
														
 
															+        top_10 = evaluated[:10]
														
 
															+
														
 
															+        # 写回结果
														
 
															+        feature_result['组合评估结果'] = top_10
														
 
															-        return search_words
														
 
															+        max_score = top_10[0]['score'] if top_10 else 0.0
														
 
															+        logger.info(f"  评估完成，Top 10 最高分: {max_score:.3f}")
														
 
															-    def _write_back_evaluations(
														
 
															+    # ========== 阶段5：执行搜索 ==========
														
 
															+
														
 
															+    def _execute_single_search(
														
 
															         self,
														
 
															-        feature_result: Dict[str, Any],
														
 
															-        evaluated: List[Dict[str, Any]]
														
 
															-    ):
														
 
															+        idx: int,
														
 
															+        total: int,
														
 
															+        search_word: str,
														
 
															+        feature_ref: Dict[str, Any]
														
 
															+    ) -> Dict[str, Any]:
														
 
															         """
														
 
															-        将LLM评估结果写回到特征节点
														
 
															+        执行单个搜索任务（用于并发执行）
														
 
															         Args:
														
 
															-            feature_result: 结果项
														
 
															-            evaluated: 评估结果列表
														
 
															+            idx: 搜索索引
														
 
															+            total: 总搜索数
														
 
															+            search_word: 搜索词
														
 
															+            feature_ref: 特征引用（用于写入结果）
														
 
															+
														
 
															+        Returns:
														
 
															+            搜索结果信息
														
 
															         """
														
 
															-        # 创建查找映射
														
 
															-        eval_map = {e['search_word']: e for e in evaluated}
														
 
															+        logger.info(f"[{idx}/{total}] 搜索: {search_word}")
														
 
															-        # 写回到特征节点
														
 
															-        for assoc in feature_result.get('找到的关联', []):
														
 
															-            for feature in assoc.get('特征列表', []):
														
 
															-                sw = feature.get('search_word')
														
 
															-                if sw and sw in eval_map:
														
 
															-                    eval_result = eval_map[sw]
														
 
															-                    feature['llm_evaluation'] = {
														
 
															-                        'score': eval_result['score'],
														
 
															-                        'rank': eval_result['rank'],
														
 
															-                        'reasoning': eval_result['reasoning'],
														
 
															-                        'original_feature': eval_result['original_feature']
														
 
															-                    }
														
 
															+        try:
														
 
															+            result = self.search_client.search(
														
 
															+                keyword=search_word,
														
 
															+                content_type='不限',
														
 
															+                sort_type='综合',
														
 
															+                max_retries=3,
														
 
															+                use_cache=True  # 启用搜索缓存
														
 
															+            )
														
 
															-    # ========== 阶段5：执行搜索 ==========
														
 
															+            note_count = len(result.get('data', {}).get('data', []))
														
 
															+            logger.info(f"  ✓ 成功，获取 {note_count} 条帖子")
														
 
															+
														
 
															+            # 写入结果
														
 
															+            feature_ref['search_result'] = result
														
 
															+            feature_ref['search_metadata'] = {
														
 
															+                'searched_at': datetime.now().isoformat(),
														
 
															+                'status': 'success',
														
 
															+                'note_count': note_count,
														
 
															+                'search_params': {
														
 
															+                    'keyword': search_word,
														
 
															+                    'content_type': '图文',
														
 
															+                    'sort_type': '综合'
														
 
															+                }
														
 
															+            }
														
 
															+
														
 
															+            return {'status': 'success', 'search_word': search_word, 'note_count': note_count}
														
 
															+
														
 
															+        except Exception as e:
														
 
															+            logger.error(f"  ✗ 失败: {e}")
														
 
															+            feature_ref['search_result'] = None
														
 
															+            feature_ref['search_metadata'] = {
														
 
															+                'searched_at': datetime.now().isoformat(),
														
 
															+                'status': 'failed',
														
 
															+                'note_count': 0,
														
 
															+                'error': str(e)
														
 
															+            }
														
 
															+
														
 
															+            return {'status': 'failed', 'search_word': search_word, 'error': str(e)}
														
 
															     def stage5_execute_searches(
														
 
															         self,
														
@@ -799,7 +954,7 @@ class EnhancedSearchV2:
 
															         logger.info("阶段5：执行小红书搜索")
														
 
															         logger.info("=" * 60)
														
 
															-        # 按原始特征分组收集搜索词
														
 
															+        # 按原始特征分组收集搜索词（从Stage4的组合评估结果读取）
														
 
															         feature_search_groups = {}
														
 
															         for feature_result in features_data:
														
@@ -808,21 +963,19 @@ class EnhancedSearchV2:
 
															             if original_feature not in feature_search_groups:
														
 
															                 feature_search_groups[original_feature] = []
														
 
															-            for assoc in feature_result.get('找到的关联', []):
														
 
															-                for feature in assoc.get('特征列表', []):
														
 
															-                    sw = feature.get('search_word')
														
 
															-                    if not sw:
														
 
															-                        continue
														
 
															+            # 从Stage4的组合评估结果读取
														
 
															+            for eval_item in feature_result.get('组合评估结果', []):
														
 
															+                sw = eval_item.get('search_word')
														
 
															+                if not sw:
														
 
															+                    continue
														
 
															-                    # 获取LLM评分
														
 
															-                    llm_eval = feature.get('llm_evaluation', {})
														
 
															-                    score = llm_eval.get('score', 0.0)
														
 
															+                score = eval_item.get('score', 0.0)
														
 
															-                    feature_search_groups[original_feature].append({
														
 
															-                        'search_word': sw,
														
 
															-                        'score': score,
														
 
															-                        'feature_ref': feature
														
 
															-                    })
														
 
															+                feature_search_groups[original_feature].append({
														
 
															+                    'search_word': sw,
														
 
															+                    'score': score,
														
 
															+                    'feature_ref': eval_item  # 引用评估项，用于写入搜索结果
														
 
															+                })
														
 
															         # 每组取Top N
														
 
															         all_searches = []
														
@@ -844,52 +997,35 @@ class EnhancedSearchV2:
 
															             logger.info(f"  {original_feature}: 从 {len(sorted_list)} 个搜索词中选择 Top {len(selected)} (过滤 {filtered} 个)")
														
 
															-        logger.info(f"\n共 {len(all_searches)} 个搜索任务（过滤前: {total_before_filter}, 过滤掉: {total_filtered}）")
														
 
															-
														
 
															-        # 执行搜索
														
 
															-        for idx, item in enumerate(all_searches, 1):
														
 
															-            sw = item['search_word']
														
 
															-            feature = item['feature_ref']
														
 
															+        # 应用全局搜索次数限制
														
 
															+        if self.max_total_searches and len(all_searches) > self.max_total_searches:
														
 
															+            logger.info(f"  应用全局限制：从 {len(all_searches)} 个减少到 {self.max_total_searches} 个")
														
 
															+            all_searches = all_searches[:self.max_total_searches]
														
 
															-            logger.info(f"[{idx}/{len(all_searches)}] 搜索: {sw}")
														
 
															+        logger.info(f"\n共 {len(all_searches)} 个搜索任务（过滤前: {total_before_filter}, 过滤掉: {total_filtered}）")
														
 
															+        logger.info(f"  并发执行搜索（并发数: {self.search_max_workers}）")
														
 
															-            try:
														
 
															-                result = self.search_client.search(
														
 
															-                    keyword=sw,
														
 
															-                    content_type='图文',
														
 
															-                    sort_type='综合',
														
 
															-                    max_retries=3
														
 
															+        # 使用ThreadPoolExecutor并发执行搜索
														
 
															+        with ThreadPoolExecutor(max_workers=self.search_max_workers) as executor:
														
 
															+            # 提交所有搜索任务
														
 
															+            futures = []
														
 
															+            for idx, item in enumerate(all_searches, 1):
														
 
															+                future = executor.submit(
														
 
															+                    self._execute_single_search,
														
 
															+                    idx,
														
 
															+                    len(all_searches),
														
 
															+                    item['search_word'],
														
 
															+                    item['feature_ref']
														
 
															                 )
														
 
															+                futures.append(future)
														
 
															-                note_count = len(result.get('data', {}).get('data', []))
														
 
															-                logger.info(f"  ✓ 成功，获取 {note_count} 条帖子")
														
 
															-
														
 
															-                # 写入结果
														
 
															-                feature['search_result'] = result
														
 
															-                feature['search_metadata'] = {
														
 
															-                    'searched_at': datetime.now().isoformat(),
														
 
															-                    'status': 'success',
														
 
															-                    'note_count': note_count,
														
 
															-                    'search_params': {
														
 
															-                        'keyword': sw,
														
 
															-                        'content_type': '图文',
														
 
															-                        'sort_type': '综合'
														
 
															-                    }
														
 
															-                }
														
 
															-
														
 
															-            except Exception as e:
														
 
															-                logger.error(f"  ✗ 失败: {e}")
														
 
															-                feature['search_result'] = None
														
 
															-                feature['search_metadata'] = {
														
 
															-                    'searched_at': datetime.now().isoformat(),
														
 
															-                    'status': 'failed',
														
 
															-                    'note_count': 0,
														
 
															-                    'error': str(e)
														
 
															-                }
														
 
															-
														
 
															-            # 延迟
														
 
															-            if idx < len(all_searches):
														
 
															-                time.sleep(search_delay)
														
 
															+            # 等待所有搜索完成
														
 
															+            for future in as_completed(futures):
														
 
															+                try:
														
 
															+                    result = future.result()
														
 
															+                    # 结果已经写入feature_ref，无需额外处理
														
 
															+                except Exception as e:
														
 
															+                    logger.error(f"  搜索任务失败: {e}")
														
 
															         # 保存结果
														
 
															         output_path = os.path.join(self.output_dir, "stage5_with_search_results.json")
														
@@ -974,7 +1110,7 @@ class EnhancedSearchV2:
 
															         feature_node: Dict[str, Any]
														
 
															     ) -> Dict[str, Any]:
														
 
															         """
														
 
															-        评估单个搜索结果
														
 
															+        评估单个搜索结果（使用并行评估）
														
 
															         Args:
														
 
															             original_feature: 原始特征
														
@@ -986,12 +1122,12 @@ class EnhancedSearchV2:
 
															         search_word = feature_node.get('search_word', '')
														
 
															         notes = feature_node['search_result'].get('data', {}).get('data', [])
														
 
															-        return self.llm_evaluator.evaluate_search_results(
														
 
															+        return self.llm_evaluator.evaluate_search_results_parallel(
														
 
															             original_feature=original_feature,
														
 
															             search_word=search_word,
														
 
															             notes=notes,
														
 
															             max_notes=20,
														
 
															-            max_images_per_note=2
														
 
															+            max_workers=20  # 20个并发评估每个帖子
														
 
															         )
														
 
															     # ========== 阶段7：扩展搜索 ==========
														
@@ -1052,9 +1188,10 @@ class EnhancedSearchV2:
 
															             try:
														
 
															                 result = self.search_client.search(
														
 
															                     keyword=extended_kw,
														
 
															-                    content_type='图文',
														
 
															+                    content_type='不限',
														
 
															                     sort_type='综合',
														
 
															-                    max_retries=3
														
 
															+                    max_retries=3,
														
 
															+                    use_cache=True  # 启用搜索缓存
														
 
															                 )
														
 
															                 note_count = len(result.get('data', {}).get('data', []))
														
@@ -1121,26 +1258,53 @@ class EnhancedSearchV2:
 
															             # 阶段2
														
 
															             stage2_results = self.stage2_find_associations(stage1_results)
														
 
															-            # 阶段3
														
 
															-            stage3_results = self.stage3_extract_features(stage2_results)
														
 
															+            # 阶段3 - 使用新方法：筛选高相似度匹配
														
 
															+            stage3_results = self.stage3_filter_high_similarity_matches(stage2_results)
														
 
															             # 阶段4
														
 
															-            stage4_results = self.stage4_generate_and_evaluate_search_words(stage3_results)
														
 
															+            stage4_results = self.stage4_generate_and_evaluate_search_words(
														
 
															+                stage3_results,
														
 
															+                max_workers=8,         # 提高并发从4到8
														
 
															+                max_combo_length=3     # 降低组合长度从4到3
														
 
															+            )
														
 
															             # 阶段5
														
 
															-            stage5_results = self.stage5_execute_searches(stage4_results, search_delay=2.0, top_n=10)
														
 
															+            stage5_results = self.stage5_execute_searches(stage4_results, search_delay=2.0, top_n=self.top_n)
														
 
															-            # 阶段6
														
 
															-            stage6_results = self.stage6_evaluate_search_results(stage5_results)
														
 
															+            # 阶段6 - 暂时切断执行（代码保留）
														
 
															+            # stage6_results = self.stage6_evaluate_search_results(stage5_results)
														
 
															-            # 阶段7
														
 
															-            final_results = self.stage7_extended_searches(stage6_results, search_delay=2.0)
														
 
															+            # 阶段7 - 暂时切断执行（代码保留）
														
 
															+            # final_results = self.stage7_extended_searches(stage6_results, search_delay=2.0)
														
 
															+
														
 
															+            logger.info("\n" + "=" * 60)
														
 
															+            logger.info("✓ 完整流程执行完成（Stage1-5）")
														
 
															+            logger.info("=" * 60)
														
 
															+            # 自动执行可视化
														
 
															             logger.info("\n" + "=" * 60)
														
 
															-            logger.info("✓ 完整流程执行完成")
														
 
															+            logger.info("开始生成可视化...")
														
 
															             logger.info("=" * 60)
														
 
															-            return final_results
														
 
															+            try:
														
 
															+                result = subprocess.run(
														
 
															+                    ['python3', 'visualize_stage5_results.py'],
														
 
															+                    capture_output=True,
														
 
															+                    text=True,
														
 
															+                    timeout=60
														
 
															+                )
														
 
															+
														
 
															+                if result.returncode == 0:
														
 
															+                    logger.info("✓ 可视化生成成功")
														
 
															+                    logger.info(result.stdout)
														
 
															+                else:
														
 
															+                    logger.error(f"可视化生成失败: {result.stderr}")
														
 
															+            except subprocess.TimeoutExpired:
														
 
															+                logger.error("可视化生成超时")
														
 
															+            except Exception as e:
														
 
															+                logger.error(f"可视化生成异常: {e}")
														
 
															+
														
 
															+            return stage5_results
														
 
															         except Exception as e:
														
 
															             logger.error(f"流程执行失败: {e}")
														
@@ -1152,7 +1316,7 @@ def main():
 
															     parser = argparse.ArgumentParser(description='增强搜索系统V2')
														
 
															     parser.add_argument(
														
 
															         '--how-json',
														
 
															-        default='69114f150000000007001f30_how.json',
														
 
															+        default='69114f150000000007001f30_how copy.json',
														
 
															         help='How解构文件路径'
														
 
															     )
														
 
															     parser.add_argument(
														
@@ -1175,6 +1339,24 @@ def main():
 
															         default='output_v2',
														
 
															         help='输出目录'
														
 
															     )
														
 
															+    parser.add_argument(
														
 
															+        '--top-n',
														
 
															+        type=int,
														
 
															+        default=10,
														
 
															+        help='每个原始特征取评分最高的N个搜索词（默认10）'
														
 
															+    )
														
 
															+    parser.add_argument(
														
 
															+        '--max-total-searches',
														
 
															+        type=int,
														
 
															+        default=None,
														
 
															+        help='全局最大搜索次数限制（默认None不限制）'
														
 
															+    )
														
 
															+    parser.add_argument(
														
 
															+        '--search-workers',
														
 
															+        type=int,
														
 
															+        default=3,
														
 
															+        help='搜索并发数（默认3）'
														
 
															+    )
														
 
															     args = parser.parse_args()
														
@@ -1184,7 +1366,10 @@ def main():
 
															         dimension_associations_path=args.dimension_associations,
														
 
															         optimized_clustered_data_path=args.optimized_clustered,
														
 
															         openrouter_api_key=args.api_key,
														
 
															-        output_dir=args.output_dir
														
 
															+        output_dir=args.output_dir,
														
 
															+        top_n=args.top_n,
														
 
															+        max_total_searches=args.max_total_searches,
														
 
															+        search_max_workers=args.search_workers
														
 
															     )
														
 
															     # 执行完整流程
														
--- a/llm_evaluator.py
+++ b/llm_evaluator.py
@@ -42,20 +42,24 @@ class LLMEvaluator:
 
															         """
														
 
															         prompt = f"""你是一个小红书内容分析专家。
														
 
															-任务：评估搜索词能否找到包含目标特征的内容
														
 
															+# 任务说明
														
 
															+从给定关键词中提取并组合适合在小红书搜索的query词(目标是找到【{original_feature}】相关内容,但query中不能直接出现"{original_feature}")
														
 
															-原始特征："{original_feature}"
														
 
															-组合搜索词："{search_word}"
														
 
															+## 可选词汇
														
 
															+{search_word}
														
 
															-评估标准：
														
 
															-1. 这个搜索词在小红书上能否找到包含"{original_feature}"相关元素的帖子
														
 
															-2. 搜索词的关键词组合是否合理、是否过于宽泛或过于具体
														
 
															-3. 搜索词与原始特征的语义关联性
														
 
															+## 要求
														
 
															+1. 只能使用可选词汇中的词,可以进行以下变化:
														
 
															+   - 直接使用原词或括号内的同义词
														
 
															+   - 多个词组合
														
 
															+   - 适当精简
														
 
															+2. 不能添加可选词汇以外的新词
														
 
															+3. 按推荐程度排序(越靠前越推荐)
														
 
															-请仔细分析并返回JSON格式：
														
 
															+## 输出格式(JSON)
														
 
															 {{
														
 
															-  "score": 0.75,  // 0.0-1.0，能找到相关内容的可能性
														
 
															-  "reasoning": "详细的评估理由，说明为什么给出这个分数"
														
 
															+  "score": 0.75,
														
 
															+  "reasoning": "评估理由"
														
 
															 }}
														
 
															 注意：只返回JSON，不要其他内容。"""
														
@@ -136,7 +140,7 @@ class LLMEvaluator:
 
															         self,
														
 
															         original_feature: str,
														
 
															         search_words: List[str],
														
 
															-        batch_size: int = 10
														
 
															+        batch_size: int = 50
														
 
															     ) -> List[Dict[str, Any]]:
														
 
															         """
														
 
															         分批评估搜索词（每批N个，减少API调用）
														
@@ -162,62 +166,71 @@ class LLMEvaluator:
 
															             logger.info(f"  处理第 {batch_idx + 1}/{total_batches} 批（{len(batch_words)} 个搜索词）")
														
 
															-            # 构建包含多个搜索词的prompt
														
 
															-            words_list = "\n".join([
														
 
															-                f"{i+1}. {word}"
														
 
															-                for i, word in enumerate(batch_words)
														
 
															-            ])
														
 
															+            # 从搜索词中提取所有独特的词作为可选词汇
														
 
															+            available_words_set = set()
														
 
															+            for word in batch_words:
														
 
															+                # 分割搜索词，提取单个词
														
 
															+                parts = word.split()
														
 
															+                available_words_set.update(parts)
														
 
															+
														
 
															+            # 转换为列表并排序（保证稳定性）
														
 
															+            available_words = sorted(list(available_words_set))
														
 
															+
														
 
															+            # 构建可选词汇字符串（逗号分隔）
														
 
															+            available_words_str = "、".join(available_words)
														
 
															-            prompt = f"""你是一个小红书内容分析专家。
														
 
															+            prompt = f"""
														
 
															-任务：评估以下搜索词在小红书上能否找到包含目标特征"{original_feature}"的内容
														
 
															+# 任务说明
														
 
															+从给定关键词中提取并组合适合在小红书搜索的query词（目标是找到【{original_feature}】相关内容，但query中不能直接出现"{original_feature}"二字）
														
 
															-搜索词列表：
														
 
															-{words_list}
														
 
															+## 可选词汇
														
 
															+{available_words_str}
														
 
															-评估标准：
														
 
															-1. 这个搜索词在小红书上能否找到包含"{original_feature}"相关元素的帖子
														
 
															-2. 搜索词的关键词组合是否合理、是否过于宽泛或过于具体
														
 
															-3. 搜索词与原始特征的语义关联性
														
 
															+## 要求
														
 
															+1. 只能使用可选词汇中的词,可以进行以下变化:
														
 
															+   - 直接使用原词或括号内的同义词
														
 
															+   - 多个词组合
														
 
															+   - 适当精简
														
 
															+2. 不能添加可选词汇以外的新词
														
 
															+3. 按推荐程度排序(越靠前越推荐)
														
 
															-请为每个搜索词返回评估结果，JSON数组格式：
														
 
															+## 输出格式（JSON）:
														
 
															 [
														
 
															   {{
														
 
															     "index": 1,
														
 
															-    "score": 0.75,
														
 
															-    "reasoning": "详细的评估理由"
														
 
															+    "search_word": "组合的搜索词",
														
 
															+    "score": 0.85,
														
 
															+    "reasoning": "推荐理由"
														
 
															   }},
														
 
															   {{
														
 
															     "index": 2,
														
 
															+    "search_word": "组合的搜索词",
														
 
															     "score": 0.80,
														
 
															-    "reasoning": "详细的评估理由"
														
 
															+    "reasoning": "推荐理由"
														
 
															   }}
														
 
															 ]
														
 
															-
														
 
															-注意：
														
 
															-- index 对应搜索词的编号（1-{len(batch_words)}）
														
 
															-- score 范围 0.0-1.0
														
 
															 - 只返回JSON数组，不要其他内容"""
														
 
															             # 调用LLM
														
 
															             result = self.client.chat_json(prompt=prompt, max_retries=3)
														
 
															             if result and isinstance(result, list):
														
 
															-                # 处理结果
														
 
															-                for item in result:
														
 
															-                    idx = item.get("index", 0) - 1  # 转换为0-based索引
														
 
															-                    if 0 <= idx < len(batch_words):
														
 
															+                # 处理结果 - 新格式直接包含search_word
														
 
															+                for idx, item in enumerate(result):
														
 
															+                    search_word = item.get("search_word", "")
														
 
															+                    if search_word:  # 确保有搜索词
														
 
															                         all_results.append({
														
 
															-                            "search_word": batch_words[idx],
														
 
															+                            "search_word": search_word,
														
 
															                             "score": item.get("score", 0.0),
														
 
															                             "reasoning": item.get("reasoning", ""),
														
 
															                             "original_feature": original_feature
														
 
															                         })
														
 
															                         logger.info(f"    [{start_idx + idx + 1}/{len(search_words)}] "
														
 
															-                                   f"{batch_words[idx]}: {item.get('score', 0.0):.3f}")
														
 
															+                                   f"{search_word}: {item.get('score', 0.0):.3f}")
														
 
															             else:
														
 
															                 logger.error(f"  第 {batch_idx + 1} 批评估失败，跳过")
														
 
															-                # 为失败的批次添加默认结果
														
 
															+                # 为失败的批次添加默认结果（使用原搜索词）
														
 
															                 for word in batch_words:
														
 
															                     all_results.append({
														
 
															                         "search_word": word,
														
@@ -237,6 +250,155 @@ class LLMEvaluator:
 
															         return all_results
														
 
															+    def evaluate_single_note(
														
 
															+        self,
														
 
															+        original_feature: str,
														
 
															+        search_word: str,
														
 
															+        note: Dict[str, Any],
														
 
															+        note_index: int = 0
														
 
															+    ) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        评估单个帖子（阶段6，多模态）
														
 
															+
														
 
															+        Args:
														
 
															+            original_feature: 原始特征
														
 
															+            search_word: 搜索词
														
 
															+            note: 单个帖子
														
 
															+            note_index: 帖子索引
														
 
															+
														
 
															+        Returns:
														
 
															+            单个帖子的评估结果
														
 
															+        """
														
 
															+        card = note.get("note_card", {})
														
 
															+        title = card.get("display_title", "")
														
 
															+        desc = card.get("desc", "")[:500]  # 限制长度
														
 
															+        images = card.get("image_list", [])[:10]  # 最多10张图
														
 
															+
														
 
															+        prompt = f"""你是一个小红书内容分析专家。
														
 
															+
														
 
															+任务：评估这个帖子是否包含目标特征"{original_feature}"的元素
														
 
															+
														
 
															+原始特征："{original_feature}"
														
 
															+搜索词："{search_word}"
														
 
															+
														
 
															+帖子内容：
														
 
															+标题: {title}
														
 
															+正文: {desc}
														
 
															+
														
 
															+请分析帖子的文字和图片内容，返回JSON格式：
														
 
															+{{
														
 
															+  "relevance": 0.85,  // 0.0-1.0，相关度
														
 
															+  "matched_elements": ["元素1", "元素2"],  // 匹配的元素列表
														
 
															+  "reasoning": "简短的匹配理由"
														
 
															+}}
														
 
															+
														
 
															+只返回JSON，不要其他内容。"""
														
 
															+
														
 
															+        result = self.client.chat_json(
														
 
															+            prompt=prompt,
														
 
															+            images=images if images else None,
														
 
															+            max_retries=3
														
 
															+        )
														
 
															+
														
 
															+        if result:
														
 
															+            return {
														
 
															+                "note_index": note_index,
														
 
															+                "relevance": result.get("relevance", 0.0),
														
 
															+                "matched_elements": result.get("matched_elements", []),
														
 
															+                "reasoning": result.get("reasoning", "")
														
 
															+            }
														
 
															+        else:
														
 
															+            logger.error(f"  评估帖子 {note_index} 失败: {search_word}")
														
 
															+            return {
														
 
															+                "note_index": note_index,
														
 
															+                "relevance": 0.0,
														
 
															+                "matched_elements": [],
														
 
															+                "reasoning": "评估失败"
														
 
															+            }
														
 
															+
														
 
															+    def evaluate_search_results_parallel(
														
 
															+        self,
														
 
															+        original_feature: str,
														
 
															+        search_word: str,
														
 
															+        notes: List[Dict[str, Any]],
														
 
															+        max_notes: int = 20,
														
 
															+        max_workers: int = 20
														
 
															+    ) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        并行评估搜索结果（每个帖子独立评估）
														
 
															+
														
 
															+        Args:
														
 
															+            original_feature: 原始特征
														
 
															+            search_word: 搜索词
														
 
															+            notes: 帖子列表
														
 
															+            max_notes: 最多评估几条帖子
														
 
															+            max_workers: 最大并发数
														
 
															+
														
 
															+        Returns:
														
 
															+            评估结果汇总
														
 
															+        """
														
 
															+        if not notes:
														
 
															+            return {
														
 
															+                "overall_relevance": 0.0,
														
 
															+                "extracted_elements": [],
														
 
															+                "evaluated_notes": []
														
 
															+            }
														
 
															+
														
 
															+        notes_to_eval = notes[:max_notes]
														
 
															+        evaluated_notes = []
														
 
															+
														
 
															+        logger.info(f"  并行评估 {len(notes_to_eval)} 个帖子（{max_workers}并发）")
														
 
															+
														
 
															+        # 20并发评估每个帖子
														
 
															+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
														
 
															+            futures = []
														
 
															+            for idx, note in enumerate(notes_to_eval):
														
 
															+                future = executor.submit(
														
 
															+                    self.evaluate_single_note,
														
 
															+                    original_feature,
														
 
															+                    search_word,
														
 
															+                    note,
														
 
															+                    idx
														
 
															+                )
														
 
															+                futures.append(future)
														
 
															+
														
 
															+            # 收集结果
														
 
															+            for future in as_completed(futures):
														
 
															+                try:
														
 
															+                    result = future.result()
														
 
															+                    evaluated_notes.append(result)
														
 
															+                except Exception as e:
														
 
															+                    logger.error(f"  评估帖子失败: {e}")
														
 
															+
														
 
															+        # 按note_index排序
														
 
															+        evaluated_notes.sort(key=lambda x: x['note_index'])
														
 
															+
														
 
															+        # 汇总：计算整体相关度和提取元素
														
 
															+        if evaluated_notes:
														
 
															+            overall_relevance = sum(n['relevance'] for n in evaluated_notes) / len(evaluated_notes)
														
 
															+
														
 
															+            # 提取所有元素并统计频次
														
 
															+            element_counts = {}
														
 
															+            for note in evaluated_notes:
														
 
															+                for elem in note['matched_elements']:
														
 
															+                    element_counts[elem] = element_counts.get(elem, 0) + 1
														
 
															+
														
 
															+            # 按频次排序，取前5个
														
 
															+            extracted_elements = sorted(
														
 
															+                element_counts.keys(),
														
 
															+                key=lambda x: element_counts[x],
														
 
															+                reverse=True
														
 
															+            )[:5]
														
 
															+        else:
														
 
															+            overall_relevance = 0.0
														
 
															+            extracted_elements = []
														
 
															+
														
 
															+        return {
														
 
															+            "overall_relevance": overall_relevance,
														
 
															+            "extracted_elements": extracted_elements,
														
 
															+            "evaluated_notes": evaluated_notes
														
 
															+        }
														
 
															+
														
 
															     def evaluate_search_results(
														
 
															         self,
														
 
															         original_feature: str,
														
--- a/visualize_stage5_results.py
+++ b/visualize_stage5_results.py
@@ -0,0 +1,818 @@
 
															+#!/usr/bin/env python3
														
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+Stage5搜索结果可视化工具
														
 
															+生成带图片轮播的交互式HTML页面
														
 
															+"""
														
 
															+
														
 
															+import json
														
 
															+import os
														
 
															+from datetime import datetime
														
 
															+from typing import List, Dict, Any
														
 
															+
														
 
															+
														
 
															+def load_data(json_path: str) -> List[Dict[str, Any]]:
														
 
															+    """加载JSON数据"""
														
 
															+    with open(json_path, 'r', encoding='utf-8') as f:
														
 
															+        return json.load(f)
														
 
															+
														
 
															+
														
 
															+def calculate_statistics(data: List[Dict[str, Any]]) -> Dict[str, Any]:
														
 
															+    """计算统计数据"""
														
 
															+    total_features = len(data)
														
 
															+    total_search_words = 0
														
 
															+    total_notes = 0
														
 
															+    video_count = 0
														
 
															+    normal_count = 0
														
 
															+
														
 
															+    for feature in data:
														
 
															+        search_results = feature.get('组合评估结果', [])
														
 
															+        total_search_words += len(search_results)
														
 
															+
														
 
															+        for search_item in search_results:
														
 
															+            search_result = search_item.get('search_result', {})
														
 
															+            notes = search_result.get('data', {}).get('data', [])
														
 
															+            total_notes += len(notes)
														
 
															+
														
 
															+            for note in notes:
														
 
															+                note_type = note.get('note_card', {}).get('type', '')
														
 
															+                if note_type == 'video':
														
 
															+                    video_count += 1
														
 
															+                else:
														
 
															+                    normal_count += 1
														
 
															+
														
 
															+    return {
														
 
															+        'total_features': total_features,
														
 
															+        'total_search_words': total_search_words,
														
 
															+        'total_notes': total_notes,
														
 
															+        'video_count': video_count,
														
 
															+        'normal_count': normal_count,
														
 
															+        'video_percentage': round(video_count / total_notes * 100, 1) if total_notes > 0 else 0,
														
 
															+        'normal_percentage': round(normal_count / total_notes * 100, 1) if total_notes > 0 else 0
														
 
															+    }
														
 
															+
														
 
															+
														
 
															+def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path: str):
														
 
															+    """生成HTML可视化页面"""
														
 
															+
														
 
															+    # 准备数据JSON（用于JavaScript）
														
 
															+    data_json = json.dumps(data, ensure_ascii=False, indent=2)
														
 
															+
														
 
															+    html_content = f'''<!DOCTYPE html>
														
 
															+<html lang="zh-CN">
														
 
															+<head>
														
 
															+    <meta charset="UTF-8">
														
 
															+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
														
 
															+    <title>Stage5 搜索结果可视化</title>
														
 
															+    <style>
														
 
															+        * {{
														
 
															+            margin: 0;
														
 
															+            padding: 0;
														
 
															+            box-sizing: border-box;
														
 
															+        }}
														
 
															+
														
 
															+        body {{
														
 
															+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
														
 
															+            background: #f5f7fa;
														
 
															+            color: #333;
														
 
															+            overflow-x: hidden;
														
 
															+        }}
														
 
															+
														
 
															+        /* 顶部统计面板 */
														
 
															+        .stats-panel {{
														
 
															+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
														
 
															+            color: white;
														
 
															+            padding: 20px;
														
 
															+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
														
 
															+        }}
														
 
															+
														
 
															+        .stats-container {{
														
 
															+            max-width: 1400px;
														
 
															+            margin: 0 auto;
														
 
															+            display: flex;
														
 
															+            justify-content: space-around;
														
 
															+            align-items: center;
														
 
															+            flex-wrap: wrap;
														
 
															+            gap: 20px;
														
 
															+        }}
														
 
															+
														
 
															+        .stat-item {{
														
 
															+            text-align: center;
														
 
															+        }}
														
 
															+
														
 
															+        .stat-value {{
														
 
															+            font-size: 32px;
														
 
															+            font-weight: bold;
														
 
															+            margin-bottom: 5px;
														
 
															+        }}
														
 
															+
														
 
															+        .stat-label {{
														
 
															+            font-size: 14px;
														
 
															+            opacity: 0.9;
														
 
															+        }}
														
 
															+
														
 
															+        /* 主容器 */
														
 
															+        .main-container {{
														
 
															+            display: flex;
														
 
															+            max-width: 1400px;
														
 
															+            margin: 20px auto;
														
 
															+            gap: 20px;
														
 
															+            padding: 0 20px;
														
 
															+            height: calc(100vh - 140px);
														
 
															+        }}
														
 
															+
														
 
															+        /* 左侧导航 */
														
 
															+        .left-sidebar {{
														
 
															+            width: 30%;
														
 
															+            background: white;
														
 
															+            border-radius: 8px;
														
 
															+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
														
 
															+            overflow-y: auto;
														
 
															+            position: sticky;
														
 
															+            top: 20px;
														
 
															+            height: fit-content;
														
 
															+            max-height: calc(100vh - 160px);
														
 
															+        }}
														
 
															+
														
 
															+        .feature-group {{
														
 
															+            border-bottom: 1px solid #e5e7eb;
														
 
															+        }}
														
 
															+
														
 
															+        .feature-header {{
														
 
															+            padding: 15px 20px;
														
 
															+            background: #f9fafb;
														
 
															+            cursor: pointer;
														
 
															+            user-select: none;
														
 
															+            transition: background 0.2s;
														
 
															+        }}
														
 
															+
														
 
															+        .feature-header:hover {{
														
 
															+            background: #f3f4f6;
														
 
															+        }}
														
 
															+
														
 
															+        .feature-header.active {{
														
 
															+            background: #667eea;
														
 
															+            color: white;
														
 
															+        }}
														
 
															+
														
 
															+        .feature-title {{
														
 
															+            font-size: 16px;
														
 
															+            font-weight: 600;
														
 
															+            margin-bottom: 5px;
														
 
															+        }}
														
 
															+
														
 
															+        .feature-meta {{
														
 
															+            font-size: 12px;
														
 
															+            color: #6b7280;
														
 
															+        }}
														
 
															+
														
 
															+        .feature-header.active .feature-meta {{
														
 
															+            color: rgba(255,255,255,0.8);
														
 
															+        }}
														
 
															+
														
 
															+        .search-words-list {{
														
 
															+            display: none;
														
 
															+            padding: 10px 0;
														
 
															+        }}
														
 
															+
														
 
															+        .search-words-list.expanded {{
														
 
															+            display: block;
														
 
															+        }}
														
 
															+
														
 
															+        .search-word-item {{
														
 
															+            padding: 12px 20px 12px 40px;
														
 
															+            cursor: pointer;
														
 
															+            border-left: 3px solid transparent;
														
 
															+            transition: all 0.2s;
														
 
															+        }}
														
 
															+
														
 
															+        .search-word-item:hover {{
														
 
															+            background: #f9fafb;
														
 
															+            border-left-color: #667eea;
														
 
															+        }}
														
 
															+
														
 
															+        .search-word-item.active {{
														
 
															+            background: #ede9fe;
														
 
															+            border-left-color: #7c3aed;
														
 
															+        }}
														
 
															+
														
 
															+        .search-word-text {{
														
 
															+            font-size: 14px;
														
 
															+            font-weight: 500;
														
 
															+            color: #374151;
														
 
															+            margin-bottom: 4px;
														
 
															+        }}
														
 
															+
														
 
															+        .search-word-score {{
														
 
															+            display: inline-block;
														
 
															+            padding: 2px 8px;
														
 
															+            border-radius: 12px;
														
 
															+            font-size: 11px;
														
 
															+            font-weight: 600;
														
 
															+            margin-left: 8px;
														
 
															+        }}
														
 
															+
														
 
															+        .score-high {{
														
 
															+            background: #d1fae5;
														
 
															+            color: #065f46;
														
 
															+        }}
														
 
															+
														
 
															+        .score-medium {{
														
 
															+            background: #fef3c7;
														
 
															+            color: #92400e;
														
 
															+        }}
														
 
															+
														
 
															+        .score-low {{
														
 
															+            background: #fee2e2;
														
 
															+            color: #991b1b;
														
 
															+        }}
														
 
															+
														
 
															+        .search-word-reasoning {{
														
 
															+            font-size: 12px;
														
 
															+            color: #6b7280;
														
 
															+            margin-top: 4px;
														
 
															+            display: -webkit-box;
														
 
															+            -webkit-line-clamp: 2;
														
 
															+            -webkit-box-orient: vertical;
														
 
															+            overflow: hidden;
														
 
															+        }}
														
 
															+
														
 
															+        /* 右侧结果区 */
														
 
															+        .right-content {{
														
 
															+            flex: 1;
														
 
															+            overflow-y: auto;
														
 
															+            padding-bottom: 40px;
														
 
															+        }}
														
 
															+
														
 
															+        .result-block {{
														
 
															+            background: white;
														
 
															+            border-radius: 8px;
														
 
															+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
														
 
															+            margin-bottom: 30px;
														
 
															+            padding: 20px;
														
 
															+            scroll-margin-top: 20px;
														
 
															+        }}
														
 
															+
														
 
															+        .result-header {{
														
 
															+            margin-bottom: 20px;
														
 
															+            padding-bottom: 15px;
														
 
															+            border-bottom: 2px solid #e5e7eb;
														
 
															+        }}
														
 
															+
														
 
															+        .result-title {{
														
 
															+            font-size: 20px;
														
 
															+            font-weight: 600;
														
 
															+            color: #111827;
														
 
															+            margin-bottom: 10px;
														
 
															+        }}
														
 
															+
														
 
															+        .result-stats {{
														
 
															+            display: flex;
														
 
															+            gap: 15px;
														
 
															+            font-size: 13px;
														
 
															+            color: #6b7280;
														
 
															+        }}
														
 
															+
														
 
															+        .stat-badge {{
														
 
															+            background: #f3f4f6;
														
 
															+            padding: 4px 10px;
														
 
															+            border-radius: 4px;
														
 
															+        }}
														
 
															+
														
 
															+        /* 帖子网格 */
														
 
															+        .notes-grid {{
														
 
															+            display: grid;
														
 
															+            grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
														
 
															+            gap: 20px;
														
 
															+        }}
														
 
															+
														
 
															+        .note-card {{
														
 
															+            border: 1px solid #e5e7eb;
														
 
															+            border-radius: 8px;
														
 
															+            overflow: hidden;
														
 
															+            cursor: pointer;
														
 
															+            transition: all 0.3s;
														
 
															+            background: white;
														
 
															+        }}
														
 
															+
														
 
															+        .note-card:hover {{
														
 
															+            transform: translateY(-4px);
														
 
															+            box-shadow: 0 10px 25px rgba(0,0,0,0.15);
														
 
															+        }}
														
 
															+
														
 
															+        /* 图片轮播 */
														
 
															+        .image-carousel {{
														
 
															+            position: relative;
														
 
															+            width: 100%;
														
 
															+            height: 280px;
														
 
															+            background: #f3f4f6;
														
 
															+            overflow: hidden;
														
 
															+        }}
														
 
															+
														
 
															+        .carousel-images {{
														
 
															+            display: flex;
														
 
															+            height: 100%;
														
 
															+            transition: transform 0.3s ease;
														
 
															+        }}
														
 
															+
														
 
															+        .carousel-image {{
														
 
															+            min-width: 100%;
														
 
															+            height: 100%;
														
 
															+            object-fit: cover;
														
 
															+        }}
														
 
															+
														
 
															+        .carousel-btn {{
														
 
															+            position: absolute;
														
 
															+            top: 50%;
														
 
															+            transform: translateY(-50%);
														
 
															+            background: rgba(0,0,0,0.5);
														
 
															+            color: white;
														
 
															+            border: none;
														
 
															+            width: 32px;
														
 
															+            height: 32px;
														
 
															+            border-radius: 50%;
														
 
															+            cursor: pointer;
														
 
															+            font-size: 16px;
														
 
															+            display: none;
														
 
															+            align-items: center;
														
 
															+            justify-content: center;
														
 
															+            transition: background 0.2s;
														
 
															+            z-index: 10;
														
 
															+        }}
														
 
															+
														
 
															+        .carousel-btn:hover {{
														
 
															+            background: rgba(0,0,0,0.7);
														
 
															+        }}
														
 
															+
														
 
															+        .carousel-btn.prev {{
														
 
															+            left: 8px;
														
 
															+        }}
														
 
															+
														
 
															+        .carousel-btn.next {{
														
 
															+            right: 8px;
														
 
															+        }}
														
 
															+
														
 
															+        .note-card:hover .carousel-btn {{
														
 
															+            display: flex;
														
 
															+        }}
														
 
															+
														
 
															+        .carousel-indicators {{
														
 
															+            position: absolute;
														
 
															+            bottom: 10px;
														
 
															+            left: 50%;
														
 
															+            transform: translateX(-50%);
														
 
															+            display: flex;
														
 
															+            gap: 6px;
														
 
															+            z-index: 10;
														
 
															+        }}
														
 
															+
														
 
															+        .dot {{
														
 
															+            width: 8px;
														
 
															+            height: 8px;
														
 
															+            border-radius: 50%;
														
 
															+            background: rgba(255,255,255,0.5);
														
 
															+            cursor: pointer;
														
 
															+            transition: all 0.2s;
														
 
															+        }}
														
 
															+
														
 
															+        .dot.active {{
														
 
															+            background: white;
														
 
															+            width: 24px;
														
 
															+            border-radius: 4px;
														
 
															+        }}
														
 
															+
														
 
															+        .image-counter {{
														
 
															+            position: absolute;
														
 
															+            top: 10px;
														
 
															+            right: 10px;
														
 
															+            background: rgba(0,0,0,0.6);
														
 
															+            color: white;
														
 
															+            padding: 4px 8px;
														
 
															+            border-radius: 4px;
														
 
															+            font-size: 12px;
														
 
															+            z-index: 10;
														
 
															+        }}
														
 
															+
														
 
															+        /* 帖子信息 */
														
 
															+        .note-info {{
														
 
															+            padding: 12px;
														
 
															+        }}
														
 
															+
														
 
															+        .note-title {{
														
 
															+            font-size: 14px;
														
 
															+            font-weight: 500;
														
 
															+            color: #111827;
														
 
															+            margin-bottom: 8px;
														
 
															+            display: -webkit-box;
														
 
															+            -webkit-line-clamp: 2;
														
 
															+            -webkit-box-orient: vertical;
														
 
															+            overflow: hidden;
														
 
															+            line-height: 1.4;
														
 
															+        }}
														
 
															+
														
 
															+        .note-meta {{
														
 
															+            display: flex;
														
 
															+            align-items: center;
														
 
															+            justify-content: space-between;
														
 
															+            font-size: 12px;
														
 
															+            color: #6b7280;
														
 
															+        }}
														
 
															+
														
 
															+        .note-type {{
														
 
															+            padding: 3px 8px;
														
 
															+            border-radius: 4px;
														
 
															+            font-weight: 500;
														
 
															+        }}
														
 
															+
														
 
															+        .type-video {{
														
 
															+            background: #dbeafe;
														
 
															+            color: #1e40af;
														
 
															+        }}
														
 
															+
														
 
															+        .type-normal {{
														
 
															+            background: #d1fae5;
														
 
															+            color: #065f46;
														
 
															+        }}
														
 
															+
														
 
															+        .note-author {{
														
 
															+            display: flex;
														
 
															+            align-items: center;
														
 
															+            gap: 6px;
														
 
															+        }}
														
 
															+
														
 
															+        .author-avatar {{
														
 
															+            width: 24px;
														
 
															+            height: 24px;
														
 
															+            border-radius: 50%;
														
 
															+        }}
														
 
															+
														
 
															+        /* SVG连线层 */
														
 
															+        #connection-svg {{
														
 
															+            position: fixed;
														
 
															+            top: 0;
														
 
															+            left: 0;
														
 
															+            width: 100%;
														
 
															+            height: 100%;
														
 
															+            pointer-events: none;
														
 
															+            z-index: 1;
														
 
															+        }}
														
 
															+
														
 
															+        .connection-line {{
														
 
															+            stroke: #cbd5e1;
														
 
															+            stroke-width: 1;
														
 
															+            stroke-dasharray: 5,5;
														
 
															+            fill: none;
														
 
															+            opacity: 0.3;
														
 
															+            transition: all 0.2s;
														
 
															+        }}
														
 
															+
														
 
															+        .connection-line.active {{
														
 
															+            stroke: #667eea;
														
 
															+            stroke-width: 2;
														
 
															+            stroke-dasharray: none;
														
 
															+            opacity: 1;
														
 
															+        }}
														
 
															+
														
 
															+        /* 滚动条样式 */
														
 
															+        ::-webkit-scrollbar {{
														
 
															+            width: 8px;
														
 
															+            height: 8px;
														
 
															+        }}
														
 
															+
														
 
															+        ::-webkit-scrollbar-track {{
														
 
															+            background: #f1f1f1;
														
 
															+        }}
														
 
															+
														
 
															+        ::-webkit-scrollbar-thumb {{
														
 
															+            background: #888;
														
 
															+            border-radius: 4px;
														
 
															+        }}
														
 
															+
														
 
															+        ::-webkit-scrollbar-thumb:hover {{
														
 
															+            background: #555;
														
 
															+        }}
														
 
															+    </style>
														
 
															+</head>
														
 
															+<body>
														
 
															+    <!-- 统计面板 -->
														
 
															+    <div class="stats-panel">
														
 
															+        <div class="stats-container">
														
 
															+            <div class="stat-item">
														
 
															+                <div class="stat-value">📊 {stats['total_features']}</div>
														
 
															+                <div class="stat-label">原始特征数</div>
														
 
															+            </div>
														
 
															+            <div class="stat-item">
														
 
															+                <div class="stat-value">🔍 {stats['total_search_words']}</div>
														
 
															+                <div class="stat-label">搜索词数</div>
														
 
															+            </div>
														
 
															+            <div class="stat-item">
														
 
															+                <div class="stat-value">📝 {stats['total_notes']}</div>
														
 
															+                <div class="stat-label">帖子总数</div>
														
 
															+            </div>
														
 
															+            <div class="stat-item">
														
 
															+                <div class="stat-value">🎬 {stats['video_count']}</div>
														
 
															+                <div class="stat-label">视频类型 ({stats['video_percentage']}%)</div>
														
 
															+            </div>
														
 
															+            <div class="stat-item">
														
 
															+                <div class="stat-value">📷 {stats['normal_count']}</div>
														
 
															+                <div class="stat-label">图文类型 ({stats['normal_percentage']}%)</div>
														
 
															+            </div>
														
 
															+        </div>
														
 
															+    </div>
														
 
															+
														
 
															+    <!-- SVG连线层 -->
														
 
															+    <svg id="connection-svg"></svg>
														
 
															+
														
 
															+    <!-- 主容器 -->
														
 
															+    <div class="main-container">
														
 
															+        <!-- 左侧导航 -->
														
 
															+        <div class="left-sidebar" id="leftSidebar">
														
 
															+            <!-- 通过JavaScript动态生成 -->
														
 
															+        </div>
														
 
															+
														
 
															+        <!-- 右侧结果区 -->
														
 
															+        <div class="right-content" id="rightContent">
														
 
															+            <!-- 通过JavaScript动态生成 -->
														
 
															+        </div>
														
 
															+    </div>
														
 
															+
														
 
															+    <script>
														
 
															+        // 数据
														
 
															+        const data = {data_json};
														
 
															+
														
 
															+        // 渲染左侧导航
														
 
															+        function renderLeftSidebar() {{
														
 
															+            const sidebar = document.getElementById('leftSidebar');
														
 
															+            let html = '';
														
 
															+
														
 
															+            data.forEach((feature, featureIdx) => {{
														
 
															+                const searchWords = feature['组合评估结果'] || [];
														
 
															+
														
 
															+                html += `
														
 
															+                    <div class="feature-group">
														
 
															+                        <div class="feature-header" onclick="toggleFeature(${{featureIdx}})" id="feature-header-${{featureIdx}}">
														
 
															+                            <div class="feature-title">${{feature['原始特征名称']}}</div>
														
 
															+                            <div class="feature-meta">
														
 
															+                                ${{feature['来源层级']}} · 权重: ${{feature['权重'].toFixed(2)}} · ${{searchWords.length}}个搜索词
														
 
															+                            </div>
														
 
															+                        </div>
														
 
															+                        <div class="search-words-list" id="search-words-${{featureIdx}}">
														
 
															+                `;
														
 
															+
														
 
															+                searchWords.forEach((sw, swIdx) => {{
														
 
															+                    const score = sw.score || 0;
														
 
															+                    const scoreClass = score >= 0.9 ? 'score-high' : score >= 0.7 ? 'score-medium' : 'score-low';
														
 
															+                    const blockId = `block-${{featureIdx}}-${{swIdx}}`;
														
 
															+
														
 
															+                    html += `
														
 
															+                        <div class="search-word-item" onclick="scrollToBlock('${{blockId}}')"
														
 
															+                             id="sw-${{featureIdx}}-${{swIdx}}"
														
 
															+                             data-block-id="${{blockId}}">
														
 
															+                            <div class="search-word-text">
														
 
															+                                ${{sw.search_word}}
														
 
															+                                <span class="search-word-score ${{scoreClass}}">${{score.toFixed(2)}}</span>
														
 
															+                            </div>
														
 
															+                            <div class="search-word-reasoning" title="${{sw.reasoning}}">
														
 
															+                                ${{sw.reasoning || ''}}
														
 
															+                            </div>
														
 
															+                        </div>
														
 
															+                    `;
														
 
															+                }});
														
 
															+
														
 
															+                html += `
														
 
															+                        </div>
														
 
															+                    </div>
														
 
															+                `;
														
 
															+            }});
														
 
															+
														
 
															+            sidebar.innerHTML = html;
														
 
															+        }}
														
 
															+
														
 
															+        // 渲染右侧结果区
														
 
															+        function renderRightContent() {{
														
 
															+            const content = document.getElementById('rightContent');
														
 
															+            let html = '';
														
 
															+
														
 
															+            data.forEach((feature, featureIdx) => {{
														
 
															+                const searchWords = feature['组合评估结果'] || [];
														
 
															+
														
 
															+                searchWords.forEach((sw, swIdx) => {{
														
 
															+                    const blockId = `block-${{featureIdx}}-${{swIdx}}`;
														
 
															+                    const searchResult = sw.search_result || {{}};
														
 
															+                    const notes = searchResult.data?.data || [];
														
 
															+
														
 
															+                    const videoCount = notes.filter(n => n.note_card?.type === 'video').length;
														
 
															+                    const normalCount = notes.length - videoCount;
														
 
															+
														
 
															+                    html += `
														
 
															+                        <div class="result-block" id="${{blockId}}">
														
 
															+                            <div class="result-header">
														
 
															+                                <div class="result-title">${{sw.search_word}}</div>
														
 
															+                                <div class="result-stats">
														
 
															+                                    <span class="stat-badge">📝 ${{notes.length}} 条帖子</span>
														
 
															+                                    <span class="stat-badge">🎬 ${{videoCount}} 视频</span>
														
 
															+                                    <span class="stat-badge">📷 ${{normalCount}} 图文</span>
														
 
															+                                </div>
														
 
															+                            </div>
														
 
															+                            <div class="notes-grid">
														
 
															+                                ${{notes.map((note, noteIdx) => renderNoteCard(note, featureIdx, swIdx, noteIdx)).join('')}}
														
 
															+                            </div>
														
 
															+                        </div>
														
 
															+                    `;
														
 
															+                }});
														
 
															+            }});
														
 
															+
														
 
															+            content.innerHTML = html;
														
 
															+        }}
														
 
															+
														
 
															+        // 渲染单个帖子卡片
														
 
															+        function renderNoteCard(note, featureIdx, swIdx, noteIdx) {{
														
 
															+            const card = note.note_card || {{}};
														
 
															+            const images = card.image_list || [];
														
 
															+            const title = card.display_title || '无标题';
														
 
															+            const noteType = card.type || 'normal';
														
 
															+            const noteId = note.id || '';
														
 
															+            const user = card.user || {{}};
														
 
															+            const userName = user.nick_name || '未知用户';
														
 
															+            const userAvatar = user.avatar || '';
														
 
															+
														
 
															+            const carouselId = `carousel-${{featureIdx}}-${{swIdx}}-${{noteIdx}}`;
														
 
															+
														
 
															+            return `
														
 
															+                <div class="note-card" onclick="openNote('${{noteId}}')">
														
 
															+                    <div class="image-carousel" id="${{carouselId}}">
														
 
															+                        <div class="carousel-images">
														
 
															+                            ${{images.map(img => `<img class="carousel-image" src="${{img}}" alt="帖子图片" loading="lazy">`).join('')}}
														
 
															+                        </div>
														
 
															+                        ${{images.length > 1 ? `
														
 
															+                            <button class="carousel-btn prev" onclick="event.stopPropagation(); changeImage('${{carouselId}}', -1)">←</button>
														
 
															+                            <button class="carousel-btn next" onclick="event.stopPropagation(); changeImage('${{carouselId}}', 1)">→</button>
														
 
															+                            <div class="carousel-indicators">
														
 
															+                                ${{images.map((_, i) => `<span class="dot ${{i === 0 ? 'active' : ''}}" onclick="event.stopPropagation(); goToImage('${{carouselId}}', ${{i}})"></span>`).join('')}}
														
 
															+                            </div>
														
 
															+                            <span class="image-counter">1/${{images.length}}</span>
														
 
															+                        ` : ''}}
														
 
															+                    </div>
														
 
															+                    <div class="note-info">
														
 
															+                        <div class="note-title">${{title}}</div>
														
 
															+                        <div class="note-meta">
														
 
															+                            <span class="note-type type-${{noteType}}">
														
 
															+                                ${{noteType === 'video' ? '🎬 视频' : '📷 图文'}}
														
 
															+                            </span>
														
 
															+                            <div class="note-author">
														
 
															+                                ${{userAvatar ? `<img class="author-avatar" src="${{userAvatar}}" alt="${{userName}}">` : ''}}
														
 
															+                                <span>${{userName}}</span>
														
 
															+                            </div>
														
 
															+                        </div>
														
 
															+                    </div>
														
 
															+                </div>
														
 
															+            `;
														
 
															+        }}
														
 
															+
														
 
															+        // 图片轮播逻辑
														
 
															+        const carouselStates = {{}};
														
 
															+
														
 
															+        function changeImage(carouselId, direction) {{
														
 
															+            if (!carouselStates[carouselId]) {{
														
 
															+                carouselStates[carouselId] = {{ currentIndex: 0 }};
														
 
															+            }}
														
 
															+
														
 
															+            const carousel = document.getElementById(carouselId);
														
 
															+            const imagesContainer = carousel.querySelector('.carousel-images');
														
 
															+            const images = carousel.querySelectorAll('.carousel-image');
														
 
															+            const dots = carousel.querySelectorAll('.dot');
														
 
															+            const counter = carousel.querySelector('.image-counter');
														
 
															+
														
 
															+            let newIndex = carouselStates[carouselId].currentIndex + direction;
														
 
															+            if (newIndex < 0) newIndex = images.length - 1;
														
 
															+            if (newIndex >= images.length) newIndex = 0;
														
 
															+
														
 
															+            carouselStates[carouselId].currentIndex = newIndex;
														
 
															+            imagesContainer.style.transform = `translateX(-${{newIndex * 100}}%)`;
														
 
															+
														
 
															+            // 更新指示器
														
 
															+            dots.forEach((dot, i) => {{
														
 
															+                dot.classList.toggle('active', i === newIndex);
														
 
															+            }});
														
 
															+
														
 
															+            // 更新计数器
														
 
															+            if (counter) {{
														
 
															+                counter.textContent = `${{newIndex + 1}}/${{images.length}}`;
														
 
															+            }}
														
 
															+        }}
														
 
															+
														
 
															+        function goToImage(carouselId, index) {{
														
 
															+            if (!carouselStates[carouselId]) {{
														
 
															+                carouselStates[carouselId] = {{ currentIndex: 0 }};
														
 
															+            }}
														
 
															+
														
 
															+            const carousel = document.getElementById(carouselId);
														
 
															+            const imagesContainer = carousel.querySelector('.carousel-images');
														
 
															+            const dots = carousel.querySelectorAll('.dot');
														
 
															+            const counter = carousel.querySelector('.image-counter');
														
 
															+
														
 
															+            carouselStates[carouselId].currentIndex = index;
														
 
															+            imagesContainer.style.transform = `translateX(-${{index * 100}}%)`;
														
 
															+
														
 
															+            // 更新指示器
														
 
															+            dots.forEach((dot, i) => {{
														
 
															+                dot.classList.toggle('active', i === index);
														
 
															+            }});
														
 
															+
														
 
															+            // 更新计数器
														
 
															+            if (counter) {{
														
 
															+                counter.textContent = `${{index + 1}}/${{dots.length}}`;
														
 
															+            }}
														
 
															+        }}
														
 
															+
														
 
															+        // 展开/折叠特征组
														
 
															+        function toggleFeature(featureIdx) {{
														
 
															+            const searchWordsList = document.getElementById(`search-words-${{featureIdx}}`);
														
 
															+            const featureHeader = document.getElementById(`feature-header-${{featureIdx}}`);
														
 
															+
														
 
															+            searchWordsList.classList.toggle('expanded');
														
 
															+            featureHeader.classList.toggle('active');
														
 
															+        }}
														
 
															+
														
 
															+        // 滚动到指定结果块
														
 
															+        function scrollToBlock(blockId) {{
														
 
															+            const block = document.getElementById(blockId);
														
 
															+            if (block) {{
														
 
															+                block.scrollIntoView({{ behavior: 'smooth', block: 'start' }});
														
 
															+
														
 
															+                // 高亮对应的搜索词
														
 
															+                document.querySelectorAll('.search-word-item').forEach(item => {{
														
 
															+                    item.classList.remove('active');
														
 
															+                }});
														
 
															+
														
 
															+                document.querySelectorAll(`[data-block-id="${{blockId}}"]`).forEach(item => {{
														
 
															+                    item.classList.add('active');
														
 
															+                }});
														
 
															+            }}
														
 
															+        }}
														
 
															+
														
 
															+        // 打开小红书帖子
														
 
															+        function openNote(noteId) {{
														
 
															+            if (noteId) {{
														
 
															+                window.open(`https://www.xiaohongshu.com/explore/${{noteId}}`, '_blank');
														
 
															+            }}
														
 
															+        }}
														
 
															+
														
 
															+        // 初始化
														
 
															+        document.addEventListener('DOMContentLoaded', () => {{
														
 
															+            renderLeftSidebar();
														
 
															+            renderRightContent();
														
 
															+
														
 
															+            // 默认展开第一个特征组
														
 
															+            if (data.length > 0) {{
														
 
															+                toggleFeature(0);
														
 
															+            }}
														
 
															+        }});
														
 
															+    </script>
														
 
															+</body>
														
 
															+</html>
														
 
															+'''
														
 
															+
														
 
															+    # 写入文件
														
 
															+    with open(output_path, 'w', encoding='utf-8') as f:
														
 
															+        f.write(html_content)
														
 
															+
														
 
															+
														
 
															+def main():
														
 
															+    """主函数"""
														
 
															+    # 配置路径
														
 
															+    script_dir = os.path.dirname(os.path.abspath(__file__))
														
 
															+    json_path = os.path.join(script_dir, 'output_v2', 'stage5_with_search_results.json')
														
 
															+    output_dir = os.path.join(script_dir, 'visualization')
														
 
															+    os.makedirs(output_dir, exist_ok=True)
														
 
															+
														
 
															+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
														
 
															+    output_path = os.path.join(output_dir, f'stage5_interactive_{timestamp}.html')
														
 
															+
														
 
															+    # 加载数据
														
 
															+    print(f"📖 加载数据: {json_path}")
														
 
															+    data = load_data(json_path)
														
 
															+    print(f"✓ 加载了 {len(data)} 个原始特征")
														
 
															+
														
 
															+    # 计算统计
														
 
															+    print("📊 计算统计数据...")
														
 
															+    stats = calculate_statistics(data)
														
 
															+    print(f"✓ 统计完成:")
														
 
															+    print(f"  - 原始特征: {stats['total_features']}")
														
 
															+    print(f"  - 搜索词: {stats['total_search_words']}")
														
 
															+    print(f"  - 帖子总数: {stats['total_notes']}")
														
 
															+    print(f"  - 视频: {stats['video_count']} ({stats['video_percentage']}%)")
														
 
															+    print(f"  - 图文: {stats['normal_count']} ({stats['normal_percentage']}%)")
														
 
															+
														
 
															+    # 生成HTML
														
 
															+    print(f"\n🎨 生成可视化页面...")
														
 
															+    generate_html(data, stats, output_path)
														
 
															+    print(f"✓ 生成完成: {output_path}")
														
 
															+
														
 
															+    # 打印访问提示
														
 
															+    print(f"\n🌐 在浏览器中打开查看:")
														
 
															+    print(f"   file://{output_path}")
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    main()
														
--- a/xiaohongshu_search.py
+++ b/xiaohongshu_search.py
@@ -9,9 +9,12 @@ import json
 
															 import os
														
 
															 import argparse
														
 
															 import time
														
 
															+import logging
														
 
															 from datetime import datetime
														
 
															 from typing import Dict, Any
														
 
															+logger = logging.getLogger(__name__)
														
 
															+
														
 
															 class XiaohongshuSearch:
														
 
															     """小红书笔记搜索API封装类"""
														
@@ -20,12 +23,13 @@ class XiaohongshuSearch:
 
															     TOOL_NAME = "xhs_note_search"
														
 
															     PLATFORM = "xiaohongshu"
														
 
															-    def __init__(self, results_dir: str = None):
														
 
															+    def __init__(self, results_dir: str = None, cache_dir: str = "search_cache"):
														
 
															         """
														
 
															         初始化API客户端
														
 
															         Args:
														
 
															             results_dir: 结果输出目录，默认为项目根目录下的 data/search 文件夹
														
 
															+            cache_dir: 缓存目录，默认为 search_cache
														
 
															         """
														
 
															         self.api_url = f"{self.BASE_URL}/tools/call/{self.TOOL_NAME}"
														
@@ -38,19 +42,60 @@ class XiaohongshuSearch:
 
															             project_root = os.path.dirname(os.path.dirname(script_dir))
														
 
															             self.results_base_dir = os.path.join(project_root, "data", "search")
														
 
															+        # 设置缓存目录
														
 
															+        self.cache_dir = cache_dir
														
 
															+        if cache_dir:
														
 
															+            os.makedirs(cache_dir, exist_ok=True)
														
 
															+
														
 
															+    def _get_cache_key(
														
 
															+        self,
														
 
															+        keyword: str,
														
 
															+        content_type: str,
														
 
															+        sort_type: str,
														
 
															+        publish_time: str
														
 
															+    ) -> str:
														
 
															+        """
														
 
															+        生成缓存key
														
 
															+
														
 
															+        Args:
														
 
															+            keyword: 搜索关键词
														
 
															+            content_type: 内容类型
														
 
															+            sort_type: 排序方式
														
 
															+            publish_time: 发布时间
														
 
															+
														
 
															+        Returns:
														
 
															+            缓存key字符串
														
 
															+        """
														
 
															+        return f"{keyword}_{content_type}_{sort_type}_{publish_time}"
														
 
															+
														
 
															+    def _get_cache_path(self, cache_key: str) -> str:
														
 
															+        """
														
 
															+        获取缓存文件路径
														
 
															+
														
 
															+        Args:
														
 
															+            cache_key: 缓存key
														
 
															+
														
 
															+        Returns:
														
 
															+            缓存文件完整路径
														
 
															+        """
														
 
															+        # 清理文件名中的非法字符
														
 
															+        safe_key = cache_key.replace('/', '_').replace('\\', '_').replace(' ', '_')
														
 
															+        return os.path.join(self.cache_dir, f"{safe_key}.json")
														
 
															+
														
 
															     def search(
														
 
															         self,
														
 
															         keyword: str,
														
 
															-        content_type: str = "图文",
														
 
															+        content_type: str = "不限",
														
 
															         sort_type: str = "综合",
														
 
															         publish_time: str = "不限",
														
 
															         cursor: str = "",
														
 
															         timeout: int = 30,
														
 
															-        max_retries: int = 3,
														
 
															-        retry_delay: int = 2
														
 
															+        max_retries: int = 5,
														
 
															+        retry_delay: int = 2,
														
 
															+        use_cache: bool = True
														
 
															     ) -> Dict[str, Any]:
														
 
															         """
														
 
															-        搜索小红书笔记（带重试机制）
														
 
															+        搜索小红书笔记（带重试机制和缓存）
														
 
															         Args:
														
 
															             keyword: 搜索关键词
														
@@ -61,6 +106,7 @@ class XiaohongshuSearch:
 
															             timeout: 请求超时时间（秒），默认30秒
														
 
															             max_retries: 最大重试次数，默认3次
														
 
															             retry_delay: 重试间隔时间（秒），默认2秒
														
 
															+            use_cache: 是否使用缓存，默认True
														
 
															         Returns:
														
 
															             API响应的JSON数据
														
@@ -68,9 +114,24 @@ class XiaohongshuSearch:
 
															         Raises:
														
 
															             requests.exceptions.RequestException: 所有重试都失败时抛出异常
														
 
															         """
														
 
															+        # 检查缓存
														
 
															+        if use_cache and self.cache_dir:
														
 
															+            cache_key = self._get_cache_key(keyword, content_type, sort_type, publish_time)
														
 
															+            cache_path = self._get_cache_path(cache_key)
														
 
															+
														
 
															+            if os.path.exists(cache_path):
														
 
															+                try:
														
 
															+                    with open(cache_path, 'r', encoding='utf-8') as f:
														
 
															+                        cached_result = json.load(f)
														
 
															+                    logger.info(f"  ✓ 使用缓存: {keyword}")
														
 
															+                    return cached_result
														
 
															+                except Exception as e:
														
 
															+                    logger.warning(f"  读取缓存失败: {e}，将重新搜索")
														
 
															+
														
 
															+        # 缓存未命中或未启用，执行实际搜索
														
 
															         payload = {
														
 
															             "keyword": keyword,
														
 
															-            "content_type": content_type,
														
 
															+            "content_type": '不限',  # 使用映射后的参数
														
 
															             "sort_type": sort_type,
														
 
															             "publish_time": publish_time,
														
 
															             "cursor": cursor
														
@@ -106,6 +167,17 @@ class XiaohongshuSearch:
 
															                 if attempt > 1:
														
 
															                     print(f"    ✓ 重试成功")
														
 
															+                # 保存到缓存
														
 
															+                if use_cache and self.cache_dir:
														
 
															+                    try:
														
 
															+                        cache_key = self._get_cache_key(keyword, content_type, sort_type, publish_time)
														
 
															+                        cache_path = self._get_cache_path(cache_key)
														
 
															+                        with open(cache_path, 'w', encoding='utf-8') as f:
														
 
															+                            json.dump(result, f, ensure_ascii=False, indent=2)
														
 
															+                        logger.info(f"  ✓ 已缓存: {keyword}")
														
 
															+                    except Exception as e:
														
 
															+                        logger.warning(f"  保存缓存失败: {e}")
														
 
															+
														
 
															                 return result
														
 
															             except requests.exceptions.RequestException as e: