刘立冬 hai 3 semanas
pai
achega
e488de4152
Modificáronse 4 ficheiros con 1468 adicións e 231 borrados
  1. 371 186
      enhanced_search_v2.py
  2. 201 39
      llm_evaluator.py
  3. 818 0
      visualize_stage5_results.py
  4. 78 6
      xiaohongshu_search.py

+ 371 - 186
enhanced_search_v2.py

@@ -11,9 +11,11 @@ import copy
 import time
 import os
 import argparse
+import subprocess
 from typing import Dict, List, Any, Optional, Set, Tuple
 from datetime import datetime
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from itertools import combinations
 
 from openrouter_client import OpenRouterClient
 from llm_evaluator import LLMEvaluator
@@ -41,7 +43,10 @@ class EnhancedSearchV2:
         dimension_associations_path: str,
         optimized_clustered_data_path: str,
         openrouter_api_key: Optional[str] = None,
-        output_dir: str = "output_v2"
+        output_dir: str = "output_v2",
+        top_n: int = 10,
+        max_total_searches: Optional[int] = None,
+        search_max_workers: int = 3
     ):
         """
         初始化系统
@@ -52,11 +57,17 @@ class EnhancedSearchV2:
             optimized_clustered_data_path: 人设特征库路径
             openrouter_api_key: OpenRouter API密钥
             output_dir: 输出目录
+            top_n: 每个原始特征取评分最高的N个搜索词(默认10)
+            max_total_searches: 全局最大搜索次数限制(默认None不限制)
+            search_max_workers: 搜索并发数(默认3)
         """
         self.how_json_path = how_json_path
         self.dimension_associations_path = dimension_associations_path
         self.optimized_clustered_data_path = optimized_clustered_data_path
         self.output_dir = output_dir
+        self.top_n = top_n
+        self.max_total_searches = max_total_searches
+        self.search_max_workers = search_max_workers
 
         # 创建输出目录
         os.makedirs(output_dir, exist_ok=True)
@@ -572,38 +583,95 @@ class EnhancedSearchV2:
             'sub_classifications': sub_classifications
         }
 
-    # ========== 阶段3:提取特征列表 ==========
+    # ========== 阶段3:筛选高相似度匹配(>0.8) ==========
 
-    def stage3_extract_features(self, associations_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def stage3_filter_high_similarity_matches(self, associations_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """
-        阶段3:从关联分类中提取特征列表
+        阶段3:筛选高相似度匹配(>0.8)
+
+        遍历how解构中的所有原始特征,找出匹配结果中相似度>0.8
+        且人设特征名称在Stage2关联范围内的高质量匹配
 
         Args:
             associations_data: 阶段2的关联数据
 
         Returns:
-            带特征列表的数据
+            带高相似度候选的数据
         """
         logger.info("=" * 60)
-        logger.info("阶段3:提取特征列表")
+        logger.info("阶段3:筛选高相似度匹配(>0.8)")
         logger.info("=" * 60)
 
         for idx, feature_result in enumerate(associations_data, 1):
-            logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {feature_result['原始特征名称']}")
+            original_feature_name = feature_result['原始特征名称']
+            logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {original_feature_name}")
 
-            for assoc in feature_result.get('找到的关联', []):
-                target_path = assoc['目标分类路径']
-                logger.info(f"  提取特征: {target_path}")
+            # 步骤1: 收集Stage2的关联范围(分类名+标签)
+            stage2_scope = self._collect_stage2_scope(feature_result)
+            logger.info(f"  Stage2范围包含 {len(stage2_scope)} 个分类/标签")
 
-                # 提取特征
-                features = self._find_features_by_path(target_path)
+            # 步骤2: 遍历how解构中的所有原始特征,找出高相似度匹配
+            high_sim_candidates = []
+            total_checked = 0
+            high_sim_found = 0
+
+            how_result = self.how_data.get('how解构结果', {})
+            for level_name, level_list in how_result.items():
+                if not isinstance(level_list, list):
+                    continue
 
-                # 添加到关联中
-                assoc['特征列表'] = features
-                logger.info(f"    找到 {len(features)} 个特征")
+                for item in level_list:
+                    for step in item.get('how步骤列表', []):
+                        for feature in step.get('特征列表', []):
+                            # 获取该特征的所有匹配
+                            matches = feature.get('匹配结果', [])
+                            total_checked += len(matches)
+
+                            # 筛选相似度>0.8且在Stage2范围内的匹配
+                            for match in matches:
+                                sim = match.get('匹配结果', {}).get('相似度', 0)
+                                persona_feature_name = match.get('人设特征名称', '')
+
+                                if sim > 0.8 and persona_feature_name in stage2_scope:
+                                    high_sim_found += 1
+                                    # 记录来源信息
+                                    high_sim_candidates.append({
+                                        '人设特征名称': persona_feature_name,
+                                        '相似度': sim,
+                                        '特征类型': match.get('特征类型', ''),
+                                        '特征分类': match.get('特征分类', []),
+                                        '人设特征层级': match.get('人设特征层级', ''),
+                                        '来源路径': self._build_classification_path(match.get('特征分类', [])),
+                                        '匹配说明': match.get('匹配结果', {}).get('说明', ''),
+                                        '来源原始特征': feature.get('特征名称', '')  # 记录来自哪个原始特征
+                                    })
+
+            logger.info(f"  检查了 {total_checked} 个匹配")
+            logger.info(f"  找到 {high_sim_found} 个相似度>0.8的匹配")
+
+            # 按相似度降序排序,并去重(同一个人设特征名称只保留最高分)
+            seen_names = set()
+            unique_candidates = []
+            high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
+
+            for candidate in high_sim_candidates:
+                name = candidate['人设特征名称']
+                if name not in seen_names:
+                    seen_names.add(name)
+                    unique_candidates.append(candidate)
+
+            # 添加到结果中
+            feature_result['高相似度候选'] = unique_candidates
+            logger.info(f"  去重后筛选出 {len(unique_candidates)} 个高相似度候选")
+
+            # 显示前5个
+            if unique_candidates:
+                logger.info(f"  Top 5:")
+                for c in unique_candidates[:5]:
+                    logger.info(f"    • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
 
         # 保存结果
-        output_path = os.path.join(self.output_dir, "stage3_features.json")
+        output_path = os.path.join(self.output_dir, "stage3_high_similarity.json")
         self._save_json(associations_data, output_path)
 
         logger.info(f"\n" + "=" * 60)
@@ -612,6 +680,29 @@ class EnhancedSearchV2:
 
         return associations_data
 
+
+    def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
+        """
+        收集Stage2找到的所有分类名和标签,形成范围集合
+
+        Args:
+            feature_result: 特征结果数据
+
+        Returns:
+            包含所有分类名和标签的集合
+        """
+        scope = set()
+
+        for assoc in feature_result.get('找到的关联', []):
+            # 添加分类名
+            scope.add(assoc['分类名称'])
+
+            # 添加所有标签
+            tags = assoc.get('标签列表', [])
+            scope.update(tags)
+
+        return scope
+
     def _find_features_by_path(self, target_classification: str) -> List[Dict[str, Any]]:
         """
         根据路径查找特征列表
@@ -630,56 +721,63 @@ class EnhancedSearchV2:
         # 深拷贝
         return copy.deepcopy(features)
 
-    # ========== 阶段4:生成搜索词 + LLM评估质量 ==========
+    # ========== 阶段4:多词组合 + LLM评估 ==========
 
     def stage4_generate_and_evaluate_search_words(
         self,
-        features_data: List[Dict[str, Any]]
+        features_data: List[Dict[str, Any]],
+        max_workers: int = 4,
+        max_candidates: int = 20,
+        max_combo_length: int = 4
     ) -> List[Dict[str, Any]]:
         """
-        阶段4:生成搜索词并用LLM评估质量
+        阶段4:多词组合 + LLM评估
+
+        基于Stage1的基础词和Stage3的高相似度候选,
+        生成所有2-N词组合,通过LLM评估选出Top10
 
         Args:
-            features_data: 阶段3的特征数据
+            features_data: 阶段3的数据(包含高相似度候选)
+            max_workers: 并发评估的原始特征数(默认4)
+            max_candidates: 参与组合的最大候选词数(默认20)
+            max_combo_length: 最大组合词数(默认4,即基础词+3个候选)
 
         Returns:
             带LLM评估的数据
         """
         logger.info("=" * 60)
-        logger.info("阶段4:生成搜索词 + LLM评估质量")
+        logger.info("阶段4:多词组合 + LLM评估")
+        logger.info(f"  最大候选词数: {max_candidates}")
+        logger.info(f"  最大组合长度: {max_combo_length} 词")
+        logger.info(f"  并发数: {max_workers} 个原始特征")
         logger.info("=" * 60)
 
-        for idx, feature_result in enumerate(features_data, 1):
-            logger.info(f"\n[{idx}/{len(features_data)}] 处理: {feature_result['原始特征名称']}")
-
-            # 生成搜索词
-            self._add_search_words(feature_result)
-
-            # 收集所有搜索词
-            all_search_words = self._collect_all_search_words(feature_result)
-
-            if not all_search_words:
-                logger.info(f"  无搜索词,跳过")
-                continue
-
-            logger.info(f"  生成 {len(all_search_words)} 个搜索词")
-
-            # LLM分批评估(每10个一批)
-            logger.info(f"  开始LLM评估...")
-            original_feature = feature_result['原始特征名称']
-            evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
-                original_feature=original_feature,
-                search_words=[sw['search_word'] for sw in all_search_words],
-                batch_size=10
-            )
+        total_features = len(features_data)
 
-            # 将评估结果写回到特征节点
-            self._write_back_evaluations(feature_result, evaluated)
+        # 使用ThreadPoolExecutor并行处理不同的原始特征
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # 提交所有任务
+            futures = []
+            for idx, feature_result in enumerate(features_data, 1):
+                future = executor.submit(
+                    self._process_single_feature_combinations,
+                    idx,
+                    total_features,
+                    feature_result,
+                    max_candidates,
+                    max_combo_length
+                )
+                futures.append((future, feature_result))
 
-            logger.info(f"  评估完成,最高分: {evaluated[0]['score']:.3f}")
+            # 等待所有任务完成并收集结果
+            for future, feature_result in futures:
+                try:
+                    _ = future.result()  # 等待完成,结果已经写回到feature_result中
+                except Exception as e:
+                    logger.error(f"  评估失败: {feature_result['原始特征名称']}, 错误: {e}")
 
         # 保存结果
-        output_path = os.path.join(self.output_dir, "stage4_with_llm_scores.json")
+        output_path = os.path.join(self.output_dir, "stage4_combinations_evaluated.json")
         self._save_json(features_data, output_path)
 
         logger.info(f"\n" + "=" * 60)
@@ -688,95 +786,152 @@ class EnhancedSearchV2:
 
         return features_data
 
-    def _add_search_words(self, result: Dict[str, Any]):
+    def _process_single_feature_combinations(
+        self,
+        idx: int,
+        total: int,
+        feature_result: Dict[str, Any],
+        max_candidates: int,
+        max_combo_length: int
+    ) -> None:
         """
-        为结果项添加search_word字段(去重)
+        处理单个原始特征的组合生成和评估
+
+        Steps:
+        1. Get base_word from Stage1's 最高匹配信息
+        2. Get candidates from Stage3's 高相似度候选 (top max_candidates)
+        3. Generate 2-N word combinations
+        4. LLM batch evaluation
+        5. Select Top 10 and write back
 
         Args:
-            result: 单个结果项
+            idx: 特征索引
+            total: 总特征数
+            feature_result: 特征结果数据
+            max_candidates: 参与组合的最大候选词数
+            max_combo_length: 最大组合词数
         """
-        # 获取基础词(人设特征名称)
-        base_word = result.get('最高匹配信息', {}).get('人设特征名称', '')
+        original_feature = feature_result['原始特征名称']
+        logger.info(f"\n[{idx}/{total}] 处理: {original_feature}")
 
+        # 步骤1: 获取基础词
+        base_word = feature_result.get('最高匹配信息', {}).get('人设特征名称', '')
         if not base_word:
+            logger.info(f"  无基础词,跳过")
+            feature_result['组合评估结果'] = []
             return
 
-        # 去重集合(在当前结果项范围内)
-        seen_words: Set[str] = set()
+        logger.info(f"  基础词: {base_word}")
 
-        # 遍历所有关联的特征列表
-        for assoc in result.get('找到的关联', []):
-            for feature in assoc.get('特征列表', []):
-                feature_name = feature.get('特征名称', '')
+        # 步骤2: 获取候选词(从高相似度候选中)
+        high_sim_candidates = feature_result.get('高相似度候选', [])
 
-                if not feature_name:
-                    feature['search_word'] = None
-                    continue
+        # 限制候选词数量
+        candidates = high_sim_candidates[:max_candidates]
+        candidate_words = [c['人设特征名称'] for c in candidates]
 
-                # 生成组合词
-                search_word = f"{base_word} {feature_name}"
+        if not candidate_words:
+            logger.info(f"  无候选词,跳过")
+            feature_result['组合评估结果'] = []
+            return
 
-                # 检查是否重复
-                if search_word not in seen_words:
-                    feature['search_word'] = search_word
-                    seen_words.add(search_word)
-                else:
-                    feature['search_word'] = None
+        logger.info(f"  候选词数量: {len(candidate_words)} (限制: {max_candidates})")
+
+        # 步骤3: 生成所有组合
+        all_combinations = []
+
+        # 生成1词到max_combo_length-1词的候选词组合(因为还要加上base_word)
+        for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
+            for combo in combinations(candidate_words, length):
+                # 组合成搜索词:基础词 + 候选词组合
+                search_phrase = base_word + ' ' + ' '.join(combo)
+                all_combinations.append({
+                    'search_word': search_phrase,
+                    'base_word': base_word,
+                    'candidate_words': list(combo),
+                    'combo_length': length + 1  # +1 因为包含base_word
+                })
 
-    def _collect_all_search_words(self, feature_result: Dict[str, Any]) -> List[Dict[str, Any]]:
-        """
-        收集结果项中所有非空的search_word
+        logger.info(f"  生成 {len(all_combinations)} 个组合")
 
-        Args:
-            feature_result: 结果项
+        # 步骤4: LLM批量评估
+        logger.info(f"  开始LLM评估...")
+        evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
+            original_feature=original_feature,
+            search_words=[c['search_word'] for c in all_combinations],
+            batch_size=50
+        )
 
-        Returns:
-            搜索词列表,每个包含 search_word 和特征引用
-        """
-        search_words = []
-
-        for assoc_idx, assoc in enumerate(feature_result.get('找到的关联', [])):
-            for feat_idx, feature in enumerate(assoc.get('特征列表', [])):
-                sw = feature.get('search_word')
-                if sw and sw.strip():
-                    search_words.append({
-                        'search_word': sw,
-                        'assoc_idx': assoc_idx,
-                        'feat_idx': feat_idx,
-                        'feature_ref': feature  # 引用,方便写回
-                    })
+        # 步骤5: 选出Top 10
+        top_10 = evaluated[:10]
+
+        # 写回结果
+        feature_result['组合评估结果'] = top_10
 
-        return search_words
+        max_score = top_10[0]['score'] if top_10 else 0.0
+        logger.info(f"  评估完成,Top 10 最高分: {max_score:.3f}")
 
-    def _write_back_evaluations(
+    # ========== 阶段5:执行搜索 ==========
+
+    def _execute_single_search(
         self,
-        feature_result: Dict[str, Any],
-        evaluated: List[Dict[str, Any]]
-    ):
+        idx: int,
+        total: int,
+        search_word: str,
+        feature_ref: Dict[str, Any]
+    ) -> Dict[str, Any]:
         """
-        将LLM评估结果写回到特征节点
+        执行单个搜索任务(用于并发执行)
 
         Args:
-            feature_result: 结果项
-            evaluated: 评估结果列表
+            idx: 搜索索引
+            total: 总搜索数
+            search_word: 搜索词
+            feature_ref: 特征引用(用于写入结果)
+
+        Returns:
+            搜索结果信息
         """
-        # 创建查找映射
-        eval_map = {e['search_word']: e for e in evaluated}
+        logger.info(f"[{idx}/{total}] 搜索: {search_word}")
 
-        # 写回到特征节点
-        for assoc in feature_result.get('找到的关联', []):
-            for feature in assoc.get('特征列表', []):
-                sw = feature.get('search_word')
-                if sw and sw in eval_map:
-                    eval_result = eval_map[sw]
-                    feature['llm_evaluation'] = {
-                        'score': eval_result['score'],
-                        'rank': eval_result['rank'],
-                        'reasoning': eval_result['reasoning'],
-                        'original_feature': eval_result['original_feature']
-                    }
+        try:
+            result = self.search_client.search(
+                keyword=search_word,
+                content_type='不限',
+                sort_type='综合',
+                max_retries=3,
+                use_cache=True  # 启用搜索缓存
+            )
 
-    # ========== 阶段5:执行搜索 ==========
+            note_count = len(result.get('data', {}).get('data', []))
+            logger.info(f"  ✓ 成功,获取 {note_count} 条帖子")
+
+            # 写入结果
+            feature_ref['search_result'] = result
+            feature_ref['search_metadata'] = {
+                'searched_at': datetime.now().isoformat(),
+                'status': 'success',
+                'note_count': note_count,
+                'search_params': {
+                    'keyword': search_word,
+                    'content_type': '图文',
+                    'sort_type': '综合'
+                }
+            }
+
+            return {'status': 'success', 'search_word': search_word, 'note_count': note_count}
+
+        except Exception as e:
+            logger.error(f"  ✗ 失败: {e}")
+            feature_ref['search_result'] = None
+            feature_ref['search_metadata'] = {
+                'searched_at': datetime.now().isoformat(),
+                'status': 'failed',
+                'note_count': 0,
+                'error': str(e)
+            }
+
+            return {'status': 'failed', 'search_word': search_word, 'error': str(e)}
 
     def stage5_execute_searches(
         self,
@@ -799,7 +954,7 @@ class EnhancedSearchV2:
         logger.info("阶段5:执行小红书搜索")
         logger.info("=" * 60)
 
-        # 按原始特征分组收集搜索词
+        # 按原始特征分组收集搜索词(从Stage4的组合评估结果读取)
         feature_search_groups = {}
 
         for feature_result in features_data:
@@ -808,21 +963,19 @@ class EnhancedSearchV2:
             if original_feature not in feature_search_groups:
                 feature_search_groups[original_feature] = []
 
-            for assoc in feature_result.get('找到的关联', []):
-                for feature in assoc.get('特征列表', []):
-                    sw = feature.get('search_word')
-                    if not sw:
-                        continue
+            # 从Stage4的组合评估结果读取
+            for eval_item in feature_result.get('组合评估结果', []):
+                sw = eval_item.get('search_word')
+                if not sw:
+                    continue
 
-                    # 获取LLM评分
-                    llm_eval = feature.get('llm_evaluation', {})
-                    score = llm_eval.get('score', 0.0)
+                score = eval_item.get('score', 0.0)
 
-                    feature_search_groups[original_feature].append({
-                        'search_word': sw,
-                        'score': score,
-                        'feature_ref': feature
-                    })
+                feature_search_groups[original_feature].append({
+                    'search_word': sw,
+                    'score': score,
+                    'feature_ref': eval_item  # 引用评估项,用于写入搜索结果
+                })
 
         # 每组取Top N
         all_searches = []
@@ -844,52 +997,35 @@ class EnhancedSearchV2:
 
             logger.info(f"  {original_feature}: 从 {len(sorted_list)} 个搜索词中选择 Top {len(selected)} (过滤 {filtered} 个)")
 
-        logger.info(f"\n共 {len(all_searches)} 个搜索任务(过滤前: {total_before_filter}, 过滤掉: {total_filtered})")
-
-        # 执行搜索
-        for idx, item in enumerate(all_searches, 1):
-            sw = item['search_word']
-            feature = item['feature_ref']
+        # 应用全局搜索次数限制
+        if self.max_total_searches and len(all_searches) > self.max_total_searches:
+            logger.info(f"  应用全局限制:从 {len(all_searches)} 个减少到 {self.max_total_searches} 个")
+            all_searches = all_searches[:self.max_total_searches]
 
-            logger.info(f"[{idx}/{len(all_searches)}] 搜索: {sw}")
+        logger.info(f"\n共 {len(all_searches)} 个搜索任务(过滤前: {total_before_filter}, 过滤掉: {total_filtered})")
+        logger.info(f"  并发执行搜索(并发数: {self.search_max_workers})")
 
-            try:
-                result = self.search_client.search(
-                    keyword=sw,
-                    content_type='图文',
-                    sort_type='综合',
-                    max_retries=3
+        # 使用ThreadPoolExecutor并发执行搜索
+        with ThreadPoolExecutor(max_workers=self.search_max_workers) as executor:
+            # 提交所有搜索任务
+            futures = []
+            for idx, item in enumerate(all_searches, 1):
+                future = executor.submit(
+                    self._execute_single_search,
+                    idx,
+                    len(all_searches),
+                    item['search_word'],
+                    item['feature_ref']
                 )
+                futures.append(future)
 
-                note_count = len(result.get('data', {}).get('data', []))
-                logger.info(f"  ✓ 成功,获取 {note_count} 条帖子")
-
-                # 写入结果
-                feature['search_result'] = result
-                feature['search_metadata'] = {
-                    'searched_at': datetime.now().isoformat(),
-                    'status': 'success',
-                    'note_count': note_count,
-                    'search_params': {
-                        'keyword': sw,
-                        'content_type': '图文',
-                        'sort_type': '综合'
-                    }
-                }
-
-            except Exception as e:
-                logger.error(f"  ✗ 失败: {e}")
-                feature['search_result'] = None
-                feature['search_metadata'] = {
-                    'searched_at': datetime.now().isoformat(),
-                    'status': 'failed',
-                    'note_count': 0,
-                    'error': str(e)
-                }
-
-            # 延迟
-            if idx < len(all_searches):
-                time.sleep(search_delay)
+            # 等待所有搜索完成
+            for future in as_completed(futures):
+                try:
+                    result = future.result()
+                    # 结果已经写入feature_ref,无需额外处理
+                except Exception as e:
+                    logger.error(f"  搜索任务失败: {e}")
 
         # 保存结果
         output_path = os.path.join(self.output_dir, "stage5_with_search_results.json")
@@ -974,7 +1110,7 @@ class EnhancedSearchV2:
         feature_node: Dict[str, Any]
     ) -> Dict[str, Any]:
         """
-        评估单个搜索结果
+        评估单个搜索结果(使用并行评估)
 
         Args:
             original_feature: 原始特征
@@ -986,12 +1122,12 @@ class EnhancedSearchV2:
         search_word = feature_node.get('search_word', '')
         notes = feature_node['search_result'].get('data', {}).get('data', [])
 
-        return self.llm_evaluator.evaluate_search_results(
+        return self.llm_evaluator.evaluate_search_results_parallel(
             original_feature=original_feature,
             search_word=search_word,
             notes=notes,
             max_notes=20,
-            max_images_per_note=2
+            max_workers=20  # 20个并发评估每个帖子
         )
 
     # ========== 阶段7:扩展搜索 ==========
@@ -1052,9 +1188,10 @@ class EnhancedSearchV2:
             try:
                 result = self.search_client.search(
                     keyword=extended_kw,
-                    content_type='图文',
+                    content_type='不限',
                     sort_type='综合',
-                    max_retries=3
+                    max_retries=3,
+                    use_cache=True  # 启用搜索缓存
                 )
 
                 note_count = len(result.get('data', {}).get('data', []))
@@ -1121,26 +1258,53 @@ class EnhancedSearchV2:
             # 阶段2
             stage2_results = self.stage2_find_associations(stage1_results)
 
-            # 阶段3
-            stage3_results = self.stage3_extract_features(stage2_results)
+            # 阶段3 - 使用新方法:筛选高相似度匹配
+            stage3_results = self.stage3_filter_high_similarity_matches(stage2_results)
 
             # 阶段4
-            stage4_results = self.stage4_generate_and_evaluate_search_words(stage3_results)
+            stage4_results = self.stage4_generate_and_evaluate_search_words(
+                stage3_results,
+                max_workers=8,         # 提高并发从4到8
+                max_combo_length=3     # 降低组合长度从4到3
+            )
 
             # 阶段5
-            stage5_results = self.stage5_execute_searches(stage4_results, search_delay=2.0, top_n=10)
+            stage5_results = self.stage5_execute_searches(stage4_results, search_delay=2.0, top_n=self.top_n)
 
-            # 阶段6
-            stage6_results = self.stage6_evaluate_search_results(stage5_results)
+            # 阶段6 - 暂时切断执行(代码保留)
+            # stage6_results = self.stage6_evaluate_search_results(stage5_results)
 
-            # 阶段7
-            final_results = self.stage7_extended_searches(stage6_results, search_delay=2.0)
+            # 阶段7 - 暂时切断执行(代码保留)
+            # final_results = self.stage7_extended_searches(stage6_results, search_delay=2.0)
+
+            logger.info("\n" + "=" * 60)
+            logger.info("✓ 完整流程执行完成(Stage1-5)")
+            logger.info("=" * 60)
 
+            # 自动执行可视化
             logger.info("\n" + "=" * 60)
-            logger.info("✓ 完整流程执行完成")
+            logger.info("开始生成可视化...")
             logger.info("=" * 60)
 
-            return final_results
+            try:
+                result = subprocess.run(
+                    ['python3', 'visualize_stage5_results.py'],
+                    capture_output=True,
+                    text=True,
+                    timeout=60
+                )
+
+                if result.returncode == 0:
+                    logger.info("✓ 可视化生成成功")
+                    logger.info(result.stdout)
+                else:
+                    logger.error(f"可视化生成失败: {result.stderr}")
+            except subprocess.TimeoutExpired:
+                logger.error("可视化生成超时")
+            except Exception as e:
+                logger.error(f"可视化生成异常: {e}")
+
+            return stage5_results
 
         except Exception as e:
             logger.error(f"流程执行失败: {e}")
@@ -1152,7 +1316,7 @@ def main():
     parser = argparse.ArgumentParser(description='增强搜索系统V2')
     parser.add_argument(
         '--how-json',
-        default='69114f150000000007001f30_how.json',
+        default='69114f150000000007001f30_how copy.json',
         help='How解构文件路径'
     )
     parser.add_argument(
@@ -1175,6 +1339,24 @@ def main():
         default='output_v2',
         help='输出目录'
     )
+    parser.add_argument(
+        '--top-n',
+        type=int,
+        default=10,
+        help='每个原始特征取评分最高的N个搜索词(默认10)'
+    )
+    parser.add_argument(
+        '--max-total-searches',
+        type=int,
+        default=None,
+        help='全局最大搜索次数限制(默认None不限制)'
+    )
+    parser.add_argument(
+        '--search-workers',
+        type=int,
+        default=3,
+        help='搜索并发数(默认3)'
+    )
 
     args = parser.parse_args()
 
@@ -1184,7 +1366,10 @@ def main():
         dimension_associations_path=args.dimension_associations,
         optimized_clustered_data_path=args.optimized_clustered,
         openrouter_api_key=args.api_key,
-        output_dir=args.output_dir
+        output_dir=args.output_dir,
+        top_n=args.top_n,
+        max_total_searches=args.max_total_searches,
+        search_max_workers=args.search_workers
     )
 
     # 执行完整流程

+ 201 - 39
llm_evaluator.py

@@ -42,20 +42,24 @@ class LLMEvaluator:
         """
         prompt = f"""你是一个小红书内容分析专家。
 
-任务:评估搜索词能否找到包含目标特征的内容
+# 任务说明
+从给定关键词中提取并组合适合在小红书搜索的query词(目标是找到【{original_feature}】相关内容,但query中不能直接出现"{original_feature}")
 
-原始特征:"{original_feature}"
-组合搜索词:"{search_word}"
+## 可选词汇
+{search_word}
 
-评估标准:
-1. 这个搜索词在小红书上能否找到包含"{original_feature}"相关元素的帖子
-2. 搜索词的关键词组合是否合理、是否过于宽泛或过于具体
-3. 搜索词与原始特征的语义关联性
+## 要求
+1. 只能使用可选词汇中的词,可以进行以下变化:
+   - 直接使用原词或括号内的同义词
+   - 多个词组合
+   - 适当精简
+2. 不能添加可选词汇以外的新词
+3. 按推荐程度排序(越靠前越推荐)
 
-请仔细分析并返回JSON格式:
+## 输出格式(JSON)
 {{
-  "score": 0.75,  // 0.0-1.0,能找到相关内容的可能性
-  "reasoning": "详细的评估理由,说明为什么给出这个分数"
+  "score": 0.75,
+  "reasoning": "评估理由"
 }}
 
 注意:只返回JSON,不要其他内容。"""
@@ -136,7 +140,7 @@ class LLMEvaluator:
         self,
         original_feature: str,
         search_words: List[str],
-        batch_size: int = 10
+        batch_size: int = 50
     ) -> List[Dict[str, Any]]:
         """
         分批评估搜索词(每批N个,减少API调用)
@@ -162,62 +166,71 @@ class LLMEvaluator:
 
             logger.info(f"  处理第 {batch_idx + 1}/{total_batches} 批({len(batch_words)} 个搜索词)")
 
-            # 构建包含多个搜索词的prompt
-            words_list = "\n".join([
-                f"{i+1}. {word}"
-                for i, word in enumerate(batch_words)
-            ])
+            # 从搜索词中提取所有独特的词作为可选词汇
+            available_words_set = set()
+            for word in batch_words:
+                # 分割搜索词,提取单个词
+                parts = word.split()
+                available_words_set.update(parts)
+
+            # 转换为列表并排序(保证稳定性)
+            available_words = sorted(list(available_words_set))
+
+            # 构建可选词汇字符串(逗号分隔)
+            available_words_str = "、".join(available_words)
 
-            prompt = f"""你是一个小红书内容分析专家。
+            prompt = f"""
 
-任务:评估以下搜索词在小红书上能否找到包含目标特征"{original_feature}"的内容
+# 任务说明
+从给定关键词中提取并组合适合在小红书搜索的query词(目标是找到【{original_feature}】相关内容,但query中不能直接出现"{original_feature}"二字)
 
-搜索词列表:
-{words_list}
+## 可选词汇
+{available_words_str}
 
-评估标准:
-1. 这个搜索词在小红书上能否找到包含"{original_feature}"相关元素的帖子
-2. 搜索词的关键词组合是否合理、是否过于宽泛或过于具体
-3. 搜索词与原始特征的语义关联性
+## 要求
+1. 只能使用可选词汇中的词,可以进行以下变化:
+   - 直接使用原词或括号内的同义词
+   - 多个词组合
+   - 适当精简
+2. 不能添加可选词汇以外的新词
+3. 按推荐程度排序(越靠前越推荐)
 
-请为每个搜索词返回评估结果,JSON数组格式:
+## 输出格式(JSON):
 [
   {{
     "index": 1,
-    "score": 0.75,
-    "reasoning": "详细的评估理由"
+    "search_word": "组合的搜索词",
+    "score": 0.85,
+    "reasoning": "推荐理由"
   }},
   {{
     "index": 2,
+    "search_word": "组合的搜索词",
     "score": 0.80,
-    "reasoning": "详细的评估理由"
+    "reasoning": "推荐理由"
   }}
 ]
-
-注意:
-- index 对应搜索词的编号(1-{len(batch_words)})
-- score 范围 0.0-1.0
 - 只返回JSON数组,不要其他内容"""
 
             # 调用LLM
             result = self.client.chat_json(prompt=prompt, max_retries=3)
 
             if result and isinstance(result, list):
-                # 处理结果
-                for item in result:
-                    idx = item.get("index", 0) - 1  # 转换为0-based索引
-                    if 0 <= idx < len(batch_words):
+                # 处理结果 - 新格式直接包含search_word
+                for idx, item in enumerate(result):
+                    search_word = item.get("search_word", "")
+                    if search_word:  # 确保有搜索词
                         all_results.append({
-                            "search_word": batch_words[idx],
+                            "search_word": search_word,
                             "score": item.get("score", 0.0),
                             "reasoning": item.get("reasoning", ""),
                             "original_feature": original_feature
                         })
                         logger.info(f"    [{start_idx + idx + 1}/{len(search_words)}] "
-                                   f"{batch_words[idx]}: {item.get('score', 0.0):.3f}")
+                                   f"{search_word}: {item.get('score', 0.0):.3f}")
             else:
                 logger.error(f"  第 {batch_idx + 1} 批评估失败,跳过")
-                # 为失败的批次添加默认结果
+                # 为失败的批次添加默认结果(使用原搜索词)
                 for word in batch_words:
                     all_results.append({
                         "search_word": word,
@@ -237,6 +250,155 @@ class LLMEvaluator:
 
         return all_results
 
+    def evaluate_single_note(
+        self,
+        original_feature: str,
+        search_word: str,
+        note: Dict[str, Any],
+        note_index: int = 0
+    ) -> Dict[str, Any]:
+        """
+        评估单个帖子(阶段6,多模态)
+
+        Args:
+            original_feature: 原始特征
+            search_word: 搜索词
+            note: 单个帖子
+            note_index: 帖子索引
+
+        Returns:
+            单个帖子的评估结果
+        """
+        card = note.get("note_card", {})
+        title = card.get("display_title", "")
+        desc = card.get("desc", "")[:500]  # 限制长度
+        images = card.get("image_list", [])[:10]  # 最多10张图
+
+        prompt = f"""你是一个小红书内容分析专家。
+
+任务:评估这个帖子是否包含目标特征"{original_feature}"的元素
+
+原始特征:"{original_feature}"
+搜索词:"{search_word}"
+
+帖子内容:
+标题: {title}
+正文: {desc}
+
+请分析帖子的文字和图片内容,返回JSON格式:
+{{
+  "relevance": 0.85,  // 0.0-1.0,相关度
+  "matched_elements": ["元素1", "元素2"],  // 匹配的元素列表
+  "reasoning": "简短的匹配理由"
+}}
+
+只返回JSON,不要其他内容。"""
+
+        result = self.client.chat_json(
+            prompt=prompt,
+            images=images if images else None,
+            max_retries=3
+        )
+
+        if result:
+            return {
+                "note_index": note_index,
+                "relevance": result.get("relevance", 0.0),
+                "matched_elements": result.get("matched_elements", []),
+                "reasoning": result.get("reasoning", "")
+            }
+        else:
+            logger.error(f"  评估帖子 {note_index} 失败: {search_word}")
+            return {
+                "note_index": note_index,
+                "relevance": 0.0,
+                "matched_elements": [],
+                "reasoning": "评估失败"
+            }
+
+    def evaluate_search_results_parallel(
+        self,
+        original_feature: str,
+        search_word: str,
+        notes: List[Dict[str, Any]],
+        max_notes: int = 20,
+        max_workers: int = 20
+    ) -> Dict[str, Any]:
+        """
+        并行评估搜索结果(每个帖子独立评估)
+
+        Args:
+            original_feature: 原始特征
+            search_word: 搜索词
+            notes: 帖子列表
+            max_notes: 最多评估几条帖子
+            max_workers: 最大并发数
+
+        Returns:
+            评估结果汇总
+        """
+        if not notes:
+            return {
+                "overall_relevance": 0.0,
+                "extracted_elements": [],
+                "evaluated_notes": []
+            }
+
+        notes_to_eval = notes[:max_notes]
+        evaluated_notes = []
+
+        logger.info(f"  并行评估 {len(notes_to_eval)} 个帖子({max_workers}并发)")
+
+        # 20并发评估每个帖子
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = []
+            for idx, note in enumerate(notes_to_eval):
+                future = executor.submit(
+                    self.evaluate_single_note,
+                    original_feature,
+                    search_word,
+                    note,
+                    idx
+                )
+                futures.append(future)
+
+            # 收集结果
+            for future in as_completed(futures):
+                try:
+                    result = future.result()
+                    evaluated_notes.append(result)
+                except Exception as e:
+                    logger.error(f"  评估帖子失败: {e}")
+
+        # 按note_index排序
+        evaluated_notes.sort(key=lambda x: x['note_index'])
+
+        # 汇总:计算整体相关度和提取元素
+        if evaluated_notes:
+            overall_relevance = sum(n['relevance'] for n in evaluated_notes) / len(evaluated_notes)
+
+            # 提取所有元素并统计频次
+            element_counts = {}
+            for note in evaluated_notes:
+                for elem in note['matched_elements']:
+                    element_counts[elem] = element_counts.get(elem, 0) + 1
+
+            # 按频次排序,取前5个
+            extracted_elements = sorted(
+                element_counts.keys(),
+                key=lambda x: element_counts[x],
+                reverse=True
+            )[:5]
+        else:
+            overall_relevance = 0.0
+            extracted_elements = []
+
+        return {
+            "overall_relevance": overall_relevance,
+            "extracted_elements": extracted_elements,
+            "evaluated_notes": evaluated_notes
+        }
+
     def evaluate_search_results(
         self,
         original_feature: str,

+ 818 - 0
visualize_stage5_results.py

@@ -0,0 +1,818 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Stage5搜索结果可视化工具
+生成带图片轮播的交互式HTML页面
+"""
+
+import json
+import os
+from datetime import datetime
+from typing import List, Dict, Any
+
+
+def load_data(json_path: str) -> List[Dict[str, Any]]:
+    """加载JSON数据"""
+    with open(json_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def calculate_statistics(data: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """计算统计数据"""
+    total_features = len(data)
+    total_search_words = 0
+    total_notes = 0
+    video_count = 0
+    normal_count = 0
+
+    for feature in data:
+        search_results = feature.get('组合评估结果', [])
+        total_search_words += len(search_results)
+
+        for search_item in search_results:
+            search_result = search_item.get('search_result', {})
+            notes = search_result.get('data', {}).get('data', [])
+            total_notes += len(notes)
+
+            for note in notes:
+                note_type = note.get('note_card', {}).get('type', '')
+                if note_type == 'video':
+                    video_count += 1
+                else:
+                    normal_count += 1
+
+    return {
+        'total_features': total_features,
+        'total_search_words': total_search_words,
+        'total_notes': total_notes,
+        'video_count': video_count,
+        'normal_count': normal_count,
+        'video_percentage': round(video_count / total_notes * 100, 1) if total_notes > 0 else 0,
+        'normal_percentage': round(normal_count / total_notes * 100, 1) if total_notes > 0 else 0
+    }
+
+
+def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path: str):
+    """生成HTML可视化页面"""
+
+    # 准备数据JSON(用于JavaScript)
+    data_json = json.dumps(data, ensure_ascii=False, indent=2)
+
+    html_content = f'''<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Stage5 搜索结果可视化</title>
+    <style>
+        * {{
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }}
+
+        body {{
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+            background: #f5f7fa;
+            color: #333;
+            overflow-x: hidden;
+        }}
+
+        /* 顶部统计面板 */
+        .stats-panel {{
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 20px;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+        }}
+
+        .stats-container {{
+            max-width: 1400px;
+            margin: 0 auto;
+            display: flex;
+            justify-content: space-around;
+            align-items: center;
+            flex-wrap: wrap;
+            gap: 20px;
+        }}
+
+        .stat-item {{
+            text-align: center;
+        }}
+
+        .stat-value {{
+            font-size: 32px;
+            font-weight: bold;
+            margin-bottom: 5px;
+        }}
+
+        .stat-label {{
+            font-size: 14px;
+            opacity: 0.9;
+        }}
+
+        /* 主容器 */
+        .main-container {{
+            display: flex;
+            max-width: 1400px;
+            margin: 20px auto;
+            gap: 20px;
+            padding: 0 20px;
+            height: calc(100vh - 140px);
+        }}
+
+        /* 左侧导航 */
+        .left-sidebar {{
+            width: 30%;
+            background: white;
+            border-radius: 8px;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+            overflow-y: auto;
+            position: sticky;
+            top: 20px;
+            height: fit-content;
+            max-height: calc(100vh - 160px);
+        }}
+
+        .feature-group {{
+            border-bottom: 1px solid #e5e7eb;
+        }}
+
+        .feature-header {{
+            padding: 15px 20px;
+            background: #f9fafb;
+            cursor: pointer;
+            user-select: none;
+            transition: background 0.2s;
+        }}
+
+        .feature-header:hover {{
+            background: #f3f4f6;
+        }}
+
+        .feature-header.active {{
+            background: #667eea;
+            color: white;
+        }}
+
+        .feature-title {{
+            font-size: 16px;
+            font-weight: 600;
+            margin-bottom: 5px;
+        }}
+
+        .feature-meta {{
+            font-size: 12px;
+            color: #6b7280;
+        }}
+
+        .feature-header.active .feature-meta {{
+            color: rgba(255,255,255,0.8);
+        }}
+
+        .search-words-list {{
+            display: none;
+            padding: 10px 0;
+        }}
+
+        .search-words-list.expanded {{
+            display: block;
+        }}
+
+        .search-word-item {{
+            padding: 12px 20px 12px 40px;
+            cursor: pointer;
+            border-left: 3px solid transparent;
+            transition: all 0.2s;
+        }}
+
+        .search-word-item:hover {{
+            background: #f9fafb;
+            border-left-color: #667eea;
+        }}
+
+        .search-word-item.active {{
+            background: #ede9fe;
+            border-left-color: #7c3aed;
+        }}
+
+        .search-word-text {{
+            font-size: 14px;
+            font-weight: 500;
+            color: #374151;
+            margin-bottom: 4px;
+        }}
+
+        .search-word-score {{
+            display: inline-block;
+            padding: 2px 8px;
+            border-radius: 12px;
+            font-size: 11px;
+            font-weight: 600;
+            margin-left: 8px;
+        }}
+
+        .score-high {{
+            background: #d1fae5;
+            color: #065f46;
+        }}
+
+        .score-medium {{
+            background: #fef3c7;
+            color: #92400e;
+        }}
+
+        .score-low {{
+            background: #fee2e2;
+            color: #991b1b;
+        }}
+
+        .search-word-reasoning {{
+            font-size: 12px;
+            color: #6b7280;
+            margin-top: 4px;
+            display: -webkit-box;
+            -webkit-line-clamp: 2;
+            -webkit-box-orient: vertical;
+            overflow: hidden;
+        }}
+
+        /* 右侧结果区 */
+        .right-content {{
+            flex: 1;
+            overflow-y: auto;
+            padding-bottom: 40px;
+        }}
+
+        .result-block {{
+            background: white;
+            border-radius: 8px;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+            margin-bottom: 30px;
+            padding: 20px;
+            scroll-margin-top: 20px;
+        }}
+
+        .result-header {{
+            margin-bottom: 20px;
+            padding-bottom: 15px;
+            border-bottom: 2px solid #e5e7eb;
+        }}
+
+        .result-title {{
+            font-size: 20px;
+            font-weight: 600;
+            color: #111827;
+            margin-bottom: 10px;
+        }}
+
+        .result-stats {{
+            display: flex;
+            gap: 15px;
+            font-size: 13px;
+            color: #6b7280;
+        }}
+
+        .stat-badge {{
+            background: #f3f4f6;
+            padding: 4px 10px;
+            border-radius: 4px;
+        }}
+
+        /* 帖子网格 */
+        .notes-grid {{
+            display: grid;
+            grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
+            gap: 20px;
+        }}
+
+        .note-card {{
+            border: 1px solid #e5e7eb;
+            border-radius: 8px;
+            overflow: hidden;
+            cursor: pointer;
+            transition: all 0.3s;
+            background: white;
+        }}
+
+        .note-card:hover {{
+            transform: translateY(-4px);
+            box-shadow: 0 10px 25px rgba(0,0,0,0.15);
+        }}
+
+        /* 图片轮播 */
+        .image-carousel {{
+            position: relative;
+            width: 100%;
+            height: 280px;
+            background: #f3f4f6;
+            overflow: hidden;
+        }}
+
+        .carousel-images {{
+            display: flex;
+            height: 100%;
+            transition: transform 0.3s ease;
+        }}
+
+        .carousel-image {{
+            min-width: 100%;
+            height: 100%;
+            object-fit: cover;
+        }}
+
+        .carousel-btn {{
+            position: absolute;
+            top: 50%;
+            transform: translateY(-50%);
+            background: rgba(0,0,0,0.5);
+            color: white;
+            border: none;
+            width: 32px;
+            height: 32px;
+            border-radius: 50%;
+            cursor: pointer;
+            font-size: 16px;
+            display: none;
+            align-items: center;
+            justify-content: center;
+            transition: background 0.2s;
+            z-index: 10;
+        }}
+
+        .carousel-btn:hover {{
+            background: rgba(0,0,0,0.7);
+        }}
+
+        .carousel-btn.prev {{
+            left: 8px;
+        }}
+
+        .carousel-btn.next {{
+            right: 8px;
+        }}
+
+        .note-card:hover .carousel-btn {{
+            display: flex;
+        }}
+
+        .carousel-indicators {{
+            position: absolute;
+            bottom: 10px;
+            left: 50%;
+            transform: translateX(-50%);
+            display: flex;
+            gap: 6px;
+            z-index: 10;
+        }}
+
+        .dot {{
+            width: 8px;
+            height: 8px;
+            border-radius: 50%;
+            background: rgba(255,255,255,0.5);
+            cursor: pointer;
+            transition: all 0.2s;
+        }}
+
+        .dot.active {{
+            background: white;
+            width: 24px;
+            border-radius: 4px;
+        }}
+
+        .image-counter {{
+            position: absolute;
+            top: 10px;
+            right: 10px;
+            background: rgba(0,0,0,0.6);
+            color: white;
+            padding: 4px 8px;
+            border-radius: 4px;
+            font-size: 12px;
+            z-index: 10;
+        }}
+
+        /* 帖子信息 */
+        .note-info {{
+            padding: 12px;
+        }}
+
+        .note-title {{
+            font-size: 14px;
+            font-weight: 500;
+            color: #111827;
+            margin-bottom: 8px;
+            display: -webkit-box;
+            -webkit-line-clamp: 2;
+            -webkit-box-orient: vertical;
+            overflow: hidden;
+            line-height: 1.4;
+        }}
+
+        .note-meta {{
+            display: flex;
+            align-items: center;
+            justify-content: space-between;
+            font-size: 12px;
+            color: #6b7280;
+        }}
+
+        .note-type {{
+            padding: 3px 8px;
+            border-radius: 4px;
+            font-weight: 500;
+        }}
+
+        .type-video {{
+            background: #dbeafe;
+            color: #1e40af;
+        }}
+
+        .type-normal {{
+            background: #d1fae5;
+            color: #065f46;
+        }}
+
+        .note-author {{
+            display: flex;
+            align-items: center;
+            gap: 6px;
+        }}
+
+        .author-avatar {{
+            width: 24px;
+            height: 24px;
+            border-radius: 50%;
+        }}
+
+        /* SVG连线层 */
+        #connection-svg {{
+            position: fixed;
+            top: 0;
+            left: 0;
+            width: 100%;
+            height: 100%;
+            pointer-events: none;
+            z-index: 1;
+        }}
+
+        .connection-line {{
+            stroke: #cbd5e1;
+            stroke-width: 1;
+            stroke-dasharray: 5,5;
+            fill: none;
+            opacity: 0.3;
+            transition: all 0.2s;
+        }}
+
+        .connection-line.active {{
+            stroke: #667eea;
+            stroke-width: 2;
+            stroke-dasharray: none;
+            opacity: 1;
+        }}
+
+        /* 滚动条样式 */
+        ::-webkit-scrollbar {{
+            width: 8px;
+            height: 8px;
+        }}
+
+        ::-webkit-scrollbar-track {{
+            background: #f1f1f1;
+        }}
+
+        ::-webkit-scrollbar-thumb {{
+            background: #888;
+            border-radius: 4px;
+        }}
+
+        ::-webkit-scrollbar-thumb:hover {{
+            background: #555;
+        }}
+    </style>
+</head>
+<body>
+    <!-- 统计面板 -->
+    <div class="stats-panel">
+        <div class="stats-container">
+            <div class="stat-item">
+                <div class="stat-value">📊 {stats['total_features']}</div>
+                <div class="stat-label">原始特征数</div>
+            </div>
+            <div class="stat-item">
+                <div class="stat-value">🔍 {stats['total_search_words']}</div>
+                <div class="stat-label">搜索词数</div>
+            </div>
+            <div class="stat-item">
+                <div class="stat-value">📝 {stats['total_notes']}</div>
+                <div class="stat-label">帖子总数</div>
+            </div>
+            <div class="stat-item">
+                <div class="stat-value">🎬 {stats['video_count']}</div>
+                <div class="stat-label">视频类型 ({stats['video_percentage']}%)</div>
+            </div>
+            <div class="stat-item">
+                <div class="stat-value">📷 {stats['normal_count']}</div>
+                <div class="stat-label">图文类型 ({stats['normal_percentage']}%)</div>
+            </div>
+        </div>
+    </div>
+
+    <!-- SVG连线层 -->
+    <svg id="connection-svg"></svg>
+
+    <!-- 主容器 -->
+    <div class="main-container">
+        <!-- 左侧导航 -->
+        <div class="left-sidebar" id="leftSidebar">
+            <!-- 通过JavaScript动态生成 -->
+        </div>
+
+        <!-- 右侧结果区 -->
+        <div class="right-content" id="rightContent">
+            <!-- 通过JavaScript动态生成 -->
+        </div>
+    </div>
+
+    <script>
+        // 数据
+        const data = {data_json};
+
+        // 渲染左侧导航
+        function renderLeftSidebar() {{
+            const sidebar = document.getElementById('leftSidebar');
+            let html = '';
+
+            data.forEach((feature, featureIdx) => {{
+                const searchWords = feature['组合评估结果'] || [];
+
+                html += `
+                    <div class="feature-group">
+                        <div class="feature-header" onclick="toggleFeature(${{featureIdx}})" id="feature-header-${{featureIdx}}">
+                            <div class="feature-title">${{feature['原始特征名称']}}</div>
+                            <div class="feature-meta">
+                                ${{feature['来源层级']}} · 权重: ${{feature['权重'].toFixed(2)}} · ${{searchWords.length}}个搜索词
+                            </div>
+                        </div>
+                        <div class="search-words-list" id="search-words-${{featureIdx}}">
+                `;
+
+                searchWords.forEach((sw, swIdx) => {{
+                    const score = sw.score || 0;
+                    const scoreClass = score >= 0.9 ? 'score-high' : score >= 0.7 ? 'score-medium' : 'score-low';
+                    const blockId = `block-${{featureIdx}}-${{swIdx}}`;
+
+                    html += `
+                        <div class="search-word-item" onclick="scrollToBlock('${{blockId}}')"
+                             id="sw-${{featureIdx}}-${{swIdx}}"
+                             data-block-id="${{blockId}}">
+                            <div class="search-word-text">
+                                ${{sw.search_word}}
+                                <span class="search-word-score ${{scoreClass}}">${{score.toFixed(2)}}</span>
+                            </div>
+                            <div class="search-word-reasoning" title="${{sw.reasoning}}">
+                                ${{sw.reasoning || ''}}
+                            </div>
+                        </div>
+                    `;
+                }});
+
+                html += `
+                        </div>
+                    </div>
+                `;
+            }});
+
+            sidebar.innerHTML = html;
+        }}
+
+        // 渲染右侧结果区
+        function renderRightContent() {{
+            const content = document.getElementById('rightContent');
+            let html = '';
+
+            data.forEach((feature, featureIdx) => {{
+                const searchWords = feature['组合评估结果'] || [];
+
+                searchWords.forEach((sw, swIdx) => {{
+                    const blockId = `block-${{featureIdx}}-${{swIdx}}`;
+                    const searchResult = sw.search_result || {{}};
+                    const notes = searchResult.data?.data || [];
+
+                    const videoCount = notes.filter(n => n.note_card?.type === 'video').length;
+                    const normalCount = notes.length - videoCount;
+
+                    html += `
+                        <div class="result-block" id="${{blockId}}">
+                            <div class="result-header">
+                                <div class="result-title">${{sw.search_word}}</div>
+                                <div class="result-stats">
+                                    <span class="stat-badge">📝 ${{notes.length}} 条帖子</span>
+                                    <span class="stat-badge">🎬 ${{videoCount}} 视频</span>
+                                    <span class="stat-badge">📷 ${{normalCount}} 图文</span>
+                                </div>
+                            </div>
+                            <div class="notes-grid">
+                                ${{notes.map((note, noteIdx) => renderNoteCard(note, featureIdx, swIdx, noteIdx)).join('')}}
+                            </div>
+                        </div>
+                    `;
+                }});
+            }});
+
+            content.innerHTML = html;
+        }}
+
+        // 渲染单个帖子卡片
+        function renderNoteCard(note, featureIdx, swIdx, noteIdx) {{
+            const card = note.note_card || {{}};
+            const images = card.image_list || [];
+            const title = card.display_title || '无标题';
+            const noteType = card.type || 'normal';
+            const noteId = note.id || '';
+            const user = card.user || {{}};
+            const userName = user.nick_name || '未知用户';
+            const userAvatar = user.avatar || '';
+
+            const carouselId = `carousel-${{featureIdx}}-${{swIdx}}-${{noteIdx}}`;
+
+            return `
+                <div class="note-card" onclick="openNote('${{noteId}}')">
+                    <div class="image-carousel" id="${{carouselId}}">
+                        <div class="carousel-images">
+                            ${{images.map(img => `<img class="carousel-image" src="${{img}}" alt="帖子图片" loading="lazy">`).join('')}}
+                        </div>
+                        ${{images.length > 1 ? `
+                            <button class="carousel-btn prev" onclick="event.stopPropagation(); changeImage('${{carouselId}}', -1)">←</button>
+                            <button class="carousel-btn next" onclick="event.stopPropagation(); changeImage('${{carouselId}}', 1)">→</button>
+                            <div class="carousel-indicators">
+                                ${{images.map((_, i) => `<span class="dot ${{i === 0 ? 'active' : ''}}" onclick="event.stopPropagation(); goToImage('${{carouselId}}', ${{i}})"></span>`).join('')}}
+                            </div>
+                            <span class="image-counter">1/${{images.length}}</span>
+                        ` : ''}}
+                    </div>
+                    <div class="note-info">
+                        <div class="note-title">${{title}}</div>
+                        <div class="note-meta">
+                            <span class="note-type type-${{noteType}}">
+                                ${{noteType === 'video' ? '🎬 视频' : '📷 图文'}}
+                            </span>
+                            <div class="note-author">
+                                ${{userAvatar ? `<img class="author-avatar" src="${{userAvatar}}" alt="${{userName}}">` : ''}}
+                                <span>${{userName}}</span>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            `;
+        }}
+
+        // 图片轮播逻辑
+        const carouselStates = {{}};
+
+        function changeImage(carouselId, direction) {{
+            if (!carouselStates[carouselId]) {{
+                carouselStates[carouselId] = {{ currentIndex: 0 }};
+            }}
+
+            const carousel = document.getElementById(carouselId);
+            const imagesContainer = carousel.querySelector('.carousel-images');
+            const images = carousel.querySelectorAll('.carousel-image');
+            const dots = carousel.querySelectorAll('.dot');
+            const counter = carousel.querySelector('.image-counter');
+
+            let newIndex = carouselStates[carouselId].currentIndex + direction;
+            if (newIndex < 0) newIndex = images.length - 1;
+            if (newIndex >= images.length) newIndex = 0;
+
+            carouselStates[carouselId].currentIndex = newIndex;
+            imagesContainer.style.transform = `translateX(-${{newIndex * 100}}%)`;
+
+            // 更新指示器
+            dots.forEach((dot, i) => {{
+                dot.classList.toggle('active', i === newIndex);
+            }});
+
+            // 更新计数器
+            if (counter) {{
+                counter.textContent = `${{newIndex + 1}}/${{images.length}}`;
+            }}
+        }}
+
+        function goToImage(carouselId, index) {{
+            if (!carouselStates[carouselId]) {{
+                carouselStates[carouselId] = {{ currentIndex: 0 }};
+            }}
+
+            const carousel = document.getElementById(carouselId);
+            const imagesContainer = carousel.querySelector('.carousel-images');
+            const dots = carousel.querySelectorAll('.dot');
+            const counter = carousel.querySelector('.image-counter');
+
+            carouselStates[carouselId].currentIndex = index;
+            imagesContainer.style.transform = `translateX(-${{index * 100}}%)`;
+
+            // 更新指示器
+            dots.forEach((dot, i) => {{
+                dot.classList.toggle('active', i === index);
+            }});
+
+            // 更新计数器
+            if (counter) {{
+                counter.textContent = `${{index + 1}}/${{dots.length}}`;
+            }}
+        }}
+
+        // 展开/折叠特征组
+        function toggleFeature(featureIdx) {{
+            const searchWordsList = document.getElementById(`search-words-${{featureIdx}}`);
+            const featureHeader = document.getElementById(`feature-header-${{featureIdx}}`);
+
+            searchWordsList.classList.toggle('expanded');
+            featureHeader.classList.toggle('active');
+        }}
+
+        // 滚动到指定结果块
+        function scrollToBlock(blockId) {{
+            const block = document.getElementById(blockId);
+            if (block) {{
+                block.scrollIntoView({{ behavior: 'smooth', block: 'start' }});
+
+                // 高亮对应的搜索词
+                document.querySelectorAll('.search-word-item').forEach(item => {{
+                    item.classList.remove('active');
+                }});
+
+                document.querySelectorAll(`[data-block-id="${{blockId}}"]`).forEach(item => {{
+                    item.classList.add('active');
+                }});
+            }}
+        }}
+
+        // 打开小红书帖子
+        function openNote(noteId) {{
+            if (noteId) {{
+                window.open(`https://www.xiaohongshu.com/explore/${{noteId}}`, '_blank');
+            }}
+        }}
+
+        // 初始化
+        document.addEventListener('DOMContentLoaded', () => {{
+            renderLeftSidebar();
+            renderRightContent();
+
+            // 默认展开第一个特征组
+            if (data.length > 0) {{
+                toggleFeature(0);
+            }}
+        }});
+    </script>
+</body>
+</html>
+'''
+
+    # 写入文件
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write(html_content)
+
+
+def main():
+    """主函数"""
+    # 配置路径
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    json_path = os.path.join(script_dir, 'output_v2', 'stage5_with_search_results.json')
+    output_dir = os.path.join(script_dir, 'visualization')
+    os.makedirs(output_dir, exist_ok=True)
+
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    output_path = os.path.join(output_dir, f'stage5_interactive_{timestamp}.html')
+
+    # 加载数据
+    print(f"📖 加载数据: {json_path}")
+    data = load_data(json_path)
+    print(f"✓ 加载了 {len(data)} 个原始特征")
+
+    # 计算统计
+    print("📊 计算统计数据...")
+    stats = calculate_statistics(data)
+    print(f"✓ 统计完成:")
+    print(f"  - 原始特征: {stats['total_features']}")
+    print(f"  - 搜索词: {stats['total_search_words']}")
+    print(f"  - 帖子总数: {stats['total_notes']}")
+    print(f"  - 视频: {stats['video_count']} ({stats['video_percentage']}%)")
+    print(f"  - 图文: {stats['normal_count']} ({stats['normal_percentage']}%)")
+
+    # 生成HTML
+    print(f"\n🎨 生成可视化页面...")
+    generate_html(data, stats, output_path)
+    print(f"✓ 生成完成: {output_path}")
+
+    # 打印访问提示
+    print(f"\n🌐 在浏览器中打开查看:")
+    print(f"   file://{output_path}")
+
+
+if __name__ == '__main__':
+    main()

+ 78 - 6
xiaohongshu_search.py

@@ -9,9 +9,12 @@ import json
 import os
 import argparse
 import time
+import logging
 from datetime import datetime
 from typing import Dict, Any
 
+logger = logging.getLogger(__name__)
+
 
 class XiaohongshuSearch:
     """小红书笔记搜索API封装类"""
@@ -20,12 +23,13 @@ class XiaohongshuSearch:
     TOOL_NAME = "xhs_note_search"
     PLATFORM = "xiaohongshu"
 
-    def __init__(self, results_dir: str = None):
+    def __init__(self, results_dir: str = None, cache_dir: str = "search_cache"):
         """
         初始化API客户端
 
         Args:
             results_dir: 结果输出目录,默认为项目根目录下的 data/search 文件夹
+            cache_dir: 缓存目录,默认为 search_cache
         """
         self.api_url = f"{self.BASE_URL}/tools/call/{self.TOOL_NAME}"
 
@@ -38,19 +42,60 @@ class XiaohongshuSearch:
             project_root = os.path.dirname(os.path.dirname(script_dir))
             self.results_base_dir = os.path.join(project_root, "data", "search")
 
+        # 设置缓存目录
+        self.cache_dir = cache_dir
+        if cache_dir:
+            os.makedirs(cache_dir, exist_ok=True)
+
+    def _get_cache_key(
+        self,
+        keyword: str,
+        content_type: str,
+        sort_type: str,
+        publish_time: str
+    ) -> str:
+        """
+        生成缓存key
+
+        Args:
+            keyword: 搜索关键词
+            content_type: 内容类型
+            sort_type: 排序方式
+            publish_time: 发布时间
+
+        Returns:
+            缓存key字符串
+        """
+        return f"{keyword}_{content_type}_{sort_type}_{publish_time}"
+
+    def _get_cache_path(self, cache_key: str) -> str:
+        """
+        获取缓存文件路径
+
+        Args:
+            cache_key: 缓存key
+
+        Returns:
+            缓存文件完整路径
+        """
+        # 清理文件名中的非法字符
+        safe_key = cache_key.replace('/', '_').replace('\\', '_').replace(' ', '_')
+        return os.path.join(self.cache_dir, f"{safe_key}.json")
+
     def search(
         self,
         keyword: str,
-        content_type: str = "图文",
+        content_type: str = "不限",
         sort_type: str = "综合",
         publish_time: str = "不限",
         cursor: str = "",
         timeout: int = 30,
-        max_retries: int = 3,
-        retry_delay: int = 2
+        max_retries: int = 5,
+        retry_delay: int = 2,
+        use_cache: bool = True
     ) -> Dict[str, Any]:
         """
-        搜索小红书笔记(带重试机制)
+        搜索小红书笔记(带重试机制和缓存
 
         Args:
             keyword: 搜索关键词
@@ -61,6 +106,7 @@ class XiaohongshuSearch:
             timeout: 请求超时时间(秒),默认30秒
             max_retries: 最大重试次数,默认3次
             retry_delay: 重试间隔时间(秒),默认2秒
+            use_cache: 是否使用缓存,默认True
 
         Returns:
             API响应的JSON数据
@@ -68,9 +114,24 @@ class XiaohongshuSearch:
         Raises:
             requests.exceptions.RequestException: 所有重试都失败时抛出异常
         """
+        # 检查缓存
+        if use_cache and self.cache_dir:
+            cache_key = self._get_cache_key(keyword, content_type, sort_type, publish_time)
+            cache_path = self._get_cache_path(cache_key)
+
+            if os.path.exists(cache_path):
+                try:
+                    with open(cache_path, 'r', encoding='utf-8') as f:
+                        cached_result = json.load(f)
+                    logger.info(f"  ✓ 使用缓存: {keyword}")
+                    return cached_result
+                except Exception as e:
+                    logger.warning(f"  读取缓存失败: {e},将重新搜索")
+
+        # 缓存未命中或未启用,执行实际搜索
         payload = {
             "keyword": keyword,
-            "content_type": content_type,
+            "content_type": '不限',  # 使用映射后的参数
             "sort_type": sort_type,
             "publish_time": publish_time,
             "cursor": cursor
@@ -106,6 +167,17 @@ class XiaohongshuSearch:
                 if attempt > 1:
                     print(f"    ✓ 重试成功")
 
+                # 保存到缓存
+                if use_cache and self.cache_dir:
+                    try:
+                        cache_key = self._get_cache_key(keyword, content_type, sort_type, publish_time)
+                        cache_path = self._get_cache_path(cache_key)
+                        with open(cache_path, 'w', encoding='utf-8') as f:
+                            json.dump(result, f, ensure_ascii=False, indent=2)
+                        logger.info(f"  ✓ 已缓存: {keyword}")
+                    except Exception as e:
+                        logger.warning(f"  保存缓存失败: {e}")
+
                 return result
 
             except requests.exceptions.RequestException as e: