刘立冬 3 týždňov pred
rodič
commit
e488de4152
4 zmenil súbory, kde vykonal 1468 pridanie a 231 odobranie
  1. 371 186
      enhanced_search_v2.py
  2. 201 39
      llm_evaluator.py
  3. 818 0
      visualize_stage5_results.py
  4. 78 6
      xiaohongshu_search.py

+ 371 - 186
enhanced_search_v2.py

@@ -11,9 +11,11 @@ import copy
 import time
 import time
 import os
 import os
 import argparse
 import argparse
+import subprocess
 from typing import Dict, List, Any, Optional, Set, Tuple
 from typing import Dict, List, Any, Optional, Set, Tuple
 from datetime import datetime
 from datetime import datetime
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from itertools import combinations
 
 
 from openrouter_client import OpenRouterClient
 from openrouter_client import OpenRouterClient
 from llm_evaluator import LLMEvaluator
 from llm_evaluator import LLMEvaluator
@@ -41,7 +43,10 @@ class EnhancedSearchV2:
         dimension_associations_path: str,
         dimension_associations_path: str,
         optimized_clustered_data_path: str,
         optimized_clustered_data_path: str,
         openrouter_api_key: Optional[str] = None,
         openrouter_api_key: Optional[str] = None,
-        output_dir: str = "output_v2"
+        output_dir: str = "output_v2",
+        top_n: int = 10,
+        max_total_searches: Optional[int] = None,
+        search_max_workers: int = 3
     ):
     ):
         """
         """
         初始化系统
         初始化系统
@@ -52,11 +57,17 @@ class EnhancedSearchV2:
             optimized_clustered_data_path: 人设特征库路径
             optimized_clustered_data_path: 人设特征库路径
             openrouter_api_key: OpenRouter API密钥
             openrouter_api_key: OpenRouter API密钥
             output_dir: 输出目录
             output_dir: 输出目录
+            top_n: 每个原始特征取评分最高的N个搜索词(默认10)
+            max_total_searches: 全局最大搜索次数限制(默认None不限制)
+            search_max_workers: 搜索并发数(默认3)
         """
         """
         self.how_json_path = how_json_path
         self.how_json_path = how_json_path
         self.dimension_associations_path = dimension_associations_path
         self.dimension_associations_path = dimension_associations_path
         self.optimized_clustered_data_path = optimized_clustered_data_path
         self.optimized_clustered_data_path = optimized_clustered_data_path
         self.output_dir = output_dir
         self.output_dir = output_dir
+        self.top_n = top_n
+        self.max_total_searches = max_total_searches
+        self.search_max_workers = search_max_workers
 
 
         # 创建输出目录
         # 创建输出目录
         os.makedirs(output_dir, exist_ok=True)
         os.makedirs(output_dir, exist_ok=True)
@@ -572,38 +583,95 @@ class EnhancedSearchV2:
             'sub_classifications': sub_classifications
             'sub_classifications': sub_classifications
         }
         }
 
 
-    # ========== 阶段3:提取特征列表 ==========
+    # ========== 阶段3:筛选高相似度匹配(>0.8) ==========
 
 
-    def stage3_extract_features(self, associations_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def stage3_filter_high_similarity_matches(self, associations_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """
         """
-        阶段3:从关联分类中提取特征列表
+        阶段3:筛选高相似度匹配(>0.8)
+
+        遍历how解构中的所有原始特征,找出匹配结果中相似度>0.8
+        且人设特征名称在Stage2关联范围内的高质量匹配
 
 
         Args:
         Args:
             associations_data: 阶段2的关联数据
             associations_data: 阶段2的关联数据
 
 
         Returns:
         Returns:
-            带特征列表的数据
+            带高相似度候选的数据
         """
         """
         logger.info("=" * 60)
         logger.info("=" * 60)
-        logger.info("阶段3:提取特征列表")
+        logger.info("阶段3:筛选高相似度匹配(>0.8)")
         logger.info("=" * 60)
         logger.info("=" * 60)
 
 
         for idx, feature_result in enumerate(associations_data, 1):
         for idx, feature_result in enumerate(associations_data, 1):
-            logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {feature_result['原始特征名称']}")
+            original_feature_name = feature_result['原始特征名称']
+            logger.info(f"\n[{idx}/{len(associations_data)}] 处理: {original_feature_name}")
 
 
-            for assoc in feature_result.get('找到的关联', []):
-                target_path = assoc['目标分类路径']
-                logger.info(f"  提取特征: {target_path}")
+            # 步骤1: 收集Stage2的关联范围(分类名+标签)
+            stage2_scope = self._collect_stage2_scope(feature_result)
+            logger.info(f"  Stage2范围包含 {len(stage2_scope)} 个分类/标签")
 
 
-                # 提取特征
-                features = self._find_features_by_path(target_path)
+            # 步骤2: 遍历how解构中的所有原始特征,找出高相似度匹配
+            high_sim_candidates = []
+            total_checked = 0
+            high_sim_found = 0
+
+            how_result = self.how_data.get('how解构结果', {})
+            for level_name, level_list in how_result.items():
+                if not isinstance(level_list, list):
+                    continue
 
 
-                # 添加到关联中
-                assoc['特征列表'] = features
-                logger.info(f"    找到 {len(features)} 个特征")
+                for item in level_list:
+                    for step in item.get('how步骤列表', []):
+                        for feature in step.get('特征列表', []):
+                            # 获取该特征的所有匹配
+                            matches = feature.get('匹配结果', [])
+                            total_checked += len(matches)
+
+                            # 筛选相似度>0.8且在Stage2范围内的匹配
+                            for match in matches:
+                                sim = match.get('匹配结果', {}).get('相似度', 0)
+                                persona_feature_name = match.get('人设特征名称', '')
+
+                                if sim > 0.8 and persona_feature_name in stage2_scope:
+                                    high_sim_found += 1
+                                    # 记录来源信息
+                                    high_sim_candidates.append({
+                                        '人设特征名称': persona_feature_name,
+                                        '相似度': sim,
+                                        '特征类型': match.get('特征类型', ''),
+                                        '特征分类': match.get('特征分类', []),
+                                        '人设特征层级': match.get('人设特征层级', ''),
+                                        '来源路径': self._build_classification_path(match.get('特征分类', [])),
+                                        '匹配说明': match.get('匹配结果', {}).get('说明', ''),
+                                        '来源原始特征': feature.get('特征名称', '')  # 记录来自哪个原始特征
+                                    })
+
+            logger.info(f"  检查了 {total_checked} 个匹配")
+            logger.info(f"  找到 {high_sim_found} 个相似度>0.8的匹配")
+
+            # 按相似度降序排序,并去重(同一个人设特征名称只保留最高分)
+            seen_names = set()
+            unique_candidates = []
+            high_sim_candidates.sort(key=lambda x: x['相似度'], reverse=True)
+
+            for candidate in high_sim_candidates:
+                name = candidate['人设特征名称']
+                if name not in seen_names:
+                    seen_names.add(name)
+                    unique_candidates.append(candidate)
+
+            # 添加到结果中
+            feature_result['高相似度候选'] = unique_candidates
+            logger.info(f"  去重后筛选出 {len(unique_candidates)} 个高相似度候选")
+
+            # 显示前5个
+            if unique_candidates:
+                logger.info(f"  Top 5:")
+                for c in unique_candidates[:5]:
+                    logger.info(f"    • {c['人设特征名称']} ({c['相似度']:.3f}) ← 来自\"{c['来源原始特征']}\"")
 
 
         # 保存结果
         # 保存结果
-        output_path = os.path.join(self.output_dir, "stage3_features.json")
+        output_path = os.path.join(self.output_dir, "stage3_high_similarity.json")
         self._save_json(associations_data, output_path)
         self._save_json(associations_data, output_path)
 
 
         logger.info(f"\n" + "=" * 60)
         logger.info(f"\n" + "=" * 60)
@@ -612,6 +680,29 @@ class EnhancedSearchV2:
 
 
         return associations_data
         return associations_data
 
 
+
+    def _collect_stage2_scope(self, feature_result: Dict[str, Any]) -> Set[str]:
+        """
+        收集Stage2找到的所有分类名和标签,形成范围集合
+
+        Args:
+            feature_result: 特征结果数据
+
+        Returns:
+            包含所有分类名和标签的集合
+        """
+        scope = set()
+
+        for assoc in feature_result.get('找到的关联', []):
+            # 添加分类名
+            scope.add(assoc['分类名称'])
+
+            # 添加所有标签
+            tags = assoc.get('标签列表', [])
+            scope.update(tags)
+
+        return scope
+
     def _find_features_by_path(self, target_classification: str) -> List[Dict[str, Any]]:
     def _find_features_by_path(self, target_classification: str) -> List[Dict[str, Any]]:
         """
         """
         根据路径查找特征列表
         根据路径查找特征列表
@@ -630,56 +721,63 @@ class EnhancedSearchV2:
         # 深拷贝
         # 深拷贝
         return copy.deepcopy(features)
         return copy.deepcopy(features)
 
 
-    # ========== 阶段4:生成搜索词 + LLM评估质量 ==========
+    # ========== 阶段4:多词组合 + LLM评估 ==========
 
 
     def stage4_generate_and_evaluate_search_words(
     def stage4_generate_and_evaluate_search_words(
         self,
         self,
-        features_data: List[Dict[str, Any]]
+        features_data: List[Dict[str, Any]],
+        max_workers: int = 4,
+        max_candidates: int = 20,
+        max_combo_length: int = 4
     ) -> List[Dict[str, Any]]:
     ) -> List[Dict[str, Any]]:
         """
         """
-        阶段4:生成搜索词并用LLM评估质量
+        阶段4:多词组合 + LLM评估
+
+        基于Stage1的基础词和Stage3的高相似度候选,
+        生成所有2-N词组合,通过LLM评估选出Top10
 
 
         Args:
         Args:
-            features_data: 阶段3的特征数据
+            features_data: 阶段3的数据(包含高相似度候选)
+            max_workers: 并发评估的原始特征数(默认4)
+            max_candidates: 参与组合的最大候选词数(默认20)
+            max_combo_length: 最大组合词数(默认4,即基础词+3个候选)
 
 
         Returns:
         Returns:
             带LLM评估的数据
             带LLM评估的数据
         """
         """
         logger.info("=" * 60)
         logger.info("=" * 60)
-        logger.info("阶段4:生成搜索词 + LLM评估质量")
+        logger.info("阶段4:多词组合 + LLM评估")
+        logger.info(f"  最大候选词数: {max_candidates}")
+        logger.info(f"  最大组合长度: {max_combo_length} 词")
+        logger.info(f"  并发数: {max_workers} 个原始特征")
         logger.info("=" * 60)
         logger.info("=" * 60)
 
 
-        for idx, feature_result in enumerate(features_data, 1):
-            logger.info(f"\n[{idx}/{len(features_data)}] 处理: {feature_result['原始特征名称']}")
-
-            # 生成搜索词
-            self._add_search_words(feature_result)
-
-            # 收集所有搜索词
-            all_search_words = self._collect_all_search_words(feature_result)
-
-            if not all_search_words:
-                logger.info(f"  无搜索词,跳过")
-                continue
-
-            logger.info(f"  生成 {len(all_search_words)} 个搜索词")
-
-            # LLM分批评估(每10个一批)
-            logger.info(f"  开始LLM评估...")
-            original_feature = feature_result['原始特征名称']
-            evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
-                original_feature=original_feature,
-                search_words=[sw['search_word'] for sw in all_search_words],
-                batch_size=10
-            )
+        total_features = len(features_data)
 
 
-            # 将评估结果写回到特征节点
-            self._write_back_evaluations(feature_result, evaluated)
+        # 使用ThreadPoolExecutor并行处理不同的原始特征
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # 提交所有任务
+            futures = []
+            for idx, feature_result in enumerate(features_data, 1):
+                future = executor.submit(
+                    self._process_single_feature_combinations,
+                    idx,
+                    total_features,
+                    feature_result,
+                    max_candidates,
+                    max_combo_length
+                )
+                futures.append((future, feature_result))
 
 
-            logger.info(f"  评估完成,最高分: {evaluated[0]['score']:.3f}")
+            # 等待所有任务完成并收集结果
+            for future, feature_result in futures:
+                try:
+                    _ = future.result()  # 等待完成,结果已经写回到feature_result中
+                except Exception as e:
+                    logger.error(f"  评估失败: {feature_result['原始特征名称']}, 错误: {e}")
 
 
         # 保存结果
         # 保存结果
-        output_path = os.path.join(self.output_dir, "stage4_with_llm_scores.json")
+        output_path = os.path.join(self.output_dir, "stage4_combinations_evaluated.json")
         self._save_json(features_data, output_path)
         self._save_json(features_data, output_path)
 
 
         logger.info(f"\n" + "=" * 60)
         logger.info(f"\n" + "=" * 60)
@@ -688,95 +786,152 @@ class EnhancedSearchV2:
 
 
         return features_data
         return features_data
 
 
-    def _add_search_words(self, result: Dict[str, Any]):
+    def _process_single_feature_combinations(
+        self,
+        idx: int,
+        total: int,
+        feature_result: Dict[str, Any],
+        max_candidates: int,
+        max_combo_length: int
+    ) -> None:
         """
         """
-        为结果项添加search_word字段(去重)
+        处理单个原始特征的组合生成和评估
+
+        Steps:
+        1. Get base_word from Stage1's 最高匹配信息
+        2. Get candidates from Stage3's 高相似度候选 (top max_candidates)
+        3. Generate 2-N word combinations
+        4. LLM batch evaluation
+        5. Select Top 10 and write back
 
 
         Args:
         Args:
-            result: 单个结果项
+            idx: 特征索引
+            total: 总特征数
+            feature_result: 特征结果数据
+            max_candidates: 参与组合的最大候选词数
+            max_combo_length: 最大组合词数
         """
         """
-        # 获取基础词(人设特征名称)
-        base_word = result.get('最高匹配信息', {}).get('人设特征名称', '')
+        original_feature = feature_result['原始特征名称']
+        logger.info(f"\n[{idx}/{total}] 处理: {original_feature}")
 
 
+        # 步骤1: 获取基础词
+        base_word = feature_result.get('最高匹配信息', {}).get('人设特征名称', '')
         if not base_word:
         if not base_word:
+            logger.info(f"  无基础词,跳过")
+            feature_result['组合评估结果'] = []
             return
             return
 
 
-        # 去重集合(在当前结果项范围内)
-        seen_words: Set[str] = set()
+        logger.info(f"  基础词: {base_word}")
 
 
-        # 遍历所有关联的特征列表
-        for assoc in result.get('找到的关联', []):
-            for feature in assoc.get('特征列表', []):
-                feature_name = feature.get('特征名称', '')
+        # 步骤2: 获取候选词(从高相似度候选中)
+        high_sim_candidates = feature_result.get('高相似度候选', [])
 
 
-                if not feature_name:
-                    feature['search_word'] = None
-                    continue
+        # 限制候选词数量
+        candidates = high_sim_candidates[:max_candidates]
+        candidate_words = [c['人设特征名称'] for c in candidates]
 
 
-                # 生成组合词
-                search_word = f"{base_word} {feature_name}"
+        if not candidate_words:
+            logger.info(f"  无候选词,跳过")
+            feature_result['组合评估结果'] = []
+            return
 
 
-                # 检查是否重复
-                if search_word not in seen_words:
-                    feature['search_word'] = search_word
-                    seen_words.add(search_word)
-                else:
-                    feature['search_word'] = None
+        logger.info(f"  候选词数量: {len(candidate_words)} (限制: {max_candidates})")
+
+        # 步骤3: 生成所有组合
+        all_combinations = []
+
+        # 生成1词到max_combo_length-1词的候选词组合(因为还要加上base_word)
+        for length in range(1, min(max_combo_length, len(candidate_words) + 1)):
+            for combo in combinations(candidate_words, length):
+                # 组合成搜索词:基础词 + 候选词组合
+                search_phrase = base_word + ' ' + ' '.join(combo)
+                all_combinations.append({
+                    'search_word': search_phrase,
+                    'base_word': base_word,
+                    'candidate_words': list(combo),
+                    'combo_length': length + 1  # +1 因为包含base_word
+                })
 
 
-    def _collect_all_search_words(self, feature_result: Dict[str, Any]) -> List[Dict[str, Any]]:
-        """
-        收集结果项中所有非空的search_word
+        logger.info(f"  生成 {len(all_combinations)} 个组合")
 
 
-        Args:
-            feature_result: 结果项
+        # 步骤4: LLM批量评估
+        logger.info(f"  开始LLM评估...")
+        evaluated = self.llm_evaluator.evaluate_search_words_in_batches(
+            original_feature=original_feature,
+            search_words=[c['search_word'] for c in all_combinations],
+            batch_size=50
+        )
 
 
-        Returns:
-            搜索词列表,每个包含 search_word 和特征引用
-        """
-        search_words = []
-
-        for assoc_idx, assoc in enumerate(feature_result.get('找到的关联', [])):
-            for feat_idx, feature in enumerate(assoc.get('特征列表', [])):
-                sw = feature.get('search_word')
-                if sw and sw.strip():
-                    search_words.append({
-                        'search_word': sw,
-                        'assoc_idx': assoc_idx,
-                        'feat_idx': feat_idx,
-                        'feature_ref': feature  # 引用,方便写回
-                    })
+        # 步骤5: 选出Top 10
+        top_10 = evaluated[:10]
+
+        # 写回结果
+        feature_result['组合评估结果'] = top_10
 
 
-        return search_words
+        max_score = top_10[0]['score'] if top_10 else 0.0
+        logger.info(f"  评估完成,Top 10 最高分: {max_score:.3f}")
 
 
-    def _write_back_evaluations(
+    # ========== 阶段5:执行搜索 ==========
+
+    def _execute_single_search(
         self,
         self,
-        feature_result: Dict[str, Any],
-        evaluated: List[Dict[str, Any]]
-    ):
+        idx: int,
+        total: int,
+        search_word: str,
+        feature_ref: Dict[str, Any]
+    ) -> Dict[str, Any]:
         """
         """
-        将LLM评估结果写回到特征节点
+        执行单个搜索任务(用于并发执行)
 
 
         Args:
         Args:
-            feature_result: 结果项
-            evaluated: 评估结果列表
+            idx: 搜索索引
+            total: 总搜索数
+            search_word: 搜索词
+            feature_ref: 特征引用(用于写入结果)
+
+        Returns:
+            搜索结果信息
         """
         """
-        # 创建查找映射
-        eval_map = {e['search_word']: e for e in evaluated}
+        logger.info(f"[{idx}/{total}] 搜索: {search_word}")
 
 
-        # 写回到特征节点
-        for assoc in feature_result.get('找到的关联', []):
-            for feature in assoc.get('特征列表', []):
-                sw = feature.get('search_word')
-                if sw and sw in eval_map:
-                    eval_result = eval_map[sw]
-                    feature['llm_evaluation'] = {
-                        'score': eval_result['score'],
-                        'rank': eval_result['rank'],
-                        'reasoning': eval_result['reasoning'],
-                        'original_feature': eval_result['original_feature']
-                    }
+        try:
+            result = self.search_client.search(
+                keyword=search_word,
+                content_type='不限',
+                sort_type='综合',
+                max_retries=3,
+                use_cache=True  # 启用搜索缓存
+            )
 
 
-    # ========== 阶段5:执行搜索 ==========
+            note_count = len(result.get('data', {}).get('data', []))
+            logger.info(f"  ✓ 成功,获取 {note_count} 条帖子")
+
+            # 写入结果
+            feature_ref['search_result'] = result
+            feature_ref['search_metadata'] = {
+                'searched_at': datetime.now().isoformat(),
+                'status': 'success',
+                'note_count': note_count,
+                'search_params': {
+                    'keyword': search_word,
+                    'content_type': '图文',
+                    'sort_type': '综合'
+                }
+            }
+
+            return {'status': 'success', 'search_word': search_word, 'note_count': note_count}
+
+        except Exception as e:
+            logger.error(f"  ✗ 失败: {e}")
+            feature_ref['search_result'] = None
+            feature_ref['search_metadata'] = {
+                'searched_at': datetime.now().isoformat(),
+                'status': 'failed',
+                'note_count': 0,
+                'error': str(e)
+            }
+
+            return {'status': 'failed', 'search_word': search_word, 'error': str(e)}
 
 
     def stage5_execute_searches(
     def stage5_execute_searches(
         self,
         self,
@@ -799,7 +954,7 @@ class EnhancedSearchV2:
         logger.info("阶段5:执行小红书搜索")
         logger.info("阶段5:执行小红书搜索")
         logger.info("=" * 60)
         logger.info("=" * 60)
 
 
-        # 按原始特征分组收集搜索词
+        # 按原始特征分组收集搜索词(从Stage4的组合评估结果读取)
         feature_search_groups = {}
         feature_search_groups = {}
 
 
         for feature_result in features_data:
         for feature_result in features_data:
@@ -808,21 +963,19 @@ class EnhancedSearchV2:
             if original_feature not in feature_search_groups:
             if original_feature not in feature_search_groups:
                 feature_search_groups[original_feature] = []
                 feature_search_groups[original_feature] = []
 
 
-            for assoc in feature_result.get('找到的关联', []):
-                for feature in assoc.get('特征列表', []):
-                    sw = feature.get('search_word')
-                    if not sw:
-                        continue
+            # 从Stage4的组合评估结果读取
+            for eval_item in feature_result.get('组合评估结果', []):
+                sw = eval_item.get('search_word')
+                if not sw:
+                    continue
 
 
-                    # 获取LLM评分
-                    llm_eval = feature.get('llm_evaluation', {})
-                    score = llm_eval.get('score', 0.0)
+                score = eval_item.get('score', 0.0)
 
 
-                    feature_search_groups[original_feature].append({
-                        'search_word': sw,
-                        'score': score,
-                        'feature_ref': feature
-                    })
+                feature_search_groups[original_feature].append({
+                    'search_word': sw,
+                    'score': score,
+                    'feature_ref': eval_item  # 引用评估项,用于写入搜索结果
+                })
 
 
         # 每组取Top N
         # 每组取Top N
         all_searches = []
         all_searches = []
@@ -844,52 +997,35 @@ class EnhancedSearchV2:
 
 
             logger.info(f"  {original_feature}: 从 {len(sorted_list)} 个搜索词中选择 Top {len(selected)} (过滤 {filtered} 个)")
             logger.info(f"  {original_feature}: 从 {len(sorted_list)} 个搜索词中选择 Top {len(selected)} (过滤 {filtered} 个)")
 
 
-        logger.info(f"\n共 {len(all_searches)} 个搜索任务(过滤前: {total_before_filter}, 过滤掉: {total_filtered})")
-
-        # 执行搜索
-        for idx, item in enumerate(all_searches, 1):
-            sw = item['search_word']
-            feature = item['feature_ref']
+        # 应用全局搜索次数限制
+        if self.max_total_searches and len(all_searches) > self.max_total_searches:
+            logger.info(f"  应用全局限制:从 {len(all_searches)} 个减少到 {self.max_total_searches} 个")
+            all_searches = all_searches[:self.max_total_searches]
 
 
-            logger.info(f"[{idx}/{len(all_searches)}] 搜索: {sw}")
+        logger.info(f"\n共 {len(all_searches)} 个搜索任务(过滤前: {total_before_filter}, 过滤掉: {total_filtered})")
+        logger.info(f"  并发执行搜索(并发数: {self.search_max_workers})")
 
 
-            try:
-                result = self.search_client.search(
-                    keyword=sw,
-                    content_type='图文',
-                    sort_type='综合',
-                    max_retries=3
+        # 使用ThreadPoolExecutor并发执行搜索
+        with ThreadPoolExecutor(max_workers=self.search_max_workers) as executor:
+            # 提交所有搜索任务
+            futures = []
+            for idx, item in enumerate(all_searches, 1):
+                future = executor.submit(
+                    self._execute_single_search,
+                    idx,
+                    len(all_searches),
+                    item['search_word'],
+                    item['feature_ref']
                 )
                 )
+                futures.append(future)
 
 
-                note_count = len(result.get('data', {}).get('data', []))
-                logger.info(f"  ✓ 成功,获取 {note_count} 条帖子")
-
-                # 写入结果
-                feature['search_result'] = result
-                feature['search_metadata'] = {
-                    'searched_at': datetime.now().isoformat(),
-                    'status': 'success',
-                    'note_count': note_count,
-                    'search_params': {
-                        'keyword': sw,
-                        'content_type': '图文',
-                        'sort_type': '综合'
-                    }
-                }
-
-            except Exception as e:
-                logger.error(f"  ✗ 失败: {e}")
-                feature['search_result'] = None
-                feature['search_metadata'] = {
-                    'searched_at': datetime.now().isoformat(),
-                    'status': 'failed',
-                    'note_count': 0,
-                    'error': str(e)
-                }
-
-            # 延迟
-            if idx < len(all_searches):
-                time.sleep(search_delay)
+            # 等待所有搜索完成
+            for future in as_completed(futures):
+                try:
+                    result = future.result()
+                    # 结果已经写入feature_ref,无需额外处理
+                except Exception as e:
+                    logger.error(f"  搜索任务失败: {e}")
 
 
         # 保存结果
         # 保存结果
         output_path = os.path.join(self.output_dir, "stage5_with_search_results.json")
         output_path = os.path.join(self.output_dir, "stage5_with_search_results.json")
@@ -974,7 +1110,7 @@ class EnhancedSearchV2:
         feature_node: Dict[str, Any]
         feature_node: Dict[str, Any]
     ) -> Dict[str, Any]:
     ) -> Dict[str, Any]:
         """
         """
-        评估单个搜索结果
+        评估单个搜索结果(使用并行评估)
 
 
         Args:
         Args:
             original_feature: 原始特征
             original_feature: 原始特征
@@ -986,12 +1122,12 @@ class EnhancedSearchV2:
         search_word = feature_node.get('search_word', '')
         search_word = feature_node.get('search_word', '')
         notes = feature_node['search_result'].get('data', {}).get('data', [])
         notes = feature_node['search_result'].get('data', {}).get('data', [])
 
 
-        return self.llm_evaluator.evaluate_search_results(
+        return self.llm_evaluator.evaluate_search_results_parallel(
             original_feature=original_feature,
             original_feature=original_feature,
             search_word=search_word,
             search_word=search_word,
             notes=notes,
             notes=notes,
             max_notes=20,
             max_notes=20,
-            max_images_per_note=2
+            max_workers=20  # 20个并发评估每个帖子
         )
         )
 
 
     # ========== 阶段7:扩展搜索 ==========
     # ========== 阶段7:扩展搜索 ==========
@@ -1052,9 +1188,10 @@ class EnhancedSearchV2:
             try:
             try:
                 result = self.search_client.search(
                 result = self.search_client.search(
                     keyword=extended_kw,
                     keyword=extended_kw,
-                    content_type='图文',
+                    content_type='不限',
                     sort_type='综合',
                     sort_type='综合',
-                    max_retries=3
+                    max_retries=3,
+                    use_cache=True  # 启用搜索缓存
                 )
                 )
 
 
                 note_count = len(result.get('data', {}).get('data', []))
                 note_count = len(result.get('data', {}).get('data', []))
@@ -1121,26 +1258,53 @@ class EnhancedSearchV2:
             # 阶段2
             # 阶段2
             stage2_results = self.stage2_find_associations(stage1_results)
             stage2_results = self.stage2_find_associations(stage1_results)
 
 
-            # 阶段3
-            stage3_results = self.stage3_extract_features(stage2_results)
+            # 阶段3 - 使用新方法:筛选高相似度匹配
+            stage3_results = self.stage3_filter_high_similarity_matches(stage2_results)
 
 
             # 阶段4
             # 阶段4
-            stage4_results = self.stage4_generate_and_evaluate_search_words(stage3_results)
+            stage4_results = self.stage4_generate_and_evaluate_search_words(
+                stage3_results,
+                max_workers=8,         # 提高并发从4到8
+                max_combo_length=3     # 降低组合长度从4到3
+            )
 
 
             # 阶段5
             # 阶段5
-            stage5_results = self.stage5_execute_searches(stage4_results, search_delay=2.0, top_n=10)
+            stage5_results = self.stage5_execute_searches(stage4_results, search_delay=2.0, top_n=self.top_n)
 
 
-            # 阶段6
-            stage6_results = self.stage6_evaluate_search_results(stage5_results)
+            # 阶段6 - 暂时切断执行(代码保留)
+            # stage6_results = self.stage6_evaluate_search_results(stage5_results)
 
 
-            # 阶段7
-            final_results = self.stage7_extended_searches(stage6_results, search_delay=2.0)
+            # 阶段7 - 暂时切断执行(代码保留)
+            # final_results = self.stage7_extended_searches(stage6_results, search_delay=2.0)
+
+            logger.info("\n" + "=" * 60)
+            logger.info("✓ 完整流程执行完成(Stage1-5)")
+            logger.info("=" * 60)
 
 
+            # 自动执行可视化
             logger.info("\n" + "=" * 60)
             logger.info("\n" + "=" * 60)
-            logger.info("✓ 完整流程执行完成")
+            logger.info("开始生成可视化...")
             logger.info("=" * 60)
             logger.info("=" * 60)
 
 
-            return final_results
+            try:
+                result = subprocess.run(
+                    ['python3', 'visualize_stage5_results.py'],
+                    capture_output=True,
+                    text=True,
+                    timeout=60
+                )
+
+                if result.returncode == 0:
+                    logger.info("✓ 可视化生成成功")
+                    logger.info(result.stdout)
+                else:
+                    logger.error(f"可视化生成失败: {result.stderr}")
+            except subprocess.TimeoutExpired:
+                logger.error("可视化生成超时")
+            except Exception as e:
+                logger.error(f"可视化生成异常: {e}")
+
+            return stage5_results
 
 
         except Exception as e:
         except Exception as e:
             logger.error(f"流程执行失败: {e}")
             logger.error(f"流程执行失败: {e}")
@@ -1152,7 +1316,7 @@ def main():
     parser = argparse.ArgumentParser(description='增强搜索系统V2')
     parser = argparse.ArgumentParser(description='增强搜索系统V2')
     parser.add_argument(
     parser.add_argument(
         '--how-json',
         '--how-json',
-        default='69114f150000000007001f30_how.json',
+        default='69114f150000000007001f30_how copy.json',
         help='How解构文件路径'
         help='How解构文件路径'
     )
     )
     parser.add_argument(
     parser.add_argument(
@@ -1175,6 +1339,24 @@ def main():
         default='output_v2',
         default='output_v2',
         help='输出目录'
         help='输出目录'
     )
     )
+    parser.add_argument(
+        '--top-n',
+        type=int,
+        default=10,
+        help='每个原始特征取评分最高的N个搜索词(默认10)'
+    )
+    parser.add_argument(
+        '--max-total-searches',
+        type=int,
+        default=None,
+        help='全局最大搜索次数限制(默认None不限制)'
+    )
+    parser.add_argument(
+        '--search-workers',
+        type=int,
+        default=3,
+        help='搜索并发数(默认3)'
+    )
 
 
     args = parser.parse_args()
     args = parser.parse_args()
 
 
@@ -1184,7 +1366,10 @@ def main():
         dimension_associations_path=args.dimension_associations,
         dimension_associations_path=args.dimension_associations,
         optimized_clustered_data_path=args.optimized_clustered,
         optimized_clustered_data_path=args.optimized_clustered,
         openrouter_api_key=args.api_key,
         openrouter_api_key=args.api_key,
-        output_dir=args.output_dir
+        output_dir=args.output_dir,
+        top_n=args.top_n,
+        max_total_searches=args.max_total_searches,
+        search_max_workers=args.search_workers
     )
     )
 
 
     # 执行完整流程
     # 执行完整流程

+ 201 - 39
llm_evaluator.py

@@ -42,20 +42,24 @@ class LLMEvaluator:
         """
         """
         prompt = f"""你是一个小红书内容分析专家。
         prompt = f"""你是一个小红书内容分析专家。
 
 
-任务:评估搜索词能否找到包含目标特征的内容
+# 任务说明
+从给定关键词中提取并组合适合在小红书搜索的query词(目标是找到【{original_feature}】相关内容,但query中不能直接出现"{original_feature}")
 
 
-原始特征:"{original_feature}"
-组合搜索词:"{search_word}"
+## 可选词汇
+{search_word}
 
 
-评估标准:
-1. 这个搜索词在小红书上能否找到包含"{original_feature}"相关元素的帖子
-2. 搜索词的关键词组合是否合理、是否过于宽泛或过于具体
-3. 搜索词与原始特征的语义关联性
+## 要求
+1. 只能使用可选词汇中的词,可以进行以下变化:
+   - 直接使用原词或括号内的同义词
+   - 多个词组合
+   - 适当精简
+2. 不能添加可选词汇以外的新词
+3. 按推荐程度排序(越靠前越推荐)
 
 
-请仔细分析并返回JSON格式:
+## 输出格式(JSON)
 {{
 {{
-  "score": 0.75,  // 0.0-1.0,能找到相关内容的可能性
-  "reasoning": "详细的评估理由,说明为什么给出这个分数"
+  "score": 0.75,
+  "reasoning": "评估理由"
 }}
 }}
 
 
 注意:只返回JSON,不要其他内容。"""
 注意:只返回JSON,不要其他内容。"""
@@ -136,7 +140,7 @@ class LLMEvaluator:
         self,
         self,
         original_feature: str,
         original_feature: str,
         search_words: List[str],
         search_words: List[str],
-        batch_size: int = 10
+        batch_size: int = 50
     ) -> List[Dict[str, Any]]:
     ) -> List[Dict[str, Any]]:
         """
         """
         分批评估搜索词(每批N个,减少API调用)
         分批评估搜索词(每批N个,减少API调用)
@@ -162,62 +166,71 @@ class LLMEvaluator:
 
 
             logger.info(f"  处理第 {batch_idx + 1}/{total_batches} 批({len(batch_words)} 个搜索词)")
             logger.info(f"  处理第 {batch_idx + 1}/{total_batches} 批({len(batch_words)} 个搜索词)")
 
 
-            # 构建包含多个搜索词的prompt
-            words_list = "\n".join([
-                f"{i+1}. {word}"
-                for i, word in enumerate(batch_words)
-            ])
+            # 从搜索词中提取所有独特的词作为可选词汇
+            available_words_set = set()
+            for word in batch_words:
+                # 分割搜索词,提取单个词
+                parts = word.split()
+                available_words_set.update(parts)
+
+            # 转换为列表并排序(保证稳定性)
+            available_words = sorted(list(available_words_set))
+
+            # 构建可选词汇字符串(逗号分隔)
+            available_words_str = "、".join(available_words)
 
 
-            prompt = f"""你是一个小红书内容分析专家。
+            prompt = f"""
 
 
-任务:评估以下搜索词在小红书上能否找到包含目标特征"{original_feature}"的内容
+# 任务说明
+从给定关键词中提取并组合适合在小红书搜索的query词(目标是找到【{original_feature}】相关内容,但query中不能直接出现"{original_feature}"二字)
 
 
-搜索词列表:
-{words_list}
+## 可选词汇
+{available_words_str}
 
 
-评估标准:
-1. 这个搜索词在小红书上能否找到包含"{original_feature}"相关元素的帖子
-2. 搜索词的关键词组合是否合理、是否过于宽泛或过于具体
-3. 搜索词与原始特征的语义关联性
+## 要求
+1. 只能使用可选词汇中的词,可以进行以下变化:
+   - 直接使用原词或括号内的同义词
+   - 多个词组合
+   - 适当精简
+2. 不能添加可选词汇以外的新词
+3. 按推荐程度排序(越靠前越推荐)
 
 
-请为每个搜索词返回评估结果,JSON数组格式:
+## 输出格式(JSON):
 [
 [
   {{
   {{
     "index": 1,
     "index": 1,
-    "score": 0.75,
-    "reasoning": "详细的评估理由"
+    "search_word": "组合的搜索词",
+    "score": 0.85,
+    "reasoning": "推荐理由"
   }},
   }},
   {{
   {{
     "index": 2,
     "index": 2,
+    "search_word": "组合的搜索词",
     "score": 0.80,
     "score": 0.80,
-    "reasoning": "详细的评估理由"
+    "reasoning": "推荐理由"
   }}
   }}
 ]
 ]
-
-注意:
-- index 对应搜索词的编号(1-{len(batch_words)})
-- score 范围 0.0-1.0
 - 只返回JSON数组,不要其他内容"""
 - 只返回JSON数组,不要其他内容"""
 
 
             # 调用LLM
             # 调用LLM
             result = self.client.chat_json(prompt=prompt, max_retries=3)
             result = self.client.chat_json(prompt=prompt, max_retries=3)
 
 
             if result and isinstance(result, list):
             if result and isinstance(result, list):
-                # 处理结果
-                for item in result:
-                    idx = item.get("index", 0) - 1  # 转换为0-based索引
-                    if 0 <= idx < len(batch_words):
+                # 处理结果 - 新格式直接包含search_word
+                for idx, item in enumerate(result):
+                    search_word = item.get("search_word", "")
+                    if search_word:  # 确保有搜索词
                         all_results.append({
                         all_results.append({
-                            "search_word": batch_words[idx],
+                            "search_word": search_word,
                             "score": item.get("score", 0.0),
                             "score": item.get("score", 0.0),
                             "reasoning": item.get("reasoning", ""),
                             "reasoning": item.get("reasoning", ""),
                             "original_feature": original_feature
                             "original_feature": original_feature
                         })
                         })
                         logger.info(f"    [{start_idx + idx + 1}/{len(search_words)}] "
                         logger.info(f"    [{start_idx + idx + 1}/{len(search_words)}] "
-                                   f"{batch_words[idx]}: {item.get('score', 0.0):.3f}")
+                                   f"{search_word}: {item.get('score', 0.0):.3f}")
             else:
             else:
                 logger.error(f"  第 {batch_idx + 1} 批评估失败,跳过")
                 logger.error(f"  第 {batch_idx + 1} 批评估失败,跳过")
-                # 为失败的批次添加默认结果
+                # 为失败的批次添加默认结果(使用原搜索词)
                 for word in batch_words:
                 for word in batch_words:
                     all_results.append({
                     all_results.append({
                         "search_word": word,
                         "search_word": word,
@@ -237,6 +250,155 @@ class LLMEvaluator:
 
 
         return all_results
         return all_results
 
 
+    def evaluate_single_note(
+        self,
+        original_feature: str,
+        search_word: str,
+        note: Dict[str, Any],
+        note_index: int = 0
+    ) -> Dict[str, Any]:
+        """
+        评估单个帖子(阶段6,多模态)
+
+        Args:
+            original_feature: 原始特征
+            search_word: 搜索词
+            note: 单个帖子
+            note_index: 帖子索引
+
+        Returns:
+            单个帖子的评估结果
+        """
+        card = note.get("note_card", {})
+        title = card.get("display_title", "")
+        desc = card.get("desc", "")[:500]  # 限制长度
+        images = card.get("image_list", [])[:10]  # 最多10张图
+
+        prompt = f"""你是一个小红书内容分析专家。
+
+任务:评估这个帖子是否包含目标特征"{original_feature}"的元素
+
+原始特征:"{original_feature}"
+搜索词:"{search_word}"
+
+帖子内容:
+标题: {title}
+正文: {desc}
+
+请分析帖子的文字和图片内容,返回JSON格式:
+{{
+  "relevance": 0.85,  // 0.0-1.0,相关度
+  "matched_elements": ["元素1", "元素2"],  // 匹配的元素列表
+  "reasoning": "简短的匹配理由"
+}}
+
+只返回JSON,不要其他内容。"""
+
+        result = self.client.chat_json(
+            prompt=prompt,
+            images=images if images else None,
+            max_retries=3
+        )
+
+        if result:
+            return {
+                "note_index": note_index,
+                "relevance": result.get("relevance", 0.0),
+                "matched_elements": result.get("matched_elements", []),
+                "reasoning": result.get("reasoning", "")
+            }
+        else:
+            logger.error(f"  评估帖子 {note_index} 失败: {search_word}")
+            return {
+                "note_index": note_index,
+                "relevance": 0.0,
+                "matched_elements": [],
+                "reasoning": "评估失败"
+            }
+
+    def evaluate_search_results_parallel(
+        self,
+        original_feature: str,
+        search_word: str,
+        notes: List[Dict[str, Any]],
+        max_notes: int = 20,
+        max_workers: int = 20
+    ) -> Dict[str, Any]:
+        """
+        并行评估搜索结果(每个帖子独立评估)
+
+        Args:
+            original_feature: 原始特征
+            search_word: 搜索词
+            notes: 帖子列表
+            max_notes: 最多评估几条帖子
+            max_workers: 最大并发数
+
+        Returns:
+            评估结果汇总
+        """
+        if not notes:
+            return {
+                "overall_relevance": 0.0,
+                "extracted_elements": [],
+                "evaluated_notes": []
+            }
+
+        notes_to_eval = notes[:max_notes]
+        evaluated_notes = []
+
+        logger.info(f"  并行评估 {len(notes_to_eval)} 个帖子({max_workers}并发)")
+
+        # 20并发评估每个帖子
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = []
+            for idx, note in enumerate(notes_to_eval):
+                future = executor.submit(
+                    self.evaluate_single_note,
+                    original_feature,
+                    search_word,
+                    note,
+                    idx
+                )
+                futures.append(future)
+
+            # 收集结果
+            for future in as_completed(futures):
+                try:
+                    result = future.result()
+                    evaluated_notes.append(result)
+                except Exception as e:
+                    logger.error(f"  评估帖子失败: {e}")
+
+        # 按note_index排序
+        evaluated_notes.sort(key=lambda x: x['note_index'])
+
+        # 汇总:计算整体相关度和提取元素
+        if evaluated_notes:
+            overall_relevance = sum(n['relevance'] for n in evaluated_notes) / len(evaluated_notes)
+
+            # 提取所有元素并统计频次
+            element_counts = {}
+            for note in evaluated_notes:
+                for elem in note['matched_elements']:
+                    element_counts[elem] = element_counts.get(elem, 0) + 1
+
+            # 按频次排序,取前5个
+            extracted_elements = sorted(
+                element_counts.keys(),
+                key=lambda x: element_counts[x],
+                reverse=True
+            )[:5]
+        else:
+            overall_relevance = 0.0
+            extracted_elements = []
+
+        return {
+            "overall_relevance": overall_relevance,
+            "extracted_elements": extracted_elements,
+            "evaluated_notes": evaluated_notes
+        }
+
     def evaluate_search_results(
     def evaluate_search_results(
         self,
         self,
         original_feature: str,
         original_feature: str,

+ 818 - 0
visualize_stage5_results.py

@@ -0,0 +1,818 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Stage5搜索结果可视化工具
+生成带图片轮播的交互式HTML页面
+"""
+
+import json
+import os
+from datetime import datetime
+from typing import List, Dict, Any
+
+
+def load_data(json_path: str) -> List[Dict[str, Any]]:
+    """加载JSON数据"""
+    with open(json_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def calculate_statistics(data: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """计算统计数据"""
+    total_features = len(data)
+    total_search_words = 0
+    total_notes = 0
+    video_count = 0
+    normal_count = 0
+
+    for feature in data:
+        search_results = feature.get('组合评估结果', [])
+        total_search_words += len(search_results)
+
+        for search_item in search_results:
+            search_result = search_item.get('search_result', {})
+            notes = search_result.get('data', {}).get('data', [])
+            total_notes += len(notes)
+
+            for note in notes:
+                note_type = note.get('note_card', {}).get('type', '')
+                if note_type == 'video':
+                    video_count += 1
+                else:
+                    normal_count += 1
+
+    return {
+        'total_features': total_features,
+        'total_search_words': total_search_words,
+        'total_notes': total_notes,
+        'video_count': video_count,
+        'normal_count': normal_count,
+        'video_percentage': round(video_count / total_notes * 100, 1) if total_notes > 0 else 0,
+        'normal_percentage': round(normal_count / total_notes * 100, 1) if total_notes > 0 else 0
+    }
+
+
+def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path: str):
+    """生成HTML可视化页面"""
+
+    # 准备数据JSON(用于JavaScript)
+    data_json = json.dumps(data, ensure_ascii=False, indent=2)
+
+    html_content = f'''<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Stage5 搜索结果可视化</title>
+    <style>
+        * {{
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }}
+
+        body {{
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+            background: #f5f7fa;
+            color: #333;
+            overflow-x: hidden;
+        }}
+
+        /* 顶部统计面板 */
+        .stats-panel {{
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 20px;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+        }}
+
+        .stats-container {{
+            max-width: 1400px;
+            margin: 0 auto;
+            display: flex;
+            justify-content: space-around;
+            align-items: center;
+            flex-wrap: wrap;
+            gap: 20px;
+        }}
+
+        .stat-item {{
+            text-align: center;
+        }}
+
+        .stat-value {{
+            font-size: 32px;
+            font-weight: bold;
+            margin-bottom: 5px;
+        }}
+
+        .stat-label {{
+            font-size: 14px;
+            opacity: 0.9;
+        }}
+
+        /* 主容器 */
+        .main-container {{
+            display: flex;
+            max-width: 1400px;
+            margin: 20px auto;
+            gap: 20px;
+            padding: 0 20px;
+            height: calc(100vh - 140px);
+        }}
+
+        /* 左侧导航 */
+        .left-sidebar {{
+            width: 30%;
+            background: white;
+            border-radius: 8px;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+            overflow-y: auto;
+            position: sticky;
+            top: 20px;
+            height: fit-content;
+            max-height: calc(100vh - 160px);
+        }}
+
+        .feature-group {{
+            border-bottom: 1px solid #e5e7eb;
+        }}
+
+        .feature-header {{
+            padding: 15px 20px;
+            background: #f9fafb;
+            cursor: pointer;
+            user-select: none;
+            transition: background 0.2s;
+        }}
+
+        .feature-header:hover {{
+            background: #f3f4f6;
+        }}
+
+        .feature-header.active {{
+            background: #667eea;
+            color: white;
+        }}
+
+        .feature-title {{
+            font-size: 16px;
+            font-weight: 600;
+            margin-bottom: 5px;
+        }}
+
+        .feature-meta {{
+            font-size: 12px;
+            color: #6b7280;
+        }}
+
+        .feature-header.active .feature-meta {{
+            color: rgba(255,255,255,0.8);
+        }}
+
+        .search-words-list {{
+            display: none;
+            padding: 10px 0;
+        }}
+
+        .search-words-list.expanded {{
+            display: block;
+        }}
+
+        .search-word-item {{
+            padding: 12px 20px 12px 40px;
+            cursor: pointer;
+            border-left: 3px solid transparent;
+            transition: all 0.2s;
+        }}
+
+        .search-word-item:hover {{
+            background: #f9fafb;
+            border-left-color: #667eea;
+        }}
+
+        .search-word-item.active {{
+            background: #ede9fe;
+            border-left-color: #7c3aed;
+        }}
+
+        .search-word-text {{
+            font-size: 14px;
+            font-weight: 500;
+            color: #374151;
+            margin-bottom: 4px;
+        }}
+
+        .search-word-score {{
+            display: inline-block;
+            padding: 2px 8px;
+            border-radius: 12px;
+            font-size: 11px;
+            font-weight: 600;
+            margin-left: 8px;
+        }}
+
+        .score-high {{
+            background: #d1fae5;
+            color: #065f46;
+        }}
+
+        .score-medium {{
+            background: #fef3c7;
+            color: #92400e;
+        }}
+
+        .score-low {{
+            background: #fee2e2;
+            color: #991b1b;
+        }}
+
+        .search-word-reasoning {{
+            font-size: 12px;
+            color: #6b7280;
+            margin-top: 4px;
+            display: -webkit-box;
+            -webkit-line-clamp: 2;
+            -webkit-box-orient: vertical;
+            overflow: hidden;
+        }}
+
+        /* 右侧结果区 */
+        .right-content {{
+            flex: 1;
+            overflow-y: auto;
+            padding-bottom: 40px;
+        }}
+
+        .result-block {{
+            background: white;
+            border-radius: 8px;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+            margin-bottom: 30px;
+            padding: 20px;
+            scroll-margin-top: 20px;
+        }}
+
+        .result-header {{
+            margin-bottom: 20px;
+            padding-bottom: 15px;
+            border-bottom: 2px solid #e5e7eb;
+        }}
+
+        .result-title {{
+            font-size: 20px;
+            font-weight: 600;
+            color: #111827;
+            margin-bottom: 10px;
+        }}
+
+        .result-stats {{
+            display: flex;
+            gap: 15px;
+            font-size: 13px;
+            color: #6b7280;
+        }}
+
+        .stat-badge {{
+            background: #f3f4f6;
+            padding: 4px 10px;
+            border-radius: 4px;
+        }}
+
+        /* 帖子网格 */
+        .notes-grid {{
+            display: grid;
+            grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
+            gap: 20px;
+        }}
+
+        .note-card {{
+            border: 1px solid #e5e7eb;
+            border-radius: 8px;
+            overflow: hidden;
+            cursor: pointer;
+            transition: all 0.3s;
+            background: white;
+        }}
+
+        .note-card:hover {{
+            transform: translateY(-4px);
+            box-shadow: 0 10px 25px rgba(0,0,0,0.15);
+        }}
+
+        /* 图片轮播 */
+        .image-carousel {{
+            position: relative;
+            width: 100%;
+            height: 280px;
+            background: #f3f4f6;
+            overflow: hidden;
+        }}
+
+        .carousel-images {{
+            display: flex;
+            height: 100%;
+            transition: transform 0.3s ease;
+        }}
+
+        .carousel-image {{
+            min-width: 100%;
+            height: 100%;
+            object-fit: cover;
+        }}
+
+        .carousel-btn {{
+            position: absolute;
+            top: 50%;
+            transform: translateY(-50%);
+            background: rgba(0,0,0,0.5);
+            color: white;
+            border: none;
+            width: 32px;
+            height: 32px;
+            border-radius: 50%;
+            cursor: pointer;
+            font-size: 16px;
+            display: none;
+            align-items: center;
+            justify-content: center;
+            transition: background 0.2s;
+            z-index: 10;
+        }}
+
+        .carousel-btn:hover {{
+            background: rgba(0,0,0,0.7);
+        }}
+
+        .carousel-btn.prev {{
+            left: 8px;
+        }}
+
+        .carousel-btn.next {{
+            right: 8px;
+        }}
+
+        .note-card:hover .carousel-btn {{
+            display: flex;
+        }}
+
+        .carousel-indicators {{
+            position: absolute;
+            bottom: 10px;
+            left: 50%;
+            transform: translateX(-50%);
+            display: flex;
+            gap: 6px;
+            z-index: 10;
+        }}
+
+        .dot {{
+            width: 8px;
+            height: 8px;
+            border-radius: 50%;
+            background: rgba(255,255,255,0.5);
+            cursor: pointer;
+            transition: all 0.2s;
+        }}
+
+        .dot.active {{
+            background: white;
+            width: 24px;
+            border-radius: 4px;
+        }}
+
+        .image-counter {{
+            position: absolute;
+            top: 10px;
+            right: 10px;
+            background: rgba(0,0,0,0.6);
+            color: white;
+            padding: 4px 8px;
+            border-radius: 4px;
+            font-size: 12px;
+            z-index: 10;
+        }}
+
+        /* 帖子信息 */
+        .note-info {{
+            padding: 12px;
+        }}
+
+        .note-title {{
+            font-size: 14px;
+            font-weight: 500;
+            color: #111827;
+            margin-bottom: 8px;
+            display: -webkit-box;
+            -webkit-line-clamp: 2;
+            -webkit-box-orient: vertical;
+            overflow: hidden;
+            line-height: 1.4;
+        }}
+
+        .note-meta {{
+            display: flex;
+            align-items: center;
+            justify-content: space-between;
+            font-size: 12px;
+            color: #6b7280;
+        }}
+
+        .note-type {{
+            padding: 3px 8px;
+            border-radius: 4px;
+            font-weight: 500;
+        }}
+
+        .type-video {{
+            background: #dbeafe;
+            color: #1e40af;
+        }}
+
+        .type-normal {{
+            background: #d1fae5;
+            color: #065f46;
+        }}
+
+        .note-author {{
+            display: flex;
+            align-items: center;
+            gap: 6px;
+        }}
+
+        .author-avatar {{
+            width: 24px;
+            height: 24px;
+            border-radius: 50%;
+        }}
+
+        /* SVG连线层 */
+        #connection-svg {{
+            position: fixed;
+            top: 0;
+            left: 0;
+            width: 100%;
+            height: 100%;
+            pointer-events: none;
+            z-index: 1;
+        }}
+
+        .connection-line {{
+            stroke: #cbd5e1;
+            stroke-width: 1;
+            stroke-dasharray: 5,5;
+            fill: none;
+            opacity: 0.3;
+            transition: all 0.2s;
+        }}
+
+        .connection-line.active {{
+            stroke: #667eea;
+            stroke-width: 2;
+            stroke-dasharray: none;
+            opacity: 1;
+        }}
+
+        /* 滚动条样式 */
+        ::-webkit-scrollbar {{
+            width: 8px;
+            height: 8px;
+        }}
+
+        ::-webkit-scrollbar-track {{
+            background: #f1f1f1;
+        }}
+
+        ::-webkit-scrollbar-thumb {{
+            background: #888;
+            border-radius: 4px;
+        }}
+
+        ::-webkit-scrollbar-thumb:hover {{
+            background: #555;
+        }}
+    </style>
+</head>
+<body>
+    <!-- 统计面板 -->
+    <div class="stats-panel">
+        <div class="stats-container">
+            <div class="stat-item">
+                <div class="stat-value">📊 {stats['total_features']}</div>
+                <div class="stat-label">原始特征数</div>
+            </div>
+            <div class="stat-item">
+                <div class="stat-value">🔍 {stats['total_search_words']}</div>
+                <div class="stat-label">搜索词数</div>
+            </div>
+            <div class="stat-item">
+                <div class="stat-value">📝 {stats['total_notes']}</div>
+                <div class="stat-label">帖子总数</div>
+            </div>
+            <div class="stat-item">
+                <div class="stat-value">🎬 {stats['video_count']}</div>
+                <div class="stat-label">视频类型 ({stats['video_percentage']}%)</div>
+            </div>
+            <div class="stat-item">
+                <div class="stat-value">📷 {stats['normal_count']}</div>
+                <div class="stat-label">图文类型 ({stats['normal_percentage']}%)</div>
+            </div>
+        </div>
+    </div>
+
+    <!-- SVG连线层 -->
+    <svg id="connection-svg"></svg>
+
+    <!-- 主容器 -->
+    <div class="main-container">
+        <!-- 左侧导航 -->
+        <div class="left-sidebar" id="leftSidebar">
+            <!-- 通过JavaScript动态生成 -->
+        </div>
+
+        <!-- 右侧结果区 -->
+        <div class="right-content" id="rightContent">
+            <!-- 通过JavaScript动态生成 -->
+        </div>
+    </div>
+
+    <script>
+        // 数据
+        const data = {data_json};
+
+        // 渲染左侧导航
+        function renderLeftSidebar() {{
+            const sidebar = document.getElementById('leftSidebar');
+            let html = '';
+
+            data.forEach((feature, featureIdx) => {{
+                const searchWords = feature['组合评估结果'] || [];
+
+                html += `
+                    <div class="feature-group">
+                        <div class="feature-header" onclick="toggleFeature(${{featureIdx}})" id="feature-header-${{featureIdx}}">
+                            <div class="feature-title">${{feature['原始特征名称']}}</div>
+                            <div class="feature-meta">
+                                ${{feature['来源层级']}} · 权重: ${{feature['权重'].toFixed(2)}} · ${{searchWords.length}}个搜索词
+                            </div>
+                        </div>
+                        <div class="search-words-list" id="search-words-${{featureIdx}}">
+                `;
+
+                searchWords.forEach((sw, swIdx) => {{
+                    const score = sw.score || 0;
+                    const scoreClass = score >= 0.9 ? 'score-high' : score >= 0.7 ? 'score-medium' : 'score-low';
+                    const blockId = `block-${{featureIdx}}-${{swIdx}}`;
+
+                    html += `
+                        <div class="search-word-item" onclick="scrollToBlock('${{blockId}}')"
+                             id="sw-${{featureIdx}}-${{swIdx}}"
+                             data-block-id="${{blockId}}">
+                            <div class="search-word-text">
+                                ${{sw.search_word}}
+                                <span class="search-word-score ${{scoreClass}}">${{score.toFixed(2)}}</span>
+                            </div>
+                            <div class="search-word-reasoning" title="${{sw.reasoning}}">
+                                ${{sw.reasoning || ''}}
+                            </div>
+                        </div>
+                    `;
+                }});
+
+                html += `
+                        </div>
+                    </div>
+                `;
+            }});
+
+            sidebar.innerHTML = html;
+        }}
+
+        // 渲染右侧结果区
+        function renderRightContent() {{
+            const content = document.getElementById('rightContent');
+            let html = '';
+
+            data.forEach((feature, featureIdx) => {{
+                const searchWords = feature['组合评估结果'] || [];
+
+                searchWords.forEach((sw, swIdx) => {{
+                    const blockId = `block-${{featureIdx}}-${{swIdx}}`;
+                    const searchResult = sw.search_result || {{}};
+                    const notes = searchResult.data?.data || [];
+
+                    const videoCount = notes.filter(n => n.note_card?.type === 'video').length;
+                    const normalCount = notes.length - videoCount;
+
+                    html += `
+                        <div class="result-block" id="${{blockId}}">
+                            <div class="result-header">
+                                <div class="result-title">${{sw.search_word}}</div>
+                                <div class="result-stats">
+                                    <span class="stat-badge">📝 ${{notes.length}} 条帖子</span>
+                                    <span class="stat-badge">🎬 ${{videoCount}} 视频</span>
+                                    <span class="stat-badge">📷 ${{normalCount}} 图文</span>
+                                </div>
+                            </div>
+                            <div class="notes-grid">
+                                ${{notes.map((note, noteIdx) => renderNoteCard(note, featureIdx, swIdx, noteIdx)).join('')}}
+                            </div>
+                        </div>
+                    `;
+                }});
+            }});
+
+            content.innerHTML = html;
+        }}
+
+        // 渲染单个帖子卡片
+        function renderNoteCard(note, featureIdx, swIdx, noteIdx) {{
+            const card = note.note_card || {{}};
+            const images = card.image_list || [];
+            const title = card.display_title || '无标题';
+            const noteType = card.type || 'normal';
+            const noteId = note.id || '';
+            const user = card.user || {{}};
+            const userName = user.nick_name || '未知用户';
+            const userAvatar = user.avatar || '';
+
+            const carouselId = `carousel-${{featureIdx}}-${{swIdx}}-${{noteIdx}}`;
+
+            return `
+                <div class="note-card" onclick="openNote('${{noteId}}')">
+                    <div class="image-carousel" id="${{carouselId}}">
+                        <div class="carousel-images">
+                            ${{images.map(img => `<img class="carousel-image" src="${{img}}" alt="帖子图片" loading="lazy">`).join('')}}
+                        </div>
+                        ${{images.length > 1 ? `
+                            <button class="carousel-btn prev" onclick="event.stopPropagation(); changeImage('${{carouselId}}', -1)">←</button>
+                            <button class="carousel-btn next" onclick="event.stopPropagation(); changeImage('${{carouselId}}', 1)">→</button>
+                            <div class="carousel-indicators">
+                                ${{images.map((_, i) => `<span class="dot ${{i === 0 ? 'active' : ''}}" onclick="event.stopPropagation(); goToImage('${{carouselId}}', ${{i}})"></span>`).join('')}}
+                            </div>
+                            <span class="image-counter">1/${{images.length}}</span>
+                        ` : ''}}
+                    </div>
+                    <div class="note-info">
+                        <div class="note-title">${{title}}</div>
+                        <div class="note-meta">
+                            <span class="note-type type-${{noteType}}">
+                                ${{noteType === 'video' ? '🎬 视频' : '📷 图文'}}
+                            </span>
+                            <div class="note-author">
+                                ${{userAvatar ? `<img class="author-avatar" src="${{userAvatar}}" alt="${{userName}}">` : ''}}
+                                <span>${{userName}}</span>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            `;
+        }}
+
+        // 图片轮播逻辑
+        const carouselStates = {{}};
+
+        function changeImage(carouselId, direction) {{
+            if (!carouselStates[carouselId]) {{
+                carouselStates[carouselId] = {{ currentIndex: 0 }};
+            }}
+
+            const carousel = document.getElementById(carouselId);
+            const imagesContainer = carousel.querySelector('.carousel-images');
+            const images = carousel.querySelectorAll('.carousel-image');
+            const dots = carousel.querySelectorAll('.dot');
+            const counter = carousel.querySelector('.image-counter');
+
+            let newIndex = carouselStates[carouselId].currentIndex + direction;
+            if (newIndex < 0) newIndex = images.length - 1;
+            if (newIndex >= images.length) newIndex = 0;
+
+            carouselStates[carouselId].currentIndex = newIndex;
+            imagesContainer.style.transform = `translateX(-${{newIndex * 100}}%)`;
+
+            // 更新指示器
+            dots.forEach((dot, i) => {{
+                dot.classList.toggle('active', i === newIndex);
+            }});
+
+            // 更新计数器
+            if (counter) {{
+                counter.textContent = `${{newIndex + 1}}/${{images.length}}`;
+            }}
+        }}
+
+        function goToImage(carouselId, index) {{
+            if (!carouselStates[carouselId]) {{
+                carouselStates[carouselId] = {{ currentIndex: 0 }};
+            }}
+
+            const carousel = document.getElementById(carouselId);
+            const imagesContainer = carousel.querySelector('.carousel-images');
+            const dots = carousel.querySelectorAll('.dot');
+            const counter = carousel.querySelector('.image-counter');
+
+            carouselStates[carouselId].currentIndex = index;
+            imagesContainer.style.transform = `translateX(-${{index * 100}}%)`;
+
+            // 更新指示器
+            dots.forEach((dot, i) => {{
+                dot.classList.toggle('active', i === index);
+            }});
+
+            // 更新计数器
+            if (counter) {{
+                counter.textContent = `${{index + 1}}/${{dots.length}}`;
+            }}
+        }}
+
+        // 展开/折叠特征组
+        function toggleFeature(featureIdx) {{
+            const searchWordsList = document.getElementById(`search-words-${{featureIdx}}`);
+            const featureHeader = document.getElementById(`feature-header-${{featureIdx}}`);
+
+            searchWordsList.classList.toggle('expanded');
+            featureHeader.classList.toggle('active');
+        }}
+
+        // 滚动到指定结果块
+        function scrollToBlock(blockId) {{
+            const block = document.getElementById(blockId);
+            if (block) {{
+                block.scrollIntoView({{ behavior: 'smooth', block: 'start' }});
+
+                // 高亮对应的搜索词
+                document.querySelectorAll('.search-word-item').forEach(item => {{
+                    item.classList.remove('active');
+                }});
+
+                document.querySelectorAll(`[data-block-id="${{blockId}}"]`).forEach(item => {{
+                    item.classList.add('active');
+                }});
+            }}
+        }}
+
+        // 打开小红书帖子
+        function openNote(noteId) {{
+            if (noteId) {{
+                window.open(`https://www.xiaohongshu.com/explore/${{noteId}}`, '_blank');
+            }}
+        }}
+
+        // 初始化
+        document.addEventListener('DOMContentLoaded', () => {{
+            renderLeftSidebar();
+            renderRightContent();
+
+            // 默认展开第一个特征组
+            if (data.length > 0) {{
+                toggleFeature(0);
+            }}
+        }});
+    </script>
+</body>
+</html>
+'''
+
+    # 写入文件
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write(html_content)
+
+
+def main():
+    """主函数"""
+    # 配置路径
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    json_path = os.path.join(script_dir, 'output_v2', 'stage5_with_search_results.json')
+    output_dir = os.path.join(script_dir, 'visualization')
+    os.makedirs(output_dir, exist_ok=True)
+
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    output_path = os.path.join(output_dir, f'stage5_interactive_{timestamp}.html')
+
+    # 加载数据
+    print(f"📖 加载数据: {json_path}")
+    data = load_data(json_path)
+    print(f"✓ 加载了 {len(data)} 个原始特征")
+
+    # 计算统计
+    print("📊 计算统计数据...")
+    stats = calculate_statistics(data)
+    print(f"✓ 统计完成:")
+    print(f"  - 原始特征: {stats['total_features']}")
+    print(f"  - 搜索词: {stats['total_search_words']}")
+    print(f"  - 帖子总数: {stats['total_notes']}")
+    print(f"  - 视频: {stats['video_count']} ({stats['video_percentage']}%)")
+    print(f"  - 图文: {stats['normal_count']} ({stats['normal_percentage']}%)")
+
+    # 生成HTML
+    print(f"\n🎨 生成可视化页面...")
+    generate_html(data, stats, output_path)
+    print(f"✓ 生成完成: {output_path}")
+
+    # 打印访问提示
+    print(f"\n🌐 在浏览器中打开查看:")
+    print(f"   file://{output_path}")
+
+
+if __name__ == '__main__':
+    main()

+ 78 - 6
xiaohongshu_search.py

@@ -9,9 +9,12 @@ import json
 import os
 import os
 import argparse
 import argparse
 import time
 import time
+import logging
 from datetime import datetime
 from datetime import datetime
 from typing import Dict, Any
 from typing import Dict, Any
 
 
+logger = logging.getLogger(__name__)
+
 
 
 class XiaohongshuSearch:
 class XiaohongshuSearch:
     """小红书笔记搜索API封装类"""
     """小红书笔记搜索API封装类"""
@@ -20,12 +23,13 @@ class XiaohongshuSearch:
     TOOL_NAME = "xhs_note_search"
     TOOL_NAME = "xhs_note_search"
     PLATFORM = "xiaohongshu"
     PLATFORM = "xiaohongshu"
 
 
-    def __init__(self, results_dir: str = None):
+    def __init__(self, results_dir: str = None, cache_dir: str = "search_cache"):
         """
         """
         初始化API客户端
         初始化API客户端
 
 
         Args:
         Args:
             results_dir: 结果输出目录,默认为项目根目录下的 data/search 文件夹
             results_dir: 结果输出目录,默认为项目根目录下的 data/search 文件夹
+            cache_dir: 缓存目录,默认为 search_cache
         """
         """
         self.api_url = f"{self.BASE_URL}/tools/call/{self.TOOL_NAME}"
         self.api_url = f"{self.BASE_URL}/tools/call/{self.TOOL_NAME}"
 
 
@@ -38,19 +42,60 @@ class XiaohongshuSearch:
             project_root = os.path.dirname(os.path.dirname(script_dir))
             project_root = os.path.dirname(os.path.dirname(script_dir))
             self.results_base_dir = os.path.join(project_root, "data", "search")
             self.results_base_dir = os.path.join(project_root, "data", "search")
 
 
+        # 设置缓存目录
+        self.cache_dir = cache_dir
+        if cache_dir:
+            os.makedirs(cache_dir, exist_ok=True)
+
+    def _get_cache_key(
+        self,
+        keyword: str,
+        content_type: str,
+        sort_type: str,
+        publish_time: str
+    ) -> str:
+        """
+        生成缓存key
+
+        Args:
+            keyword: 搜索关键词
+            content_type: 内容类型
+            sort_type: 排序方式
+            publish_time: 发布时间
+
+        Returns:
+            缓存key字符串
+        """
+        return f"{keyword}_{content_type}_{sort_type}_{publish_time}"
+
+    def _get_cache_path(self, cache_key: str) -> str:
+        """
+        获取缓存文件路径
+
+        Args:
+            cache_key: 缓存key
+
+        Returns:
+            缓存文件完整路径
+        """
+        # 清理文件名中的非法字符
+        safe_key = cache_key.replace('/', '_').replace('\\', '_').replace(' ', '_')
+        return os.path.join(self.cache_dir, f"{safe_key}.json")
+
     def search(
     def search(
         self,
         self,
         keyword: str,
         keyword: str,
-        content_type: str = "图文",
+        content_type: str = "不限",
         sort_type: str = "综合",
         sort_type: str = "综合",
         publish_time: str = "不限",
         publish_time: str = "不限",
         cursor: str = "",
         cursor: str = "",
         timeout: int = 30,
         timeout: int = 30,
-        max_retries: int = 3,
-        retry_delay: int = 2
+        max_retries: int = 5,
+        retry_delay: int = 2,
+        use_cache: bool = True
     ) -> Dict[str, Any]:
     ) -> Dict[str, Any]:
         """
         """
-        搜索小红书笔记(带重试机制)
+        搜索小红书笔记(带重试机制和缓存
 
 
         Args:
         Args:
             keyword: 搜索关键词
             keyword: 搜索关键词
@@ -61,6 +106,7 @@ class XiaohongshuSearch:
             timeout: 请求超时时间(秒),默认30秒
             timeout: 请求超时时间(秒),默认30秒
             max_retries: 最大重试次数,默认3次
             max_retries: 最大重试次数,默认3次
             retry_delay: 重试间隔时间(秒),默认2秒
             retry_delay: 重试间隔时间(秒),默认2秒
+            use_cache: 是否使用缓存,默认True
 
 
         Returns:
         Returns:
             API响应的JSON数据
             API响应的JSON数据
@@ -68,9 +114,24 @@ class XiaohongshuSearch:
         Raises:
         Raises:
             requests.exceptions.RequestException: 所有重试都失败时抛出异常
             requests.exceptions.RequestException: 所有重试都失败时抛出异常
         """
         """
+        # 检查缓存
+        if use_cache and self.cache_dir:
+            cache_key = self._get_cache_key(keyword, content_type, sort_type, publish_time)
+            cache_path = self._get_cache_path(cache_key)
+
+            if os.path.exists(cache_path):
+                try:
+                    with open(cache_path, 'r', encoding='utf-8') as f:
+                        cached_result = json.load(f)
+                    logger.info(f"  ✓ 使用缓存: {keyword}")
+                    return cached_result
+                except Exception as e:
+                    logger.warning(f"  读取缓存失败: {e},将重新搜索")
+
+        # 缓存未命中或未启用,执行实际搜索
         payload = {
         payload = {
             "keyword": keyword,
             "keyword": keyword,
-            "content_type": content_type,
+            "content_type": '不限',  # 使用映射后的参数
             "sort_type": sort_type,
             "sort_type": sort_type,
             "publish_time": publish_time,
             "publish_time": publish_time,
             "cursor": cursor
             "cursor": cursor
@@ -106,6 +167,17 @@ class XiaohongshuSearch:
                 if attempt > 1:
                 if attempt > 1:
                     print(f"    ✓ 重试成功")
                     print(f"    ✓ 重试成功")
 
 
+                # 保存到缓存
+                if use_cache and self.cache_dir:
+                    try:
+                        cache_key = self._get_cache_key(keyword, content_type, sort_type, publish_time)
+                        cache_path = self._get_cache_path(cache_key)
+                        with open(cache_path, 'w', encoding='utf-8') as f:
+                            json.dump(result, f, ensure_ascii=False, indent=2)
+                        logger.info(f"  ✓ 已缓存: {keyword}")
+                    except Exception as e:
+                        logger.warning(f"  保存缓存失败: {e}")
+
                 return result
                 return result
 
 
             except requests.exceptions.RequestException as e:
             except requests.exceptions.RequestException as e: