刘立冬 2 hete
szülő
commit
8aad6ff988
38 módosított fájl, 10540 hozzáadás és 62 törlés
  1. 230 0
      scripts/add_search_words.py
  2. 413 0
      scripts/execute_search_tasks.py
  3. 219 0
      scripts/import_search_results.py
  4. 445 0
      scripts/run_deconstruction.py
  5. 246 0
      scripts/run_similarity_analysis.py
  6. 76 0
      scripts/run_visualizer.py
  7. 187 0
      scripts/visualize_cascade.py
  8. 37 0
      scripts/visualize_integrated_results.py
  9. 0 0
      src/__init__.py
  10. 0 0
      src/analyzers/__init__.py
  11. 603 0
      src/analyzers/post_deconstruction_analyzer.py
  12. 757 0
      src/analyzers/similarity_analyzer.py
  13. 51 0
      src/api/base.py
  14. 0 0
      src/clients/__init__.py
  15. 253 0
      src/clients/deconstruction_api_client.py
  16. 277 0
      src/clients/openrouter_client.py
  17. 331 0
      src/clients/xiaohongshu_search.py
  18. 0 0
      src/evaluators/__init__.py
  19. 54 34
      src/evaluators/llm_evaluator.py
  20. 72 0
      src/models/__init__.py
  21. 174 0
      src/models/candidate.py
  22. 184 0
      src/models/deconstruction.py
  23. 170 0
      src/models/evaluation.py
  24. 158 0
      src/models/post.py
  25. 174 0
      src/models/query.py
  26. 221 0
      src/models/similarity.py
  27. 0 0
      src/pipeline/__init__.py
  28. 1291 0
      src/pipeline/feature_search_pipeline.py
  29. 0 0
      src/visualizers/__init__.py
  30. 1341 0
      src/visualizers/cascade_search_visualizer.py
  31. 24 28
      src/visualizers/deconstruction_visualizer.py
  32. 1487 0
      src/visualizers/search_results_visualizer.py
  33. 255 0
      tools/analyze_associations.py
  34. 100 0
      tools/analyze_content_types.py
  35. 202 0
      tools/analyze_feature_matches.py
  36. 168 0
      tools/analyze_specific_feature.py
  37. 236 0
      tools/analyze_stage6_results.py
  38. 104 0
      tools/remove_association_methods.py

+ 230 - 0
scripts/add_search_words.py

@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+为关联特征生成检索词并去重
+
+读取 associated_tags_results.json,为每个特征生成组合检索词,
+并在同一结果项内去重。
+"""
+
+import json
+import logging
+from pathlib import Path
+from typing import Dict, List, Any, Set
+import argparse
+
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+
+
+class SearchWordGenerator:
+    """检索词生成器"""
+
+    def __init__(self, input_path: str):
+        """
+        初始化生成器
+
+        Args:
+            input_path: 输入JSON文件路径
+        """
+        self.input_path = input_path
+        self.data = self._load_json(input_path)
+        self.stats = {
+            '处理的结果项数': 0,
+            '生成的总组合词数': 0,
+            '唯一组合词数': 0,
+            '重复过滤的词数': 0,
+            '每项详情': []
+        }
+
+    def _load_json(self, file_path: str) -> List[Dict]:
+        """加载JSON文件"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except Exception as e:
+            logger.error(f"加载文件 {file_path} 失败: {e}")
+            raise
+
+    def generate_search_words(self) -> List[Dict[str, Any]]:
+        """
+        为所有结果项生成检索词
+
+        Returns:
+            增强后的数据列表
+        """
+        logger.info("=" * 60)
+        logger.info("开始生成检索词")
+        logger.info("=" * 60)
+
+        enhanced_data = []
+
+        for idx, result in enumerate(self.data, 1):
+            logger.info(f"\n处理第 {idx}/{len(self.data)} 个结果项")
+
+            # 获取基础词(人设特征名称)
+            base_word = result.get('最高匹配信息', {}).get('人设特征名称', '')
+            original_feature = result.get('原始特征名称', '')
+
+            logger.info(f"  原始特征: {original_feature}")
+            logger.info(f"  人设特征名称(基础词): {base_word}")
+
+            if not base_word:
+                logger.warning(f"  警告:未找到人设特征名称,跳过")
+                enhanced_data.append(result)
+                continue
+
+            # 用于去重的集合(在当前结果项范围内)
+            seen_words: Set[str] = set()
+            item_stats = {
+                '原始特征': original_feature,
+                '人设特征名称': base_word,
+                '总特征数': 0,
+                '唯一组合词数': 0,
+                '重复词数': 0,
+                '组合词列表': []
+            }
+
+            # 遍历所有关联
+            associations = result.get('找到的关联', [])
+            for assoc_idx, assoc in enumerate(associations):
+                target_path = assoc.get('目标分类路径', '')
+                features = assoc.get('特征列表', [])
+
+                logger.info(f"  处理关联 {assoc_idx + 1}/{len(associations)}: {target_path}")
+                logger.info(f"    特征数: {len(features)}")
+
+                # 遍历特征列表
+                for feature in features:
+                    feature_name = feature.get('特征名称', '')
+                    item_stats['总特征数'] += 1
+
+                    if not feature_name:
+                        feature['search_word'] = None
+                        continue
+
+                    # 生成组合词
+                    search_word = f"{base_word} {feature_name}"
+
+                    # 检查是否重复
+                    if search_word not in seen_words:
+                        # 首次出现,填充
+                        feature['search_word'] = search_word
+                        seen_words.add(search_word)
+                        item_stats['唯一组合词数'] += 1
+                        item_stats['组合词列表'].append(search_word)
+                        logger.debug(f"      + 新增: {search_word}")
+                    else:
+                        # 重复,留空
+                        feature['search_word'] = None
+                        item_stats['重复词数'] += 1
+                        logger.debug(f"      - 重复(留空): {search_word}")
+
+            # 记录统计
+            logger.info(f"  完成:总特征 {item_stats['总特征数']} 个,"
+                       f"唯一组合词 {item_stats['唯一组合词数']} 个,"
+                       f"重复 {item_stats['重复词数']} 个")
+
+            self.stats['处理的结果项数'] += 1
+            self.stats['生成的总组合词数'] += item_stats['总特征数']
+            self.stats['唯一组合词数'] += item_stats['唯一组合词数']
+            self.stats['重复过滤的词数'] += item_stats['重复词数']
+            self.stats['每项详情'].append(item_stats)
+
+            enhanced_data.append(result)
+
+        logger.info("\n" + "=" * 60)
+        logger.info("生成完成")
+        logger.info("=" * 60)
+        logger.info(f"处理的结果项数: {self.stats['处理的结果项数']}")
+        logger.info(f"生成的总组合词数: {self.stats['生成的总组合词数']}")
+        logger.info(f"唯一组合词数: {self.stats['唯一组合词数']}")
+        logger.info(f"重复过滤的词数: {self.stats['重复过滤的词数']}")
+
+        return enhanced_data
+
+    def save_results(self, enhanced_data: List[Dict[str, Any]], output_path: str):
+        """保存增强后的数据"""
+        try:
+            with open(output_path, 'w', encoding='utf-8') as f:
+                json.dump(enhanced_data, f, ensure_ascii=False, indent=2)
+            logger.info(f"增强数据已保存到: {output_path}")
+        except Exception as e:
+            logger.error(f"保存结果失败: {e}")
+            raise
+
+    def save_stats(self, stats_path: str):
+        """保存统计信息"""
+        try:
+            with open(stats_path, 'w', encoding='utf-8') as f:
+                json.dump(self.stats, f, ensure_ascii=False, indent=2)
+            logger.info(f"统计信息已保存到: {stats_path}")
+        except Exception as e:
+            logger.error(f"保存统计信息失败: {e}")
+            raise
+
+
+def main():
+    """主函数"""
+    parser = argparse.ArgumentParser(description='为关联特征生成检索词并去重')
+    parser.add_argument(
+        '--input',
+        default='associated_tags_results.json',
+        help='输入JSON文件路径(默认: associated_tags_results.json)'
+    )
+    parser.add_argument(
+        '--output',
+        default='associated_tags_results_with_search.json',
+        help='输出JSON文件路径(默认: associated_tags_results_with_search.json)'
+    )
+    parser.add_argument(
+        '--stats',
+        default='search_words_stats.json',
+        help='统计信息输出路径(默认: search_words_stats.json)'
+    )
+    parser.add_argument(
+        '--debug',
+        action='store_true',
+        help='启用调试日志'
+    )
+
+    args = parser.parse_args()
+
+    # 设置日志级别
+    if args.debug:
+        logger.setLevel(logging.DEBUG)
+
+    # 创建生成器
+    generator = SearchWordGenerator(input_path=args.input)
+
+    # 生成检索词
+    enhanced_data = generator.generate_search_words()
+
+    # 保存结果
+    generator.save_results(enhanced_data, args.output)
+    generator.save_stats(args.stats)
+
+    # 输出汇总
+    logger.info("\n" + "=" * 60)
+    logger.info("处理完成汇总")
+    logger.info("=" * 60)
+    logger.info(f"输入文件: {args.input}")
+    logger.info(f"输出文件: {args.output}")
+    logger.info(f"统计文件: {args.stats}")
+    logger.info(f"")
+    logger.info(f"处理结果:")
+    logger.info(f"  - 结果项数: {generator.stats['处理的结果项数']}")
+    logger.info(f"  - 总特征数: {generator.stats['生成的总组合词数']}")
+    logger.info(f"  - 唯一组合词: {generator.stats['唯一组合词数']}")
+    logger.info(f"  - 重复过滤: {generator.stats['重复过滤的词数']}")
+    logger.info(f"  - 去重率: {generator.stats['重复过滤的词数'] / max(generator.stats['生成的总组合词数'], 1) * 100:.1f}%")
+
+
+if __name__ == '__main__':
+    main()

+ 413 - 0
scripts/execute_search_tasks.py

@@ -0,0 +1,413 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+小红书搜索任务执行器
+
+读取 associated_tags_results_with_search.json,
+对所有非空的 search_word 执行小红书搜索,
+并将结果写入到对应的特征节点下。
+"""
+
+import sys
+import os
+
+# 将项目根目录添加到Python路径
+project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, project_root)
+
+import json
+import logging
+import time
+import copy
+from pathlib import Path
+from typing import Dict, List, Any, Set, Optional
+from datetime import datetime
+import argparse
+
+from src.clients.xiaohongshu_search import XiaohongshuSearch
+
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S',
+    handlers=[
+        logging.FileHandler('search_execution.log', encoding='utf-8'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+
+class SearchTaskExecutor:
+    """搜索任务执行器"""
+
+    def __init__(
+        self,
+        input_path: str,
+        output_path: str = None,
+        progress_path: str = 'search_progress.json',
+        search_delay: float = 2.0,
+        content_type: str = '图文',
+        sort_type: str = '综合'
+    ):
+        """
+        初始化执行器
+
+        Args:
+            input_path: 输入JSON文件路径
+            output_path: 输出JSON文件路径
+            progress_path: 进度文件路径
+            search_delay: 每次搜索间隔时间(秒)
+            content_type: 内容类型
+            sort_type: 排序方式
+        """
+        self.input_path = input_path
+        self.output_path = output_path or input_path.replace(
+            '.json', '_with_search_data.json'
+        )
+        self.progress_path = progress_path
+        self.search_delay = search_delay
+        self.content_type = content_type
+        self.sort_type = sort_type
+
+        # 初始化搜索客户端
+        self.search_client = XiaohongshuSearch()
+
+        # 统计信息
+        self.stats = {
+            '总特征数': 0,
+            '有search_word的特征数': 0,
+            '唯一search_word数': 0,
+            '已完成搜索数': 0,
+            '成功搜索数': 0,
+            '失败搜索数': 0,
+            '跳过搜索数': 0
+        }
+
+    def load_json(self, file_path: str) -> Any:
+        """加载JSON文件"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except FileNotFoundError:
+            logger.warning(f"文件不存在: {file_path}")
+            return None
+        except Exception as e:
+            logger.error(f"加载文件失败 {file_path}: {e}")
+            raise
+
+    def save_json(self, data: Any, file_path: str):
+        """保存JSON文件"""
+        try:
+            with open(file_path, 'w', encoding='utf-8') as f:
+                json.dump(data, f, ensure_ascii=False, indent=2)
+            logger.info(f"已保存: {file_path}")
+        except Exception as e:
+            logger.error(f"保存文件失败 {file_path}: {e}")
+            raise
+
+    def load_progress(self) -> Dict[str, Any]:
+        """加载进度文件"""
+        progress = self.load_json(self.progress_path)
+        if progress is None:
+            return {
+                'completed_searches': {},  # search_word -> result
+                'started_at': datetime.now().isoformat(),
+                'last_updated': None
+            }
+        return progress
+
+    def save_progress(self, progress: Dict[str, Any]):
+        """保存进度文件"""
+        progress['last_updated'] = datetime.now().isoformat()
+        self.save_json(progress, self.progress_path)
+
+    def collect_search_words(self, data: List[Dict[str, Any]]) -> Dict[str, List[tuple]]:
+        """
+        收集所有需要搜索的关键词
+
+        Args:
+            data: 输入数据列表
+
+        Returns:
+            字典,key 为 search_word,value 为特征位置列表
+            位置格式: (result_idx, assoc_idx, feature_idx)
+        """
+        search_word_map = {}  # search_word -> [(result_idx, assoc_idx, feature_idx), ...]
+
+        for result_idx, result in enumerate(data):
+            for assoc_idx, assoc in enumerate(result.get('找到的关联', [])):
+                for feature_idx, feature in enumerate(assoc.get('特征列表', [])):
+                    self.stats['总特征数'] += 1
+
+                    search_word = feature.get('search_word')
+                    if search_word and search_word.strip():
+                        self.stats['有search_word的特征数'] += 1
+
+                        if search_word not in search_word_map:
+                            search_word_map[search_word] = []
+
+                        search_word_map[search_word].append(
+                            (result_idx, assoc_idx, feature_idx)
+                        )
+
+        self.stats['唯一search_word数'] = len(search_word_map)
+        return search_word_map
+
+    def execute_search(
+        self,
+        search_word: str,
+        max_retries: int = 3
+    ) -> Optional[Dict[str, Any]]:
+        """
+        执行单个搜索
+
+        Args:
+            search_word: 搜索关键词
+            max_retries: 最大重试次数
+
+        Returns:
+            搜索结果字典,失败返回 None
+        """
+        try:
+            logger.info(f"  搜索: {search_word}")
+
+            result = self.search_client.search(
+                keyword=search_word,
+                content_type=self.content_type,
+                sort_type=self.sort_type,
+                max_retries=max_retries
+            )
+
+            # 提取帖子数量
+            note_count = len(result.get('data', {}).get('data', []))
+            logger.info(f"    ✓ 成功,获取 {note_count} 条帖子")
+
+            return result
+
+        except Exception as e:
+            logger.error(f"    ✗ 失败: {e}")
+            return None
+
+    def process_searches(
+        self,
+        data: List[Dict[str, Any]],
+        search_word_map: Dict[str, List[tuple]],
+        progress: Dict[str, Any]
+    ):
+        """
+        执行所有搜索任务
+
+        Args:
+            data: 输入数据(会被修改)
+            search_word_map: 搜索词映射
+            progress: 进度数据
+        """
+        completed_searches = progress['completed_searches']
+        total_searches = len(search_word_map)
+
+        logger.info("=" * 60)
+        logger.info("开始执行搜索任务")
+        logger.info("=" * 60)
+        logger.info(f"唯一搜索词数: {total_searches}")
+        logger.info(f"已完成: {len(completed_searches)}")
+        logger.info(f"待执行: {total_searches - len(completed_searches)}")
+        logger.info("")
+
+        # 遍历所有唯一的搜索词
+        for idx, (search_word, positions) in enumerate(search_word_map.items(), 1):
+            logger.info(f"[{idx}/{total_searches}] 处理: {search_word}")
+            logger.info(f"  影响 {len(positions)} 个特征节点")
+
+            # 检查是否已完成
+            if search_word in completed_searches:
+                logger.info(f"  ⊙ 已完成(使用缓存结果)")
+                search_result = completed_searches[search_word]
+                self.stats['跳过搜索数'] += 1
+            else:
+                # 执行搜索
+                search_result = self.execute_search(search_word)
+
+                # 记录结果到进度文件
+                completed_searches[search_word] = search_result
+                self.stats['已完成搜索数'] += 1
+
+                if search_result:
+                    self.stats['成功搜索数'] += 1
+                else:
+                    self.stats['失败搜索数'] += 1
+
+                # 保存进度
+                self.save_progress(progress)
+
+                # 延迟,避免请求过快
+                if idx < total_searches:  # 最后一次不需要延迟
+                    time.sleep(self.search_delay)
+
+            # 将搜索结果写入到所有相关的特征节点
+            self._write_results_to_features(
+                data, positions, search_word, search_result
+            )
+
+        logger.info("")
+        logger.info("=" * 60)
+        logger.info("搜索任务执行完成")
+        logger.info("=" * 60)
+
+    def _write_results_to_features(
+        self,
+        data: List[Dict[str, Any]],
+        positions: List[tuple],
+        search_word: str,
+        search_result: Optional[Dict[str, Any]]
+    ):
+        """
+        将搜索结果写入到所有相关的特征节点
+
+        Args:
+            data: 数据列表(会被修改)
+            positions: 特征位置列表
+            search_word: 搜索关键词
+            search_result: 搜索结果
+        """
+        for result_idx, assoc_idx, feature_idx in positions:
+            feature = data[result_idx]['找到的关联'][assoc_idx]['特征列表'][feature_idx]
+
+            # 添加搜索结果
+            if search_result:
+                # 深拷贝,确保每个特征有独立的数据
+                feature['search_result'] = copy.deepcopy(search_result)
+
+                # 添加元数据
+                note_count = len(search_result.get('data', {}).get('data', []))
+                feature['search_metadata'] = {
+                    'searched_at': datetime.now().isoformat(),
+                    'status': 'success',
+                    'note_count': note_count,
+                    'search_params': {
+                        'keyword': search_word,
+                        'content_type': self.content_type,
+                        'sort_type': self.sort_type
+                    }
+                }
+            else:
+                # 搜索失败
+                feature['search_result'] = None
+                feature['search_metadata'] = {
+                    'searched_at': datetime.now().isoformat(),
+                    'status': 'failed',
+                    'note_count': 0,
+                    'search_params': {
+                        'keyword': search_word,
+                        'content_type': self.content_type,
+                        'sort_type': self.sort_type
+                    }
+                }
+
+    def execute(self):
+        """执行完整流程"""
+        logger.info("=" * 60)
+        logger.info("搜索任务执行器启动")
+        logger.info("=" * 60)
+        logger.info(f"输入文件: {self.input_path}")
+        logger.info(f"输出文件: {self.output_path}")
+        logger.info(f"进度文件: {self.progress_path}")
+        logger.info(f"搜索延迟: {self.search_delay} 秒")
+        logger.info("")
+
+        # 1. 加载输入数据
+        logger.info("步骤1: 加载输入数据")
+        data = self.load_json(self.input_path)
+        if not data:
+            logger.error("输入数据为空,退出")
+            return
+
+        # 2. 加载进度
+        logger.info("步骤2: 加载进度文件")
+        progress = self.load_progress()
+
+        # 3. 收集搜索词
+        logger.info("步骤3: 收集搜索关键词")
+        search_word_map = self.collect_search_words(data)
+        logger.info(f"  总特征数: {self.stats['总特征数']}")
+        logger.info(f"  有search_word的特征数: {self.stats['有search_word的特征数']}")
+        logger.info(f"  唯一search_word数: {self.stats['唯一search_word数']}")
+        logger.info("")
+
+        # 4. 执行搜索
+        logger.info("步骤4: 执行搜索任务")
+        self.process_searches(data, search_word_map, progress)
+
+        # 5. 保存结果
+        logger.info("步骤5: 保存结果")
+        self.save_json(data, self.output_path)
+
+        # 6. 输出统计
+        logger.info("")
+        logger.info("=" * 60)
+        logger.info("执行统计")
+        logger.info("=" * 60)
+        for key, value in self.stats.items():
+            logger.info(f"  {key}: {value}")
+
+        logger.info("")
+        logger.info("✓ 执行完成")
+
+
+def main():
+    """主函数"""
+    parser = argparse.ArgumentParser(description='小红书搜索任务执行器')
+    parser.add_argument(
+        '--input',
+        default='associated_tags_results_with_search.json',
+        help='输入JSON文件路径(默认: associated_tags_results_with_search.json)'
+    )
+    parser.add_argument(
+        '--output',
+        default=None,
+        help='输出JSON文件路径(默认: 输入文件名_with_search_data.json)'
+    )
+    parser.add_argument(
+        '--progress',
+        default='search_progress.json',
+        help='进度文件路径(默认: search_progress.json)'
+    )
+    parser.add_argument(
+        '--delay',
+        type=float,
+        default=2.0,
+        help='每次搜索间隔时间(秒,默认: 2.0)'
+    )
+    parser.add_argument(
+        '--content-type',
+        default='图文',
+        choices=['不限', '视频', '图文'],
+        help='内容类型(默认: 图文)'
+    )
+    parser.add_argument(
+        '--sort-type',
+        default='综合',
+        choices=['综合', '最新', '最多点赞', '最多评论'],
+        help='排序方式(默认: 综合)'
+    )
+
+    args = parser.parse_args()
+
+    # 创建执行器
+    executor = SearchTaskExecutor(
+        input_path=args.input,
+        output_path=args.output,
+        progress_path=args.progress,
+        search_delay=args.delay,
+        content_type=args.content_type,
+        sort_type=args.sort_type
+    )
+
+    # 执行
+    executor.execute()
+
+
+if __name__ == '__main__':
+    main()

+ 219 - 0
scripts/import_search_results.py

@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+搜索结果导入工具
+
+将 search_progress.json 中已完成的搜索结果导入到
+associated_tags_results_with_search.json 对应的特征节点中。
+
+匹配规则:根据特征的 search_word 字段匹配
+"""
+
+import json
+import copy
+from datetime import datetime
+from typing import Dict, Any
+import argparse
+
+
+def load_json(file_path: str) -> Any:
+    """加载JSON文件"""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except Exception as e:
+        print(f"错误: 加载文件失败 {file_path}: {e}")
+        raise
+
+
+def save_json(data: Any, file_path: str):
+    """保存JSON文件"""
+    try:
+        with open(file_path, 'w', encoding='utf-8') as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+        print(f"✓ 已保存: {file_path}")
+    except Exception as e:
+        print(f"错误: 保存文件失败 {file_path}: {e}")
+        raise
+
+
+def import_search_results(
+    source_data_path: str,
+    progress_path: str,
+    output_path: str,
+    content_type: str = '图文',
+    sort_type: str = '综合'
+):
+    """
+    导入搜索结果
+
+    Args:
+        source_data_path: 源数据文件路径(包含特征和search_word)
+        progress_path: 进度文件路径(包含已完成的搜索结果)
+        output_path: 输出文件路径
+        content_type: 内容类型
+        sort_type: 排序方式
+    """
+    print("=" * 60)
+    print("搜索结果导入工具")
+    print("=" * 60)
+    print()
+
+    # 1. 加载源数据
+    print(f"步骤1: 加载源数据文件")
+    print(f"  {source_data_path}")
+    source_data = load_json(source_data_path)
+    print(f"  ✓ 已加载 {len(source_data)} 个结果项")
+    print()
+
+    # 2. 加载进度文件
+    print(f"步骤2: 加载搜索进度文件")
+    print(f"  {progress_path}")
+    progress = load_json(progress_path)
+    completed_searches = progress.get('completed_searches', {})
+    print(f"  ✓ 已加载 {len(completed_searches)} 个搜索结果")
+    print()
+
+    # 3. 统计特征信息
+    print("步骤3: 统计特征信息")
+    total_features = 0
+    features_with_search_word = 0
+    unique_search_words = set()
+
+    for result in source_data:
+        for assoc in result.get('找到的关联', []):
+            for feature in assoc.get('特征列表', []):
+                total_features += 1
+                search_word = feature.get('search_word')
+                if search_word:
+                    features_with_search_word += 1
+                    unique_search_words.add(search_word)
+
+    print(f"  总特征数: {total_features}")
+    print(f"  有search_word的特征: {features_with_search_word}")
+    print(f"  唯一search_word数: {len(unique_search_words)}")
+    print()
+
+    # 4. 导入搜索结果
+    print("步骤4: 导入搜索结果")
+    matched_count = 0
+    not_found_count = 0
+    success_count = 0
+    failed_count = 0
+
+    for result_idx, result in enumerate(source_data):
+        for assoc_idx, assoc in enumerate(result.get('找到的关联', [])):
+            for feature_idx, feature in enumerate(assoc.get('特征列表', [])):
+                search_word = feature.get('search_word')
+
+                # 跳过空的 search_word
+                if not search_word:
+                    continue
+
+                # 查找对应的搜索结果
+                if search_word in completed_searches:
+                    matched_count += 1
+                    search_result = completed_searches[search_word]
+
+                    # 深拷贝搜索结果,避免共享引用
+                    feature['search_result'] = copy.deepcopy(search_result)
+
+                    # 添加元数据
+                    if search_result and search_result.get('data'):
+                        note_count = len(search_result.get('data', {}).get('data', []))
+                        feature['search_metadata'] = {
+                            'searched_at': datetime.now().isoformat(),
+                            'status': 'success',
+                            'note_count': note_count,
+                            'search_params': {
+                                'keyword': search_word,
+                                'content_type': content_type,
+                                'sort_type': sort_type
+                            }
+                        }
+                        success_count += 1
+                    else:
+                        # 搜索结果为空或失败
+                        feature['search_metadata'] = {
+                            'searched_at': datetime.now().isoformat(),
+                            'status': 'failed',
+                            'note_count': 0,
+                            'search_params': {
+                                'keyword': search_word,
+                                'content_type': content_type,
+                                'sort_type': sort_type
+                            }
+                        }
+                        failed_count += 1
+                else:
+                    not_found_count += 1
+
+    print(f"  匹配成功: {matched_count} 个特征")
+    print(f"  搜索成功: {success_count} 个")
+    print(f"  搜索失败: {failed_count} 个")
+    print(f"  未找到搜索结果: {not_found_count} 个")
+    print()
+
+    # 5. 保存结果
+    print("步骤5: 保存输出文件")
+    print(f"  {output_path}")
+    save_json(source_data, output_path)
+    print()
+
+    # 6. 输出统计信息
+    print("=" * 60)
+    print("导入完成")
+    print("=" * 60)
+    print()
+    print(f"总特征数: {total_features}")
+    print(f"有search_word的特征: {features_with_search_word}")
+    print(f"已导入搜索结果: {matched_count} ({matched_count/features_with_search_word*100:.1f}%)")
+    print(f"  - 成功: {success_count}")
+    print(f"  - 失败: {failed_count}")
+    print(f"待搜索: {not_found_count} ({not_found_count/features_with_search_word*100:.1f}%)")
+    print()
+
+
+def main():
+    """主函数"""
+    parser = argparse.ArgumentParser(description='搜索结果导入工具')
+    parser.add_argument(
+        '--source',
+        default='associated_tags_results_with_search.json',
+        help='源数据文件路径(默认: associated_tags_results_with_search.json)'
+    )
+    parser.add_argument(
+        '--progress',
+        default='search_progress.json',
+        help='进度文件路径(默认: search_progress.json)'
+    )
+    parser.add_argument(
+        '--output',
+        default='associated_tags_results_with_search_data.json',
+        help='输出文件路径(默认: associated_tags_results_with_search_data.json)'
+    )
+    parser.add_argument(
+        '--content-type',
+        default='图文',
+        help='内容类型(默认: 图文)'
+    )
+    parser.add_argument(
+        '--sort-type',
+        default='综合',
+        help='排序方式(默认: 综合)'
+    )
+
+    args = parser.parse_args()
+
+    # 执行导入
+    import_search_results(
+        source_data_path=args.source,
+        progress_path=args.progress,
+        output_path=args.output,
+        content_type=args.content_type,
+        sort_type=args.sort_type
+    )
+
+
+if __name__ == '__main__':
+    main()

+ 445 - 0
scripts/run_deconstruction.py

@@ -0,0 +1,445 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Stage 6 独立运行脚本
+从 Stage 5 结果开始,进行深度解构分析
+支持指定 feature 和数量限制
+"""
+
+import sys
+import os
+
+# 将项目根目录添加到Python路径
+project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, project_root)
+
+import json
+import logging
+import argparse
+import webbrowser
+from pathlib import Path
+from src.analyzers.post_deconstruction_analyzer import PostDeconstructionAnalyzer
+from src.analyzers.similarity_analyzer import SimilarityAnalyzer
+import src.visualizers.deconstruction_visualizer as deconstruction_visualizer
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S',
+    handlers=[
+        logging.FileHandler('deconstruction_standalone.log', encoding='utf-8'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+
+def main():
+    """主函数"""
+    parser = argparse.ArgumentParser(
+        description='深度解构分析(独立运行,支持流水线执行)',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog='''
+基础用法示例:
+  # 只处理"墨镜"特征的前10个高分帖子
+  python3 scripts/run_deconstruction.py --feature "墨镜" --max-notes 10
+
+  # 处理"墨镜"和"耳环"两个特征,每个最多5个
+  python3 scripts/run_deconstruction.py --feature "墨镜" "耳环" --max-notes 5
+
+  # 按数据原始顺序处理前50个(不排序)
+  python3 scripts/run_deconstruction.py --sort-by none --max-notes 50
+
+  # 处理所有特征,按时间排序,前20个
+  python3 scripts/run_deconstruction.py --sort-by time --max-notes 20
+
+  # 只处理"墨镜",按互动量排序,跳过前3个
+  python3 scripts/run_deconstruction.py --feature "墨镜" --sort-by engagement --skip 3
+
+  # 降低分数阈值,处理更多帖子
+  python3 scripts/run_deconstruction.py --feature "墨镜" --min-score 6.0 --max-notes 30
+
+流水线执行示例(推荐):
+  # 完整流水线: 深度解构 → 相似度分析 → 可视化 → 自动打开浏览器
+  python3 scripts/run_deconstruction.py --feature "墨镜" --max-notes 10 --run-similarity --visualize
+
+  # 深度解构 → 相似度分析(不生成可视化)
+  python3 scripts/run_deconstruction.py --feature "墨镜" --max-notes 10 --run-similarity
+
+  # 深度解构 → 可视化(跳过相似度分析)
+  python3 scripts/run_deconstruction.py --feature "墨镜" --max-notes 10 --visualize
+
+  # 完整流水线,不自动打开浏览器
+  python3 scripts/run_deconstruction.py --feature "墨镜" --run-similarity --visualize --no-open
+
+  # 自定义相似度分析权重
+  python3 scripts/run_deconstruction.py --feature "墨镜" --run-similarity --visualize \\
+    --similarity-weight-embedding 0.7 --similarity-weight-semantic 0.3
+
+  # 过滤低相似度特征
+  python3 scripts/run_deconstruction.py --feature "墨镜" --run-similarity --visualize \\
+    --similarity-min-similarity 0.3
+
+配置文件示例:
+  # 使用配置文件(支持所有参数)
+  python3 scripts/run_deconstruction.py --config pipeline_config.json
+
+  # 配置文件示例内容(pipeline_config.json):
+  {
+    "feature": ["墨镜"],
+    "max_notes": 10,
+    "timeout": 600,
+    "run_similarity": true,
+    "visualize": true,
+    "similarity_weight_embedding": 0.5,
+    "similarity_weight_semantic": 0.5
+  }
+        '''
+    )
+
+    # 输入输出配置
+    parser.add_argument(
+        '--input',
+        default='output_v2/evaluated_results.json',
+        help='评估结果文件路径(默认: output_v2/evaluated_results.json)'
+    )
+    parser.add_argument(
+        '--output',
+        default='output_v2/deep_analysis_results.json',
+        help='深度分析输出文件路径(默认: output_v2/deep_analysis_results.json)'
+    )
+
+    # Feature 过滤(新增)
+    parser.add_argument(
+        '--feature',
+        nargs='+',
+        default=None,
+        help='指定要处理的原始特征名称(可指定多个),如: --feature "墨镜" "耳环"。不指定则处理所有特征'
+    )
+
+    # 过滤参数
+    parser.add_argument(
+        '--min-score',
+        type=float,
+        default=0.8,
+        help='最低分数阈值,只处理 >= 此分数的帖子(默认: 0.8)'
+    )
+    parser.add_argument(
+        '--skip',
+        type=int,
+        default=0,
+        help='跳过前 N 个帖子(默认: 0)'
+    )
+    parser.add_argument(
+        '--max-notes',
+        type=int,
+        default=None,
+        help='最多处理多少个帖子(默认: None 不限制)'
+    )
+    parser.add_argument(
+        '--sort-by',
+        choices=['none', 'score', 'time', 'engagement'],
+        default='score',
+        help='排序方式: none(不排序,保持数据原始顺序), score(评分), time(时间), engagement(互动量)(默认: score)'
+    )
+
+    # API 配置
+    parser.add_argument(
+        '--api-url',
+        default='http://192.168.245.150:7000/what/analysis/single',
+        help='解构 API 地址(默认: http://192.168.245.150:7000/what/analysis/single)'
+    )
+    parser.add_argument(
+        '--timeout',
+        type=int,
+        default=800,
+        help='API 超时时间(秒)(默认: 600,即10分钟)'
+    )
+    parser.add_argument(
+        '--max-retries',
+        type=int,
+        default=3,
+        help='API 最大重试次数(默认: 3)'
+    )
+
+    # 并发配置
+    parser.add_argument(
+        '--max-workers',
+        type=int,
+        default=5,
+        help='并发处理数(默认: 5)'
+    )
+
+    # 从配置文件加载
+    parser.add_argument(
+        '--config',
+        default=None,
+        help='从 JSON 配置文件加载参数'
+    )
+
+    # 流水线控制参数
+    parser.add_argument(
+        '--run-similarity',
+        action='store_true',
+        help='深度解构完成后自动运行相似度分析'
+    )
+    parser.add_argument(
+        '--visualize',
+        action='store_true',
+        help='生成可视化结果'
+    )
+    parser.add_argument(
+        '--open-browser',
+        action='store_true',
+        default=True,
+        help='自动在浏览器中打开可视化结果(默认: True)'
+    )
+    parser.add_argument(
+        '--no-open',
+        action='store_true',
+        help='禁用自动打开浏览器'
+    )
+
+    # Stage 7 输出配置
+    parser.add_argument(
+        '--similarity-output',
+        default='output_v2/similarity_analysis_results.json',
+        help='相似度分析输出文件路径(默认: output_v2/similarity_analysis_results.json)'
+    )
+
+    # Stage 7 相似度配置
+    parser.add_argument(
+        '--similarity-weight-embedding',
+        type=float,
+        default=0.5,
+        help='相似度分析向量模型权重(默认: 0.5)'
+    )
+    parser.add_argument(
+        '--similarity-weight-semantic',
+        type=float,
+        default=0.5,
+        help='相似度分析 LLM 模型权重(默认: 0.5)'
+    )
+    parser.add_argument(
+        '--similarity-min-similarity',
+        type=float,
+        default=0.0,
+        help='相似度分析最小相似度阈值(默认: 0.0)'
+    )
+    parser.add_argument(
+        '--similarity-max-workers',
+        type=int,
+        default=5,
+        help='相似度分析最大并发数(默认: 5)'
+    )
+
+    # 可视化输出配置
+    parser.add_argument(
+        '--viz-output',
+        default=None,
+        help='可视化输出目录(默认: visualization/)'
+    )
+
+    args = parser.parse_args()
+
+    # 如果提供了配置文件,加载配置
+    if args.config:
+        logger.info(f"从配置文件加载参数: {args.config}")
+        with open(args.config, 'r', encoding='utf-8') as f:
+            config = json.load(f)
+
+        # 配置文件中的参数会覆盖命令行参数
+        for key, value in config.items():
+            setattr(args, key.replace('-', '_'), value)
+
+    # 检查输入文件是否存在
+    if not os.path.exists(args.input):
+        logger.error(f"输入文件不存在: {args.input}")
+        return
+
+    # 加载 Stage 5 结果
+    logger.info(f"加载评估结果: {args.input}")
+    with open(args.input, 'r', encoding='utf-8') as f:
+        evaluation_results = json.load(f)
+
+    # 打印配置
+    logger.info("=" * 60)
+    logger.info("运行配置:")
+    logger.info(f"  输入文件: {args.input}")
+    logger.info(f"  输出文件: {args.output}")
+    if args.feature:
+        logger.info(f"  指定特征: {', '.join(args.feature)}")
+    else:
+        logger.info(f"  指定特征: 全部")
+    logger.info(f"  API 地址: {args.api_url}")
+    logger.info(f"  最低分数阈值: {args.min_score}")
+    logger.info(f"  跳过前 N 个: {args.skip}")
+    logger.info(f"  最多处理数: {args.max_notes if args.max_notes else '不限制'}")
+    logger.info(f"  排序方式: {args.sort_by}")
+    logger.info(f"  并发数: {args.max_workers}")
+    logger.info(f"  API 超时: {args.timeout}秒")
+    logger.info(f"  最大重试: {args.max_retries}次")
+    logger.info("=" * 60)
+
+    # 创建分析器
+    analyzer = PostDeconstructionAnalyzer(
+        api_url=args.api_url,
+        max_workers=args.max_workers,
+        max_notes=args.max_notes,
+        min_score=args.min_score,
+        skip_count=args.skip,
+        sort_by=args.sort_by,
+        timeout=args.timeout,
+        max_retries=args.max_retries,
+        output_dir=os.path.dirname(args.output) or 'output_v2',
+        target_features=args.feature  # 传递 feature 过滤参数
+    )
+
+    # 运行分析
+    try:
+        deep_results = analyzer.run(
+            evaluation_results=evaluation_results,
+            output_path=args.output
+        )
+
+        # 打印结果摘要
+        logger.info("\n" + "=" * 60)
+        logger.info("深度解构分析完成!")
+        logger.info(f"  总匹配帖子数: {deep_results['metadata']['total_matched_notes']}")
+        logger.info(f"  实际处理数: {deep_results['metadata']['processed_notes']}")
+        logger.info(f"  成功: {deep_results['metadata']['success_count']}")
+        logger.info(f"  失败: {deep_results['metadata']['failed_count']}")
+        logger.info(f"  总耗时: {deep_results['metadata']['processing_time_seconds']}秒")
+        logger.info(f"  结果已保存: {args.output}")
+        logger.info("=" * 60)
+
+        # Stage 7: 相似度分析
+        similarity_results = None
+        if args.run_similarity:
+            logger.info("\n" + "=" * 60)
+            logger.info("开始执行相似度分析...")
+            logger.info("=" * 60)
+
+            try:
+                # 创建 Stage 7 分析器
+                similarity_analyzer = SimilarityAnalyzer(
+                    weight_embedding=args.similarity_weight_embedding,
+                    weight_semantic=args.similarity_weight_semantic,
+                    max_workers=args.similarity_max_workers,
+                    min_similarity=args.similarity_min_similarity,
+                    target_features=args.feature
+                )
+
+                # 运行 Stage 7 分析
+                similarity_results = similarity_analyzer.run(
+                    deconstruction_results=deep_results,
+                    output_path=args.similarity_output
+                )
+
+                # 打印 Stage 7 结果摘要
+                logger.info("\n" + "=" * 60)
+                logger.info("相似度分析完成!")
+                metadata = similarity_results['metadata']
+                overall_stats = metadata['overall_statistics']
+
+                logger.info(f"  处理帖子数: {overall_stats['total_notes']}")
+                logger.info(f"  提取特征总数: {overall_stats['total_features_extracted']}")
+                logger.info(f"  平均特征数/帖子: {overall_stats['avg_features_per_note']:.2f}")
+                logger.info(f"  平均最高相似度: {overall_stats['avg_max_similarity']:.3f}")
+                logger.info(f"  包含高相似度特征的帖子: {overall_stats['notes_with_high_similarity']}")
+                logger.info(f"  总耗时: {metadata['processing_time_seconds']:.2f}秒")
+                logger.info(f"  结果已保存: {args.similarity_output}")
+                logger.info("=" * 60)
+
+                # 打印 Top 5 高相似度特征示例
+                if similarity_results['results']:
+                    logger.info("\nTop 5 高相似度特征示例:")
+                    all_features = []
+                    for result in similarity_results['results']:
+                        for feat in result['deconstructed_features'][:5]:
+                            all_features.append({
+                                'note_id': result['note_id'],
+                                'feature_name': feat['feature_name'],
+                                'dimension': feat['dimension'],
+                                'similarity': feat['similarity_score']
+                            })
+
+                    # 按相似度排序,取 Top 5
+                    all_features.sort(key=lambda x: x['similarity'], reverse=True)
+                    for i, feat in enumerate(all_features[:5], 1):
+                        logger.info(f"  {i}. [{feat['note_id'][:12]}...] "
+                                   f"{feat['feature_name']} ({feat['dimension']}) "
+                                   f"- 相似度: {feat['similarity']:.3f}")
+
+            except Exception as e:
+                logger.error(f"相似度分析失败: {e}", exc_info=True)
+                logger.warning("继续执行后续步骤...")
+
+        # 可视化生成
+        viz_path = None
+        if args.visualize:
+            logger.info("\n" + "=" * 60)
+            logger.info("开始生成可视化结果...")
+            logger.info("=" * 60)
+
+            try:
+                # 准备可视化所需的数据文件路径
+                viz_args = [
+                    '--evaluation-results', args.input,
+                    '--deep-analysis-results', args.output
+                ]
+
+                # 如果有 Stage 7 结果,添加到参数中
+                if similarity_results and args.similarity_output:
+                    viz_args.extend(['--similarity-results', args.similarity_output])
+
+                # 如果指定了可视化输出目录
+                if args.viz_output:
+                    viz_args.extend(['--output-dir', args.viz_output])
+
+                # 调用可视化模块
+                import sys
+                original_argv = sys.argv
+                try:
+                    sys.argv = ['deconstruction_visualizer.py'] + viz_args
+                    viz_path = deconstruction_visualizer.main()
+                finally:
+                    sys.argv = original_argv
+
+                if viz_path:
+                    logger.info("\n" + "=" * 60)
+                    logger.info("可视化生成完成!")
+                    logger.info(f"  可视化文件: {viz_path}")
+                    logger.info("=" * 60)
+
+                    # 自动打开浏览器
+                    if args.open_browser and not args.no_open:
+                        logger.info("\n正在打开浏览器...")
+                        try:
+                            # 使用 Path.as_uri() 来正确处理包含中文和特殊字符的路径
+                            file_url = Path(viz_path).resolve().as_uri()
+                            webbrowser.open(file_url)
+                            logger.info("浏览器已打开")
+                        except Exception as e:
+                            logger.warning(f"无法自动打开浏览器: {e}")
+                            logger.info(f"请手动打开: {os.path.abspath(viz_path)}")
+                else:
+                    logger.warning("可视化生成返回了空路径")
+
+            except Exception as e:
+                logger.error(f"可视化生成失败: {e}", exc_info=True)
+                logger.warning("跳过可视化步骤")
+
+        # 流水线执行完成
+        logger.info("\n" + "=" * 60)
+        logger.info("流水线执行完成!")
+        logger.info("=" * 60)
+
+    except Exception as e:
+        logger.error(f"执行失败: {e}", exc_info=True)
+        raise
+
+
+if __name__ == '__main__':
+    main()

+ 246 - 0
scripts/run_similarity_analysis.py

@@ -0,0 +1,246 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""相似度分析独立运行脚本"""
+
+import sys
+import os
+
+# 将项目根目录添加到Python路径
+project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, project_root)
+
+import json
+import logging
+import argparse
+from src.analyzers.similarity_analyzer import SimilarityAnalyzer
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='解构特征相似度分析(独立运行)',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+使用示例:
+  # 基础用法 - 处理"墨镜"特征
+  python3 scripts/run_similarity_analysis.py --feature "墨镜"
+
+  # 处理多个特征
+  python3 scripts/run_similarity_analysis.py --feature "墨镜" "耳环"
+
+  # 自定义权重配置
+  python3 scripts/run_similarity_analysis.py --feature "墨镜" --weight-embedding 0.7 --weight-semantic 0.3
+
+  # 过滤低相似度特征
+  python3 scripts/run_similarity_analysis.py --feature "墨镜" --min-similarity 0.3
+
+  # 使用配置文件
+  python3 scripts/run_similarity_analysis.py --config stage8_config.json
+
+  # 自定义输入输出路径
+  python3 scripts/run_similarity_analysis.py --input output_v2/deep_analysis_custom.json --output output_v2/similarity_custom.json
+        """
+    )
+
+    # 输入输出
+    parser.add_argument(
+        '--input',
+        default='output_v2/deep_analysis_results.json',
+        help='解构分析结果文件路径(默认: output_v2/deep_analysis_results.json)'
+    )
+    parser.add_argument(
+        '--output',
+        default='output_v2/similarity_analysis_results.json',
+        help='输出文件路径(默认: output_v2/similarity_analysis_results.json)'
+    )
+
+    # 特征过滤
+    parser.add_argument(
+        '--feature',
+        nargs='+',
+        default=None,
+        help='指定要处理的原始特征名称(可指定多个),如: --feature "墨镜" "耳环"'
+    )
+
+    # 相似度配置
+    parser.add_argument(
+        '--weight-embedding',
+        type=float,
+        default=0.5,
+        help='向量模型权重(默认: 0.5)'
+    )
+    parser.add_argument(
+        '--weight-semantic',
+        type=float,
+        default=0.5,
+        help='LLM 模型权重(默认: 0.5)'
+    )
+    parser.add_argument(
+        '--min-similarity',
+        type=float,
+        default=0.0,
+        help='最小相似度阈值,低于此值的特征会被过滤(默认: 0.0,保留所有)'
+    )
+
+    # 并发配置
+    parser.add_argument(
+        '--max-workers',
+        type=int,
+        default=5,
+        help='最大并发数(默认: 5)'
+    )
+
+    # 综合得分P计算配置
+    parser.add_argument(
+        '--evaluation-path',
+        default='output_v2/evaluated_results.json',
+        help='评估结果数据文件路径,用于计算综合得分P(默认: output_v2/evaluated_results.json)'
+    )
+    parser.add_argument(
+        '--no-update-evaluation',
+        action='store_true',
+        help='不计算和更新综合得分P(默认会计算)'
+    )
+
+    # 配置文件
+    parser.add_argument(
+        '--config',
+        help='从配置文件读取参数(JSON 格式)'
+    )
+
+    # 日志级别
+    parser.add_argument(
+        '--log-level',
+        default='INFO',
+        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
+        help='日志级别(默认: INFO)'
+    )
+
+    args = parser.parse_args()
+
+    # 配置日志
+    logging.basicConfig(
+        level=getattr(logging, args.log_level),
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    logger = logging.getLogger(__name__)
+
+    # 如果提供了配置文件,从文件读取参数
+    if args.config:
+        logger.info(f"从配置文件读取参数: {args.config}")
+        try:
+            with open(args.config, 'r', encoding='utf-8') as f:
+                config = json.load(f)
+
+            # 配置文件中的参数会覆盖命令行默认值,但不会覆盖用户显式指定的命令行参数
+            args.input = config.get('input', args.input)
+            args.output = config.get('output', args.output)
+            args.feature = config.get('feature', args.feature)
+            args.weight_embedding = config.get('weight_embedding', args.weight_embedding)
+            args.weight_semantic = config.get('weight_semantic', args.weight_semantic)
+            args.min_similarity = config.get('min_similarity', args.min_similarity)
+            args.max_workers = config.get('max_workers', args.max_workers)
+            args.evaluation_path = config.get('evaluation_path', args.evaluation_path)
+            if 'no_update_evaluation' in config:
+                args.no_update_evaluation = config.get('no_update_evaluation', args.no_update_evaluation)
+
+        except Exception as e:
+            logger.error(f"读取配置文件失败: {e}")
+            return 1
+
+    # 验证输入文件
+    if not os.path.exists(args.input):
+        logger.error(f"输入文件不存在: {args.input}")
+        return 1
+
+    # 读取解构分析结果
+    logger.info(f"读取解构分析结果: {args.input}")
+    try:
+        with open(args.input, 'r', encoding='utf-8') as f:
+            deconstruction_results = json.load(f)
+    except Exception as e:
+        logger.error(f"读取解构分析结果失败: {e}")
+        return 1
+
+    # 打印配置信息
+    logger.info("\n" + "=" * 60)
+    logger.info("相似度分析配置:")
+    logger.info("=" * 60)
+    logger.info(f"输入文件: {args.input}")
+    logger.info(f"输出文件: {args.output}")
+    if args.feature:
+        logger.info(f"目标特征: {', '.join(args.feature)}")
+    else:
+        logger.info(f"目标特征: 全部")
+    logger.info(f"向量模型权重: {args.weight_embedding}")
+    logger.info(f"LLM 模型权重: {args.weight_semantic}")
+    logger.info(f"最小相似度阈值: {args.min_similarity}")
+    logger.info(f"最大并发数: {args.max_workers}")
+    logger.info(f"评估结果文件路径: {args.evaluation_path}")
+    logger.info(f"计算综合得分P: {'否' if args.no_update_evaluation else '是'}")
+    logger.info("=" * 60 + "\n")
+
+    # 创建分析器
+    try:
+        analyzer = SimilarityAnalyzer(
+            weight_embedding=args.weight_embedding,
+            weight_semantic=args.weight_semantic,
+            max_workers=args.max_workers,
+            min_similarity=args.min_similarity,
+            target_features=args.feature,
+            evaluation_results_path=args.evaluation_path,
+            update_evaluation_scores=not args.no_update_evaluation
+        )
+    except Exception as e:
+        logger.error(f"创建分析器失败: {e}")
+        return 1
+
+    # 运行分析
+    try:
+        similarity_results = analyzer.run(deconstruction_results, output_path=args.output)
+
+        # 打印摘要
+        logger.info("\n" + "=" * 60)
+        logger.info("相似度分析完成")
+        logger.info("=" * 60)
+
+        metadata = similarity_results['metadata']
+        overall_stats = metadata['overall_statistics']
+
+        logger.info(f"处理帖子数: {overall_stats['total_notes']}")
+        logger.info(f"提取特征总数: {overall_stats['total_features_extracted']}")
+        logger.info(f"平均特征数/帖子: {overall_stats['avg_features_per_note']}")
+        logger.info(f"平均最高相似度: {overall_stats['avg_max_similarity']}")
+        logger.info(f"包含高相似度特征的帖子: {overall_stats['notes_with_high_similarity']}")
+        logger.info(f"总耗时: {metadata['processing_time_seconds']}秒")
+        logger.info(f"结果已保存: {args.output}")
+        logger.info("=" * 60 + "\n")
+
+        # 打印 Top 5 高相似度特征示例
+        if similarity_results['results']:
+            logger.info("Top 5 高相似度特征示例:")
+            all_features = []
+            for result in similarity_results['results']:
+                for feat in result['deconstructed_features'][:5]:  # 每个帖子取前5个
+                    all_features.append({
+                        'note_id': result['note_id'],
+                        'feature_name': feat['feature_name'],
+                        'dimension': feat['dimension'],
+                        'similarity': feat['similarity_score']
+                    })
+
+            # 按相似度排序,取 Top 5
+            all_features.sort(key=lambda x: x['similarity'], reverse=True)
+            for i, feat in enumerate(all_features[:5], 1):
+                logger.info(f"  {i}. [{feat['note_id'][:12]}...] "
+                           f"{feat['feature_name']} ({feat['dimension']}) "
+                           f"- 相似度: {feat['similarity']:.3f}")
+
+        return 0
+
+    except Exception as e:
+        logger.error(f"相似度分析失败: {e}", exc_info=True)
+        return 1
+
+
+if __name__ == '__main__':
+    exit(main())

+ 76 - 0
scripts/run_visualizer.py

@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+分类树可视化执行脚本
+"""
+import sys
+import os
+from pathlib import Path
+
+# 将项目根目录添加到Python路径
+project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, project_root)
+
+from src.visualizers.classification_tree_visualizer import visualize_classification_tree
+
+
+def main():
+    # 设置路径
+    base_dir = current_dir
+
+    # 必需的文件
+    optimized_data_path = os.path.join(base_dir, "optimized_clustered_data_gemini-3-pro-preview.json")
+    posts_dir = base_dir  # 当前目录作为帖子目录
+
+    # 可选的关联分析文件
+    dimension_associations_path = os.path.join(base_dir, "dimension_associations_analysis.json")
+    intra_dimension_associations_path = os.path.join(base_dir, "intra_dimension_associations_analysis.json")
+    expanded_orthogonal_combinations_path = os.path.join(base_dir, "expanded_orthogonal_combinations.json")
+    enriched_xuanti_point_map_path = os.path.join(base_dir, "enriched_xuanti_point_map.json")
+
+    # 检查必需文件是否存在
+    if not os.path.exists(optimized_data_path):
+        print(f"❌ 找不到优化数据文件: {optimized_data_path}")
+        return
+
+    print(f"📂 优化数据文件: {optimized_data_path}")
+    print(f"📂 帖子目录: {posts_dir}")
+
+    # 检查可选文件
+    if os.path.exists(dimension_associations_path):
+        print(f"✅ 找到跨维度关联分析数据")
+    if os.path.exists(intra_dimension_associations_path):
+        print(f"✅ 找到维度内部关联分析数据")
+    if os.path.exists(expanded_orthogonal_combinations_path):
+        print(f"✅ 找到扩展正交组合数据")
+    if os.path.exists(enriched_xuanti_point_map_path):
+        print(f"✅ 找到丰富选题点映射数据")
+
+    # xuanti_point_map 使用空字典(如果没有外部依赖)
+    xuanti_point_map = {}
+
+    print("\n🚀 开始生成可视化...")
+
+    try:
+        output_path = visualize_classification_tree(
+            optimized_data_path=optimized_data_path,
+            posts_dir=posts_dir,
+            xuanti_point_map=xuanti_point_map,
+            dimension_associations_path=dimension_associations_path if os.path.exists(dimension_associations_path) else None,
+            intra_dimension_associations_path=intra_dimension_associations_path if os.path.exists(intra_dimension_associations_path) else None,
+            expanded_orthogonal_combinations_path=expanded_orthogonal_combinations_path if os.path.exists(expanded_orthogonal_combinations_path) else None,
+            enriched_xuanti_point_map_path=enriched_xuanti_point_map_path if os.path.exists(enriched_xuanti_point_map_path) else None
+        )
+
+        print(f"\n🎉 可视化完成!")
+        print(f"📄 输出文件: {output_path}")
+        print(f"\n💡 请在浏览器中打开: file://{output_path}")
+
+    except Exception as e:
+        print(f"\n❌ 生成可视化失败: {e}")
+        import traceback
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    main()

+ 187 - 0
scripts/visualize_cascade.py

@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+生成级联树形可视化
+使用全新的cascade_visualizer生成四层级联展示
+"""
+
+import sys
+import json
+from pathlib import Path
+from datetime import datetime
+
+# 添加项目根目录到路径
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+
+from src.visualizers.cascade_visualizer import CascadeVisualizer
+
+
+def load_json(file_path: Path) -> dict:
+    """加载JSON文件"""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def main():
+    """主函数"""
+    print("🎯 级联树形可视化生成器")
+    print("=" * 60)
+
+    # 定义路径
+    output_dir = project_root / "output_v2"
+    visualization_dir = project_root / "visualization"
+
+    # 加载评估数据
+    evaluated_file = output_dir / "evaluated_results.json"
+    print(f"📖 加载评估数据: {evaluated_file}")
+
+    if not evaluated_file.exists():
+        print(f"❌ 文件不存在: {evaluated_file}")
+        return
+
+    evaluated_data = load_json(evaluated_file)
+    print(f"✓ 加载了 {len(evaluated_data)} 个原始特征")
+
+    # 加载解构数据
+    deep_analysis_file = output_dir / "deep_analysis_results.json"
+    print(f"📖 加载解构数据: {deep_analysis_file}")
+
+    if not deep_analysis_file.exists():
+        print(f"❌ 文件不存在: {deep_analysis_file}")
+        return
+
+    deep_analysis_full = load_json(deep_analysis_file)
+    deep_analysis_data = deep_analysis_full.get('results', [])
+    print(f"✓ 加载了 {len(deep_analysis_data)} 个解构结果")
+
+    # 加载Stage8数据
+    similarity_file = output_dir / "similarity_analysis_results.json"
+    print(f"📖 加载Stage8数据: {similarity_file}")
+
+    similarity_data = {}
+    if similarity_file.exists():
+        similarity_full = load_json(similarity_file)
+        similarity_data = similarity_full.get('results', {})
+        print(f"✓ 加载了 {len(similarity_data)} 个相似度评分")
+    else:
+        print("⚠️ Stage8数据文件不存在,将使用默认值")
+
+    # 计算统计数据
+    print("\n📊 计算统计数据...")
+    stats = calculate_stats(evaluated_data)
+    print("✓ 统计完成:")
+    print(f"  - 原始特征: {stats['原始特征数']}")
+    print(f"  - 搜索词总数: {stats['搜索词总数']}")
+    print(f"  - 帖子总数: {stats['帖子总数']}")
+    print(f"  - 完全匹配: {stats['完全匹配']} ({stats['完全匹配率']})")
+
+    # 提取所有特征信息
+    print("\n📊 提取所有特征信息...")
+    all_features = extract_all_features(evaluated_data, deep_analysis_data, similarity_data)
+    print(f"✓ 提取了 {len(all_features)} 个特征")
+
+    # 统计分类
+    high_similarity = sum(1 for f in all_features if f.get('相似度得分', 0) >= 0.8)
+    partial_match = sum(1 for f in all_features if 0.5 <= f.get('相似度得分', 0) < 0.8)
+    low_similarity = sum(1 for f in all_features if f.get('相似度得分', 0) < 0.5)
+
+    print(f"  - 高相似度特征(≥0.8): {high_similarity} 个")
+    print(f"  - 部分匹配特征(0.5-0.8): {partial_match} 个")
+    print(f"  - 低相似度特征(<0.5): {low_similarity} 个")
+
+    # 生成可视化
+    print("\n🎨 生成级联可视化页面...")
+    visualizer = CascadeVisualizer()
+
+    # 生成输出文件名
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_file = visualization_dir / f"cascade_results_{timestamp}.html"
+
+    # 生成HTML
+    result_file = visualizer.generate_html(all_features, stats, str(output_file))
+
+    print(f"✓ 生成完成: {result_file}")
+    print(f"\n🌐 在浏览器中打开查看:")
+    print(f"   file://{result_file}")
+
+
+def calculate_stats(evaluated_data: list) -> dict:
+    """计算统计数据"""
+    stats = {
+        '原始特征数': len(evaluated_data),
+        '搜索词总数': 0,
+        '帖子总数': 0,
+        '完全匹配': 0,
+        '相似匹配': 0,
+        '弱相似': 0,
+        '无匹配': 0,
+        '已过滤': 0
+    }
+
+    total_notes = 0
+    complete_notes = 0
+
+    for item in evaluated_data:
+        groups = item.get('组合评估结果_分组', [])
+        for group in groups:
+            searches = group.get('top10_searches', [])
+            stats['搜索词总数'] += len(searches)
+
+            for search in searches:
+                eval_data = search.get('evaluation_with_filter', {})
+                search_stats = eval_data.get('statistics', {})
+
+                stats['完全匹配'] += search_stats.get('完全匹配(0.8-1.0)', 0)
+                stats['相似匹配'] += search_stats.get('相似匹配(0.6-0.79)', 0)
+                stats['弱相似'] += search_stats.get('弱相似(0.5-0.59)', 0)
+                stats['无匹配'] += search_stats.get('无匹配(≤0.4)', 0)
+                stats['已过滤'] += eval_data.get('filtered_count', 0)
+
+                # 统计帖子总数
+                notes = search.get('search_result', {}).get('data', {}).get('data', [])
+                total_notes += len(notes)
+
+                # 统计完全匹配的帖子
+                notes_with_scores = eval_data.get('notes_with_scores', [])
+                for note_eval in notes_with_scores:
+                    match_level = note_eval.get('match_level', '')
+                    if '完全匹配' in match_level:
+                        complete_notes += 1
+
+    stats['帖子总数'] = total_notes
+    stats['完全匹配率'] = f"{(complete_notes / total_notes * 100):.1f}%" if total_notes > 0 else "0%"
+
+    return stats
+
+
+def extract_all_features(evaluated_data: list, deep_analysis_data: list, similarity_data: list) -> list:
+    """
+    提取所有特征信息,整合评估数据、解构数据和相似度数据
+    """
+    all_features = []
+
+    # 遍历评估数据
+    for eval_item in evaluated_data:
+        post_target_word = eval_item.get('帖子目标词', '')
+        persona_feature = eval_item.get('人设特征名称', '')
+
+        # 简化处理:直接从eval_item中获取相似度得分
+        # 如果没有,默认为0.5(部分匹配)
+        similarity_score = eval_item.get('相似度得分', 0.5)
+
+        # 整合数据
+        feature = {
+            '帖子目标词': post_target_word,
+            '人设特征名称': persona_feature,
+            '相似度得分': similarity_score,
+            '组合评估结果_分组': eval_item.get('组合评估结果_分组', [])
+        }
+
+        all_features.append(feature)
+
+    return all_features
+
+
+if __name__ == "__main__":
+    main()

+ 37 - 0
scripts/visualize_integrated_results.py

@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+整合可视化工具
+调用已适配新数据结构的可视化器,整合评估结果、深度解构和相似度分析
+"""
+
+import sys
+import os
+
+# 将项目根目录添加到Python路径
+project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, project_root)
+
+# 导入可视化器主函数
+from src.visualizers.deconstruction_visualizer import main as visualize_main
+
+
+if __name__ == '__main__':
+    """
+    包装脚本:调用已适配新数据结构的可视化器
+
+    读取三个数据文件:
+    1. output_v2/evaluated_results.json - 评估结果
+    2. output_v2/deep_analysis_results.json - 深度解构结果
+    3. output_v2/similarity_analysis_results.json - 相似度分析结果
+
+    生成整合的HTML可视化报告到 visualization/ 目录
+    """
+    try:
+        output_path = visualize_main()
+        exit(0)
+    except Exception as e:
+        print(f"\n❌ 可视化生成失败: {e}")
+        import traceback
+        traceback.print_exc()
+        exit(1)

+ 0 - 0
src/__init__.py


+ 0 - 0
src/analyzers/__init__.py


+ 603 - 0
src/analyzers/post_deconstruction_analyzer.py

@@ -0,0 +1,603 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+帖子解构分析器
+对评估结果中完全匹配的帖子进行深度解构分析
+"""
+
+import os
+import json
+import time
+import logging
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Dict, List, Any, Optional
+
+from src.clients.deconstruction_api_client import DeconstructionAPIClient, map_note_to_api_format
+
+try:
+    from tqdm import tqdm
+    TQDM_AVAILABLE = True
+except ImportError:
+    TQDM_AVAILABLE = False
+    logger.warning("tqdm 未安装,将使用简单进度显示。安装命令: pip install tqdm")
+
+logger = logging.getLogger(__name__)
+
+
+class PostDeconstructionAnalyzer:
+    """完全匹配帖子的深度解构分析"""
+
+    def __init__(
+        self,
+        api_url: str = "http://192.168.245.150:7000/what/analysis/single",
+        max_workers: int = 5,
+        max_notes: Optional[int] = None,
+        min_score: float = 8.0,
+        skip_count: int = 0,
+        sort_by: str = 'score',
+        timeout: int = 800,
+        max_retries: int = 3,
+        output_dir: str = "output_v2",
+        enable_image_download: bool = True,
+        image_server_url: str = "http://localhost:8765",
+        image_download_dir: str = "downloaded_images",
+        target_features: Optional[List[str]] = None
+    ):
+        """
+        初始化帖子解构分析器
+
+        Args:
+            api_url: API 地址
+            max_workers: 并发数
+            max_notes: 最多处理多少个帖子(None = 不限制)
+            min_score: 最低分数阈值(只处理 >= 此分数的帖子)
+            skip_count: 跳过前 N 个
+            sort_by: 排序方式 ('none' | 'score' | 'time' | 'engagement')
+                    - 'none': 不排序,保持Stage6数据原始顺序
+                    - 'score': 按评分降序
+                    - 'time': 按时间降序
+                    - 'engagement': 按互动量降序
+            timeout: API 超时时间
+            max_retries: API 最大重试次数
+            output_dir: 输出目录
+            enable_image_download: 是否启用图片下载(下载小红书图片并转换为本地URL)
+            image_server_url: 图片服务器URL
+            image_download_dir: 图片下载目录
+            target_features: 指定要处理的原始特征列表(None = 处理所有特征)
+        """
+        self.max_workers = max_workers
+        self.max_notes = max_notes
+        self.min_score = min_score
+        self.skip_count = skip_count
+        self.sort_by = sort_by
+        self.output_dir = output_dir
+        self.enable_image_download = enable_image_download
+        self.target_features = target_features  # 新增:目标特征过滤
+
+        # 初始化 API 客户端
+        self.api_client = DeconstructionAPIClient(
+            api_url=api_url,
+            timeout=timeout,
+            max_retries=max_retries
+        )
+
+        # 图片下载功能已弃用,直接使用原始图片URL
+        # 保留参数以向后兼容,但不再使用
+        if self.enable_image_download:
+            logger.warning("  注意: enable_image_download 参数已弃用,将直接使用原始图片URL")
+
+    def extract_matched_notes_from_evaluation(
+        self,
+        evaluation_results: List[Dict]
+    ) -> List[Dict]:
+        """
+        从 评估结果中提取所有完全匹配的帖子
+
+        Args:
+            evaluation_results: 评估结果(列表)
+
+        Returns:
+            完全匹配的帖子列表
+        """
+        matched_notes = []
+
+        # 评估结果是一个列表,每个元素是一个 feature_group
+        for feature_group in evaluation_results:
+            original_feature = feature_group.get('原始特征名称', '')
+
+            # 如果指定了 target_features,只处理指定的特征
+            if self.target_features and original_feature not in self.target_features:
+                continue
+
+            # 遍历 组合评估结果_分组(这一层包含了 top10_searches)
+            for combo_group in feature_group.get('组合评估结果_分组', []):
+                # top10_searches 包含所有搜索结果
+                for search_item in combo_group.get('top10_searches', []):
+                    search_word = search_item.get('search_word', '')
+                    source_word = search_item.get('source_word', '')
+                    evaluation = search_item.get('evaluation_with_filter', {})
+
+                    # 检查是否有搜索结果
+                    if 'search_result' not in search_item:
+                        continue
+
+                    notes = search_item['search_result'].get('data', {}).get('data', [])
+
+                    # 遍历评估结果
+                    for note_eval in evaluation.get('notes_evaluation', []):
+                        score = note_eval.get('综合得分', 0)
+
+                        # 只处理完全匹配的(分数 >= min_score)
+                        if score >= self.min_score:
+                            note_index = note_eval.get('note_index', -1)
+                            if 0 <= note_index < len(notes):
+                                note = notes[note_index]
+
+                                matched_notes.append({
+                                    'note': note,
+                                    'note_card': note.get('note_card', {}),
+                                    'evaluation': note_eval,
+                                    'search_word': search_word,
+                                    'source_word': source_word,
+                                    'original_feature': original_feature,
+                                    'top3_persona_features': feature_group.get('top3匹配信息', [])
+                                })
+
+        return matched_notes
+
+    def sort_matched_notes(
+        self,
+        matched_notes: List[Dict]
+    ) -> List[Dict]:
+        """
+        对完全匹配的帖子进行排序
+
+        Args:
+            matched_notes: 匹配的帖子列表
+
+        Returns:
+            排序后的帖子列表
+        """
+        if self.sort_by == 'none':
+            # 不排序,保持数据原始顺序
+            return matched_notes
+
+        elif self.sort_by == 'score':
+            # 按评分降序(优先处理高分帖子)
+            return sorted(
+                matched_notes,
+                key=lambda x: x['evaluation'].get('综合得分', 0),
+                reverse=True
+            )
+
+        elif self.sort_by == 'time':
+            # 按时间降序(优先处理最新帖子)
+            return sorted(
+                matched_notes,
+                key=lambda x: x['note_card'].get('publish_timestamp', 0),
+                reverse=True
+            )
+
+        elif self.sort_by == 'engagement':
+            # 按互动量降序(点赞+收藏+评论)
+            def calc_engagement(note_data):
+                interact = note_data['note_card'].get('interact_info', {})
+                return (
+                    interact.get('liked_count', 0) +
+                    interact.get('collected_count', 0) +
+                    interact.get('comment_count', 0)
+                )
+
+            return sorted(
+                matched_notes,
+                key=calc_engagement,
+                reverse=True
+            )
+
+        return matched_notes
+
+    def _save_intermediate_results(
+        self,
+        results: List[Dict],
+        output_path: str,
+        processed_count: int,
+        total_count: int,
+        start_time: float
+    ):
+        """
+        保存中间结果
+
+        Args:
+            results: 当前结果列表
+            output_path: 输出路径
+            processed_count: 已处理数量
+            total_count: 总数量
+            start_time: 开始时间
+        """
+        # 构建中间结果文件路径
+        base_dir = os.path.dirname(output_path) or 'output_v2'
+        base_name = os.path.basename(output_path)
+        name_without_ext = os.path.splitext(base_name)[0]
+
+        intermediate_path = os.path.join(
+            base_dir,
+            f"{name_without_ext}_partial_{processed_count}of{total_count}.json"
+        )
+
+        # 统计成功失败数
+        success_count = sum(1 for r in results if r['api_response']['status'] == 'success')
+        failed_count = len(results) - success_count
+
+        # 构建中间结果
+        intermediate_result = {
+            'metadata': {
+                'stage': 'deconstruction_partial',
+                'description': f'部分结果({processed_count}/{total_count})',
+                'processed_notes': len(results),
+                'success_count': success_count,
+                'failed_count': failed_count,
+                'saved_at': datetime.now().isoformat(),
+                'processing_time_seconds': round(time.time() - start_time, 2)
+            },
+            'results': results
+        }
+
+        # 保存
+        os.makedirs(base_dir, exist_ok=True)
+        with open(intermediate_path, 'w', encoding='utf-8') as f:
+            json.dump(intermediate_result, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"    已保存中间结果: {intermediate_path} ({processed_count}/{total_count})")
+
+    def process_single_note(
+        self,
+        matched_note_data: Dict,
+        index: int,
+        total: int
+    ) -> Dict:
+        """
+        处理单个帖子的解构分析
+
+        Args:
+            matched_note_data: 匹配的帖子数据
+            index: 当前索引(用于日志)
+            total: 总数(用于日志)
+
+        Returns:
+            处理结果
+        """
+        note = matched_note_data['note']
+        note_card = matched_note_data['note_card']
+        evaluation = matched_note_data['evaluation']
+        search_word = matched_note_data['search_word']
+        original_feature = matched_note_data['original_feature']
+
+        note_id = note.get('id', '')
+        note_title = note_card.get('display_title', '')[:30]  # 前30个字符
+
+        logger.info(f"[{index}/{total}] 解构分析: {note_id}")
+        logger.info(f"  标题: {note_title}...")
+        logger.info(f"  搜索词: {search_word}")
+        logger.info(f"  原始特征: {original_feature}")
+
+        # 获取关键匹配点(用于保存到结果中)
+        key_points = evaluation.get('关键匹配点', [])
+
+        # 获取 top3 人设特征
+        top3_features = matched_note_data.get('top3_persona_features', [])
+
+        # 构建 start_points - 只使用 top3 的第一个人设特征名称
+        start_points = []
+        if top3_features:
+            first_feature = top3_features[0].get('人设特征名称', '')
+            if first_feature:
+                start_points = [first_feature]
+
+        logger.info(f"  start_points: {start_points}")
+        if top3_features:
+            logger.info(f"  top3人设特征: {[f.get('人设特征名称', '') for f in top3_features[:3]]}")
+
+        # 直接使用原始图片URL,不做任何处理
+        original_images = note_card.get('image_list', [])
+        if original_images:
+            logger.info(f"  图片数量: {len(original_images)}")
+
+        # 映射数据为 API 格式(直接使用原始图片URL)
+        api_payload = map_note_to_api_format(
+            note=note,
+            note_card=note_card,
+            evaluation=evaluation,
+            search_word=search_word,
+            original_feature=original_feature,
+            start_points=start_points,
+            processed_image_urls=None  # 不传递处理后的URL,使用原始URL
+        )
+
+        # 调用 API
+        start_time = time.time()
+        api_response = self.api_client.call_api(api_payload)
+        processing_time = (time.time() - start_time) * 1000  # 毫秒
+
+        # 构建结果
+        result = {
+            'note_id': note_id,
+            'search_word': search_word,
+            'original_feature': original_feature,
+            'source_word': matched_note_data['source_word'],
+            'evaluation_score': evaluation.get('综合得分', 0),
+            'evaluation_type': evaluation.get('匹配类型', ''),
+            'evaluation_confidence': evaluation.get('置信度', ''),
+            'key_matching_points': key_points,
+            'note_data': {
+                'title': note_card.get('display_title', ''),
+                'author': note_card.get('user', {}).get('nick_name', ''),
+                'link': f"https://www.xiaohongshu.com/explore/{note_id}"
+            },
+            'api_request': api_payload,
+            'api_response': api_response,
+            'processed_at': datetime.now().isoformat(),
+            'processing_time_ms': round(processing_time, 2)
+        }
+
+        if api_response['status'] == 'success':
+            logger.info(f"  ✓ 成功 ({processing_time:.0f}ms)")
+        else:
+            logger.error(f"  ✗ 失败: {api_response['error']}")
+
+        return result
+
+    def run(
+        self,
+        evaluation_results: Dict,
+        output_path: Optional[str] = None
+    ) -> Dict:
+        """
+        执行 深度解构分析
+
+        Args:
+            evaluation_results: 评估结果
+            output_path: 输出路径(可选)
+
+        Returns:
+            解构分析 结果
+        """
+        logger.info("\n" + "=" * 60)
+        logger.info("解构分析: 完全匹配帖子的深度解构分析")
+        logger.info("=" * 60)
+
+        # 打印配置参数
+        logger.info("配置参数:")
+        logger.info(f"  API 地址: {self.api_client.api_url}")
+        if self.target_features:
+            logger.info(f"  目标特征: {', '.join(self.target_features)}")
+        else:
+            logger.info(f"  目标特征: 全部")
+        logger.info(f"  最低分数阈值: {self.min_score}")
+        logger.info(f"  并发数: {self.max_workers}")
+        logger.info(f"  最多处理帖子数: {self.max_notes if self.max_notes else '不限制'}")
+        logger.info(f"  跳过前 N 个: {self.skip_count}")
+        logger.info(f"  排序方式: {self.sort_by}")
+        logger.info(f"  API 超时: {self.api_client.timeout}秒")
+        logger.info(f"  最大重试次数: {self.api_client.max_retries}")
+
+        # 默认输出路径
+        if output_path is None:
+            output_path = os.path.join(self.output_dir, "deep_analysis_results.json")
+
+        # 1. 提取完全匹配的帖子
+        matched_notes = self.extract_matched_notes_from_evaluation(evaluation_results)
+        total_matched = len(matched_notes)
+
+        logger.info(f"  完全匹配帖子总数: {total_matched} (分数 >= {self.min_score})")
+
+        if total_matched == 0:
+            logger.warning("  没有找到完全匹配的帖子")
+            return {
+                'metadata': {
+                    'stage': 'deconstruction',
+                    'total_matched_notes': 0,
+                    'processed_notes': 0
+                },
+                'results': []
+            }
+
+        # 2. 排序
+        matched_notes = self.sort_matched_notes(matched_notes)
+        logger.info(f"  排序方式: {self.sort_by}")
+
+        # 3. 跳过前 N 个
+        if self.skip_count > 0:
+            logger.info(f"  跳过前 {self.skip_count} 个")
+            matched_notes = matched_notes[self.skip_count:]
+
+        # 4. 限制数量
+        if self.max_notes is not None and len(matched_notes) > self.max_notes:
+            logger.info(f"  数量限制: {self.max_notes}")
+            matched_notes = matched_notes[:self.max_notes]
+
+        to_process = len(matched_notes)
+        logger.info(f"  实际处理: {to_process} 个")
+        logger.info(f"  并发数: {self.max_workers}")
+        logger.info(f"  API: {self.api_client.api_url}")
+
+        if to_process == 0:
+            logger.warning("  没有需要处理的帖子")
+            return {
+                'metadata': {
+                    'stage': 'deconstruction',
+                    'total_matched_notes': total_matched,
+                    'processed_notes': 0,
+                    'skipped_notes': self.skip_count
+                },
+                'results': []
+            }
+
+        # 5. 并行处理
+        results = []
+        start_time = time.time()
+        save_interval = 10  # 每处理10个帖子保存一次
+
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            futures = []
+            for idx, note_data in enumerate(matched_notes, start=1):
+                future = executor.submit(
+                    self.process_single_note,
+                    note_data,
+                    idx,
+                    to_process
+                )
+                futures.append(future)
+
+            # 收集结果(带进度显示)
+            if TQDM_AVAILABLE:
+                # 使用 tqdm 进度条
+                logger.info("  使用进度条显示...")
+                iterator = tqdm(
+                    as_completed(futures),
+                    total=len(futures),
+                    desc="  处理进度",
+                    unit="帖子",
+                    ncols=100
+                )
+            else:
+                # 简单进度显示
+                iterator = as_completed(futures)
+
+            processed_count = 0
+            for future in iterator:
+                try:
+                    result = future.result()
+                    results.append(result)
+                    processed_count += 1
+
+                    # 增量保存(每处理一定数量保存一次)
+                    if processed_count % save_interval == 0:
+                        self._save_intermediate_results(
+                            results,
+                            output_path,
+                            processed_count,
+                            to_process,
+                            start_time
+                        )
+
+                    # 简单进度显示(如果没有 tqdm)
+                    if not TQDM_AVAILABLE and processed_count % 5 == 0:
+                        logger.info(f"  进度: {processed_count}/{to_process}")
+
+                except Exception as e:
+                    logger.error(f"  处理失败: {e}")
+
+        processing_time = time.time() - start_time
+
+        # 6. 统计
+        success_count = sum(1 for r in results if r['api_response']['status'] == 'success')
+        failed_count = len(results) - success_count
+
+        logger.info(f"\n  总耗时: {processing_time:.1f}s")
+        logger.info(f"  成功: {success_count}")
+        logger.info(f"  失败: {failed_count}")
+
+        # 6.5. 加载已有结果(如果存在)并合并
+        existing_results = []
+        if os.path.exists(output_path):
+            logger.info(f"\n  检测到已有结果文件,准备合并...")
+            try:
+                with open(output_path, 'r', encoding='utf-8') as f:
+                    existing_data = json.load(f)
+                    existing_results = existing_data.get('results', [])
+                    logger.info(f"  已有结果数: {len(existing_results)}")
+            except Exception as e:
+                logger.warning(f"  加载已有结果失败: {e},将覆盖写入")
+                existing_results = []
+
+        # 6.6. 合并新旧结果(基于 note_id 去重)
+        if existing_results:
+            # 建立已有结果的 note_id 索引
+            existing_note_ids = {r['note_id']: r for r in existing_results}
+
+            # 统计更新数量
+            updated_count = 0
+            for new_result in results:
+                if new_result['note_id'] in existing_note_ids:
+                    updated_count += 1
+                # 用新结果更新已有结果(新结果优先)
+                existing_note_ids[new_result['note_id']] = new_result
+
+            # 合并后的完整结果
+            merged_results = list(existing_note_ids.values())
+
+            logger.info(f"  合并后总结果数: {len(merged_results)}")
+            logger.info(f"  本次新增: {len(results) - updated_count} 条")
+            logger.info(f"  本次更新: {updated_count} 条")
+        else:
+            merged_results = results
+            logger.info(f"  无已有结果,直接保存")
+
+        # 7. 构建最终结果
+        final_result = {
+            'metadata': {
+                'stage': 'deconstruction',
+                'description': '完全匹配帖子的深度解构分析',
+                'target_features': self.target_features if self.target_features else '全部',
+                'total_matched_notes': total_matched,
+                'processed_notes': len(results),
+                'total_results_count': len(merged_results),
+                'new_results_count': len(results),
+                'skipped_notes': self.skip_count,
+                'max_notes_limit': self.max_notes,
+                'sort_by': self.sort_by,
+                'success_count': success_count,
+                'failed_count': failed_count,
+                'api_url': self.api_client.api_url,
+                'min_score_threshold': self.min_score,
+                'created_at': datetime.now().isoformat(),
+                'processing_time_seconds': round(processing_time, 2)
+            },
+            'results': merged_results
+        }
+
+        # 8. 保存结果
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(final_result, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"  结果已保存: {output_path}")
+
+        return final_result
+
+
+def test_post_deconstruction_analyzer():
+    """测试帖子解构分析器"""
+    # 读取 评估结果
+    evaluation_path = "output_v2/evaluated_results.json"
+
+    if not os.path.exists(evaluation_path):
+        print(f"评估结果不存在: {evaluation_path}")
+        return
+
+    with open(evaluation_path, 'r', encoding='utf-8') as f:
+        evaluation_results = json.load(f)
+
+    # 创建分析器
+    analyzer = PostDeconstructionAnalyzer(
+        max_workers=3,
+        max_notes=5,  # 只测试 5 个
+        skip_count=0,
+        sort_by='score'
+    )
+
+    # 运行分析
+    deconstruction_results = analyzer.run(evaluation_results)
+
+    print(f"\n处理了 {deconstruction_results['metadata']['processed_notes']} 个帖子")
+    print(f"成功: {deconstruction_results['metadata']['success_count']}")
+    print(f"失败: {deconstruction_results['metadata']['failed_count']}")
+
+
+if __name__ == '__main__':
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    test_post_deconstruction_analyzer()

+ 757 - 0
src/analyzers/similarity_analyzer.py

@@ -0,0 +1,757 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+解构特征相似度分析器
+计算解构特征与原始特征的相似度评分
+"""
+
+import os
+import json
+import time
+import logging
+import asyncio
+from datetime import datetime
+from typing import Dict, List, Any, Optional
+from lib.hybrid_similarity import compare_phrases_cartesian
+from lib.config import get_cache_dir
+
+try:
+    from tqdm import tqdm
+    TQDM_AVAILABLE = True
+except ImportError:
+    TQDM_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+
+def extract_deconstructed_features(api_response: Dict) -> List[Dict]:
+    """
+    从三点解构中提取所有特征
+
+    Args:
+        api_response: 解构分析 的 api_response 对象
+
+    Returns:
+        特征列表,每个特征包含:
+        - feature_name: 特征名称
+        - dimension: 维度 (灵感点-全新内容/灵感点-共性差异/灵感点-共性内容/目的点/关键点)
+        - dimension_detail: 维度细分 (实质/形式/意图等)
+        - weight: 权重
+        - source_index: 在该维度中的索引
+        - source_*: 溯源信息 (候选编号、目的点描述、关键点描述等)
+    """
+    features = []
+
+    # 检查 API 响应状态
+    if api_response.get('status') != 'success':
+        logger.warning("  API 响应状态不是 success,无法提取特征")
+        return features
+
+    result = api_response.get('result', {})
+
+    # 检查是否有 data 字段
+    if 'data' not in result:
+        logger.warning("  API 响应中没有 data 字段")
+        return features
+
+    data = result['data']
+    three_point = data.get('三点解构', {})
+
+    if not three_point:
+        logger.warning("  三点解构数据为空")
+        return features
+
+    # 1. 提取灵感点 (3个子类别)
+    inspiration = three_point.get('灵感点', {})
+    for category in ['全新内容', '共性差异', '共性内容']:
+        items = inspiration.get(category, [])
+        for idx, item in enumerate(items):
+            extracted_features = item.get('提取的特征', [])
+            for feat in extracted_features:
+                feature_name = feat.get('特征名称', '')
+                if not feature_name:
+                    continue
+
+                features.append({
+                    'feature_name': feature_name,
+                    'dimension': f'灵感点-{category}',
+                    'dimension_detail': feat.get('维度分类', ''),  # 注意字段名
+                    'weight': feat.get('权重', 0),
+                    'source_index': idx,
+                    'source_candidate_number': item.get('候选编号', 0),
+                    'source_inspiration': item.get('灵感点', '')
+                })
+
+    # 2. 提取目的点
+    purpose = three_point.get('目的点', {})
+    purposes_list = purpose.get('purposes', [])
+    for idx, item in enumerate(purposes_list):
+        extracted_features = item.get('提取的特征', [])
+        for feat in extracted_features:
+            feature_name = feat.get('特征名称', '')
+            if not feature_name:
+                continue
+
+            features.append({
+                'feature_name': feature_name,
+                'dimension': '目的点',
+                'dimension_detail': feat.get('特征分类', ''),  # 注意字段名
+                'weight': feat.get('权重', 0),
+                'source_index': idx,
+                'source_purpose': item.get('目的点', ''),
+                'source_purpose_dimension': item.get('维度', {})
+            })
+
+    # 3. 提取关键点
+    key_points_data = three_point.get('关键点', {})
+    key_points_list = key_points_data.get('key_points', [])
+    for idx, item in enumerate(key_points_list):
+        extracted_features = item.get('提取的特征', [])
+        for feat in extracted_features:
+            feature_name = feat.get('特征名称', '')
+            if not feature_name:
+                continue
+
+            features.append({
+                'feature_name': feature_name,
+                'dimension': '关键点',
+                'dimension_detail': feat.get('维度', ''),  # 注意字段名
+                'weight': feat.get('权重', 0),
+                'source_index': idx,
+                'source_candidate_number': item.get('候选编号', 0),
+                'source_key_point': item.get('关键点', ''),
+                'source_key_point_dimension': item.get('维度', '')
+            })
+
+    logger.info(f"  提取特征数量: {len(features)}")
+    if features:
+        # 统计各维度数量
+        dimension_counts = {}
+        for feat in features:
+            dim = feat['dimension']
+            dimension_counts[dim] = dimension_counts.get(dim, 0) + 1
+        logger.info(f"  维度分布: {dimension_counts}")
+
+    return features
+
+
+async def calculate_similarity_for_note(
+    note_result: Dict,
+    original_feature: str,
+    weight_embedding: float = 0.5,
+    weight_semantic: float = 0.5,
+    min_similarity: float = 0.0
+) -> Dict:
+    """
+    计算单个帖子的所有特征与原始特征的相似度
+
+    Args:
+        note_result: 解构分析 的单个 result 对象
+        original_feature: 原始特征名称
+        weight_embedding: 向量模型权重
+        weight_semantic: LLM 模型权重
+        min_similarity: 最小相似度阈值,低于此值的特征会被过滤
+
+    Returns:
+        包含相似度信息的结果对象
+    """
+    note_id = note_result.get('note_id', '')
+
+    logger.info(f"  [{note_id}] 开始计算相似度...")
+
+    # 1. 提取解构特征
+    deconstructed_features = extract_deconstructed_features(
+        note_result['api_response']
+    )
+
+    if not deconstructed_features:
+        logger.warning(f"  [{note_id}] 没有提取到特征")
+        return {
+            'note_id': note_id,
+            'original_feature': original_feature,
+            'evaluation_score': note_result.get('evaluation_score', 0),
+            'search_word': note_result.get('search_word', ''),
+            'note_data': note_result.get('note_data', {}),
+            'deconstructed_features': [],
+            'similarity_statistics': {
+                'total_features': 0,
+                'max_similarity': 0,
+                'min_similarity': 0,
+                'avg_similarity': 0,
+                'high_similarity_count': 0,
+                'medium_similarity_count': 0,
+                'low_similarity_count': 0
+            }
+        }
+
+    # 2. 构建特征名称列表
+    feature_names = [f['feature_name'] for f in deconstructed_features]
+
+    logger.info(f"  [{note_id}] 调用相似度计算 API (1×{len(feature_names)} 笛卡尔积)...")
+
+    # 3. 批量计算相似度 (1×N 笛卡尔积)
+    try:
+        start_time = time.time()
+        similarity_results = await compare_phrases_cartesian(
+            phrases_a=[original_feature],
+            phrases_b=feature_names,
+            max_concurrent=50
+        )
+        elapsed = time.time() - start_time
+        logger.info(f"  [{note_id}] 相似度计算完成 ({elapsed:.1f}秒)")
+
+        # 4. 映射结果回特征对象
+        for i, feat in enumerate(deconstructed_features):
+            feat['similarity_score'] = similarity_results[0][i]['相似度']
+            feat['similarity_explanation'] = similarity_results[0][i]['说明']
+
+        # 5. 过滤低相似度特征
+        if min_similarity > 0:
+            original_count = len(deconstructed_features)
+            deconstructed_features = [
+                f for f in deconstructed_features
+                if f['similarity_score'] >= min_similarity
+            ]
+            filtered_count = original_count - len(deconstructed_features)
+            if filtered_count > 0:
+                logger.info(f"  [{note_id}] 过滤掉 {filtered_count} 个低相似度特征 (< {min_similarity})")
+
+        # 6. 计算统计信息
+        if deconstructed_features:
+            scores = [f['similarity_score'] for f in deconstructed_features]
+            statistics = {
+                'total_features': len(scores),
+                'max_similarity': round(max(scores), 3),
+                'min_similarity': round(min(scores), 3),
+                'avg_similarity': round(sum(scores) / len(scores), 3),
+                'high_similarity_count': sum(1 for s in scores if s >= 0.7),
+                'medium_similarity_count': sum(1 for s in scores if 0.5 <= s < 0.7),
+                'low_similarity_count': sum(1 for s in scores if s < 0.5)
+            }
+
+            # 7. 按相似度降序排序
+            deconstructed_features.sort(key=lambda x: x['similarity_score'], reverse=True)
+
+            logger.info(f"  [{note_id}] 统计: 最高={statistics['max_similarity']}, "
+                       f"平均={statistics['avg_similarity']}, "
+                       f"高相似度={statistics['high_similarity_count']}个")
+        else:
+            statistics = {
+                'total_features': 0,
+                'max_similarity': 0,
+                'min_similarity': 0,
+                'avg_similarity': 0,
+                'high_similarity_count': 0,
+                'medium_similarity_count': 0,
+                'low_similarity_count': 0
+            }
+
+        return {
+            'note_id': note_id,
+            'original_feature': original_feature,
+            'evaluation_score': note_result.get('evaluation_score', 0),
+            'search_word': note_result.get('search_word', ''),
+            'note_data': note_result.get('note_data', {}),
+            'deconstructed_features': deconstructed_features,
+            'similarity_statistics': statistics,
+            'processing_time_seconds': round(elapsed, 2)
+        }
+
+    except Exception as e:
+        logger.error(f"  [{note_id}] 相似度计算失败: {e}")
+        return {
+            'note_id': note_id,
+            'original_feature': original_feature,
+            'evaluation_score': note_result.get('evaluation_score', 0),
+            'search_word': note_result.get('search_word', ''),
+            'note_data': note_result.get('note_data', {}),
+            'deconstructed_features': [],
+            'similarity_statistics': {
+                'total_features': 0,
+                'error': str(e)
+            }
+        }
+
+
+class SimilarityAnalyzer:
+    """相似度分析: 解构特征与原始特征的相似度分析"""
+
+    def __init__(
+        self,
+        weight_embedding: float = 0.5,
+        weight_semantic: float = 0.5,
+        max_workers: int = 5,
+        min_similarity: float = 0.0,
+        output_dir: str = "output_v2",
+        target_features: Optional[List[str]] = None,
+        evaluation_results_path: str = 'output_v2/evaluated_results.json',
+        update_evaluation_scores: bool = True
+    ):
+        """
+        初始化 相似度分析 分析器
+
+        Args:
+            weight_embedding: 向量模型权重(默认 0.5)
+            weight_semantic: LLM 模型权重(默认 0.5)
+            max_workers: 最大并发数(默认 5)
+            min_similarity: 最小相似度阈值(默认 0.0,保留所有特征)
+            output_dir: 输出目录
+            target_features: 指定要处理的原始特征列表(None = 处理所有特征)
+            evaluation_results_path: 评估结果 数据文件路径(用于计算综合得分)
+            update_evaluation_scores: 是否计算并更新 评估结果 的综合得分(默认 True)
+        """
+        self.weight_embedding = weight_embedding
+        self.weight_semantic = weight_semantic
+        self.max_workers = max_workers
+        self.min_similarity = min_similarity
+        self.output_dir = output_dir
+        self.target_features = target_features
+        self.evaluation_results_path = evaluation_results_path
+        self.update_evaluation_scores = update_evaluation_scores
+
+        # 验证权重
+        total_weight = weight_embedding + weight_semantic
+        if abs(total_weight - 1.0) > 0.001:
+            raise ValueError(f"权重之和必须为1.0,当前为: {total_weight}")
+
+    def _save_intermediate_results(
+        self,
+        results: List[Dict],
+        output_path: str,
+        processed_count: int,
+        total_count: int,
+        start_time: float
+    ):
+        """保存中间结果"""
+        base_dir = os.path.dirname(output_path) or self.output_dir
+        base_name = os.path.basename(output_path)
+        name_without_ext = os.path.splitext(base_name)[0]
+
+        intermediate_path = os.path.join(
+            base_dir,
+            f"{name_without_ext}_partial_{processed_count}of{total_count}.json"
+        )
+
+        # 统计
+        total_features = sum(r['similarity_statistics']['total_features'] for r in results)
+        avg_max_sim = sum(r['similarity_statistics']['max_similarity'] for r in results) / len(results)
+
+        intermediate_result = {
+            'metadata': {
+                'stage': 'similarity_analysis_partial',
+                'description': f'部分结果({processed_count}/{total_count})',
+                'processed_notes': len(results),
+                'total_features_extracted': total_features,
+                'avg_max_similarity': round(avg_max_sim, 3),
+                'saved_at': datetime.now().isoformat(),
+                'processing_time_seconds': round(time.time() - start_time, 2)
+            },
+            'results': results
+        }
+
+        os.makedirs(base_dir, exist_ok=True)
+        with open(intermediate_path, 'w', encoding='utf-8') as f:
+            json.dump(intermediate_result, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"    已保存中间结果: {intermediate_path}")
+
+    async def run_async(
+        self,
+        deconstruction_results: Dict,
+        output_path: Optional[str] = None
+    ) -> Dict:
+        """
+        执行 相似度分析 相似度分析(异步版本)
+
+        Args:
+            deconstruction_results: 解构分析 结果
+            output_path: 输出路径(可选)
+
+        Returns:
+            相似度分析 结果
+        """
+        logger.info("\n" + "=" * 60)
+        logger.info("相似度分析: 解构特征与原始特征的相似度分析")
+        logger.info("=" * 60)
+
+        # 打印配置
+        logger.info("配置参数:")
+        logger.info(f"  向量模型权重: {self.weight_embedding}")
+        logger.info(f"  LLM 模型权重: {self.weight_semantic}")
+        logger.info(f"  最大并发数: {self.max_workers}")
+        logger.info(f"  最小相似度阈值: {self.min_similarity}")
+        if self.target_features:
+            logger.info(f"  目标特征: {', '.join(self.target_features)}")
+        else:
+            logger.info(f"  目标特征: 全部")
+
+        # 默认输出路径
+        if output_path is None:
+            output_path = os.path.join(self.output_dir, "similarity_analysis_results.json")
+
+        # 提取 解构分析 结果
+        results_list = deconstruction_results.get('results', [])
+
+        # 过滤目标特征
+        if self.target_features:
+            results_list = [
+                r for r in results_list
+                if r.get('original_feature') in self.target_features
+            ]
+
+        total_notes = len(results_list)
+        logger.info(f"  待处理帖子数: {total_notes}")
+
+        if total_notes == 0:
+            logger.warning("  没有需要处理的帖子")
+            return {
+                'metadata': {
+                    'stage': 'similarity_analysis',
+                    'processed_notes': 0
+                },
+                'results': []
+            }
+
+        # 创建任务列表
+        start_time = time.time()
+        results = []
+
+        # 使用 Semaphore 控制并发数
+        semaphore = asyncio.Semaphore(self.max_workers)
+
+        async def bounded_task(result):
+            async with semaphore:
+                return await calculate_similarity_for_note(
+                    result,
+                    result.get('original_feature', ''),
+                    self.weight_embedding,
+                    self.weight_semantic,
+                    self.min_similarity
+                )
+
+        tasks = [bounded_task(result) for result in results_list]
+
+        # 带进度条执行
+        if TQDM_AVAILABLE:
+            logger.info("  使用进度条显示...")
+            processed_count = 0
+            save_interval = 10
+
+            for coro in tqdm(
+                asyncio.as_completed(tasks),
+                total=len(tasks),
+                desc="  相似度计算进度",
+                unit="帖子",
+                ncols=100
+            ):
+                result = await coro
+                results.append(result)
+                processed_count += 1
+
+                # 增量保存
+                if processed_count % save_interval == 0:
+                    self._save_intermediate_results(
+                        results,
+                        output_path,
+                        processed_count,
+                        total_notes,
+                        start_time
+                    )
+        else:
+            # 简单执行
+            results = await asyncio.gather(*tasks)
+            logger.info(f"  完成: {len(results)}/{total_notes}")
+
+        processing_time = time.time() - start_time
+
+        # 计算总体统计
+        total_features = sum(r['similarity_statistics']['total_features'] for r in results)
+        all_max_similarities = [r['similarity_statistics']['max_similarity'] for r in results if r['similarity_statistics']['total_features'] > 0]
+
+        overall_stats = {
+            'total_notes': total_notes,
+            'total_features_extracted': total_features,
+            'avg_features_per_note': round(total_features / total_notes, 1) if total_notes > 0 else 0,
+            'avg_max_similarity': round(sum(all_max_similarities) / len(all_max_similarities), 3) if all_max_similarities else 0,
+            'notes_with_high_similarity': sum(1 for r in results if r['similarity_statistics'].get('high_similarity_count', 0) > 0)
+        }
+
+        logger.info(f"\n  总耗时: {processing_time:.1f}秒")
+        logger.info(f"  总特征数: {total_features}")
+        logger.info(f"  平均特征数/帖子: {overall_stats['avg_features_per_note']}")
+        logger.info(f"  平均最高相似度: {overall_stats['avg_max_similarity']}")
+        logger.info(f"  包含高相似度特征的帖子: {overall_stats['notes_with_high_similarity']}")
+
+        # 构建最终结果
+        final_result = {
+            'metadata': {
+                'stage': 'similarity_analysis',
+                'description': '解构特征与原始特征的相似度评分',
+                'source_file': deconstruction_results.get('metadata', {}).get('created_at', ''),
+                'target_features': self.target_features if self.target_features else '全部',
+                'similarity_config': {
+                    'algorithm': 'hybrid_similarity',
+                    'weight_embedding': self.weight_embedding,
+                    'weight_semantic': self.weight_semantic,
+                    'min_similarity_threshold': self.min_similarity
+                },
+                'overall_statistics': overall_stats,
+                'created_at': datetime.now().isoformat(),
+                'processing_time_seconds': round(processing_time, 2)
+            },
+            'results': results
+        }
+
+        # 保存结果
+        os.makedirs(os.path.dirname(output_path) or self.output_dir, exist_ok=True)
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(final_result, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"  结果已保存: {output_path}")
+
+        # 计算并更新综合得分P
+        if self.update_evaluation_scores:
+            logger.info("\n" + "=" * 60)
+            logger.info("开始计算综合得分P并更新评估结果数据...")
+            logger.info("=" * 60)
+            self._calculate_and_update_comprehensive_scores(results)
+
+        return final_result
+
+    def _calculate_and_update_comprehensive_scores(self, similarity_results: List[Dict]):
+        """
+        计算综合得分P并更新评估结果数据
+
+        Args:
+            similarity_results: 相似度分析 的结果列表
+        """
+        try:
+            # 1. 加载 评估结果 数据
+            logger.info(f"  加载 评估结果 数据: {self.evaluation_results_path}")
+            if not os.path.exists(self.evaluation_results_path):
+                logger.error(f"  评估结果 文件不存在: {self.evaluation_results_path}")
+                return
+
+            with open(self.evaluation_results_path, 'r', encoding='utf-8') as f:
+                evaluation_data = json.load(f)
+
+            # 2. 构建 相似度分析 映射 (note_id → max_similarity)
+            logger.info("  构建相似度映射...")
+            similarity_map = {}
+            for result in similarity_results:
+                note_id = result['note_id']
+                max_similarity = result['similarity_statistics']['max_similarity']
+                similarity_map[note_id] = max_similarity
+
+            logger.info(f"  相似度映射条目数: {len(similarity_map)}")
+
+            # 3. 遍历 评估结果 中的所有原始特征和搜索词,计算 P 值
+            # 评估结果 数据是一个列表,每个元素是一个原始特征
+            updated_count = 0
+            total_searches = 0
+
+            logger.info(f"  开始遍历 {len(evaluation_data)} 个原始特征...")
+
+            for feature_item in evaluation_data:
+                original_feature = feature_item.get('原始特征名称', '')
+                logger.info(f"\n  处理原始特征: {original_feature}")
+
+                # 遍历每个分组
+                for group in feature_item.get('组合评估结果_分组', []):
+                    source_word = group.get('source_word', '')
+
+                    # 遍历该分组的所有搜索词
+                    for search_item in group.get('top10_searches', []):
+                        search_word = search_item.get('search_word', '')
+                        total_searches += 1
+
+                        logger.info(f"    处理搜索词: {search_word} (来源: {source_word})")
+
+                        # 计算该搜索词的综合得分
+                        p_score, p_detail = self._calculate_single_query_score(
+                            search_item,
+                            similarity_map
+                        )
+
+                        # 更新搜索词数据
+                        if p_score is not None:
+                            search_item['comprehensive_score'] = round(p_score, 3)
+                            search_item['comprehensive_score_detail'] = p_detail
+                            updated_count += 1
+                            logger.info(f"      综合得分P = {p_score:.3f} (M={p_detail['M']}, N={p_detail['N']})")
+                        else:
+                            logger.warning(f"      无法计算综合得分(可能缺少数据)")
+
+            # 4. 保存更新后的 评估结果 数据
+            logger.info(f"\n  保存更新后的 评估结果 数据...")
+            logger.info(f"  已更新 {updated_count}/{total_searches} 个搜索词")
+
+            with open(self.evaluation_results_path, 'w', encoding='utf-8') as f:
+                json.dump(evaluation_data, f, ensure_ascii=False, indent=2)
+
+            logger.info(f"  更新完成: {self.evaluation_results_path}")
+
+        except Exception as e:
+            logger.error(f"  计算综合得分失败: {e}", exc_info=True)
+
+    def _calculate_single_query_score(
+        self,
+        query: Dict,
+        similarity_map: Dict[str, float]
+    ) -> tuple[Optional[float], Optional[Dict]]:
+        """
+        计算单个查询的综合得分P
+
+        Args:
+            query: 评估结果 中的单个查询对象
+            similarity_map: note_id → max_similarity 的映射
+
+        Returns:
+            (P值, 详细计算信息) 或 (None, None)
+        """
+        # 获取总帖子数 N
+        evaluation_with_filter = query.get('evaluation_with_filter', {})
+        N = evaluation_with_filter.get('total_notes', 0)
+
+        if N == 0:
+            logger.warning(f"    查询总帖子数为0,无法计算P值")
+            return None, None
+
+        # 获取笔记评估数据和原始笔记数据
+        notes_evaluation = evaluation_with_filter.get('notes_evaluation', [])
+        search_result = query.get('search_result', {})
+        notes_data = search_result.get('data', {}).get('data', [])
+
+        if not notes_evaluation or not notes_data:
+            logger.warning(f"    缺少评估数据或笔记数据")
+            return 0.0, {
+                'N': N,
+                'M': 0,
+                'total_contribution': 0.0,
+                'complete_matches': []
+            }
+
+        # 获取完全匹配的帖子列表 (综合得分 >= 0.8)
+        complete_matches_data = []
+        for note_eval in notes_evaluation:
+            score = note_eval.get('综合得分', 0)
+            if score >= 0.8:
+                note_index = note_eval.get('note_index', -1)
+                if 0 <= note_index < len(notes_data):
+                    # 从原始数据中获取note_id
+                    note_id = notes_data[note_index].get('id', '')
+                    note_card = notes_data[note_index].get('note_card', {})
+                    note_title = note_card.get('display_title', '')
+
+                    complete_matches_data.append({
+                        'note_id': note_id,
+                        'note_title': note_title,
+                        'evaluation_score': score,
+                        'note_index': note_index
+                    })
+
+        M = len(complete_matches_data)
+        logger.info(f"    完全匹配数: M = {M}/{N}")
+
+        if M == 0:
+            # 没有完全匹配,P = 0
+            return 0.0, {
+                'N': N,
+                'M': 0,
+                'total_contribution': 0.0,
+                'complete_matches': []
+            }
+
+        # 计算每个完全匹配的贡献 a×b
+        contributions = []
+        total_contribution = 0.0
+
+        for match in complete_matches_data:
+            note_id = match['note_id']
+            evaluation_score = match['evaluation_score']  # a 值
+
+            # 从 similarity_map 获取 b 值
+            max_similarity = similarity_map.get(note_id, 0)  # b 值
+
+            # 计算贡献
+            contribution = evaluation_score * max_similarity
+            total_contribution += contribution
+
+            # 保存详细信息
+            contributions.append({
+                'note_id': note_id,
+                'note_title': match['note_title'],
+                'evaluation_score': round(evaluation_score, 3),
+                'max_similarity': round(max_similarity, 3),
+                'contribution': round(contribution, 3)
+            })
+
+        # 计算综合得分 P = Σ(a×b) / N
+        P = total_contribution / N
+
+        # 按贡献降序排序
+        contributions.sort(key=lambda x: x['contribution'], reverse=True)
+
+        # 构建详细信息
+        detail = {
+            'N': N,
+            'M': M,
+            'total_contribution': round(total_contribution, 3),
+            'complete_matches': contributions
+        }
+
+        return P, detail
+
+    def run(
+        self,
+        deconstruction_results: Dict,
+        output_path: Optional[str] = None
+    ) -> Dict:
+        """
+        执行 相似度分析 相似度分析(同步版本)
+
+        Args:
+            deconstruction_results: 解构分析 结果
+            output_path: 输出路径(可选)
+
+        Returns:
+            相似度分析 结果
+        """
+        return asyncio.run(self.run_async(deconstruction_results, output_path))
+
+
+def test_similarity_analyzer():
+    """测试相似度分析器"""
+    # 读取解构分析结果
+    deconstruction_path = "output_v2/deep_analysis_results.json"
+
+    if not os.path.exists(deconstruction_path):
+        print(f"解构分析结果不存在: {deconstruction_path}")
+        return
+
+    with open(deconstruction_path, 'r', encoding='utf-8') as f:
+        deconstruction_results = json.load(f)
+
+    # 创建分析器
+    analyzer = SimilarityAnalyzer(
+        weight_embedding=0.5,
+        weight_semantic=0.5,
+        max_workers=3,
+        min_similarity=0.3,
+        target_features=["墨镜"]
+    )
+
+    # 运行分析
+    similarity_results = analyzer.run(deconstruction_results)
+
+    print(f"\n处理了 {similarity_results['metadata']['overall_statistics']['total_notes']} 个帖子")
+    print(f"提取了 {similarity_results['metadata']['overall_statistics']['total_features_extracted']} 个特征")
+    print(f"平均最高相似度: {similarity_results['metadata']['overall_statistics']['avg_max_similarity']}")
+
+
+if __name__ == '__main__':
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    test_similarity_analyzer()

+ 51 - 0
src/api/base.py

@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+API客户端基类
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+import logging
+import time
+from functools import wraps
+
+
+class BaseAPIClient(ABC):
+    """API客户端基类,提供统一的重试、缓存、日志功能"""
+
+    def __init__(
+        self,
+        api_name: str,
+        max_retries: int = 3,
+        retry_delay: float = 2.0,
+        logger: Optional[logging.Logger] = None
+    ):
+        self.api_name = api_name
+        self.max_retries = max_retries
+        self.retry_delay = retry_delay
+        self.logger = logger or logging.getLogger(api_name)
+
+    def with_retry(self, func):
+        """重试装饰器"""
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            for attempt in range(1, self.max_retries + 1):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as e:
+                    self.logger.warning(
+                        f"[{self.api_name}] Attempt {attempt}/{self.max_retries} failed: {e}"
+                    )
+                    if attempt < self.max_retries:
+                        time.sleep(self.retry_delay)
+                    else:
+                        self.logger.error(f"[{self.api_name}] All retries failed: {e}")
+                        raise
+            return None
+        return wrapper
+
+    @abstractmethod
+    def call(self, *args, **kwargs) -> Any:
+        """具体的API调用逻辑,子类实现"""
+        pass

+ 0 - 0
src/clients/__init__.py


+ 253 - 0
src/clients/deconstruction_api_client.py

@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Stage 7 API 客户端
+用于调用深度解构分析 API
+"""
+
+import time
+import logging
+import requests
+from datetime import datetime
+from typing import Dict, List, Any, Optional
+
+logger = logging.getLogger(__name__)
+
+
+def map_note_to_api_format(
+    note: Dict,
+    note_card: Dict,
+    evaluation: Dict,
+    search_word: str,
+    original_feature: str,
+    start_points: List[str],
+    processed_image_urls: Optional[List[str]] = None
+) -> Dict:
+    """
+    将小红书笔记数据映射为 API 所需格式
+
+    Args:
+        note: 笔记原始数据
+        note_card: 笔记卡片信息
+        evaluation: 评估结果
+        search_word: 搜索词
+        original_feature: 原始特征
+        start_points: 起点列表
+        processed_image_urls: 处理后的图片URL列表(如果提供,将替代原始URL)
+
+    Returns:
+        API 请求格式的数据
+    """
+    # 构建小红书链接
+    note_id = note.get('id', '')
+    link = f"https://www.xiaohongshu.com/explore/{note_id}"
+
+    # 获取用户信息
+    user = note_card.get('user', {})
+    interact_info = note_card.get('interact_info', {})
+
+    # 获取发布时间(需要转换为毫秒时间戳)
+    publish_ts = note_card.get('publish_timestamp', 0)
+    publish_ts_ms = publish_ts * 1000 if publish_ts else 0
+
+    # 格式化发布日期
+    publish_date = ''
+    if publish_ts:
+        try:
+            publish_date = datetime.fromtimestamp(publish_ts).strftime('%Y-%m-%d %H:%M:%S')
+        except:
+            publish_date = ''
+
+    # 使用处理后的图片URL,如果没有则使用原始URL
+    image_urls = processed_image_urls if processed_image_urls else note_card.get('image_list', [])
+
+    return {
+        "post_data": {
+            "channel_content_id": note_id,
+            "link": link,
+            "xsec_token": "",  # 通常为空
+            "comment_count": interact_info.get('comment_count', 0),
+            "images": image_urls,
+            "like_count": interact_info.get('liked_count', 0),
+            "body_text": note_card.get('desc', ''),
+            "title": note_card.get('display_title', ''),
+            "collect_count": interact_info.get('collected_count', 0),
+            "channel_account_id": user.get('user_id', ''),
+            "channel_account_name": user.get('nick_name', ''),
+            "publish_timestamp": publish_ts_ms,
+            "modify_timestamp": publish_ts_ms,
+            "update_timestamp": int(time.time() * 1000),
+            "publish_date": publish_date,
+            "content_type": "note",
+            "video": {}  # 图文类型无视频
+        },
+        "question_data": {
+            "target": original_feature,      # 例如: "墨镜"
+            "start_points": start_points,    # 例如: ["墨镜", "猫咪服饰造型元素", "图片中猫咪佩戴墨镜"]
+            "query": search_word             # 例如: "猫咪服饰造型元素"
+        }
+    }
+
+
+class DeconstructionAPIClient:
+    """解构分析 API 客户端"""
+
+    def __init__(
+        self,
+        api_url: str = "http://192.168.245.150:7000/what/analysis/single",
+        timeout: int = 800,
+        max_retries: int = 3
+    ):
+        """
+        初始化 API 客户端
+
+        Args:
+            api_url: API 地址
+            timeout: 超时时间(秒)
+            max_retries: 最大重试次数
+        """
+        self.api_url = api_url
+        self.timeout = timeout
+        self.max_retries = max_retries
+
+    def call_api(
+        self,
+        api_payload: Dict
+    ) -> Dict:
+        """
+        调用解构 API(带重试机制)
+
+        Args:
+            api_payload: API 请求数据
+
+        Returns:
+            {
+                'status': 'success' | 'failed',
+                'result': API响应数据(成功时),
+                'error': 错误信息(失败时)
+            }
+        """
+        for attempt in range(self.max_retries):
+            try:
+                response = requests.post(
+                    self.api_url,
+                    json=api_payload,
+                    headers={'Content-Type': 'application/json'},
+                    timeout=self.timeout
+                )
+
+                if response.status_code == 200:
+                    return {
+                        'status': 'success',
+                        'result': response.json(),
+                        'error': None
+                    }
+                else:
+                    error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
+
+                    # 如果还有重试机会,继续重试
+                    if attempt < self.max_retries - 1:
+                        wait_time = 2 ** attempt  # 指数退避: 1s, 2s, 4s
+                        logger.warning(f"    API 调用失败,{wait_time}s 后重试 ({attempt + 1}/{self.max_retries})")
+                        time.sleep(wait_time)
+                        continue
+
+                    # 最后一次重试也失败
+                    return {
+                        'status': 'failed',
+                        'result': None,
+                        'error': error_msg
+                    }
+
+            except requests.Timeout:
+                if attempt < self.max_retries - 1:
+                    wait_time = 2 ** attempt
+                    logger.warning(f"    API 超时,{wait_time}s 后重试 ({attempt + 1}/{self.max_retries})")
+                    time.sleep(wait_time)
+                    continue
+
+                return {
+                    'status': 'failed',
+                    'result': None,
+                    'error': f'API timeout after {self.timeout}s'
+                }
+
+            except Exception as e:
+                if attempt < self.max_retries - 1:
+                    wait_time = 2 ** attempt
+                    logger.warning(f"    API 异常,{wait_time}s 后重试 ({attempt + 1}/{self.max_retries}): {e}")
+                    time.sleep(wait_time)
+                    continue
+
+                return {
+                    'status': 'failed',
+                    'result': None,
+                    'error': f'Exception: {str(e)}'
+                }
+
+        # 理论上不会到这里
+        return {
+            'status': 'failed',
+            'result': None,
+            'error': 'Max retries exceeded'
+        }
+
+
+def test_api_client():
+    """测试 API 客户端"""
+    # 模拟数据
+    test_note = {
+        'id': '68ba3a27000000001c00f8fc'
+    }
+
+    test_note_card = {
+        'display_title': '测试标题',
+        'desc': '测试内容',
+        'image_list': [
+            'https://example.com/image1.jpg',
+            'https://example.com/image2.jpg'
+        ],
+        'user': {
+            'user_id': '123456',
+            'nick_name': '测试用户'
+        },
+        'interact_info': {
+            'liked_count': 100,
+            'collected_count': 50,
+            'comment_count': 10
+        },
+        'publish_timestamp': 1640000000
+    }
+
+    test_evaluation = {
+        '综合得分': 9.0,
+        '关键匹配点': ['测试匹配点1', '测试匹配点2']
+    }
+
+    # 数据映射测试
+    api_payload = map_note_to_api_format(
+        note=test_note,
+        note_card=test_note_card,
+        evaluation=test_evaluation,
+        search_word='测试搜索词',
+        original_feature='测试特征',
+        start_points=['起点1', '起点2']
+    )
+
+    print("API Payload:")
+    import json
+    print(json.dumps(api_payload, ensure_ascii=False, indent=2))
+
+    # API 调用测试(需要实际 API 服务)
+    # client = DeconstructionAPIClient()
+    # result = client.call_api(api_payload)
+    # print("\nAPI Result:")
+    # print(json.dumps(result, ensure_ascii=False, indent=2))
+
+
+if __name__ == '__main__':
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    test_api_client()

+ 277 - 0
src/clients/openrouter_client.py

@@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+OpenRouter API 客户端
+支持文本和多模态(图片)任务
+"""
+
+import os
+import json
+import requests
+import logging
+import time
+from typing import List, Dict, Any, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class OpenRouterClient:
+    """OpenRouter API客户端"""
+
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        model: str = "google/gemini-2.5-pro",
+        max_tokens: int = 8192,
+        temperature: float = 0.3,
+        retry_delay: int = 3
+    ):
+        """
+        初始化客户端
+
+        Args:
+            api_key: API密钥,默认从环境变量读取
+            model: 模型名称
+            max_tokens: 最大token数
+            temperature: 温度参数
+            retry_delay: 默认重试延迟(秒)
+        """
+        self.api_key = api_key or os.getenv("OPENROUTER_API_KEY")
+        if not self.api_key:
+            raise ValueError("OPENROUTER_API_KEY not found in environment variables")
+
+        self.base_url = "https://openrouter.ai/api/v1"
+        self.model = model
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.retry_delay = retry_delay
+
+        logger.info(f"OpenRouter客户端已初始化: model={model}, max_tokens={max_tokens}, retry_delay={retry_delay}s")
+
+    def chat(
+        self,
+        prompt: str,
+        images: Optional[List[str]] = None,
+        system_prompt: Optional[str] = None,
+        max_retries: int = 3,
+        retry_delay: Optional[int] = None
+    ) -> Dict[str, Any]:
+        """
+        调用LLM进行对话
+
+        Args:
+            prompt: 用户提示词
+            images: 图片URL列表(可选,用于多模态任务)
+            system_prompt: 系统提示词(可选)
+            max_retries: 最大重试次数
+            retry_delay: 重试延迟(秒),None则使用实例默认值
+
+        Returns:
+            LLM响应
+        """
+        # 使用实例默认retry_delay(如果未指定)
+        if retry_delay is None:
+            retry_delay = self.retry_delay
+
+        # 构建消息
+        messages = []
+
+        # 添加系统提示词
+        if system_prompt:
+            messages.append({
+                "role": "system",
+                "content": system_prompt
+            })
+
+        # 构建用户消息
+        if images:
+            # 多模态消息
+            content = [{"type": "text", "text": prompt}]
+            for img_url in images:
+                content.append({
+                    "type": "image_url",
+                    "image_url": {"url": img_url}
+                })
+            messages.append({
+                "role": "user",
+                "content": content
+            })
+        else:
+            # 纯文本消息
+            messages.append({
+                "role": "user",
+                "content": prompt
+            })
+
+        # 构建请求
+        payload = {
+            "model": self.model,
+            "messages": messages,
+            "max_tokens": self.max_tokens,
+            "temperature": self.temperature
+        }
+
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json"
+        }
+
+        # 重试循环
+        last_exception = None
+        for attempt in range(1, max_retries + 1):
+            try:
+                if attempt > 1:
+                    logger.info(f"  重试第 {attempt - 1}/{max_retries - 1} 次")
+                    time.sleep(retry_delay)
+
+                response = requests.post(
+                    f"{self.base_url}/chat/completions",
+                    json=payload,
+                    headers=headers,
+                    timeout=60
+                )
+                response.raise_for_status()
+
+                result = response.json()
+
+                # 提取响应内容
+                if "choices" in result and len(result["choices"]) > 0:
+                    content = result["choices"][0]["message"]["content"]
+
+                    # 尝试解析JSON
+                    try:
+                        # 如果响应是JSON格式,解析它
+                        if content.strip().startswith('{'):
+                            parsed = json.loads(content)
+                            return {
+                                "success": True,
+                                "content": content,
+                                "parsed": parsed,
+                                "raw_response": result
+                            }
+                    except json.JSONDecodeError:
+                        pass
+
+                    return {
+                        "success": True,
+                        "content": content,
+                        "raw_response": result
+                    }
+                else:
+                    raise Exception(f"Invalid API response: {result}")
+
+            except requests.exceptions.RequestException as e:
+                last_exception = e
+                logger.error(f"  API调用失败 (第{attempt}次尝试): {e}")
+
+                if attempt >= max_retries:
+                    logger.error(f"  已达最大重试次数 {max_retries}")
+
+        # 所有重试都失败
+        return {
+            "success": False,
+            "error": str(last_exception),
+            "content": None
+        }
+
+    def chat_json(
+        self,
+        prompt: str,
+        images: Optional[List[str]] = None,
+        system_prompt: Optional[str] = None,
+        max_retries: int = 3
+    ) -> Optional[Dict[str, Any]]:
+        """
+        调用LLM并期望返回JSON格式
+
+        Args:
+            prompt: 用户提示词(应包含返回JSON的指示)
+            images: 图片URL列表
+            system_prompt: 系统提示词
+            max_retries: 最大重试次数
+
+        Returns:
+            解析后的JSON对象,失败返回None
+        """
+        result = self.chat(
+            prompt=prompt,
+            images=images,
+            system_prompt=system_prompt,
+            max_retries=max_retries
+        )
+
+        if not result["success"]:
+            logger.error(f"LLM调用失败: {result.get('error')}")
+            return None
+
+        # 如果已经解析了JSON
+        if "parsed" in result:
+            return result["parsed"]
+
+        # 尝试从content中解析JSON
+        content = result["content"]
+
+        # 尝试提取JSON(可能包含在markdown代码块中)
+        if "```json" in content:
+            # 提取代码块中的JSON
+            start = content.find("```json") + 7
+            end = content.find("```", start)
+            json_str = content[start:end].strip()
+        elif "```" in content:
+            # 普通代码块
+            start = content.find("```") + 3
+            end = content.find("```", start)
+            json_str = content[start:end].strip()
+        else:
+            # 直接尝试解析
+            json_str = content.strip()
+
+        try:
+            return json.loads(json_str)
+        except json.JSONDecodeError as e:
+            logger.error(f"JSON解析失败: {e}")
+            logger.error(f"原始内容: {content[:500]}")
+            return None
+
+
+def test_client():
+    """测试客户端"""
+    # 需要设置环境变量 OPENROUTER_API_KEY
+    client = OpenRouterClient()
+
+    # 测试文本任务
+    print("\n=== 测试文本任务 ===")
+    result = client.chat_json(
+        prompt="""
+        评估搜索词"猫咪 宠物"能否找到包含"拟人"相关元素的内容。
+
+        返回JSON格式:
+        {
+          "score": 0.0-1.0,
+          "reasoning": "评估理由"
+        }
+        """
+    )
+    print(json.dumps(result, ensure_ascii=False, indent=2))
+
+    # 测试多模态任务
+    print("\n=== 测试多模态任务 ===")
+    result = client.chat_json(
+        prompt="""
+        这张图片中是否包含与"拟人"相关的元素?
+
+        返回JSON格式:
+        {
+          "has_element": true/false,
+          "elements": ["元素1", "元素2"],
+          "reasoning": "理由"
+        }
+        """,
+        images=["http://example.com/cat.jpg"]  # 示例图片
+    )
+    print(json.dumps(result, ensure_ascii=False, indent=2))
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    test_client()

+ 331 - 0
src/clients/xiaohongshu_search.py

@@ -0,0 +1,331 @@
+#!/usr/bin/env python3
+"""
+小红书笔记搜索工具
+根据关键词搜索小红书笔记,支持多种筛选条件
+"""
+
+import requests
+import json
+import os
+import argparse
+import time
+import logging
+from datetime import datetime
+from typing import Dict, Any
+
+logger = logging.getLogger(__name__)
+
+
+class XiaohongshuSearch:
+    """小红书笔记搜索API封装类"""
+
+    BASE_URL = "http://47.84.182.56:8001"
+    TOOL_NAME = "xhs_note_search"
+    PLATFORM = "xiaohongshu"
+
+    def __init__(self, results_dir: str = None, cache_dir: str = "search_cache"):
+        """
+        初始化API客户端
+
+        Args:
+            results_dir: 结果输出目录,默认为项目根目录下的 data/search 文件夹
+            cache_dir: 缓存目录,默认为 search_cache
+        """
+        self.api_url = f"{self.BASE_URL}/tools/call/{self.TOOL_NAME}"
+
+        # 设置结果输出目录
+        if results_dir:
+            self.results_base_dir = results_dir
+        else:
+            # 默认使用项目根目录的 data/search 文件夹
+            script_dir = os.path.dirname(os.path.abspath(__file__))
+            project_root = os.path.dirname(os.path.dirname(script_dir))
+            self.results_base_dir = os.path.join(project_root, "data", "search")
+
+        # 设置缓存目录
+        self.cache_dir = cache_dir
+        if cache_dir:
+            os.makedirs(cache_dir, exist_ok=True)
+
+    def _get_cache_key(
+        self,
+        keyword: str,
+        content_type: str,
+        sort_type: str,
+        publish_time: str
+    ) -> str:
+        """
+        生成缓存key
+
+        Args:
+            keyword: 搜索关键词
+            content_type: 内容类型
+            sort_type: 排序方式
+            publish_time: 发布时间
+
+        Returns:
+            缓存key字符串
+        """
+        return f"{keyword}_{content_type}_{sort_type}_{publish_time}"
+
+    def _get_cache_path(self, cache_key: str) -> str:
+        """
+        获取缓存文件路径
+
+        Args:
+            cache_key: 缓存key
+
+        Returns:
+            缓存文件完整路径
+        """
+        # 清理文件名中的非法字符
+        safe_key = cache_key.replace('/', '_').replace('\\', '_').replace(' ', '_')
+        return os.path.join(self.cache_dir, f"{safe_key}.json")
+
+    def search(
+        self,
+        keyword: str,
+        content_type: str = "不限",
+        sort_type: str = "综合",
+        publish_time: str = "不限",
+        cursor: str = "",
+        timeout: int = 30,
+        max_retries: int = 5,
+        retry_delay: int = 2,
+        use_cache: bool = True
+    ) -> Dict[str, Any]:
+        """
+        搜索小红书笔记(带重试机制和缓存)
+
+        Args:
+            keyword: 搜索关键词
+            content_type: 内容类型,可选值:不限、视频、图文,默认为'不限'
+            sort_type: 排序方式,可选值:综合、最新、最多点赞、最多评论,默认为'综合'
+            publish_time: 发布时间筛选,可选值:不限、一天内、一周内、半年内,默认为'不限'
+            cursor: 翻页游标,第一页默认为空,下一页的游标在上一页的返回值中获取
+            timeout: 请求超时时间(秒),默认30秒
+            max_retries: 最大重试次数,默认3次
+            retry_delay: 重试间隔时间(秒),默认2秒
+            use_cache: 是否使用缓存,默认True
+
+        Returns:
+            API响应的JSON数据
+
+        Raises:
+            requests.exceptions.RequestException: 所有重试都失败时抛出异常
+        """
+        # 检查缓存
+        if use_cache and self.cache_dir:
+            cache_key = self._get_cache_key(keyword, content_type, sort_type, publish_time)
+            cache_path = self._get_cache_path(cache_key)
+
+            if os.path.exists(cache_path):
+                try:
+                    with open(cache_path, 'r', encoding='utf-8') as f:
+                        cached_result = json.load(f)
+                    logger.info(f"  ✓ 使用缓存: {keyword}")
+                    return cached_result
+                except Exception as e:
+                    logger.warning(f"  读取缓存失败: {e},将重新搜索")
+
+        # 缓存未命中或未启用,执行实际搜索
+        payload = {
+            "keyword": keyword,
+            "content_type": '不限',  # 使用映射后的参数
+            "sort_type": sort_type,
+            "publish_time": publish_time,
+            "cursor": cursor
+        }
+
+        last_exception = None
+
+        # 重试循环:最多尝试 max_retries 次
+        for attempt in range(1, max_retries + 1):
+            try:
+                if attempt > 1:
+                    print(f"    重试第 {attempt - 1}/{max_retries - 1} 次: {keyword}")
+
+                response = requests.post(
+                    self.api_url,
+                    json=payload,
+                    timeout=timeout,
+                    headers={"Content-Type": "application/json"}
+                )
+                response.raise_for_status()
+                api_response = response.json()
+
+                # 解析API返回的result字段(是JSON字符串)
+                if not api_response.get("success"):
+                    raise Exception(f"API返回失败: {api_response}")
+
+                result_str = api_response.get("result", "{}")
+                result = json.loads(result_str)
+
+                # 预处理返回数据:提取 image_list 中的 URL 字符串
+                self._preprocess_response(result)
+
+                if attempt > 1:
+                    print(f"    ✓ 重试成功")
+
+                # 保存到缓存
+                if use_cache and self.cache_dir:
+                    try:
+                        cache_key = self._get_cache_key(keyword, content_type, sort_type, publish_time)
+                        cache_path = self._get_cache_path(cache_key)
+                        with open(cache_path, 'w', encoding='utf-8') as f:
+                            json.dump(result, f, ensure_ascii=False, indent=2)
+                        logger.info(f"  ✓ 已缓存: {keyword}")
+                    except Exception as e:
+                        logger.warning(f"  保存缓存失败: {e}")
+
+                return result
+
+            except requests.exceptions.RequestException as e:
+                last_exception = e
+
+                if attempt < max_retries:
+                    # 还有重试机会,等待后继续
+                    print(f"    ✗ 请求失败 (第{attempt}次尝试): {e}")
+                    print(f"    等待 {retry_delay} 秒后重试...")
+                    time.sleep(retry_delay)
+                else:
+                    # 已达最大重试次数,抛出异常
+                    print(f"    ✗ 请求失败 (已达最大重试次数 {max_retries}): {e}")
+
+        # 所有重试都失败,抛出最后一次的异常
+        raise last_exception
+
+    def _preprocess_response(self, result: Dict[str, Any]) -> None:
+        """
+        预处理搜索结果,将 image_list 中的字典格式转换为 URL 字符串列表
+        并限制返回的帖子数量为10个
+
+        Args:
+            result: API返回的原始结果字典(会直接修改)
+        """
+        # 获取帖子列表
+        data_wrapper = result.get("data", {})
+        notes = data_wrapper.get("data", [])
+
+        # 限制为前10个帖子
+        if len(notes) > 10:
+            notes = notes[:10]
+            data_wrapper["data"] = notes
+            logger.info(f"  限制搜索结果为前10个帖子")
+
+        for note in notes:
+            note_card = note.get("note_card", {})
+            image_list_raw = note_card.get("image_list", [])
+
+            # 提取 URL 字符串
+            image_list = []
+            for img in image_list_raw:
+                if isinstance(img, dict) and "image_url" in img:
+                    image_list.append(img["image_url"])
+                elif isinstance(img, str):
+                    # 如果已经是字符串,直接使用
+                    image_list.append(img)
+
+            # 更新为预处理后的列表
+            note_card["image_list"] = image_list
+
+    def save_result(self, keyword: str, result: Dict[str, Any], page: int = 1) -> str:
+        """
+        保存结果到文件
+        目录结构: results/xiaohongshu_search/关键词/时间戳_page{页码}.json
+
+        Args:
+            keyword: 搜索关键词
+            result: API返回的结果
+            page: 页码
+
+        Returns:
+            保存的文件路径
+        """
+        # 创建目录结构: results/xiaohongshu_search/关键词/
+        result_dir = os.path.join(self.results_base_dir, "xiaohongshu_search", keyword)
+        os.makedirs(result_dir, exist_ok=True)
+
+        # 文件名使用时间戳和页码
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"{timestamp}_page{page}.json"
+        filepath = os.path.join(result_dir, filename)
+
+        # 保存结果
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(result, f, ensure_ascii=False, indent=2)
+
+        return filepath
+
+
+def main():
+    """示例使用"""
+    # 解析命令行参数
+    parser = argparse.ArgumentParser(description='小红书笔记搜索工具')
+    parser.add_argument(
+        '--results-dir',
+        type=str,
+        default='data/search',
+        help='结果输出目录 (默认: data/search)'
+    )
+    parser.add_argument(
+        '--keyword',
+        type=str,
+        required=True,
+        help='搜索关键词 (必填)'
+    )
+    parser.add_argument(
+        '--content-type',
+        type=str,
+        default='不限',
+        choices=['不限', '视频', '图文'],
+        help='内容类型 (默认: 不限)'
+    )
+    parser.add_argument(
+        '--sort-type',
+        type=str,
+        default='综合',
+        choices=['综合', '最新', '最多点赞', '最多评论'],
+        help='排序方式 (默认: 综合)'
+    )
+    parser.add_argument(
+        '--publish-time',
+        type=str,
+        default='不限',
+        choices=['不限', '一天内', '一周内', '半年内'],
+        help='发布时间筛选 (默认: 不限)'
+    )
+    parser.add_argument(
+        '--cursor',
+        type=str,
+        default='',
+        help='翻页游标 (默认为空,即第一页)'
+    )
+    parser.add_argument(
+        '--page',
+        type=int,
+        default=1,
+        help='页码标识,用于保存文件名 (默认: 1)'
+    )
+    args = parser.parse_args()
+
+    # 创建API客户端实例
+    client = XiaohongshuSearch(results_dir=args.results_dir)
+
+    # 执行搜索并保存
+    try:
+        result = client.search(
+            args.keyword,
+            args.content_type,
+            args.sort_type,
+            args.publish_time,
+            args.cursor
+        )
+        filepath = client.save_result(args.keyword, result, args.page)
+        print(f"Output: {filepath}")
+    except Exception as e:
+        print(f"Error: {e}", file=__import__('sys').stderr)
+
+
+if __name__ == "__main__":
+    main()

+ 0 - 0
src/evaluators/__init__.py


+ 54 - 34
src/evaluators/llm_evaluator.py

@@ -292,22 +292,14 @@ class LLMEvaluator:
 
         prompt = f"""# 角色
 你是一个专业的搜索query生成专家。你的任务是根据输入信息,生成最优的搜索query组合。
+# 核心规则(必须严格遵守)
+- 目标动机严格隔离,仅用于最终匹配度评估,Query生成过程中不得使用目标动机原文
+- query构成:仅由"中心词(如果有)+待选词的完整、未拆分形式"直接组成,严禁对原始词汇进行任何形式的增、删、改、拆分或重组,包括但不限于将一个词拆分成多个部分进行组合,或将多个词的部分内容进行拼接。
+- 单个query结构:2-4个词,考虑词的前后顺序
 
-## 核心规则(必须严格遵守)
-1. **绝对禁止**:
-   - 目标动机严格隔离,仅用于最终匹配度评估,Query生成过程中不得使用目标动机原文
-   - Query中不得包含动机词汇(如"如何"、"方法"、"技巧"、"教程"等意图词)
-   - 所有分析基于真实信息,不可假设推导
-
-2. **query构成**:仅由"中心词(如果有)+待选词"直接组成,无额外信息
-3. **query结构**:2-4个词,考虑前后顺序,无相似或语义重叠的query
-4. **输出数量**:生成1-4条备选query
-搜
-# 输入格式
-目标特征:{original_feature}
+# 输入
 中心词:{base_word}
 待选词:{candidate_words_str}
-
 注:带权重的词用括号标注权重值,无权重或权重为0则平权
 
 # query生成流程
@@ -325,8 +317,8 @@ class LLMEvaluator:
 
 **关联性分级:**
 - **强关联(0.7-1.0)**:两词在语义上紧密配合,常在同一场景共现,组合后形成完整概念
-- **中关联(0.4-0.69)**:两词有明确关联但不强制共现,组合后有一定语义增益
-- **无关联(0.0-0.39)**:两词无明显语义关联,组合无意义
+- **中关联(0.3-0.69)**:两词有明确关联但不强制共现,组合后有一定语义增益
+- **弱关联(0.0-0.29)**:两词无明显语义关联,组合无意义
 
 ## 第三步:互补性分析
 
@@ -335,11 +327,16 @@ class LLMEvaluator:
 **互补性分级:**
 - **强互补**:两词描述不同维度,组合后语义更完整(如:主体+场景、形式+内容)
 - **弱互补**:两词有差异但语义部分重叠
-- **语义重叠**:两词描述同一维度,组合无新增价值(避免)
+- **语义重叠**:两词描述同一维度,组合无新增价值
+
+**语义重叠的判定标准:**
+- **重叠度>70%**:确实重复,应避免
+- **重叠度40-70%**:有差异,允许共存
+- **重叠度<40%**:互补,优先保留
 
 **常见互补维度组合:**
 - 主体+场景
-- 形式+内容
+- 形式+内容  
 - 内容+应用方式
 - 载体+场景+情绪
 
@@ -349,35 +346,52 @@ class LLMEvaluator:
 - 与中心词(或其他待选词)的关联强度
 - 原始权重高低
 - 互补性强弱
+- 角度独特性(是否覆盖不同语义维度)
 
 **排序原则:**
-强关联+高权重+强互补 > 强关联+无权重+强互补 > 中关联+高权重
+强关联+高权重+强互补 > 强关联+无权重+强互补 > 中关联+高权重+独特角度 > 中关联+强互补
+
+## 第五步:生成query
 
-## 第五步:组合生成query
+**整体query生成规则(确保从不同优先级和角度生成query):每种形式可生成1-2个query
+- 强关联+强互补(核心query,最精准),
+- 强关联+弱互补或中关联+强互补(扩展query,覆盖相关内容)
+- 中关联+弱互补但角度独特(覆盖query,探索边缘相关内容)
+- 创新组合或探索性query(低关联但可能发现意外相关内容)
 
 **组合策略:**
 
 **如果有中心词:**
-1. 中心词 + 强关联且强互补的待选词(1-2个)
-2. 中心词 + 强关联但弱互补的待选词(1-2个)
-3. 仅用待选词组合(当纯待选词组合语义更完整时)
+1. 中心词 + 强关联且强互补的待选词(1-2个词)
+2. 中心词 + 强关联但弱互补的待选词(1-2个词)
+3. 中心词 + 中关联但角度独特的待选词(1-2个词)
+4. 仅用待选词组合(当纯待选词组合语义更完整时)
 
 **如果无中心词:**
 1. 2-3个强关联且强互补的待选词组合
-2. 1个核心词 + 1-2个中关联但强互补词
+2. 1个核心词 + 1-2个中关联但强互补或角度独特的词
+3. 探索性组合:关联度中等但可能产生新视角的词组合
 
 **组合规则:**
-- 同一语义维度只保留1个最优词
+- 同一语义维度可保留2个有明显差异的词组合
 - 优先选择互补性强的词组合
-- 构成词数控制在2-3个
+- 构成单个query的词数控制在2-3个
 - 考虑词的前后顺序(词定语在前,核心名词在后;场景词在前,实体词在后)
+- **query数量控制在3-8个**,在保证质量前提下尽可能生成更多不同角度的query
+- 即使query在语义上有轻微重叠(重叠度40-70%),只要切入角度不同也应保留
+
+**多样性要求:**
+- 从不同语义维度生成query(形式、场景、内容、情感、应用等)
+- 确保各层次query都有代表,覆盖从核心到边缘的搜索空间
+- 允许部分query探索性地组合中关联词,以发现潜在相关内容
+
 **组合理由:**
-说明为什么选择这些词组合,词与词之间如何协同工作,形成什么样的搜索语义场
+说明为什么选择这些词组合,词与词之间如何协同工作,形成什么样的搜索语义场,属于哪个生成层次
 
 ## 第六步:query与目标动机匹配度评估
 **重要说明:** 只有在query生成完成后,才将query与目标动机进行匹配度评估
 **匹配分含义:**
-匹配分 = 此query能找到目标动机所需内容的概率(0-1之间)
+匹配分 = 此query语意扩展能找到目标动机所需内容的概率(0-1之间)
 
 **评分标准:**
 - **0.8-1.0分**:query在语意上与目标强关联,能精准召回目标动机所需内容,覆盖核心要素
@@ -402,22 +416,28 @@ class LLMEvaluator:
 {{
   "queries": [
     {{
-      "query": "查询词",
-      "中心词": "{base_word}",
-      "组合理由": "query词组合理由的详细说明,深度解释该query与目标及中心词的逻辑关联。目标特征的核心诉求是什么,基于这个诉求,选择了哪些词,为什么这些词最相关(说明权重、语义覆盖等原因)这些词如何协同工作,形成什么样的搜索语义场,词与词之间有什么语义延展关系,这个query预期能召回什么类型的内容,为什么能找到目标",
+      "query": "query内容",
+      "组合理由": "query词组合理由的详细说明,深度解释该query与中心词的逻辑关联。选择了哪些词,为什么这些词最相关(说明权重、语义覆盖、关联强度、互补性等原因),这些词如何协同工作,形成什么样的搜索语义场,词与词之间有什么语义延展关系,这个query预期能召回什么类型的内容",
       "与目标匹配分": 0.85,
-      "source_word ": "来源词,待选词和中心词组合"
+      "匹配分理由": "目标特征的核心诉求是什么,基于这个诉求,该query为什么能找到目标,query的语义场如何与目标动机产生关联,为什么能/不能召回目标所需内容",
+      "source_word": "产生这个query的来源词,待选词和中心词组合,多个组合空格分隔"
     }}
   ]
 }}
 
-
 **关键点:**
 1. query生成阶段:只考虑词与词之间的语义关联和互补性
 2. 匹配评估阶段:才将生成的query与目标动机进行匹配度分析
 3. 目标动机不参与query生成,仅用于最终评估
-
-注意:只返回JSON,不要其他内容。"""
+4. 通过分层生成确保query数量充足且覆盖不同优先级
+
+**source_word规则**(重要):
+1. 格式:空格分隔的词汇
+2. 来源:**必须且只能**从"中心词 + 待选词"中提取
+3. 提取规则:该query实际使用到的所有原始词汇
+4. 禁止:同义替换、添加新词
+5. 必须包含:中心词(如果query中使用了中心词)
+"""
 
         # 调用 LLM
         llm_results = self.client.chat_json(prompt=prompt, max_retries=3)

+ 72 - 0
src/models/__init__.py

@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+数据模型模块
+"""
+
+from .post import Post, TopicPoint, PersonaMatch
+from .candidate import (
+    Candidate,
+    CandidateSource,
+    TopicPointCandidates,
+    PostCandidates
+)
+from .query import (
+    SearchQuery,
+    TopicPointQueries,
+    PostQueries
+)
+from .evaluation import (
+    NoteEvaluation,
+    QueryEvaluation,
+    PostEvaluation
+)
+from .deconstruction import (
+    DeconstructedFeature,
+    DeconstructionResult,
+    PostDeconstruction
+)
+from .similarity import (
+    SimilarFeature,
+    SimilarityStatistics,
+    ComprehensiveScoreDetail,
+    SimilarityScore,
+    OverallSimilarityStatistics,
+    PostSimilarityScores
+)
+
+__all__ = [
+    # Post模型
+    'Post',
+    'TopicPoint',
+    'PersonaMatch',
+
+    # Candidate模型
+    'Candidate',
+    'CandidateSource',
+    'TopicPointCandidates',
+    'PostCandidates',
+
+    # Query模型
+    'SearchQuery',
+    'TopicPointQueries',
+    'PostQueries',
+
+    # Evaluation模型
+    'NoteEvaluation',
+    'QueryEvaluation',
+    'PostEvaluation',
+
+    # Deconstruction模型
+    'DeconstructedFeature',
+    'DeconstructionResult',
+    'PostDeconstruction',
+
+    # Similarity模型
+    'SimilarFeature',
+    'SimilarityStatistics',
+    'ComprehensiveScoreDetail',
+    'SimilarityScore',
+    'OverallSimilarityStatistics',
+    'PostSimilarityScores',
+]

+ 174 - 0
src/models/candidate.py

@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+候选词数据模型
+"""
+
+from dataclasses import dataclass, field
+from typing import List, Dict, Any, Optional
+from enum import Enum
+
+
+class CandidateSource(Enum):
+    """候选词来源"""
+    GLOBAL = "global"                   # 全局提取(整个how文件)
+    CURRENT_POST = "current_post"       # 当前帖子
+
+
+@dataclass
+class Candidate:
+    """候选词模型"""
+
+    # 基本信息
+    persona_feature_name: str           # 人设特征名称
+    similarity: float                   # 相似度得分
+
+    # 来源信息(关键!)
+    source: CandidateSource             # 来源:global / current_post
+    source_topic_point: Optional[str] = None  # 如果来自当前帖子,记录来源选题点名称
+
+    # 元数据
+    feature_type: str = ""              # 特征类型:标签/分类
+    feature_level: str = ""             # 人设特征层级
+    classification_path: str = ""        # 分类路径(用/分隔)
+    match_reason: str = ""              # 匹配说明
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转为字典(保存时使用)"""
+        return {
+            "persona_feature_name": self.persona_feature_name,
+            "similarity": self.similarity,
+            "source": self.source.value,
+            "source_topic_point": self.source_topic_point,
+            "feature_type": self.feature_type,
+            "feature_level": self.feature_level,
+            "classification_path": self.classification_path,
+            "match_reason": self.match_reason
+        }
+
+    @staticmethod
+    def from_dict(data: Dict[str, Any]) -> 'Candidate':
+        """从字典创建"""
+        source_str = data.get('source', 'global')
+        source = CandidateSource(source_str) if source_str else CandidateSource.GLOBAL
+
+        return Candidate(
+            persona_feature_name=data.get('persona_feature_name', ''),
+            similarity=data.get('similarity', 0.0),
+            source=source,
+            source_topic_point=data.get('source_topic_point'),
+            feature_type=data.get('feature_type', ''),
+            feature_level=data.get('feature_level', ''),
+            classification_path=data.get('classification_path', ''),
+            match_reason=data.get('match_reason', '')
+        )
+
+    def get_source_label(self) -> str:
+        """获取来源标签(用于展示)"""
+        return "当前帖子" if self.source == CandidateSource.CURRENT_POST else "全局"
+
+
+@dataclass
+class TopicPointCandidates:
+    """选题点的候选词集合"""
+    topic_point_name: str               # 选题点名称
+    topic_point_level: str              # 选题点层级
+    topic_point_description: str = ""   # 选题点描述
+    candidates: List[Candidate] = field(default_factory=list)  # 候选词列表
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转为字典"""
+        return {
+            "topic_point_name": self.topic_point_name,
+            "topic_point_level": self.topic_point_level,
+            "topic_point_description": self.topic_point_description,
+            "candidates": [c.to_dict() for c in self.candidates],
+            "statistics": {
+                "total_candidates": len(self.candidates),
+                "from_current_post": sum(
+                    1 for c in self.candidates
+                    if c.source == CandidateSource.CURRENT_POST
+                ),
+                "from_global": sum(
+                    1 for c in self.candidates
+                    if c.source == CandidateSource.GLOBAL
+                )
+            }
+        }
+
+    @staticmethod
+    def from_dict(data: Dict[str, Any]) -> 'TopicPointCandidates':
+        """从字典创建"""
+        candidates = [
+            Candidate.from_dict(c)
+            for c in data.get('candidates', [])
+        ]
+
+        return TopicPointCandidates(
+            topic_point_name=data.get('topic_point_name', ''),
+            topic_point_level=data.get('topic_point_level', ''),
+            topic_point_description=data.get('topic_point_description', ''),
+            candidates=candidates
+        )
+
+    def get_top_n(self, n: int = 20) -> List[Candidate]:
+        """获取Top N个候选词(按相似度降序)"""
+        return sorted(
+            self.candidates,
+            key=lambda c: c.similarity,
+            reverse=True
+        )[:n]
+
+    def filter_by_source(self, source: CandidateSource) -> List[Candidate]:
+        """按来源过滤"""
+        return [c for c in self.candidates if c.source == source]
+
+
+@dataclass
+class PostCandidates:
+    """帖子的所有候选词"""
+    post_id: str                        # 帖子ID
+    topic_points_candidates: List[TopicPointCandidates] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转为字典"""
+        total_candidates = sum(len(tpc.candidates) for tpc in self.topic_points_candidates)
+        total_from_current = sum(
+            len(tpc.filter_by_source(CandidateSource.CURRENT_POST))
+            for tpc in self.topic_points_candidates
+        )
+        total_from_global = sum(
+            len(tpc.filter_by_source(CandidateSource.GLOBAL))
+            for tpc in self.topic_points_candidates
+        )
+
+        return {
+            "post_id": self.post_id,
+            "topic_points_candidates": [tpc.to_dict() for tpc in self.topic_points_candidates],
+            "statistics": {
+                "total_topic_points": len(self.topic_points_candidates),
+                "total_candidates": total_candidates,
+                "from_current_post": total_from_current,
+                "from_global": total_from_global
+            }
+        }
+
+    @staticmethod
+    def from_dict(data: Dict[str, Any]) -> 'PostCandidates':
+        """从字典创建"""
+        tpc_list = [
+            TopicPointCandidates.from_dict(tpc)
+            for tpc in data.get('topic_points_candidates', [])
+        ]
+
+        return PostCandidates(
+            post_id=data.get('post_id', ''),
+            topic_points_candidates=tpc_list
+        )
+
+    def get_topic_point_candidates(self, topic_point_name: str) -> Optional[TopicPointCandidates]:
+        """获取指定选题点的候选词"""
+        for tpc in self.topic_points_candidates:
+            if tpc.topic_point_name == topic_point_name:
+                return tpc
+        return None

+ 184 - 0
src/models/deconstruction.py

@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+解构分析数据模型
+"""
+
+from dataclasses import dataclass, field, asdict
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+
+
+@dataclass
+class DeconstructedFeature:
+    """解构特征"""
+    feature_name: str                    # 特征名称
+    dimension: str                       # 维度 (灵感点-全新内容/灵感点-共性差异/灵感点-共性内容/目的点/关键点)
+    dimension_detail: str                # 维度细分 (实质/形式/意图等)
+    weight: float                        # 权重
+    source_index: int                    # 在该维度中的索引
+    source_info: Dict[str, Any] = field(default_factory=dict)  # 溯源信息
+
+
+@dataclass
+class DeconstructionResult:
+    """单个帖子的解构结果"""
+    note_id: str                         # 帖子ID
+    search_word: str                     # 搜索词
+    original_feature: str                # 原始特征
+    source_word: str                     # 来源词
+    evaluation_score: float              # 评估得分
+    evaluation_type: str                 # 匹配类型
+    evaluation_confidence: str           # 置信度
+    key_matching_points: List[str]       # 关键匹配点
+
+    # 解构特征
+    inspiration_features: List[DeconstructedFeature] = field(default_factory=list)  # 灵感点特征
+    purpose_features: List[DeconstructedFeature] = field(default_factory=list)      # 目的点特征
+    key_point_features: List[DeconstructedFeature] = field(default_factory=list)    # 关键点特征
+
+    # 帖子数据
+    note_data: Dict[str, Any] = field(default_factory=dict)  # 帖子信息 (title, author, link)
+
+    # API响应
+    api_request: Dict[str, Any] = field(default_factory=dict)   # API请求
+    api_response: Dict[str, Any] = field(default_factory=dict)  # API响应
+
+    # 元数据
+    processed_at: str = ""                # 处理时间
+    processing_time_ms: float = 0.0       # 处理耗时(毫秒)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return asdict(self)
+
+    @property
+    def all_features(self) -> List[DeconstructedFeature]:
+        """获取所有特征"""
+        return (
+            self.inspiration_features +
+            self.purpose_features +
+            self.key_point_features
+        )
+
+    @property
+    def feature_count(self) -> int:
+        """特征总数"""
+        return len(self.all_features)
+
+
+@dataclass
+class PostDeconstruction:
+    """帖子解构结果集合"""
+    post_id: str                                              # 帖子ID
+    deconstruction_results: List[DeconstructionResult]        # 解构结果列表
+
+    # 元数据
+    total_matched_notes: int = 0                              # 总匹配帖子数
+    processed_notes: int = 0                                  # 已处理帖子数
+    skipped_notes: int = 0                                    # 跳过帖子数
+    success_count: int = 0                                    # 成功数
+    failed_count: int = 0                                     # 失败数
+
+    # 配置参数
+    api_url: str = ""                                         # API地址
+    min_score_threshold: float = 0.0                          # 最低分数阈值
+    sort_by: str = "score"                                    # 排序方式
+    target_features: Optional[List[str]] = None               # 目标特征列表
+
+    # 时间信息
+    created_at: str = ""                                      # 创建时间
+    processing_time_seconds: float = 0.0                      # 处理耗时(秒)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'metadata': {
+                'stage': 'deconstruction',
+                'description': '完全匹配帖子的深度解构分析',
+                'post_id': self.post_id,
+                'target_features': self.target_features if self.target_features else '全部',
+                'total_matched_notes': self.total_matched_notes,
+                'processed_notes': self.processed_notes,
+                'skipped_notes': self.skipped_notes,
+                'success_count': self.success_count,
+                'failed_count': self.failed_count,
+                'api_url': self.api_url,
+                'min_score_threshold': self.min_score_threshold,
+                'sort_by': self.sort_by,
+                'created_at': self.created_at or datetime.now().isoformat(),
+                'processing_time_seconds': round(self.processing_time_seconds, 2)
+            },
+            'results': [r.to_dict() for r in self.deconstruction_results]
+        }
+
+    @classmethod
+    def from_json_file(cls, file_path: str) -> 'PostDeconstruction':
+        """从JSON文件加载"""
+        import json
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+
+        metadata = data['metadata']
+        results_data = data['results']
+
+        # 重建DeconstructionResult对象
+        results = []
+        for r in results_data:
+            # 重建特征列表
+            inspiration_features = [
+                DeconstructedFeature(**f) for f in r.get('inspiration_features', [])
+            ]
+            purpose_features = [
+                DeconstructedFeature(**f) for f in r.get('purpose_features', [])
+            ]
+            key_point_features = [
+                DeconstructedFeature(**f) for f in r.get('key_point_features', [])
+            ]
+
+            result = DeconstructionResult(
+                note_id=r['note_id'],
+                search_word=r['search_word'],
+                original_feature=r['original_feature'],
+                source_word=r['source_word'],
+                evaluation_score=r['evaluation_score'],
+                evaluation_type=r['evaluation_type'],
+                evaluation_confidence=r['evaluation_confidence'],
+                key_matching_points=r['key_matching_points'],
+                inspiration_features=inspiration_features,
+                purpose_features=purpose_features,
+                key_point_features=key_point_features,
+                note_data=r['note_data'],
+                api_request=r['api_request'],
+                api_response=r['api_response'],
+                processed_at=r['processed_at'],
+                processing_time_ms=r['processing_time_ms']
+            )
+            results.append(result)
+
+        return cls(
+            post_id=metadata['post_id'],
+            deconstruction_results=results,
+            total_matched_notes=metadata['total_matched_notes'],
+            processed_notes=metadata['processed_notes'],
+            skipped_notes=metadata['skipped_notes'],
+            success_count=metadata['success_count'],
+            failed_count=metadata['failed_count'],
+            api_url=metadata['api_url'],
+            min_score_threshold=metadata['min_score_threshold'],
+            sort_by=metadata['sort_by'],
+            target_features=metadata.get('target_features'),
+            created_at=metadata['created_at'],
+            processing_time_seconds=metadata['processing_time_seconds']
+        )
+
+    def get_statistics(self) -> Dict[str, Any]:
+        """获取统计信息"""
+        total_features = sum(r.feature_count for r in self.deconstruction_results)
+
+        return {
+            'total_notes': len(self.deconstruction_results),
+            'total_features': total_features,
+            'avg_features_per_note': round(total_features / len(self.deconstruction_results), 1) if self.deconstruction_results else 0,
+            'success_rate': round(self.success_count / self.processed_notes, 3) if self.processed_notes > 0 else 0
+        }

+ 170 - 0
src/models/evaluation.py

@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+评估结果数据模型
+"""
+
+from dataclasses import dataclass, field
+from typing import List, Dict, Any, Optional
+
+
+@dataclass
+class NoteEvaluation:
+    """单个帖子的评估结果"""
+    note_id: str                        # 帖子ID
+    channel_content_id: str             # 频道内容ID
+    title: str = ""                     # 标题
+    body_text: str = ""                 # 正文
+
+    # 第一层:Query相关性
+    query_relevance: str = ""           # "相关" or "不相关"
+    query_relevance_reason: str = ""    # 判断理由
+
+    # 第二层:特征匹配度
+    match_level: str = ""               # 完全匹配/相似匹配/弱相似/无匹配
+    match_score: float = 0.0            # 综合得分
+    match_reason: str = ""              # 匹配理由
+
+    # 详细得分
+    score_details: Optional[Dict[str, Any]] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转为字典"""
+        return {
+            "note_id": self.note_id,
+            "channel_content_id": self.channel_content_id,
+            "title": self.title,
+            "body_text": self.body_text,
+            "query_relevance": self.query_relevance,
+            "query_relevance_reason": self.query_relevance_reason,
+            "match_level": self.match_level,
+            "match_score": self.match_score,
+            "match_reason": self.match_reason,
+            "score_details": self.score_details
+        }
+
+    @staticmethod
+    def from_dict(data: Dict[str, Any]) -> 'NoteEvaluation':
+        """从字典创建"""
+        return NoteEvaluation(
+            note_id=data.get('note_id', ''),
+            channel_content_id=data.get('channel_content_id', ''),
+            title=data.get('title', ''),
+            body_text=data.get('body_text', ''),
+            query_relevance=data.get('query_relevance', ''),
+            query_relevance_reason=data.get('query_relevance_reason', ''),
+            match_level=data.get('match_level', ''),
+            match_score=data.get('match_score', 0.0),
+            match_reason=data.get('match_reason', ''),
+            score_details=data.get('score_details')
+        )
+
+    def is_relevant_to_query(self) -> bool:
+        """是否与Query相关"""
+        return self.query_relevance == "相关"
+
+    def is_high_match(self) -> bool:
+        """是否高匹配(完全匹配)"""
+        return self.match_level == "完全匹配" or self.match_score >= 0.8
+
+
+@dataclass
+class QueryEvaluation:
+    """Query的评估结果"""
+    query_text: str                     # Query文本
+    topic_point_name: str               # 所属选题点
+
+    # 统计信息
+    total_notes: int = 0                # 总帖子数
+    filtered_count: int = 0             # 第一层过滤掉的数量
+    evaluated_count: int = 0            # 第二层评估的数量
+
+    # 匹配度分布
+    match_distribution: Dict[str, int] = field(default_factory=dict)
+
+    # 详细评估结果
+    notes_evaluation: List[NoteEvaluation] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转为字典"""
+        return {
+            "query_text": self.query_text,
+            "topic_point_name": self.topic_point_name,
+            "total_notes": self.total_notes,
+            "filtered_count": self.filtered_count,
+            "evaluated_count": self.evaluated_count,
+            "match_distribution": self.match_distribution,
+            "notes_evaluation": [ne.to_dict() for ne in self.notes_evaluation]
+        }
+
+    @staticmethod
+    def from_dict(data: Dict[str, Any]) -> 'QueryEvaluation':
+        """从字典创建"""
+        notes_eval = [
+            NoteEvaluation.from_dict(ne)
+            for ne in data.get('notes_evaluation', [])
+        ]
+
+        return QueryEvaluation(
+            query_text=data.get('query_text', ''),
+            topic_point_name=data.get('topic_point_name', ''),
+            total_notes=data.get('total_notes', 0),
+            filtered_count=data.get('filtered_count', 0),
+            evaluated_count=data.get('evaluated_count', 0),
+            match_distribution=data.get('match_distribution', {}),
+            notes_evaluation=notes_eval
+        )
+
+    def get_high_match_notes(self) -> List[NoteEvaluation]:
+        """获取高匹配的帖子"""
+        return [ne for ne in self.notes_evaluation if ne.is_high_match()]
+
+    def get_relevant_notes(self) -> List[NoteEvaluation]:
+        """获取与Query相关的帖子"""
+        return [ne for ne in self.notes_evaluation if ne.is_relevant_to_query()]
+
+
+@dataclass
+class PostEvaluation:
+    """帖子的所有评估结果"""
+    post_id: str                        # 帖子ID
+    query_evaluations: List[QueryEvaluation] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转为字典"""
+        total_queries = len(self.query_evaluations)
+        total_notes_evaluated = sum(qe.evaluated_count for qe in self.query_evaluations)
+        total_high_match = sum(
+            len(qe.get_high_match_notes())
+            for qe in self.query_evaluations
+        )
+
+        return {
+            "post_id": self.post_id,
+            "query_evaluations": [qe.to_dict() for qe in self.query_evaluations],
+            "statistics": {
+                "total_queries": total_queries,
+                "total_notes_evaluated": total_notes_evaluated,
+                "total_high_match_notes": total_high_match
+            }
+        }
+
+    @staticmethod
+    def from_dict(data: Dict[str, Any]) -> 'PostEvaluation':
+        """从字典创建"""
+        qe_list = [
+            QueryEvaluation.from_dict(qe)
+            for qe in data.get('query_evaluations', [])
+        ]
+
+        return PostEvaluation(
+            post_id=data.get('post_id', ''),
+            query_evaluations=qe_list
+        )
+
+    def get_all_high_match_notes(self) -> List[NoteEvaluation]:
+        """获取所有高匹配的帖子"""
+        all_notes = []
+        for qe in self.query_evaluations:
+            all_notes.extend(qe.get_high_match_notes())
+        return all_notes

+ 158 - 0
src/models/post.py

@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+帖子数据模型
+"""
+
+from dataclasses import dataclass, field
+from typing import List, Dict, Any, Optional
+
+
+@dataclass
+class PersonaMatch:
+    """人设特征匹配结果"""
+    persona_feature_name: str           # 人设特征名称
+    persona_feature_level: str          # 人设特征层级(灵感点/关键点/目的点)
+    feature_type: str                   # 特征类型(标签/分类)
+    feature_classification: List[str]   # 特征分类路径
+    similarity: float                   # 相似度得分
+    match_reason: str                   # 匹配说明
+
+    @staticmethod
+    def from_dict(data: Dict[str, Any]) -> 'PersonaMatch':
+        """从字典创建"""
+        return PersonaMatch(
+            persona_feature_name=data.get('人设特征名称', ''),
+            persona_feature_level=data.get('人设特征层级', ''),
+            feature_type=data.get('特征类型', ''),
+            feature_classification=data.get('特征分类', []),
+            similarity=data.get('相似度', 0.0),
+            match_reason=data.get('说明', '')
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转为字典"""
+        return {
+            '人设特征名称': self.persona_feature_name,
+            '人设特征层级': self.persona_feature_level,
+            '特征类型': self.feature_type,
+            '特征分类': self.feature_classification,
+            '相似度': self.similarity,
+            '说明': self.match_reason
+        }
+
+
+@dataclass
+class TopicPoint:
+    """选题点(灵感点/关键点/目的点下的每一项)"""
+    id: str                             # ID
+    name: str                           # 名称
+    level: str                          # 层级(灵感点列表/关键点列表/目的点列表)
+    type: str                           # 类型(实质/形式/场景等)
+    description: str                    # 描述
+    confidence: float                   # 置信度
+    persona_matches: List[PersonaMatch] = field(default_factory=list)  # 人设匹配结果
+
+    @staticmethod
+    def from_dict(data: Dict[str, Any], level: str) -> 'TopicPoint':
+        """从字典创建"""
+        # 解析人设匹配结果
+        matches = [
+            PersonaMatch.from_dict(m)
+            for m in data.get('匹配人设结果', [])
+        ]
+
+        return TopicPoint(
+            id=data.get('ID', ''),
+            name=data.get('名称', ''),
+            level=level,
+            type=data.get('类型', ''),
+            description=data.get('描述', ''),
+            confidence=data.get('置信度', 0.0),
+            persona_matches=matches
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转为字典"""
+        return {
+            'ID': self.id,
+            '名称': self.name,
+            '层级': self.level,
+            '类型': self.type,
+            '描述': self.description,
+            '置信度': self.confidence,
+            '匹配人设结果': [m.to_dict() for m in self.persona_matches]
+        }
+
+
+@dataclass
+class Post:
+    """帖子模型"""
+    post_id: str                        # 帖子ID
+    post_details: Dict[str, Any]        # 帖子详情
+    topic_points: List[TopicPoint]      # 所有选题点(灵感点+关键点+目的点)
+
+    @staticmethod
+    def from_json_file(file_path: str) -> 'Post':
+        """从JSON文件加载"""
+        import json
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        return Post.from_dict(data)
+
+    @staticmethod
+    def from_dict(data: Dict[str, Any]) -> 'Post':
+        """从字典创建"""
+        # 解析选题点
+        topic_points = []
+        deconstruction = data.get('解构结果', {})
+
+        # 灵感点列表
+        for item in deconstruction.get('灵感点列表', []):
+            topic_points.append(TopicPoint.from_dict(item, '灵感点列表'))
+
+        # 关键点列表
+        for item in deconstruction.get('关键点列表', []):
+            topic_points.append(TopicPoint.from_dict(item, '关键点列表'))
+
+        # 目的点列表
+        for item in deconstruction.get('目的点列表', []):
+            topic_points.append(TopicPoint.from_dict(item, '目的点列表'))
+
+        return Post(
+            post_id=data.get('帖子id', ''),
+            post_details=data.get('帖子详情', {}),
+            topic_points=topic_points
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转为字典"""
+        # 按层级重组选题点
+        inspiration_points = []
+        key_points = []
+        purpose_points = []
+
+        for tp in self.topic_points:
+            if tp.level == '灵感点列表':
+                inspiration_points.append(tp.to_dict())
+            elif tp.level == '关键点列表':
+                key_points.append(tp.to_dict())
+            elif tp.level == '目的点列表':
+                purpose_points.append(tp.to_dict())
+
+        return {
+            '帖子id': self.post_id,
+            '帖子详情': self.post_details,
+            '解构结果': {
+                '灵感点列表': inspiration_points,
+                '关键点列表': key_points,
+                '目的点列表': purpose_points
+            }
+        }
+
+    def get_topic_point_by_name(self, name: str) -> Optional[TopicPoint]:
+        """根据名称查找选题点"""
+        for tp in self.topic_points:
+            if tp.name == name:
+                return tp
+        return None

+ 174 - 0
src/models/query.py

@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+搜索Query数据模型
+"""
+
+from dataclasses import dataclass, field
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+
+
+@dataclass
+class SearchQuery:
+    """搜索Query模型"""
+
+    # 基本信息
+    query_text: str                     # 搜索词文本
+    score: float                        # LLM评分
+    reasoning: str = ""                 # 推荐理由
+
+    # 来源信息
+    topic_point_name: str = ""          # 所属选题点
+    used_candidates: List[str] = field(default_factory=list)  # 使用的候选词
+    candidate_sources: List[str] = field(default_factory=list)  # 候选词来源
+
+    # 搜索结果(Stage 3填充)
+    search_result: Optional[Dict[str, Any]] = None
+    search_metadata: Optional[Dict[str, Any]] = None
+
+    # 评估结果(Stage 4填充)
+    evaluation_result: Optional[Dict[str, Any]] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转为字典"""
+        return {
+            "query_text": self.query_text,
+            "score": self.score,
+            "reasoning": self.reasoning,
+            "topic_point_name": self.topic_point_name,
+            "used_candidates": self.used_candidates,
+            "candidate_sources": self.candidate_sources,
+            "search_result": self.search_result,
+            "search_metadata": self.search_metadata,
+            "evaluation_result": self.evaluation_result
+        }
+
+    @staticmethod
+    def from_dict(data: Dict[str, Any]) -> 'SearchQuery':
+        """从字典创建"""
+        return SearchQuery(
+            query_text=data.get('query_text', ''),
+            score=data.get('score', 0.0),
+            reasoning=data.get('reasoning', ''),
+            topic_point_name=data.get('topic_point_name', ''),
+            used_candidates=data.get('used_candidates', []),
+            candidate_sources=data.get('candidate_sources', []),
+            search_result=data.get('search_result'),
+            search_metadata=data.get('search_metadata'),
+            evaluation_result=data.get('evaluation_result')
+        )
+
+    def has_search_result(self) -> bool:
+        """是否有搜索结果"""
+        return self.search_result is not None
+
+    def has_evaluation(self) -> bool:
+        """是否有评估结果"""
+        return self.evaluation_result is not None
+
+    def get_note_count(self) -> int:
+        """获取搜索到的帖子数量"""
+        if not self.search_metadata:
+            return 0
+        return self.search_metadata.get('note_count', 0)
+
+
+@dataclass
+class TopicPointQueries:
+    """选题点的搜索Query集合"""
+    topic_point_name: str               # 选题点名称
+    topic_point_level: str              # 选题点层级
+    queries: List[SearchQuery] = field(default_factory=list)  # Query列表
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转为字典"""
+        return {
+            "topic_point_name": self.topic_point_name,
+            "topic_point_level": self.topic_point_level,
+            "queries": [q.to_dict() for q in self.queries],
+            "statistics": {
+                "total_queries": len(self.queries),
+                "with_search_results": sum(1 for q in self.queries if q.has_search_result()),
+                "with_evaluations": sum(1 for q in self.queries if q.has_evaluation())
+            }
+        }
+
+    @staticmethod
+    def from_dict(data: Dict[str, Any]) -> 'TopicPointQueries':
+        """从字典创建"""
+        queries = [
+            SearchQuery.from_dict(q)
+            for q in data.get('queries', [])
+        ]
+
+        return TopicPointQueries(
+            topic_point_name=data.get('topic_point_name', ''),
+            topic_point_level=data.get('topic_point_level', ''),
+            queries=queries
+        )
+
+    def get_top_n_queries(self, n: int = 10) -> List[SearchQuery]:
+        """获取Top N个Query(按score降序)"""
+        return sorted(
+            self.queries,
+            key=lambda q: q.score,
+            reverse=True
+        )[:n]
+
+
+@dataclass
+class PostQueries:
+    """帖子的所有搜索Query"""
+    post_id: str                        # 帖子ID
+    topic_points_queries: List[TopicPointQueries] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转为字典"""
+        total_queries = sum(len(tpq.queries) for tpq in self.topic_points_queries)
+        total_with_results = sum(
+            sum(1 for q in tpq.queries if q.has_search_result())
+            for tpq in self.topic_points_queries
+        )
+        total_with_evaluations = sum(
+            sum(1 for q in tpq.queries if q.has_evaluation())
+            for tpq in self.topic_points_queries
+        )
+
+        return {
+            "post_id": self.post_id,
+            "topic_points_queries": [tpq.to_dict() for tpq in self.topic_points_queries],
+            "statistics": {
+                "total_topic_points": len(self.topic_points_queries),
+                "total_queries": total_queries,
+                "with_search_results": total_with_results,
+                "with_evaluations": total_with_evaluations
+            }
+        }
+
+    @staticmethod
+    def from_dict(data: Dict[str, Any]) -> 'PostQueries':
+        """从字典创建"""
+        tpq_list = [
+            TopicPointQueries.from_dict(tpq)
+            for tpq in data.get('topic_points_queries', [])
+        ]
+
+        return PostQueries(
+            post_id=data.get('post_id', ''),
+            topic_points_queries=tpq_list
+        )
+
+    def get_topic_point_queries(self, topic_point_name: str) -> Optional[TopicPointQueries]:
+        """获取指定选题点的Query"""
+        for tpq in self.topic_points_queries:
+            if tpq.topic_point_name == topic_point_name:
+                return tpq
+        return None
+
+    def get_all_queries(self) -> List[SearchQuery]:
+        """获取所有Query(扁平化)"""
+        all_queries = []
+        for tpq in self.topic_points_queries:
+            all_queries.extend(tpq.queries)
+        return all_queries

+ 221 - 0
src/models/similarity.py

@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+相似度分析数据模型
+"""
+
+from dataclasses import dataclass, field, asdict
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+
+
+@dataclass
+class SimilarFeature:
+    """带相似度的特征"""
+    feature_name: str                    # 特征名称
+    dimension: str                       # 维度
+    dimension_detail: str                # 维度细分
+    weight: float                        # 权重
+    source_index: int                    # 来源索引
+    source_info: Dict[str, Any]          # 溯源信息
+
+    # 相似度信息
+    similarity_score: float              # 相似度得分
+    similarity_explanation: str          # 相似度说明
+
+
+@dataclass
+class SimilarityStatistics:
+    """相似度统计信息"""
+    total_features: int = 0              # 特征总数
+    max_similarity: float = 0.0          # 最高相似度
+    min_similarity: float = 0.0          # 最低相似度
+    avg_similarity: float = 0.0          # 平均相似度
+    high_similarity_count: int = 0       # 高相似度数量 (>=0.7)
+    medium_similarity_count: int = 0     # 中等相似度数量 (0.5-0.7)
+    low_similarity_count: int = 0        # 低相似度数量 (<0.5)
+    error: Optional[str] = None          # 错误信息 (如果有)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return asdict(self)
+
+
+@dataclass
+class ComprehensiveScoreDetail:
+    """综合得分P详细信息"""
+    N: int                               # 总帖子数
+    M: int                               # 完全匹配帖子数
+    total_contribution: float            # 总贡献值 Σ(a×b)
+    complete_matches: List[Dict[str, Any]]  # 完全匹配列表 (含每个的贡献)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return asdict(self)
+
+
+@dataclass
+class SimilarityScore:
+    """单个帖子的相似度评分"""
+    note_id: str                         # 帖子ID
+    original_feature: str                # 原始特征
+    evaluation_score: float              # Stage 4评估得分
+    search_word: str                     # 搜索词
+
+    # 帖子数据
+    note_data: Dict[str, Any] = field(default_factory=dict)  # 帖子信息
+
+    # 相似特征列表 (按相似度降序排序)
+    deconstructed_features: List[SimilarFeature] = field(default_factory=list)
+
+    # 统计信息
+    similarity_statistics: SimilarityStatistics = field(default_factory=SimilarityStatistics)
+
+    # 综合得分P (可选,如果计算了的话)
+    comprehensive_score: Optional[float] = None
+    comprehensive_score_detail: Optional[ComprehensiveScoreDetail] = None
+
+    # 处理时间
+    processing_time_seconds: float = 0.0
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        result = {
+            'note_id': self.note_id,
+            'original_feature': self.original_feature,
+            'evaluation_score': self.evaluation_score,
+            'search_word': self.search_word,
+            'note_data': self.note_data,
+            'deconstructed_features': [asdict(f) for f in self.deconstructed_features],
+            'similarity_statistics': self.similarity_statistics.to_dict(),
+            'processing_time_seconds': round(self.processing_time_seconds, 2)
+        }
+
+        # 添加综合得分 (如果有)
+        if self.comprehensive_score is not None:
+            result['comprehensive_score'] = round(self.comprehensive_score, 3)
+        if self.comprehensive_score_detail is not None:
+            result['comprehensive_score_detail'] = self.comprehensive_score_detail.to_dict()
+
+        return result
+
+
+@dataclass
+class OverallSimilarityStatistics:
+    """整体相似度统计"""
+    total_notes: int                     # 总帖子数
+    total_features_extracted: int        # 提取的特征总数
+    avg_features_per_note: float         # 平均特征数/帖子
+    avg_max_similarity: float            # 平均最高相似度
+    notes_with_high_similarity: int      # 包含高相似度特征的帖子数
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return asdict(self)
+
+
+@dataclass
+class PostSimilarityScores:
+    """帖子相似度评分集合"""
+    post_id: str                                          # 帖子ID
+    similarity_scores: List[SimilarityScore]              # 相似度评分列表
+
+    # 相似度配置
+    algorithm: str = "hybrid_similarity"                  # 算法名称
+    weight_embedding: float = 0.5                         # 向量权重
+    weight_semantic: float = 0.5                          # 语义权重
+    min_similarity_threshold: float = 0.0                 # 最小相似度阈值
+
+    # 目标特征
+    target_features: Optional[List[str]] = None           # 目标特征列表
+
+    # 整体统计
+    overall_statistics: Optional[OverallSimilarityStatistics] = None
+
+    # 时间信息
+    source_file: str = ""                                 # 来源文件
+    created_at: str = ""                                  # 创建时间
+    processing_time_seconds: float = 0.0                  # 处理耗时(秒)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'metadata': {
+                'stage': 'similarity',
+                'description': '解构特征与原始特征的相似度评分',
+                'post_id': self.post_id,
+                'source_file': self.source_file,
+                'target_features': self.target_features if self.target_features else '全部',
+                'similarity_config': {
+                    'algorithm': self.algorithm,
+                    'weight_embedding': self.weight_embedding,
+                    'weight_semantic': self.weight_semantic,
+                    'min_similarity_threshold': self.min_similarity_threshold
+                },
+                'overall_statistics': self.overall_statistics.to_dict() if self.overall_statistics else None,
+                'created_at': self.created_at or datetime.now().isoformat(),
+                'processing_time_seconds': round(self.processing_time_seconds, 2)
+            },
+            'results': [s.to_dict() for s in self.similarity_scores]
+        }
+
+    @classmethod
+    def from_json_file(cls, file_path: str) -> 'PostSimilarityScores':
+        """从JSON文件加载"""
+        import json
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+
+        metadata = data['metadata']
+        results_data = data['results']
+
+        # 重建SimilarityScore对象
+        scores = []
+        for r in results_data:
+            # 重建特征列表
+            features = [
+                SimilarFeature(**f) for f in r.get('deconstructed_features', [])
+            ]
+
+            # 重建统计信息
+            stats = SimilarityStatistics(**r['similarity_statistics'])
+
+            # 重建综合得分详情
+            comprehensive_detail = None
+            if 'comprehensive_score_detail' in r:
+                comprehensive_detail = ComprehensiveScoreDetail(**r['comprehensive_score_detail'])
+
+            score = SimilarityScore(
+                note_id=r['note_id'],
+                original_feature=r['original_feature'],
+                evaluation_score=r['evaluation_score'],
+                search_word=r['search_word'],
+                note_data=r['note_data'],
+                deconstructed_features=features,
+                similarity_statistics=stats,
+                comprehensive_score=r.get('comprehensive_score'),
+                comprehensive_score_detail=comprehensive_detail,
+                processing_time_seconds=r['processing_time_seconds']
+            )
+            scores.append(score)
+
+        # 重建整体统计
+        overall_stats = None
+        if metadata.get('overall_statistics'):
+            overall_stats = OverallSimilarityStatistics(**metadata['overall_statistics'])
+
+        similarity_config = metadata['similarity_config']
+
+        return cls(
+            post_id=metadata['post_id'],
+            similarity_scores=scores,
+            algorithm=similarity_config['algorithm'],
+            weight_embedding=similarity_config['weight_embedding'],
+            weight_semantic=similarity_config['weight_semantic'],
+            min_similarity_threshold=similarity_config['min_similarity_threshold'],
+            target_features=metadata.get('target_features'),
+            overall_statistics=overall_stats,
+            source_file=metadata['source_file'],
+            created_at=metadata['created_at'],
+            processing_time_seconds=metadata['processing_time_seconds']
+        )

+ 0 - 0
src/pipeline/__init__.py


+ 1291 - 0
src/pipeline/feature_search_pipeline.py

@@ -0,0 +1,1291 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+增强搜索系统 V2
+支持LLM评估和扩展搜索的完整流程
+"""
+
+import json
+import logging
+import os
+import argparse
+import subprocess
+from typing import Dict, List, Any, Optional
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from src.clients.openrouter_client import OpenRouterClient
+from src.evaluators.llm_evaluator import LLMEvaluator
+from src.clients.xiaohongshu_search import XiaohongshuSearch
+from src.analyzers.post_deconstruction_analyzer import PostDeconstructionAnalyzer
+from src.analyzers.similarity_analyzer import SimilarityAnalyzer
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S',
+    handlers=[
+        logging.FileHandler('enhanced_search_v2.log', encoding='utf-8'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+
+class EnhancedSearchV2:
+    """增强搜索系统V2"""
+
+    def __init__(
+        self,
+        how_json_path: str,
+        openrouter_api_key: Optional[str] = None,
+        output_dir: str = "output_v2",
+        top_n: int = 10,
+        max_total_searches: Optional[int] = None,
+        search_max_workers: int = 3,
+        max_searches_per_feature: Optional[int] = None,
+        max_searches_per_base_word: Optional[int] = None,
+        enable_evaluation: bool = True,
+        evaluation_max_workers: int = 10,
+        evaluation_max_notes_per_query: int = 20,
+        enable_deep_analysis: bool = False,
+        deep_analysis_only: bool = False,
+        deep_analysis_max_workers: int = 5,
+        deep_analysis_max_notes: Optional[int] = None,
+        deep_analysis_skip_count: int = 0,
+        deep_analysis_sort_by: str = 'score',
+        deep_analysis_api_url: str = "http://192.168.245.150:7000/what/analysis/single",
+        deep_analysis_min_score: float = 0.8,
+        enable_similarity_analysis: bool = False,
+        similarity_weight_embedding: float = 0.5,
+        similarity_weight_semantic: float = 0.5,
+        similarity_max_workers: int = 5,
+        similarity_min_similarity: float = 0.0
+    ):
+        """
+        初始化系统
+
+        Args:
+            how_json_path: How解构文件路径
+            openrouter_api_key: OpenRouter API密钥
+            output_dir: 输出目录
+            top_n: 每个原始特征取评分最高的N个搜索词(默认10)
+            max_total_searches: 全局最大搜索次数限制(默认None不限制)
+            search_max_workers: 搜索并发数(默认3)
+            max_searches_per_feature: 每个原始特征的最大搜索次数(默认None不限制)
+            max_searches_per_base_word: 每个base_word的最大搜索次数(默认None不限制)
+            enable_evaluation: 是否启用结果评估(默认False)
+            evaluation_max_workers: 结果评估并发评估数(默认10)
+            evaluation_max_notes_per_query: 每个搜索结果评估的最大帖子数(默认20)
+            enable_deep_analysis: 是否启用深度解构(默认False)
+            deep_analysis_only: 只运行深度解构(从结果评估结果开始,默认False)
+            deep_analysis_max_workers: 深度解构并发数(默认5)
+            deep_analysis_max_notes: 深度解构最多处理多少个帖子(默认None不限制)
+            deep_analysis_skip_count: 深度解构跳过前N个帖子(默认0)
+            deep_analysis_sort_by: 深度解构排序方式:score/time/engagement(默认score)
+            deep_analysis_api_url: 深度解构API地址
+            deep_analysis_min_score: 深度解构处理的最低分数阈值(默认0.8,0-1分制)
+            enable_similarity_analysis: 是否启用相似度分析(默认False)
+            similarity_weight_embedding: 相似度分析向量模型权重(默认0.5)
+            similarity_weight_semantic: 相似度分析LLM模型权重(默认0.5)
+            similarity_max_workers: 相似度分析并发数(默认5)
+            similarity_min_similarity: 相似度分析最小相似度阈值(默认0.0)
+        """
+        self.how_json_path = how_json_path
+        self.output_dir = output_dir
+        self.top_n = top_n
+        self.max_total_searches = max_total_searches
+        self.search_max_workers = search_max_workers
+        self.max_searches_per_feature = max_searches_per_feature
+        self.max_searches_per_base_word = max_searches_per_base_word
+        self.enable_evaluation = enable_evaluation
+        self.evaluation_max_workers = evaluation_max_workers
+        self.evaluation_max_notes_per_query = evaluation_max_notes_per_query
+        self.enable_deep_analysis = enable_deep_analysis
+        self.deep_analysis_only = deep_analysis_only
+        self.enable_similarity_analysis = enable_similarity_analysis
+
+        # 创建输出目录
+        os.makedirs(output_dir, exist_ok=True)
+
+        # 加载数据
+        logger.info("加载数据文件...")
+        self.how_data = self._load_json(how_json_path)
+        logger.info("  ✓ 已加载 how.json")
+
+        # 初始化组件
+        logger.info("初始化组件...")
+        self.openrouter_client = OpenRouterClient(
+            api_key=openrouter_api_key,
+            model="google/gemini-2.5-flash",
+            retry_delay=5  # 增加重试延迟避免限流
+        )
+        self.llm_evaluator = LLMEvaluator(self.openrouter_client)
+        self.search_client = XiaohongshuSearch()
+
+        # 初始化深度解构分析器
+        self.deep_analyzer = PostDeconstructionAnalyzer(
+            api_url=deep_analysis_api_url,
+            max_workers=deep_analysis_max_workers,
+            max_notes=deep_analysis_max_notes,
+            min_score=deep_analysis_min_score,
+            skip_count=deep_analysis_skip_count,
+            sort_by=deep_analysis_sort_by,
+            output_dir=output_dir,
+            enable_image_download=False,  # 直接使用原始图片URL,不做代理
+            image_server_url="http://localhost:8765",  # 图片服务器URL(已弃用)
+            image_download_dir="downloaded_images"  # 图片下载目录(已弃用)
+        )
+
+        # 初始化相似度分析器
+        self.similarity_analyzer = SimilarityAnalyzer(
+            weight_embedding=similarity_weight_embedding,
+            weight_semantic=similarity_weight_semantic,
+            max_workers=similarity_max_workers,
+            min_similarity=similarity_min_similarity,
+            evaluation_results_path=os.path.join(output_dir, "evaluated_results.json"),
+            update_evaluation_scores=True  # 自动计算综合得分P
+        )
+
+        logger.info("系统初始化完成")
+
+    def _load_json(self, file_path: str) -> Any:
+        """加载JSON文件"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except Exception as e:
+            logger.error(f"加载文件失败 {file_path}: {e}")
+            raise
+
+    def _save_json(self, data: Any, file_path: str):
+        """保存JSON文件"""
+        try:
+            with open(file_path, 'w', encoding='utf-8') as f:
+                json.dump(data, f, ensure_ascii=False, indent=2)
+            logger.info(f"已保存: {file_path}")
+        except Exception as e:
+            logger.error(f"保存文件失败 {file_path}: {e}")
+            raise
+
+    # ========== 步骤1:筛选 0.5 <= 相似度 < 0.8 的特征 ==========
+
+    def filter_medium_similarity_features(self) -> List[Dict[str, Any]]:
+        """
+        步骤1:筛选中等匹配度特征
+
+        筛选条件:0.5 <= 最高相似度 < 0.8
+
+        Returns:
+            筛选后的特征列表
+        """
+        logger.info("=" * 60)
+        logger.info("步骤1:筛选中等匹配度特征 (0.5 <= 相似度 < 0.8)")
+        logger.info("=" * 60)
+
+        results = []
+        how_result = self.how_data.get('解构结果', {})
+
+        total_features = 0
+        filtered_out_low = 0  # < 0.5
+        filtered_out_high = 0  # >= 0.8
+        selected_count = 0
+
+        # 遍历三个维度
+        for level_name, level_list in how_result.items():
+            if not isinstance(level_list, list):
+                continue
+
+            logger.info(f"\n处理 {level_name}...")
+
+            for item_idx, item in enumerate(level_list):
+                item_name = item.get('名称', f'未命名-{item_idx}')
+
+                # 新格式:直接读取点层级的匹配人设结果
+                match_results = item.get('匹配人设结果', [])
+
+                total_features += 1
+
+                if not match_results:
+                    logger.info(f"  ✗ {item_name}: 无匹配结果")
+                    continue
+
+                # 找到最高相似度(新格式:相似度是直接字段)
+                max_similarity = max(
+                    (m.get('相似度', 0) for m in match_results),
+                    default=0
+                )
+
+                # 筛选条件
+                if max_similarity < 0.5:
+                    filtered_out_low += 1
+                    logger.info(f"  ✗ {item_name}: 最高相似度 {max_similarity:.3f} < 0.5(过滤)")
+                    continue
+                elif max_similarity >= 0.8:
+                    filtered_out_high += 1
+                    logger.info(f"  ✗ {item_name}: 最高相似度 {max_similarity:.3f} >= 0.8(过滤)")
+                    continue
+
+                # 0.5 <= max_similarity < 0.8,保留
+                # 按相似度降序排序,取前3个
+                sorted_matches = sorted(
+                    match_results,
+                    key=lambda x: x.get('相似度', 0),
+                    reverse=True
+                )
+                top3_matches = sorted_matches[:3]  # 取前3个
+
+                # 构建top3匹配信息列表
+                top3_match_info = []
+                for match in top3_matches:
+                    feature_classification = match.get('特征分类', [])
+                    classification_path = self._build_classification_path(feature_classification)
+
+                    # 直接从匹配结果读取特征类型
+                    is_classification = (match.get('特征类型') == '分类')
+
+                    top3_match_info.append({
+                        '人设特征名称': match.get('人设特征名称'),
+                        '人设特征层级': match.get('人设特征层级'),
+                        '特征类型': match.get('特征类型'),
+                        '特征分类': feature_classification,
+                        '相似度': match.get('相似度', 0),  # 直接字段
+                        '匹配说明': match.get('说明', ''),  # 直接字段
+                        '是分类': is_classification,
+                        '所属分类路径': classification_path
+                    })
+
+                result_item = {
+                    '原始特征名称': item_name,  # 使用点的名称作为特征名
+                    '来源层级': level_name,
+                    '权重': 1.0,  # 新格式没有权重字段,默认1.0
+                    '所属点名称': item_name,
+                    '最高匹配信息': top3_match_info[0],  # 保留第1个用于Stage2
+                    'top3匹配信息': top3_match_info  # 新增字段
+                }
+
+                results.append(result_item)
+                selected_count += 1
+
+                # 显示top3匹配信息
+                top3_names = [m['人设特征名称'] for m in top3_match_info]
+                logger.info(f"  ✓ {item_name} → Top{len(top3_match_info)}: {', '.join(top3_names)}")
+
+        # 统计信息
+        logger.info(f"\n" + "=" * 60)
+        logger.info(f"步骤1完成")
+        logger.info(f"  总特征数: {total_features}")
+        logger.info(f"  过滤掉(<0.5): {filtered_out_low}")
+        logger.info(f"  过滤掉(>=0.8): {filtered_out_high}")
+        logger.info(f"  保留(0.5-0.8): {selected_count}")
+        logger.info("=" * 60)
+
+        # 保存结果
+        output_path = os.path.join(self.output_dir, "filtered_features.json")
+        self._save_json(results, output_path)
+
+        return results
+
+    def _build_classification_path(self, feature_classification: List[str]) -> str:
+        """
+        构建分类路径
+
+        Args:
+            feature_classification: 特征分类数组
+
+        Returns:
+            分类路径
+        """
+        if not feature_classification:
+            return ""
+
+        # 步骤1: 去掉中间元素的"实质"后缀
+        cleaned = []
+        for i, item in enumerate(feature_classification):
+            if i == len(feature_classification) - 1:  # 最后一个保留
+                cleaned.append(item)
+            elif item.endswith("实质") and i != 0:  # 中间的去掉"实质"
+                cleaned.append(item[:-2])
+            else:
+                cleaned.append(item)
+
+        # 步骤2: 反转数组
+        reversed_list = list(reversed(cleaned))
+
+        # 步骤3: 拼接路径
+        path = "/".join(reversed_list)
+
+        return path
+
+    # ========== 步骤2:从how文件提取高相似度候选词 ==========
+
+    def extract_candidate_words(self, filtered_features: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        步骤2:从how文件提取候选词
+
+        处理流程:
+        1. 提取人设候选词:相似度 >= 0.8 的人设特征名称
+        2. 提取帖子候选词:点的名称(灵感点、目的点、关键点),要求该点与人设的最高相似度 >= 0.8
+        3. 合并两种候选词并去重
+        4. 按相似度降序排序
+        5. 为每个中心词分配候选词列表
+        6. 构造 '高相似度候选_按base_word' 结构
+
+        Args:
+            filtered_features: 特征筛选筛选的特征列表
+
+        Returns:
+            带高相似度候选的特征列表
+        """
+        logger.info("=" * 60)
+        logger.info("步骤2:从how文件提取候选词(人设+帖子)")
+        logger.info("=" * 60)
+
+        how_result = self.how_data.get('解构结果', {})
+
+        # Step 1: 提取人设候选词(相似度 >= 0.8)
+        persona_candidates_dict = {}  # {人设特征名称: {候选词信息}}
+
+        for dimension in ['灵感点列表', '关键点列表', '目的点列表']:
+            items_list = how_result.get(dimension, [])
+
+            for item in items_list:
+                item_name = item.get('名称', '')
+                matches = item.get('匹配人设结果', [])
+
+                for match in matches:
+                    similarity = match.get('相似度', 0)
+                    persona_feature_name = match.get('人设特征名称', '')
+
+                    # 筛选相似度 >= 0.8
+                    if similarity >= 0.8 and persona_feature_name:
+                        # 去重逻辑:保留最高相似度
+                        if persona_feature_name not in persona_candidates_dict or \
+                           similarity > persona_candidates_dict[persona_feature_name]['相似度']:
+                            persona_candidates_dict[persona_feature_name] = {
+                                '候选词': persona_feature_name,
+                                '候选词类型': 'persona',  # 标记为人设候选词
+                                '相似度': similarity,
+                                '特征类型': match.get('特征类型', ''),
+                                '特征分类': match.get('特征分类', []),
+                                '人设特征层级': match.get('人设特征层级', ''),
+                                '来源层级': 'persona',
+                                '来源路径': self._build_classification_path(match.get('特征分类', [])),
+                                '匹配说明': match.get('说明', ''),
+                                '来源原始特征': item_name
+                            }
+
+        # Step 2: 提取帖子候选词(点名称,要求该点与人设的最高相似度 >= 0.8)
+        post_candidates_dict = {}  # {点名称: {候选词信息}}
+
+        for dimension in ['灵感点列表', '关键点列表', '目的点列表']:
+            items_list = how_result.get(dimension, [])
+
+            for item in items_list:
+                item_name = item.get('名称', '')
+                matches = item.get('匹配人设结果', [])
+
+                if not item_name or not matches:
+                    continue
+
+                # 计算该点与人设的最高相似度
+                max_similarity = max(
+                    (m.get('相似度', 0) for m in matches),
+                    default=0
+                )
+
+                # 只有最高相似度 >= 0.8 的点才作为帖子候选词
+                if max_similarity >= 0.8:
+                    # 如果点名称已经作为人设候选词存在,跳过(优先保留人设候选词)
+                    if item_name not in persona_candidates_dict and item_name not in post_candidates_dict:
+                        post_candidates_dict[item_name] = {
+                            '候选词': item_name,
+                            '候选词类型': 'post',  # 标记为帖子候选词
+                            '相似度': 1.0,  # 帖子自身的点,相似度视为1.0
+                            '特征类型': item.get('类型', ''),
+                            '特征分类': [],
+                            '人设特征层级': '',
+                            '来源层级': dimension,
+                            '来源路径': f"帖子/{dimension}/{item_name}",
+                            '匹配说明': item.get('描述', ''),
+                            '来源原始特征': item_name,
+                            '点最高人设相似度': max_similarity  # 记录该点与人设的最高相似度
+                        }
+
+        # Step 3: 合并两种候选词
+        all_candidates_dict = {}
+        all_candidates_dict.update(persona_candidates_dict)  # 人设候选词
+        all_candidates_dict.update(post_candidates_dict)     # 帖子候选词
+
+        # Step 4: 转为列表并按相似度降序排序
+        global_candidates = sorted(
+            all_candidates_dict.values(),
+            key=lambda x: x['相似度'],
+            reverse=True
+        )
+
+        logger.info(f"候选词统计:")
+        logger.info(f"  - 人设候选词: {len(persona_candidates_dict)} 个")
+        logger.info(f"  - 帖子候选词: {len(post_candidates_dict)} 个")
+        logger.info(f"  - 总候选词: {len(global_candidates)} 个")
+
+        # 显示Top 10候选词
+        if global_candidates:
+            logger.info("\nTop 10 候选词:")
+            for i, candidate in enumerate(global_candidates[:10], 1):
+                cand_type = "人设" if candidate['候选词类型'] == 'persona' else "帖子"
+                logger.info(f"  {i}. {candidate['候选词']} (相似度: {candidate['相似度']:.3f}, 类型: {cand_type})")
+
+        # Step 3: 为每个特征构造输出结构
+        results = []
+        for idx, feature_data in enumerate(filtered_features, 1):
+            original_feature_name = feature_data.get('原始特征名称', '')
+            logger.info(f"\n[{idx}/{len(filtered_features)}] 处理: {original_feature_name}")
+
+            top3_matches = feature_data.get('top3匹配信息', [])
+
+            # 提取3个中心词
+            base_words = [match.get('人设特征名称', '') for match in top3_matches[:3]]
+            logger.info(f"  中心词: {', '.join(base_words)}")
+
+            # 所有中心词共享相同的候选词列表
+            high_similarity_by_base = {}
+            for base_word in base_words:
+                if base_word:
+                    high_similarity_by_base[base_word] = global_candidates.copy()
+
+            logger.info(f"  每个中心词分配 {len(global_candidates)} 个候选词")
+
+            result = {
+                '原始特征名称': original_feature_name,
+                '来源层级': feature_data.get('来源层级', ''),  # 保留元数据
+                '权重': feature_data.get('权重', 0),  # 保留元数据
+                'top3匹配信息': top3_matches,
+                '找到的关联_按base_word': {},  # 新方式不需要关联分析
+                '高相似度候选_按base_word': high_similarity_by_base
+            }
+            results.append(result)
+
+        # 保存结果
+        output_path = os.path.join(self.output_dir, 'candidate_words.json')
+        self._save_json(results, output_path)
+
+        logger.info(f"\n" + "=" * 60)
+        logger.info(f"步骤2完成")
+        logger.info(f"  提取候选词: {len(global_candidates)} 个")
+        logger.info(f"  处理特征: {len(results)} 个")
+        logger.info("=" * 60)
+
+        return results
+
+    # ========== 步骤4:多词组合 + LLM评估 ==========
+
+    def generate_search_queries(
+        self,
+        features_data: List[Dict[str, Any]],
+        max_workers: int = 4,
+        max_candidates: int = 20,
+        max_combo_length: int = 4
+    ) -> List[Dict[str, Any]]:
+        """
+        步骤4:多词组合 + LLM评估
+
+        基于Stage1的基础词和Stage3的高相似度候选,
+        生成所有2-N词组合,通过LLM评估选出Top10
+
+        Args:
+            features_data: 阶段3的数据(包含高相似度候选)
+            max_workers: 并发评估的原始特征数(默认4)
+            max_candidates: 参与组合的最大候选词数(默认20)
+            max_combo_length: 最大组合词数(默认4,即基础词+3个候选)
+
+        Returns:
+            带LLM评估的数据
+        """
+        logger.info("=" * 60)
+        logger.info("步骤4:多词组合 + LLM评估")
+        logger.info(f"  最大候选词数: {max_candidates}")
+        logger.info(f"  最大组合长度: {max_combo_length} 词")
+        logger.info(f"  并发数: {max_workers} 个原始特征")
+        logger.info("=" * 60)
+
+        total_features = len(features_data)
+
+        # 使用ThreadPoolExecutor并行处理不同的原始特征
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # 提交所有任务
+            futures = []
+            for idx, feature_result in enumerate(features_data, 1):
+                future = executor.submit(
+                    self._process_single_feature_combinations,
+                    idx,
+                    total_features,
+                    feature_result,
+                    max_candidates,
+                    max_combo_length
+                )
+                futures.append((future, feature_result))
+
+            # 等待所有任务完成并收集结果
+            for future, feature_result in futures:
+                try:
+                    _ = future.result()  # 等待完成,结果已经写回到feature_result中
+                except Exception as e:
+                    logger.error(f"  评估失败: {feature_result['原始特征名称']}, 错误: {e}")
+
+        # 保存结果
+        output_path = os.path.join(self.output_dir, "search_queries_evaluated.json")
+        self._save_json(features_data, output_path)
+
+        logger.info(f"\n" + "=" * 60)
+        logger.info(f"步骤4完成")
+        logger.info("=" * 60)
+
+        return features_data
+
+    def _process_single_feature_combinations(
+        self,
+        idx: int,
+        total: int,
+        feature_result: Dict[str, Any],
+        max_candidates: int,
+        max_combo_length: int
+    ) -> None:
+        """
+        处理单个原始特征的组合生成和评估
+
+        改进: 每个base_word使用自己的候选词(而不是共享)
+
+        Steps:
+        1. Get top3 base_words from Stage1's top3匹配信息
+        2. For each base_word:
+           a. Get candidates from Stage3's 高相似度候选_按base_word
+           b. Generate combinations
+           c. LLM evaluation
+           d. Select Top 10
+        3. Save grouped results
+
+        Args:
+            idx: 特征索引
+            total: 总特征数
+            feature_result: 特征结果数据
+            max_candidates: 参与组合的最大候选词数
+            max_combo_length: 最大组合词数
+        """
+        original_feature = feature_result['原始特征名称']
+        logger.info(f"\n[{idx}/{total}] 处理: {original_feature}")
+
+        # 步骤1: 获取top3基础词
+        top3_info = feature_result.get('top3匹配信息', [])
+        if not top3_info:
+            logger.info(f"  无top3匹配信息,跳过")
+            feature_result['组合评估结果_分组'] = []
+            return
+
+        logger.info(f"  找到 {len(top3_info)} 个base_word")
+
+        # 步骤2: 获取按base_word分组的候选词
+        candidates_by_base_word = feature_result.get('高相似度候选_按base_word', {})
+
+        if not candidates_by_base_word:
+            logger.warning(f"  无按base_word分组的候选词,跳过")
+            feature_result['组合评估结果_分组'] = []
+            return
+
+        # 步骤3: 为每个base_word独立处理
+        grouped_results = []
+
+        for base_idx, base_info in enumerate(top3_info, 1):
+            base_word = base_info.get('人设特征名称', '')
+            base_similarity = base_info.get('相似度', 0)
+
+            if not base_word:
+                continue
+
+            logger.info(f"  [{base_idx}/{len(top3_info)}] Base Word: {base_word} (相似度: {base_similarity:.3f})")
+
+            # 获取该base_word的候选词
+            base_candidates = candidates_by_base_word.get(base_word, [])
+            candidates = base_candidates[:max_candidates]
+            candidate_words = [c['候选词'] for c in candidates]
+
+            if not candidate_words:
+                logger.warning(f"    该base_word无候选词,跳过")
+                grouped_results.append({
+                    'base_word': base_word,
+                    'base_word_similarity': base_similarity,
+                    'base_word_info': base_info,
+                    'top10_searches': [],
+                    'available_words': []
+                })
+                continue
+
+            logger.info(f"    候选词数量: {len(candidate_words)} (限制: {max_candidates})")
+
+            # LLM生成query(新方式:直接让LLM基于候选词生成query)
+            logger.info(f"    使用LLM生成query(中心词: {base_word})...")
+            evaluated = self.llm_evaluator.generate_queries_from_candidates(
+                original_feature=original_feature,
+                base_word=base_word,
+                candidate_words=candidate_words,
+                max_queries=10
+            )
+
+            # 选出Top 10(已经由LLM生成方法控制数量)
+            top_10 = evaluated[:10]
+            logger.info(f"    生成完成,共 {len(top_10)} 个query")
+
+            # 保存分组结果 - 每个base_word有自己的available_words
+            grouped_results.append({
+                'base_word': base_word,
+                'base_word_similarity': base_similarity,
+                'base_word_info': base_info,
+                'top10_searches': top_10,
+                'available_words': candidate_words  # 该base_word自己的候选词
+            })
+
+        # 写回结果
+        feature_result['组合评估结果_分组'] = grouped_results
+
+        total_searches = sum(len(g['top10_searches']) for g in grouped_results)
+        logger.info(f"  完成!共 {len(grouped_results)} 个base_word,{total_searches} 个搜索词")
+
+    # ========== 步骤5:执行搜索 ==========
+
+    def _execute_single_search(
+        self,
+        idx: int,
+        total: int,
+        search_word: str,
+        feature_ref: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """
+        执行单个搜索任务(用于并发执行)
+
+        Args:
+            idx: 搜索索引
+            total: 总搜索数
+            search_word: 搜索词
+            feature_ref: 特征引用(用于写入结果)
+
+        Returns:
+            搜索结果信息
+        """
+        logger.info(f"[{idx}/{total}] 搜索: {search_word}")
+
+        try:
+            result = self.search_client.search(
+                keyword=search_word,
+                content_type='不限',
+                sort_type='综合',
+                max_retries=3,
+                use_cache=True  # 启用搜索缓存
+            )
+
+            note_count = len(result.get('data', {}).get('data', []))
+            logger.info(f"  ✓ 成功,获取 {note_count} 条帖子")
+
+            # 写入结果
+            feature_ref['search_result'] = result
+            feature_ref['search_metadata'] = {
+                'searched_at': datetime.now().isoformat(),
+                'status': 'success',
+                'note_count': note_count,
+                'search_params': {
+                    'keyword': search_word,
+                    'content_type': '图文',
+                    'sort_type': '综合'
+                }
+            }
+
+            return {'status': 'success', 'search_word': search_word, 'note_count': note_count}
+
+        except Exception as e:
+            logger.error(f"  ✗ 失败: {e}")
+            feature_ref['search_result'] = None
+            feature_ref['search_metadata'] = {
+                'searched_at': datetime.now().isoformat(),
+                'status': 'failed',
+                'note_count': 0,
+                'error': str(e)
+            }
+
+            return {'status': 'failed', 'search_word': search_word, 'error': str(e)}
+
+    def execute_search_queries(
+        self,
+        features_data: List[Dict[str, Any]],
+        search_delay: float = 2.0,
+        top_n: int = 10
+    ) -> List[Dict[str, Any]]:
+        """
+        步骤4:执行小红书搜索
+
+        Args:
+            features_data: 阶段3的数据
+            search_delay: 搜索延迟
+            top_n: 每个原始特征取评分最高的N个搜索词
+
+        Returns:
+            带搜索结果的数据
+        """
+        logger.info("=" * 60)
+        logger.info("步骤4:执行小红书搜索")
+        logger.info("=" * 60)
+
+        # 按原始特征分组收集搜索词(从搜索词生成结果中读取)
+        feature_search_groups = {}
+
+        for feature_result in features_data:
+            original_feature = feature_result['原始特征名称']
+
+            if original_feature not in feature_search_groups:
+                feature_search_groups[original_feature] = []
+
+            # 从搜索词生成结果中读取(新结构)
+            grouped_results = feature_result.get('组合评估结果_分组', [])
+
+            if grouped_results:
+                # 使用分组结构:每个base_word的top10都执行
+                for group in grouped_results:
+                    base_word = group.get('base_word', '')
+                    base_similarity = group.get('base_word_similarity', 0)
+
+                    base_word_searches = []
+                    for eval_item in group.get('top10_searches', []):
+                        sw = eval_item.get('search_word')
+                        if not sw:
+                            continue
+
+                        score = eval_item.get('score', 0.0)
+
+                        base_word_searches.append({
+                            'search_word': sw,
+                            'score': score,
+                            'base_word': base_word,
+                            'base_word_similarity': base_similarity,
+                            'feature_ref': eval_item  # 引用评估项,用于写入搜索结果
+                        })
+
+                    # 应用每个base_word的搜索次数限制
+                    if self.max_searches_per_base_word and len(base_word_searches) > self.max_searches_per_base_word:
+                        logger.info(f"  应用base_word限制: {base_word} 从 {len(base_word_searches)} 减少到 {self.max_searches_per_base_word}")
+                        base_word_searches = base_word_searches[:self.max_searches_per_base_word]
+
+                    feature_search_groups[original_feature].extend(base_word_searches)
+            else:
+                # 兼容旧结构(组合评估结果)
+                for eval_item in feature_result.get('组合评估结果', []):
+                    sw = eval_item.get('search_word')
+                    if not sw:
+                        continue
+
+                    score = eval_item.get('score', 0.0)
+
+                    feature_search_groups[original_feature].append({
+                        'search_word': sw,
+                        'score': score,
+                        'feature_ref': eval_item
+                    })
+
+            # 应用每个原始特征的搜索次数限制
+            if self.max_searches_per_feature and len(feature_search_groups[original_feature]) > self.max_searches_per_feature:
+                logger.info(f"  应用特征限制: {original_feature} 从 {len(feature_search_groups[original_feature])} 减少到 {self.max_searches_per_feature}")
+                feature_search_groups[original_feature] = feature_search_groups[original_feature][:self.max_searches_per_feature]
+
+        # 收集所有搜索任务(分组结构下执行所有base_word的top10,不再过滤)
+        all_searches = []
+        total_count = 0
+
+        for original_feature, search_list in feature_search_groups.items():
+            total_count += len(search_list)
+            all_searches.extend(search_list)
+
+            logger.info(f"  {original_feature}: {len(search_list)} 个搜索词")
+
+        # 应用全局搜索次数限制
+        if self.max_total_searches and len(all_searches) > self.max_total_searches:
+            logger.info(f"  应用全局限制:从 {len(all_searches)} 个减少到 {self.max_total_searches} 个")
+            all_searches = all_searches[:self.max_total_searches]
+
+        logger.info(f"\n共 {len(all_searches)} 个搜索任务")
+        logger.info(f"  并发执行搜索(并发数: {self.search_max_workers})")
+
+        # 使用ThreadPoolExecutor并发执行搜索
+        with ThreadPoolExecutor(max_workers=self.search_max_workers) as executor:
+            # 提交所有搜索任务
+            futures = []
+            for idx, item in enumerate(all_searches, 1):
+                future = executor.submit(
+                    self._execute_single_search,
+                    idx,
+                    len(all_searches),
+                    item['search_word'],
+                    item['feature_ref']
+                )
+                futures.append(future)
+
+            # 等待所有搜索完成
+            for future in as_completed(futures):
+                try:
+                    result = future.result()
+                    # 结果已经写入feature_ref,无需额外处理
+                except Exception as e:
+                    logger.error(f"  搜索任务失败: {e}")
+
+        # 保存结果
+        output_path = os.path.join(self.output_dir, "search_results.json")
+        self._save_json(features_data, output_path)
+
+        logger.info(f"\n" + "=" * 60)
+        logger.info(f"步骤4完成")
+        logger.info("=" * 60)
+
+        return features_data
+
+    # ========== 步骤5:LLM评估搜索结果(两层过滤评估) ==========
+    # 注:旧的单层评估方法已移至 backup/unused_methods_from_enhanced_search_v2.py
+
+    def evaluate_search_results(
+        self,
+        features_data: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
+        """
+        步骤5:用LLM评估搜索结果(使用两层过滤评估)
+
+        遍历所有搜索结果,使用两层评估机制:
+        1. 第一层:过滤与搜索Query无关的结果
+        2. 第二层:评估与目标特征的匹配度(0.8-1.0/0.6-0.79/0.5-0.59/≤0.4)
+
+        Args:
+            features_data: 阶段4的数据
+
+        Returns:
+            带评估结果的数据
+        """
+        logger.info("=" * 60)
+        logger.info("步骤5:LLM评估搜索结果(两层过滤评估)")
+        logger.info(f"  并发数: {self.evaluation_max_workers}")
+        logger.info(f"  每个搜索最多评估: {self.evaluation_max_notes_per_query} 个帖子")
+        logger.info("=" * 60)
+
+        # 收集所有需要评估的搜索项
+        search_items_to_evaluate = []
+
+        for feature_result in features_data:
+            original_feature = feature_result['原始特征名称']
+
+            # 从组合评估结果_分组中读取搜索结果
+            grouped_results = feature_result.get('组合评估结果_分组', [])
+
+            if grouped_results:
+                for group in grouped_results:
+                    for eval_item in group.get('top10_searches', []):
+                        # 检查是否有搜索结果
+                        if eval_item.get('search_result') and eval_item.get('search_metadata', {}).get('status') == 'success':
+                            search_items_to_evaluate.append({
+                                'original_feature': original_feature,
+                                'search_item': eval_item,
+                                'base_word': group.get('base_word', '')
+                            })
+            else:
+                # 兼容旧结构
+                for eval_item in feature_result.get('组合评估结果', []):
+                    if eval_item.get('search_result') and eval_item.get('search_metadata', {}).get('status') == 'success':
+                        search_items_to_evaluate.append({
+                            'original_feature': original_feature,
+                            'search_item': eval_item,
+                            'base_word': ''
+                        })
+
+        logger.info(f"共 {len(search_items_to_evaluate)} 个搜索结果需要评估")
+
+        # 并行评估所有搜索结果
+        with ThreadPoolExecutor(max_workers=self.evaluation_max_workers) as executor:
+            futures = []
+            for idx, item in enumerate(search_items_to_evaluate, 1):
+                future = executor.submit(
+                    self._evaluate_single_search_with_filter,
+                    idx,
+                    len(search_items_to_evaluate),
+                    item['original_feature'],
+                    item['search_item'],
+                    item['base_word']
+                )
+                futures.append((future, item))
+
+            # 收集结果
+            success_count = 0
+            failed_count = 0
+
+            for future, item in futures:
+                try:
+                    evaluation = future.result()
+                    item['search_item']['evaluation_with_filter'] = evaluation
+                    success_count += 1
+                except Exception as e:
+                    logger.error(f"  评估失败: {item['search_item'].get('search_word', 'unknown')}, 错误: {e}")
+                    item['search_item']['evaluation_with_filter'] = None
+                    failed_count += 1
+
+        logger.info(f"\n评估完成: 成功 {success_count}, 失败 {failed_count}")
+
+        # 保存结果
+        output_path = os.path.join(self.output_dir, "evaluated_results.json")
+        self._save_json(features_data, output_path)
+
+        logger.info(f"\n" + "=" * 60)
+        logger.info(f"步骤5完成")
+        logger.info("=" * 60)
+
+        return features_data
+
+    def _evaluate_single_search_with_filter(
+        self,
+        idx: int,
+        total: int,
+        original_feature: str,
+        search_item: Dict[str, Any],
+        base_word: str
+    ) -> Dict[str, Any]:
+        """
+        评估单个搜索结果(使用两层过滤)
+
+        Args:
+            idx: 索引
+            total: 总数
+            original_feature: 原始特征
+            search_item: 搜索项(包含search_word和search_result)
+            base_word: 基础词
+
+        Returns:
+            评估结果
+        """
+        search_word = search_item.get('search_word', '')
+        notes = search_item['search_result'].get('data', {}).get('data', [])
+
+        logger.info(f"[{idx}/{total}] 评估: {search_word} (帖子数: {len(notes)})")
+
+        # 调用LLM评估器的批量评估方法
+        evaluation = self.llm_evaluator.batch_evaluate_notes_with_filter(
+            search_query=search_word,
+            target_feature=original_feature,
+            notes=notes,
+            max_notes=self.evaluation_max_notes_per_query,
+            max_workers=self.evaluation_max_workers
+        )
+
+        # 统计信息
+        filtered_count = evaluation.get('filtered_count', 0)
+        evaluated_count = evaluation.get('evaluated_count', 0)
+        match_dist = evaluation.get('match_distribution', {})
+
+        logger.info(f"  ✓ 完成: 过滤 {filtered_count}, 评估 {evaluated_count}, "
+                   f"完全匹配 {match_dist.get('完全匹配(0.8-1.0)', 0)}, "
+                   f"相似匹配 {match_dist.get('相似匹配(0.6-0.79)', 0)}")
+
+        return evaluation
+
+    # ========== 主流程 ==========
+    # 注:旧的扩展搜索方法(extended_searches)已移至 backup/unused_methods_from_enhanced_search_v2.py
+
+    def run_full_pipeline(self):
+        """执行完整流程"""
+        logger.info("\n" + "=" * 60)
+        logger.info("开始执行完整流程")
+        logger.info("=" * 60)
+
+        try:
+            # 深度解构 Only 模式:只运行深度解构分析(从 结果评估 结果开始)
+            if self.deep_analysis_only:
+                logger.info("运行模式: 深度解构 Only (从 结果评估 结果开始)")
+                evaluation_path = os.path.join(self.output_dir, "evaluated_results.json")
+
+                if not os.path.exists(evaluation_path):
+                    raise FileNotFoundError(f"结果评估 结果不存在: {evaluation_path}")
+
+                with open(evaluation_path, 'r', encoding='utf-8') as f:
+                    evaluation_results = json.load(f)
+
+                deep_results = self.deep_analyzer.run(evaluation_results)
+                return deep_results
+
+            # 正常流程:从 特征筛选 开始
+            # 步骤1
+            filtered_features = self.filter_medium_similarity_features()
+
+            # 步骤2:从how文件提取候选词
+            candidates = self.extract_candidate_words(filtered_features)
+
+            # 步骤3:多词组合 + LLM评估
+            queries = self.generate_search_queries(
+                candidates,
+                max_workers=8,         # 提高并发从4到8
+                max_combo_length=3     # 降低组合长度从4到3
+            )
+
+            # 步骤4:执行搜索
+            search_results = self.execute_search_queries(queries, search_delay=2.0, top_n=self.top_n)
+
+            # 步骤5:LLM评估搜索结果 - 条件执行
+            if self.enable_evaluation:
+                evaluation_results = self.evaluate_search_results(search_results)
+            else:
+                evaluation_results = search_results
+                logger.info("\n" + "=" * 60)
+                logger.info("步骤5:跳过(未启用)")
+                logger.info("=" * 60)
+
+            # 深度解构分析 - 条件执行
+            if self.enable_deep_analysis:
+                deep_results = self.deep_analyzer.run(evaluation_results)
+                final_results = deep_results
+            else:
+                final_results = evaluation_results
+
+            # 相似度分析 - 条件执行
+            if self.enable_similarity_analysis and self.enable_deep_analysis:
+                logger.info("\n" + "=" * 60)
+                logger.info("步骤7:相似度分析(解构特征与原始特征)")
+                logger.info("=" * 60)
+
+                similarity_results = self.similarity_analyzer.run(
+                    deep_results,
+                    output_path=os.path.join(self.output_dir, "similarity_analysis_results.json")
+                )
+                final_results = similarity_results
+
+                logger.info("\n" + "=" * 60)
+                logger.info("步骤7完成")
+                logger.info("=" * 60)
+
+            logger.info("\n" + "=" * 60)
+            if self.enable_similarity_analysis and self.enable_deep_analysis:
+                logger.info("✓ 完整流程执行完成(完整流程+深度分析+相似度分析)")
+            elif self.enable_deep_analysis:
+                logger.info("✓ 完整流程执行完成(完整流程+深度分析)")
+            elif self.enable_evaluation:
+                logger.info("✓ 完整流程执行完成(完整流程)")
+            else:
+                logger.info("✓ 完整流程执行完成(基础流程)")
+            logger.info("=" * 60)
+
+            # 自动执行可视化
+            logger.info("\n" + "=" * 60)
+            logger.info("开始生成可视化...")
+            logger.info("=" * 60)
+
+            try:
+                # 使用统一的可视化脚本
+                viz_script = 'src/visualizers/search_results_visualizer.py'
+                logger.info(f"  使用可视化脚本: {viz_script}")
+
+                result = subprocess.run(
+                    ['python3', viz_script],
+                    capture_output=True,
+                    text=True,
+                    timeout=60
+                )
+
+                if result.returncode == 0:
+                    logger.info("✓ 可视化生成成功")
+                    logger.info(result.stdout)
+                else:
+                    logger.error(f"可视化生成失败: {result.stderr}")
+            except subprocess.TimeoutExpired:
+                logger.error("可视化生成超时")
+            except Exception as e:
+                logger.error(f"可视化生成异常: {e}")
+
+            return final_results
+
+        except Exception as e:
+            logger.error(f"流程执行失败: {e}")
+            raise
+
+
+def main():
+    """主函数"""
+    parser = argparse.ArgumentParser(description='增强搜索系统V2')
+    parser.add_argument(
+        '--how-json',
+        default='input/posts/690d977d0000000007036331_how.json',
+        help='How解构文件路径'
+    )
+    parser.add_argument(
+        '--api-key',
+        default=None,
+        help='OpenRouter API密钥(默认从环境变量读取)'
+    )
+    parser.add_argument(
+        '--output-dir',
+        default='output_v2',
+        help='输出目录'
+    )
+    parser.add_argument(
+        '--top-n',
+        type=int,
+        default=10,
+        help='每个原始特征取评分最高的N个搜索词(默认10)'
+    )
+    parser.add_argument(
+        '--max-total-searches',
+        type=int,
+        default=None,
+        help='全局最大搜索次数限制(默认None不限制)'
+    )
+    parser.add_argument(
+        '--search-workers',
+        type=int,
+        default=3,
+        help='搜索并发数(默认3)'
+    )
+    parser.add_argument(
+        '--max-searches-per-feature',
+        type=int,
+        default=None,
+        help='每个原始特征的最大搜索次数(默认None不限制)'
+    )
+    parser.add_argument(
+        '--max-searches-per-base-word',
+        type=int,
+        default=None,
+        help='每个base_word的最大搜索次数(默认None不限制)'
+    )
+    parser.add_argument(
+        '--enable-stage5',
+        action='store_true',
+        help='启用结果评估(默认False)'
+    )
+    parser.add_argument(
+        '--stage5-max-workers',
+        type=int,
+        default=10,
+        help='结果评估并发评估数(默认10)'
+    )
+    parser.add_argument(
+        '--stage5-max-notes',
+        type=int,
+        default=20,
+        help='每个搜索结果评估的最大帖子数(默认20)'
+    )
+    parser.add_argument(
+        '--enable-stage6',
+        action='store_true',
+        help='启用 深度解构分析'
+    )
+    parser.add_argument(
+        '--stage6-only',
+        action='store_true',
+        help='只运行 深度解构(从 结果评估 结果开始)'
+    )
+    parser.add_argument(
+        '--stage6-max-workers',
+        type=int,
+        default=5,
+        help='深度解构 并发数(默认5)'
+    )
+    parser.add_argument(
+        '--stage6-max-notes',
+        type=int,
+        default=None,
+        help='深度解构 最多处理多少个完全匹配的帖子(默认None不限制)'
+    )
+    parser.add_argument(
+        '--stage6-skip',
+        type=int,
+        default=0,
+        help='深度解构 跳过前 N 个完全匹配的帖子(默认0)'
+    )
+    parser.add_argument(
+        '--stage6-sort-by',
+        type=str,
+        choices=['score', 'time', 'engagement'],
+        default='score',
+        help='深度解构 排序方式: score(评分), time(时间), engagement(互动量)'
+    )
+    parser.add_argument(
+        '--stage6-api-url',
+        type=str,
+        default='http://192.168.245.150:7000/what/analysis/single',
+        help='深度解构 解构 API 地址'
+    )
+    parser.add_argument(
+        '--stage6-min-score',
+        type=float,
+        default=0.8,
+        help='深度解构 处理的最低分数阈值(默认0.8,0-1分制)'
+    )
+    parser.add_argument(
+        '--enable-stage8',
+        action='store_true',
+        help='启用相似度分析(默认False,需要先启用stage6)'
+    )
+    parser.add_argument(
+        '--stage8-weight-embedding',
+        type=float,
+        default=0.5,
+        help='相似度分析向量模型权重(默认0.5)'
+    )
+    parser.add_argument(
+        '--stage8-weight-semantic',
+        type=float,
+        default=0.5,
+        help='相似度分析LLM模型权重(默认0.5)'
+    )
+    parser.add_argument(
+        '--stage8-max-workers',
+        type=int,
+        default=5,
+        help='相似度分析并发数(默认5)'
+    )
+    parser.add_argument(
+        '--stage8-min-similarity',
+        type=float,
+        default=0.0,
+        help='相似度分析最小相似度阈值(默认0.0)'
+    )
+
+    args = parser.parse_args()
+
+    # 创建系统实例
+    system = EnhancedSearchV2(
+        how_json_path=args.how_json,
+        openrouter_api_key=args.api_key,
+        output_dir=args.output_dir,
+        top_n=args.top_n,
+        max_total_searches=args.max_total_searches,
+        search_max_workers=args.search_workers,
+        max_searches_per_feature=args.max_searches_per_feature,
+        max_searches_per_base_word=args.max_searches_per_base_word,
+        enable_evaluation=args.enable_stage5,
+        evaluation_max_workers=args.stage5_max_workers,
+        evaluation_max_notes_per_query=args.stage5_max_notes,
+        enable_deep_analysis=args.enable_stage6,
+        deep_analysis_only=args.stage6_only,
+        deep_analysis_max_workers=args.stage6_max_workers,
+        deep_analysis_max_notes=args.stage6_max_notes,
+        deep_analysis_skip_count=args.stage6_skip,
+        deep_analysis_sort_by=args.stage6_sort_by,
+        deep_analysis_api_url=args.stage6_api_url,
+        deep_analysis_min_score=args.stage6_min_score,
+        enable_similarity_analysis=args.enable_stage8,
+        similarity_weight_embedding=args.stage8_weight_embedding,
+        similarity_weight_semantic=args.stage8_weight_semantic,
+        similarity_max_workers=args.stage8_max_workers,
+        similarity_min_similarity=args.stage8_min_similarity
+    )
+
+    # 执行完整流程
+    system.run_full_pipeline()
+
+
+if __name__ == '__main__':
+    # 当作为主脚本运行时,添加项目根目录到Python路径
+    import sys
+    project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    if project_root not in sys.path:
+        sys.path.insert(0, project_root)
+
+    main()

+ 0 - 0
src/visualizers/__init__.py


+ 1341 - 0
src/visualizers/cascade_search_visualizer.py

@@ -0,0 +1,1341 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+级联搜索结果可视化工具
+展示候选词 → Top3人设特征 → 搜索词 → 搜索结果的完整流程
+"""
+
+import json
+import os
+import sys
+from datetime import datetime
+from typing import List, Dict, Any, Set
+import webbrowser
+
+
+def load_json(file_path: str) -> Any:
+    """加载JSON文件"""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def load_all_data(output_dir: str = "output_v2") -> Dict[str, Any]:
+    """
+    加载所有需要的数据文件
+
+    Returns:
+        包含所有数据的字典
+    """
+    print("正在加载数据文件...")
+
+    data = {
+        'filtered_features': load_json(os.path.join(output_dir, 'filtered_features.json')),
+        'candidate_words': load_json(os.path.join(output_dir, 'candidate_words.json')),
+        'search_queries': load_json(os.path.join(output_dir, 'search_queries_evaluated.json')),
+        'search_results': load_json(os.path.join(output_dir, 'search_results.json')),
+        'evaluated_results': load_json(os.path.join(output_dir, 'evaluated_results.json'))
+    }
+
+    # 尝试加载深度分析数据(可选)
+    deep_path = os.path.join(output_dir, 'deep_analysis_results.json')
+    similarity_path = os.path.join(output_dir, 'similarity_analysis_results.json')
+
+    if os.path.exists(deep_path):
+        deep_data = load_json(deep_path)
+        # 创建note_id到解构数据的映射
+        data['stage7_mapping'] = {}
+        for result in deep_data.get('results', []):
+            note_id = result.get('note_id')
+            if note_id:
+                data['stage7_mapping'][note_id] = result
+    else:
+        data['stage7_mapping'] = {}
+
+    if os.path.exists(similarity_path):
+        sim_data = load_json(similarity_path)
+        # 创建note_id到相似度数据的映射
+        data['stage8_mapping'] = {}
+        for result in sim_data.get('results', []):
+            note_id = result.get('note_id')
+            if note_id:
+                data['stage8_mapping'][note_id] = result
+    else:
+        data['stage8_mapping'] = {}
+
+    print(f"  ✓ 已加载 {len(data['filtered_features'])} 个原始特征")
+    print(f"  ✓ 已加载 {len(data['candidate_words'])} 个候选词数据")
+    print(f"  ✓ 已加载解构数据: {len(data['stage7_mapping'])} 个帖子")
+    print(f"  ✓ 已加载相似度数据: {len(data['stage8_mapping'])} 个帖子")
+
+    return data
+
+
+def extract_global_candidates(data: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]:
+    """
+    提取全局候选词并按相似度分类
+
+    Returns:
+        {
+            'matched': [...],      # 相似度 >= 0.8
+            'partial': [...],      # 0.5 <= 相似度 < 0.8
+            'unmatched': [...]     # 相似度 < 0.5
+        }
+    """
+    print("\n提取全局候选词...")
+
+    candidates_map = {}  # 用于去重
+
+    # 遍历所有特征的候选词
+    for feature_data in data['candidate_words']:
+        candidates_by_base = feature_data.get('高相似度候选_按base_word', {})
+
+        for base_word, candidates in candidates_by_base.items():
+            for cand in candidates:
+                cand_name = cand.get('候选词', '')
+                if not cand_name:
+                    continue
+
+                # 计算相似度
+                similarity = cand.get('相似度', 0)
+
+                # 如果是帖子候选词,使用点最高人设相似度
+                if cand.get('候选词类型') == 'post':
+                    similarity = cand.get('点最高人设相似度', similarity)
+
+                # 去重:保留最高相似度
+                if cand_name not in candidates_map or similarity > candidates_map[cand_name]['相似度']:
+                    candidates_map[cand_name] = {
+                        '名称': cand_name,
+                        '类型': cand.get('候选词类型', 'unknown'),
+                        '相似度': similarity,
+                        '特征类型': cand.get('特征类型', ''),
+                        '来源路径': cand.get('来源路径', ''),
+                        '匹配说明': cand.get('匹配说明', '')
+                    }
+
+    # 按相似度分类
+    result = {
+        'matched': [],     # >= 0.8
+        'partial': [],     # 0.5 ~ 0.8
+        'unmatched': []    # < 0.5
+    }
+
+    for cand in candidates_map.values():
+        similarity = cand['相似度']
+        if similarity >= 0.8:
+            result['matched'].append(cand)
+        elif similarity >= 0.5:
+            result['partial'].append(cand)
+        else:
+            result['unmatched'].append(cand)
+
+    # 排序:按相似度降序
+    for category in result.values():
+        category.sort(key=lambda x: x['相似度'], reverse=True)
+
+    print(f"  ✓ 已匹配: {len(result['matched'])} 个")
+    print(f"  ✓ 部分匹配: {len(result['partial'])} 个")
+    print(f"  ✓ 不匹配: {len(result['unmatched'])} 个")
+
+    return result
+
+
+def render_left_candidates_html(global_candidates: Dict[str, List[Dict[str, Any]]]) -> str:
+    """
+    渲染左侧固定候选词区域HTML
+
+    Args:
+        global_candidates: 分类后的全局候选词
+
+    Returns:
+        HTML字符串
+    """
+    html_parts = []
+
+    html_parts.append('''
+    <div class="left-candidates-panel">
+        <div class="candidates-header">
+            <div class="candidates-title">📚 可用候选词</div>
+            <div class="candidates-hint">此区域固定展示 不随滚动</div>
+        </div>
+        <div class="candidates-content">
+    ''')
+
+    # 已匹配区域
+    html_parts.append('''
+            <div class="candidates-section matched-section">
+                <div class="section-title">✅ 已匹配 <span class="section-count">({count})</span></div>
+                <div class="section-hint">与人设相似度 ≥ 0.8</div>
+                <div class="candidates-list">
+    '''.format(count=len(global_candidates['matched'])))
+
+    for cand in global_candidates['matched']:
+        icon = '📝' if cand['类型'] == 'post' else '👤'
+        type_label = '帖子' if cand['类型'] == 'post' else '人设'
+        html_parts.append(f'''
+                    <div class="candidate-item matched">
+                        <div class="candidate-icon">{icon}</div>
+                        <div class="candidate-info">
+                            <div class="candidate-name">{cand['名称']}</div>
+                            <div class="candidate-meta">
+                                <span class="candidate-type">{type_label}</span>
+                                <span class="candidate-similarity">{cand['相似度']:.2f}</span>
+                            </div>
+                        </div>
+                    </div>
+        ''')
+
+    html_parts.append('''
+                </div>
+            </div>
+    ''')
+
+    # 部分匹配区域
+    html_parts.append('''
+            <div class="candidates-section partial-section">
+                <div class="section-title">🟡 部分匹配 <span class="section-count">({count})</span></div>
+                <div class="section-hint">与人设特征相似度 0.5-0.8</div>
+                <div class="candidates-list">
+    '''.format(count=len(global_candidates['partial'])))
+
+    for cand in global_candidates['partial']:
+        icon = '📝' if cand['类型'] == 'post' else '👤'
+        type_label = '帖子' if cand['类型'] == 'post' else '人设'
+        html_parts.append(f'''
+                    <div class="candidate-item partial">
+                        <div class="candidate-icon">{icon}</div>
+                        <div class="candidate-info">
+                            <div class="candidate-name">{cand['名称']}</div>
+                            <div class="candidate-meta">
+                                <span class="candidate-type">{type_label}</span>
+                                <span class="candidate-similarity">{cand['相似度']:.2f}</span>
+                            </div>
+                        </div>
+                    </div>
+        ''')
+
+    html_parts.append('''
+                </div>
+            </div>
+    ''')
+
+    # 不匹配区域
+    html_parts.append('''
+            <div class="candidates-section unmatched-section">
+                <div class="section-title">❌ 不匹配 <span class="section-count">({count})</span></div>
+                <div class="section-hint">与人设特征相似度 < 0.5</div>
+                <div class="candidates-list">
+    '''.format(count=len(global_candidates['unmatched'])))
+
+    for cand in global_candidates['unmatched']:
+        icon = '📝' if cand['类型'] == 'post' else '👤'
+        type_label = '帖子' if cand['类型'] == 'post' else '人设'
+        html_parts.append(f'''
+                    <div class="candidate-item unmatched">
+                        <div class="candidate-icon">{icon}</div>
+                        <div class="candidate-info">
+                            <div class="candidate-name">{cand['名称']}</div>
+                            <div class="candidate-meta">
+                                <span class="candidate-type">{type_label}</span>
+                                <span class="candidate-similarity">{cand['相似度']:.2f}</span>
+                            </div>
+                        </div>
+                    </div>
+        ''')
+
+    html_parts.append('''
+                </div>
+            </div>
+        </div>
+    </div>
+    ''')
+
+    return ''.join(html_parts)
+
+
+def render_cascade_flow_html(data: Dict[str, Any]) -> str:
+    """
+    渲染中间级联流程HTML(三层结构)
+
+    Returns:
+        HTML字符串
+    """
+    html_parts = []
+
+    html_parts.append('''
+    <div class="cascade-flow-panel">
+        <div class="cascade-header">
+            <div class="cascade-title">🔄 级联搜索流程</div>
+        </div>
+        <div class="cascade-content" id="cascadeContent">
+    ''')
+
+    # 默认显示第一个特征的级联流程
+    if data['evaluated_results']:
+        first_feature = data['evaluated_results'][0]
+        html_parts.append(render_single_cascade(first_feature, 0, data))
+
+    html_parts.append('''
+        </div>
+    </div>
+    ''')
+
+    return ''.join(html_parts)
+
+
+def render_single_cascade(feature_data: Dict[str, Any], feature_idx: int, data: Dict[str, Any]) -> str:
+    """
+    渲染单个特征的级联流程
+
+    Args:
+        feature_data: 特征数据
+        feature_idx: 特征索引
+        data: 全部数据
+
+    Returns:
+        HTML字符串
+    """
+    html_parts = []
+
+    original_feature = feature_data.get('原始特征名称', '')
+    top3_matches = feature_data.get('top3匹配信息', [])
+    groups = feature_data.get('组合评估结果_分组', [])
+
+    # 层级1: 原始特征
+    html_parts.append(f'''
+        <div class="cascade-layer layer-1">
+            <div class="layer-title">📌 帖子选题点</div>
+            <div class="feature-selector">
+                <div class="selected-feature">
+                    <div class="feature-name">{original_feature}</div>
+                    <div class="feature-actions">
+                        <button class="switch-feature-btn" onclick="showFeatureSelector()">切换特征</button>
+                    </div>
+                </div>
+            </div>
+        </div>
+    ''')
+
+    # 层级2: Top3人设特征
+    html_parts.append('''
+        <div class="cascade-arrow">↓</div>
+        <div class="cascade-layer layer-2">
+            <div class="layer-title">🎯 Top1各 相似度(x)</div>
+            <div class="top3-container">
+    ''')
+
+    for idx, match in enumerate(top3_matches[:3], 1):
+        base_word = match.get('人设特征名称', '')
+        similarity = match.get('相似度', 0)
+        is_top1 = (idx == 1)
+        card_class = 'top3-card top1-card' if is_top1 else 'top3-card'
+
+        html_parts.append(f'''
+                <div class="{card_class}" data-feature-idx="{feature_idx}" data-match-idx="{idx-1}" onclick="selectBaseWord({feature_idx}, {idx-1})">
+                    <div class="top3-rank">Top{idx}</div>
+                    <div class="top3-name">{base_word}</div>
+                    <div class="top3-similarity">相似度: {similarity:.2f}</div>
+                </div>
+        ''')
+
+    html_parts.append('''
+            </div>
+        </div>
+    ''')
+
+    # 层级3: 搜索词(默认展开Top1)
+    if groups:
+        html_parts.append('''
+            <div class="cascade-arrow">↓</div>
+            <div class="cascade-layer layer-3">
+                <div class="layer-title">🔍 搜索词生成</div>
+                <div class="search-words-container" id="searchWordsContainer">
+        ''')
+
+        # 默认显示第一个group(Top1)
+        html_parts.append(render_search_words_group(groups[0], feature_idx, 0))
+
+        html_parts.append('''
+                </div>
+            </div>
+        ''')
+
+    return ''.join(html_parts)
+
+
+def render_search_words_group(group: Dict[str, Any], feature_idx: int, group_idx: int) -> str:
+    """
+    渲染搜索词组
+
+    Args:
+        group: 搜索词组数据
+        feature_idx: 特征索引
+        group_idx: 组索引
+
+    Returns:
+        HTML字符串
+    """
+    html_parts = []
+
+    base_word = group.get('base_word', '')
+    searches = group.get('top10_searches', [])
+    available_words = group.get('available_words', [])
+
+    html_parts.append(f'''
+        <div class="search-words-group" data-base-word="{base_word}">
+            <div class="base-word-label">中心词: <span class="base-word-value">{base_word}</span></div>
+    ''')
+
+    # 显示每个搜索词
+    for sw_idx, search in enumerate(searches):
+        html_parts.append(render_search_word_card(search, feature_idx, group_idx, sw_idx, available_words))
+
+    html_parts.append('''
+        </div>
+    ''')
+
+    return ''.join(html_parts)
+
+
+def render_search_word_card(search: Dict[str, Any], feature_idx: int, group_idx: int, sw_idx: int, available_words: List) -> str:
+    """
+    渲染单个搜索词卡片
+
+    Args:
+        search: 搜索词数据
+        feature_idx, group_idx, sw_idx: 索引
+        available_words: 可用候选词列表
+
+    Returns:
+        HTML字符串
+    """
+    search_word = search.get('search_word', '')
+    score = search.get('score', 0)
+    reasoning = search.get('reasoning', '')
+    has_result = search.get('search_result') is not None
+
+    # 检查是否已执行搜索
+    status_icon = '✅' if has_result else '⏸️'
+    status_text = '已搜索' if has_result else '未搜索'
+    status_class = 'searched' if has_result else 'not-searched'
+
+    # 显示候选词(最多前10个)
+    cand_names = [w.get('候选词', '') if isinstance(w, dict) else w for w in available_words[:10]]
+    cand_display = ', '.join(cand_names) if cand_names else '无'
+
+    html = f'''
+        <div class="search-word-card {status_class}" data-feature-idx="{feature_idx}" data-group-idx="{group_idx}" data-sw-idx="{sw_idx}" onclick="selectSearchWord({feature_idx}, {group_idx}, {sw_idx})">
+            <div class="sw-header">
+                <div class="sw-status">{status_icon} {status_text}</div>
+                <div class="sw-rank">#{sw_idx + 1}</div>
+            </div>
+
+            <div class="sw-candidates-pool">
+                <div class="sw-label">可用候选词池:</div>
+                <div class="sw-candidates">{cand_display}</div>
+            </div>
+
+            <div class="sw-arrow-container">
+                <div class="sw-arrow">
+                    <span class="arrow-line">→</span>
+                    <span class="arrow-score">score: {score:.2f}</span>
+                </div>
+            </div>
+
+            <div class="sw-result">
+                <div class="sw-query">{search_word}</div>
+            </div>
+
+            <div class="sw-reasoning">
+                <div class="reasoning-label">💡 LLM推理理由:</div>
+                <div class="reasoning-content">{reasoning}</div>
+            </div>
+        </div>
+    '''
+
+    return html
+
+
+def generate_html(data: Dict[str, Any], global_candidates: Dict[str, List[Dict[str, Any]]]) -> str:
+    """
+    生成完整HTML页面
+
+    Args:
+        data: 所有数据
+        global_candidates: 全局候选词
+
+    Returns:
+        完整HTML字符串
+    """
+    print("\n正在生成HTML...")
+
+    # 准备数据JSON
+    data_json = json.dumps(data['evaluated_results'], ensure_ascii=False)
+    stage7_json = json.dumps(data['stage7_mapping'], ensure_ascii=False)
+    stage8_json = json.dumps(data['stage8_mapping'], ensure_ascii=False)
+
+    # 生成各部分HTML
+    left_html = render_left_candidates_html(global_candidates)
+    cascade_html = render_cascade_flow_html(data)
+
+    # 生成完整HTML
+    html_template = f'''<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>级联搜索结果可视化</title>
+    <style>
+        {get_css_styles()}
+    </style>
+</head>
+<body>
+    <div class="page-header">
+        <div class="header-title">🔍 级联搜索结果可视化系统</div>
+        <div class="header-subtitle">候选词 → Top3人设特征 → 搜索词 → 搜索结果</div>
+    </div>
+
+    <div class="main-layout">
+        <!-- 左侧:候选词库 -->
+        {left_html}
+
+        <!-- 中间:级联流程 -->
+        {cascade_html}
+
+        <!-- 右侧:搜索结果 -->
+        <div class="right-results-panel">
+            <div class="results-header">
+                <div class="results-title">📝 搜索结果卡片</div>
+                <div class="results-subtitle" id="resultsSubtitle">请选择一个搜索词查看结果</div>
+            </div>
+            <div class="results-content" id="resultsContent">
+                <div class="empty-results">
+                    <div class="empty-icon">🔍</div>
+                    <div class="empty-text">选择搜索词后,这里将显示对应的搜索结果</div>
+                </div>
+            </div>
+        </div>
+    </div>
+
+    <!-- 特征选择器Modal -->
+    <div class="modal-overlay" id="featureSelectorModal">
+        <div class="modal-window">
+            <div class="modal-header">
+                <div class="modal-title">选择原始特征</div>
+                <button class="modal-close-btn" onclick="closeFeatureSelector()">×</button>
+            </div>
+            <div class="modal-body">
+                <div class="feature-list" id="featureList"></div>
+            </div>
+        </div>
+    </div>
+
+    <script>
+        // 数据
+        const allData = {data_json};
+        const stage7Data = {stage7_json};
+        const stage8Data = {stage8_json};
+        let currentFeatureIdx = 0;
+        let currentGroupIdx = 0;
+        let currentSwIdx = 0;
+
+        {get_javascript_code()}
+    </script>
+</body>
+</html>
+'''
+
+    print("  ✓ HTML生成完成")
+    return html_template
+
+
+def get_css_styles() -> str:
+    """获取CSS样式"""
+    return '''
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+            background: #f5f7fa;
+            color: #333;
+            overflow-x: hidden;
+        }
+
+        /* 页面头部 */
+        .page-header {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 20px;
+            text-align: center;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+        }
+
+        .header-title {
+            font-size: 24px;
+            font-weight: bold;
+            margin-bottom: 5px;
+        }
+
+        .header-subtitle {
+            font-size: 14px;
+            opacity: 0.9;
+        }
+
+        /* 主布局 - 三栏 */
+        .main-layout {
+            display: flex;
+            gap: 20px;
+            padding: 20px;
+            height: calc(100vh - 100px);
+        }
+
+        /* 左侧候选词面板 - 固定 */
+        .left-candidates-panel {
+            width: 280px;
+            background: white;
+            border-radius: 8px;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+            position: sticky;
+            top: 20px;
+            height: fit-content;
+            max-height: calc(100vh - 140px);
+            display: flex;
+            flex-direction: column;
+        }
+
+        .candidates-header {
+            padding: 15px;
+            border-bottom: 2px solid #e5e7eb;
+        }
+
+        .candidates-title {
+            font-size: 16px;
+            font-weight: 600;
+            color: #374151;
+            margin-bottom: 5px;
+        }
+
+        .candidates-hint {
+            font-size: 11px;
+            color: #ef4444;
+            font-weight: 500;
+        }
+
+        .candidates-content {
+            flex: 1;
+            overflow-y: auto;
+            padding: 10px;
+        }
+
+        .candidates-section {
+            margin-bottom: 15px;
+        }
+
+        .section-title {
+            font-size: 13px;
+            font-weight: 600;
+            margin-bottom: 5px;
+            display: flex;
+            align-items: center;
+            gap: 5px;
+        }
+
+        .section-count {
+            color: #6b7280;
+            font-size: 12px;
+        }
+
+        .section-hint {
+            font-size: 11px;
+            color: #6b7280;
+            margin-bottom: 8px;
+        }
+
+        .candidates-list {
+            display: flex;
+            flex-direction: column;
+            gap: 6px;
+        }
+
+        .candidate-item {
+            display: flex;
+            align-items: center;
+            gap: 8px;
+            padding: 8px;
+            border-radius: 6px;
+            border: 1px solid #e5e7eb;
+            transition: all 0.2s;
+            cursor: pointer;
+        }
+
+        .candidate-item:hover {
+            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+            transform: translateY(-1px);
+        }
+
+        .candidate-item.matched {
+            background: #f0fdf4;
+            border-color: #86efac;
+        }
+
+        .candidate-item.partial {
+            background: #fffbeb;
+            border-color: #fcd34d;
+        }
+
+        .candidate-item.unmatched {
+            background: #fef2f2;
+            border-color: #fca5a5;
+        }
+
+        .candidate-icon {
+            font-size: 18px;
+            flex-shrink: 0;
+        }
+
+        .candidate-info {
+            flex: 1;
+            min-width: 0;
+        }
+
+        .candidate-name {
+            font-size: 12px;
+            font-weight: 500;
+            color: #374151;
+            white-space: nowrap;
+            overflow: hidden;
+            text-overflow: ellipsis;
+        }
+
+        .candidate-meta {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-top: 2px;
+        }
+
+        .candidate-type {
+            font-size: 10px;
+            color: #6b7280;
+        }
+
+        .candidate-similarity {
+            font-size: 10px;
+            font-weight: 600;
+            color: #10b981;
+        }
+
+        /* 中间级联流程面板 */
+        .cascade-flow-panel {
+            flex: 1;
+            background: white;
+            border-radius: 8px;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+            overflow-y: auto;
+            padding: 20px;
+        }
+
+        .cascade-header {
+            margin-bottom: 20px;
+            padding-bottom: 10px;
+            border-bottom: 2px solid #e5e7eb;
+        }
+
+        .cascade-title {
+            font-size: 18px;
+            font-weight: 600;
+            color: #374151;
+        }
+
+        .cascade-content {
+            display: flex;
+            flex-direction: column;
+            gap: 15px;
+        }
+
+        .cascade-layer {
+            background: #f9fafb;
+            border-radius: 8px;
+            padding: 15px;
+        }
+
+        .layer-title {
+            font-size: 14px;
+            font-weight: 600;
+            color: #6b7280;
+            margin-bottom: 10px;
+        }
+
+        /* 层级1: 特征选择器 */
+        .selected-feature {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            padding: 12px;
+            background: white;
+            border-radius: 6px;
+            border: 2px solid #667eea;
+        }
+
+        .feature-name {
+            font-size: 15px;
+            font-weight: 600;
+            color: #374151;
+        }
+
+        .switch-feature-btn {
+            padding: 6px 12px;
+            background: #667eea;
+            color: white;
+            border: none;
+            border-radius: 4px;
+            cursor: pointer;
+            font-size: 12px;
+            transition: all 0.2s;
+        }
+
+        .switch-feature-btn:hover {
+            background: #5568d3;
+        }
+
+        /* 层级2: Top3卡片 */
+        .top3-container {
+            display: flex;
+            gap: 10px;
+        }
+
+        .top3-card {
+            flex: 1;
+            padding: 12px;
+            background: white;
+            border-radius: 6px;
+            border: 2px solid #e5e7eb;
+            cursor: pointer;
+            transition: all 0.2s;
+        }
+
+        .top3-card:hover {
+            border-color: #667eea;
+            box-shadow: 0 2px 6px rgba(102, 126, 234, 0.2);
+        }
+
+        .top3-card.top1-card {
+            border-color: #10b981;
+            background: #f0fdf4;
+        }
+
+        .top3-card.top1-card:hover {
+            border-color: #059669;
+        }
+
+        .top3-card.selected {
+            border-color: #667eea;
+            box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.2);
+        }
+
+        .top3-rank {
+            font-size: 11px;
+            font-weight: 600;
+            color: #6b7280;
+            margin-bottom: 4px;
+        }
+
+        .top3-name {
+            font-size: 14px;
+            font-weight: 600;
+            color: #374151;
+            margin-bottom: 4px;
+        }
+
+        .top3-similarity {
+            font-size: 12px;
+            color: #10b981;
+        }
+
+        /* 级联箭头 */
+        .cascade-arrow {
+            text-align: center;
+            font-size: 24px;
+            color: #667eea;
+            margin: 5px 0;
+        }
+
+        /* 层级3: 搜索词 */
+        .base-word-label {
+            font-size: 13px;
+            color: #6b7280;
+            margin-bottom: 12px;
+        }
+
+        .base-word-value {
+            font-weight: 600;
+            color: #10b981;
+        }
+
+        .search-word-card {
+            background: white;
+            border-radius: 8px;
+            border: 2px solid #e5e7eb;
+            padding: 15px;
+            margin-bottom: 12px;
+            cursor: pointer;
+            transition: all 0.2s;
+        }
+
+        .search-word-card:hover {
+            border-color: #667eea;
+            box-shadow: 0 2px 6px rgba(0,0,0,0.1);
+        }
+
+        .search-word-card.searched {
+            border-color: #10b981;
+        }
+
+        .search-word-card.selected {
+            border-color: #667eea;
+            box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.2);
+        }
+
+        .sw-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 10px;
+        }
+
+        .sw-status {
+            font-size: 12px;
+            font-weight: 600;
+            color: #10b981;
+        }
+
+        .sw-rank {
+            font-size: 11px;
+            color: #6b7280;
+        }
+
+        .sw-candidates-pool {
+            margin-bottom: 10px;
+        }
+
+        .sw-label {
+            font-size: 11px;
+            color: #6b7280;
+            margin-bottom: 4px;
+        }
+
+        .sw-candidates {
+            font-size: 12px;
+            color: #374151;
+            background: #f9fafb;
+            padding: 6px;
+            border-radius: 4px;
+        }
+
+        .sw-arrow-container {
+            text-align: center;
+            margin: 10px 0;
+        }
+
+        .sw-arrow {
+            display: inline-flex;
+            align-items: center;
+            gap: 8px;
+        }
+
+        .arrow-line {
+            font-size: 20px;
+            color: #667eea;
+        }
+
+        .arrow-score {
+            font-size: 12px;
+            font-weight: 600;
+            color: #667eea;
+            background: #ede9fe;
+            padding: 2px 8px;
+            border-radius: 4px;
+        }
+
+        .sw-result {
+            text-align: center;
+            margin-bottom: 10px;
+        }
+
+        .sw-query {
+            font-size: 16px;
+            font-weight: 600;
+            color: #374151;
+            background: #f0fdf4;
+            padding: 8px;
+            border-radius: 6px;
+            border: 1px solid #86efac;
+        }
+
+        .sw-reasoning {
+            background: #fffbeb;
+            padding: 10px;
+            border-radius: 6px;
+            border: 1px solid #fcd34d;
+        }
+
+        .reasoning-label {
+            font-size: 12px;
+            font-weight: 600;
+            color: #374151;
+            margin-bottom: 4px;
+        }
+
+        .reasoning-content {
+            font-size: 12px;
+            color: #6b7280;
+            line-height: 1.5;
+        }
+
+        /* 右侧结果面板 */
+        .right-results-panel {
+            width: 500px;
+            background: white;
+            border-radius: 8px;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+            overflow-y: auto;
+            display: flex;
+            flex-direction: column;
+        }
+
+        .results-header {
+            padding: 15px;
+            border-bottom: 2px solid #e5e7eb;
+        }
+
+        .results-title {
+            font-size: 16px;
+            font-weight: 600;
+            color: #374151;
+            margin-bottom: 5px;
+        }
+
+        .results-subtitle {
+            font-size: 12px;
+            color: #6b7280;
+        }
+
+        .results-content {
+            flex: 1;
+            padding: 15px;
+        }
+
+        .empty-results {
+            text-align: center;
+            padding: 60px 20px;
+        }
+
+        .empty-icon {
+            font-size: 48px;
+            margin-bottom: 15px;
+        }
+
+        .empty-text {
+            font-size: 14px;
+            color: #6b7280;
+        }
+
+        /* Modal */
+        .modal-overlay {
+            display: none;
+            position: fixed;
+            top: 0;
+            left: 0;
+            right: 0;
+            bottom: 0;
+            background: rgba(0,0,0,0.5);
+            z-index: 1000;
+            align-items: center;
+            justify-content: center;
+        }
+
+        .modal-overlay.active {
+            display: flex;
+        }
+
+        .modal-window {
+            background: white;
+            border-radius: 12px;
+            box-shadow: 0 10px 40px rgba(0,0,0,0.2);
+            max-width: 600px;
+            width: 90%;
+            max-height: 80vh;
+            display: flex;
+            flex-direction: column;
+        }
+
+        .modal-header {
+            padding: 20px;
+            border-bottom: 1px solid #e5e7eb;
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+        }
+
+        .modal-title {
+            font-size: 18px;
+            font-weight: 600;
+            color: #374151;
+        }
+
+        .modal-close-btn {
+            background: none;
+            border: none;
+            font-size: 28px;
+            color: #6b7280;
+            cursor: pointer;
+            padding: 0;
+            width: 32px;
+            height: 32px;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            border-radius: 4px;
+        }
+
+        .modal-close-btn:hover {
+            background: #f3f4f6;
+        }
+
+        .modal-body {
+            flex: 1;
+            overflow-y: auto;
+            padding: 20px;
+        }
+
+        .feature-list {
+            display: flex;
+            flex-direction: column;
+            gap: 10px;
+        }
+
+        .feature-list-item {
+            padding: 12px;
+            background: #f9fafb;
+            border-radius: 6px;
+            border: 2px solid #e5e7eb;
+            cursor: pointer;
+            transition: all 0.2s;
+        }
+
+        .feature-list-item:hover {
+            border-color: #667eea;
+            background: white;
+        }
+
+        .feature-list-item.active {
+            border-color: #10b981;
+            background: #f0fdf4;
+        }
+    '''
+
+
+def get_javascript_code() -> str:
+    """获取JavaScript代码"""
+    return '''
+        // 初始化
+        document.addEventListener('DOMContentLoaded', function() {
+            console.log('页面加载完成');
+            renderFeatureList();
+        });
+
+        // 显示特征选择器
+        function showFeatureSelector() {
+            const modal = document.getElementById('featureSelectorModal');
+            modal.classList.add('active');
+        }
+
+        // 关闭特征选择器
+        function closeFeatureSelector() {
+            const modal = document.getElementById('featureSelectorModal');
+            modal.classList.remove('active');
+        }
+
+        // 渲染特征列表
+        function renderFeatureList() {
+            const listEl = document.getElementById('featureList');
+            let html = '';
+
+            allData.forEach((feature, idx) => {
+                const name = feature['原始特征名称'];
+                const isActive = idx === currentFeatureIdx;
+                const activeClass = isActive ? 'active' : '';
+
+                html += `
+                    <div class="feature-list-item ${activeClass}" onclick="selectFeature(${idx})">
+                        ${name}
+                    </div>
+                `;
+            });
+
+            listEl.innerHTML = html;
+        }
+
+        // 选择特征
+        function selectFeature(featureIdx) {
+            currentFeatureIdx = featureIdx;
+            currentGroupIdx = 0;
+            currentSwIdx = 0;
+
+            closeFeatureSelector();
+            updateCascadeView();
+            renderFeatureList();
+        }
+
+        // 更新级联视图
+        function updateCascadeView() {
+            const feature = allData[currentFeatureIdx];
+            const cascadeContent = document.getElementById('cascadeContent');
+
+            // 重新渲染级联流程(这里简化处理,实际应该用JavaScript动态更新)
+            location.reload();  // 简化版:重新加载页面
+        }
+
+        // 选择base_word
+        function selectBaseWord(featureIdx, matchIdx) {
+            currentFeatureIdx = featureIdx;
+            currentGroupIdx = matchIdx;
+            currentSwIdx = 0;
+
+            // 移除所有选中状态
+            document.querySelectorAll('.top3-card').forEach(card => {
+                card.classList.remove('selected');
+            });
+
+            // 添加选中状态
+            event.target.closest('.top3-card').classList.add('selected');
+
+            // 更新搜索词显示
+            const feature = allData[currentFeatureIdx];
+            const groups = feature['组合评估结果_分组'] || [];
+            if (groups[currentGroupIdx]) {
+                // TODO: 更新搜索词列表显示
+                console.log('切换到group:', currentGroupIdx);
+            }
+        }
+
+        // 选择搜索词
+        function selectSearchWord(featureIdx, groupIdx, swIdx) {
+            currentFeatureIdx = featureIdx;
+            currentGroupIdx = groupIdx;
+            currentSwIdx = swIdx;
+
+            // 移除所有搜索词的选中状态
+            document.querySelectorAll('.search-word-card').forEach(card => {
+                card.classList.remove('selected');
+            });
+
+            // 添加选中状态
+            event.target.closest('.search-word-card').classList.add('selected');
+
+            // 显示搜索结果
+            renderSearchResults(featureIdx, groupIdx, swIdx);
+        }
+
+        // 渲染搜索结果
+        function renderSearchResults(featureIdx, groupIdx, swIdx) {
+            const feature = allData[featureIdx];
+            const groups = feature['组合评估结果_分组'] || [];
+            const group = groups[groupIdx];
+            if (!group) return;
+
+            const searches = group['top10_searches'] || [];
+            const search = searches[swIdx];
+            if (!search) return;
+
+            const searchWord = search['search_word'] || '';
+            const searchResult = search['search_result'];
+
+            const resultsContent = document.getElementById('resultsContent');
+            const resultsSubtitle = document.getElementById('resultsSubtitle');
+
+            resultsSubtitle.textContent = `搜索词: ${searchWord}`;
+
+            if (!searchResult) {
+                resultsContent.innerHTML = `
+                    <div class="empty-results">
+                        <div class="empty-icon">❌</div>
+                        <div class="empty-text">该搜索词未执行搜索</div>
+                    </div>
+                `;
+                return;
+            }
+
+            const notes = searchResult.data?.data || [];
+
+            if (notes.length === 0) {
+                resultsContent.innerHTML = `
+                    <div class="empty-results">
+                        <div class="empty-icon">📭</div>
+                        <div class="empty-text">未找到匹配的帖子</div>
+                    </div>
+                `;
+                return;
+            }
+
+            // 渲染帖子卡片(简化版)
+            let html = '<div class="notes-grid">';
+            notes.forEach((note, idx) => {
+                const card = note.note_card || {};
+                const title = card.display_title || '无标题';
+                const image = (card.image_list || [])[0] || '';
+
+                html += `
+                    <div class="note-card-simple">
+                        ${image ? `<img src="${image}" alt="${title}" loading="lazy">` : ''}
+                        <div class="note-title-simple">${title}</div>
+                    </div>
+                `;
+            });
+            html += '</div>';
+
+            resultsContent.innerHTML = html;
+        }
+    '''
+
+
+def main():
+    """主函数"""
+    print("=" * 60)
+    print("级联搜索结果可视化工具")
+    print("=" * 60)
+
+    # 加载数据
+    data = load_all_data()
+
+    # 提取全局候选词
+    global_candidates = extract_global_candidates(data)
+
+    # 生成HTML
+    html_content = generate_html(data, global_candidates)
+
+    # 保存HTML文件
+    output_path = "visualization/cascade_search_results.html"
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write(html_content)
+
+    print(f"\n✓ HTML文件已保存: {output_path}")
+
+    # 打开HTML文件
+    abs_path = os.path.abspath(output_path)
+    print(f"正在打开浏览器...")
+    webbrowser.open(f'file://{abs_path}')
+
+    print("\n" + "=" * 60)
+    print("✅ 可视化生成完成!")
+    print("=" * 60)
+
+
+if __name__ == '__main__':
+    main()

+ 24 - 28
src/visualizers/deconstruction_visualizer.py

@@ -1313,13 +1313,13 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
         }}
 
         .baseword-item:hover {{
-            background: #f0fdf4;
-            border-left-color: #22c55e;
+            background: #fef3c7;
+            border-left-color: #f59e0b;
         }}
 
         .baseword-item.active {{
-            background: linear-gradient(90deg, #dcfce7 0%, #f0fdf4 100%);
-            border-left-color: #22c55e;
+            background: linear-gradient(90deg, #fef3c7 0%, #fefce8 100%);
+            border-left-color: #f59e0b;
             position: relative;
         }}
 
@@ -1330,7 +1330,7 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
             top: 50%;
             width: 25px;
             height: 2px;
-            background: #22c55e;
+            background: #f59e0b;
         }}
 
         .baseword-item.active::before {{
@@ -1341,7 +1341,7 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
             transform: translateY(-50%);
             width: 0;
             height: 0;
-            border-left: 6px solid #22c55e;
+            border-left: 6px solid #f59e0b;
             border-top: 4px solid transparent;
             border-bottom: 4px solid transparent;
         }}
@@ -3125,8 +3125,8 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
         }}
 
         .low-similarity-item {{
-            padding: 10px 12px;
-            margin: 8px 0;
+            padding: 12px 15px;
+            margin-bottom: 10px;
             background: white;
             border-left: 3px solid #dc2626;
             border-radius: 4px;
@@ -3138,6 +3138,7 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
         }}
 
         .low-feature-name {{
+            font-size: 14px;
             font-weight: 600;
             color: #991b1b;
             margin-bottom: 4px;
@@ -3145,17 +3146,18 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
 
         .low-feature-score {{
             display: inline-block;
-            font-size: 13px;
+            font-size: 12px;
             font-weight: 600;
             color: #dc2626;
             background: #fee2e2;
             padding: 2px 8px;
             border-radius: 4px;
+            margin-right: 6px;
         }}
 
         .low-feature-meta {{
             font-size: 11px;
-            color: #9ca3af;
+            color: #dc2626;
             margin-top: 4px;
         }}
     </style>
@@ -3655,7 +3657,7 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
                         <div class="feature-item-left ${{isActive ? 'active' : ''}}"
                              onclick="selectFeature(${{featureIdx}})"
                              id="feature-left-${{featureIdx}}">
-                            <div class="feature-name">🎯 ${{featureName}}${{postIcon}}</div>
+                            <div class="feature-name">📝 ${{featureName}}${{postIcon}}</div>
                             <div class="cascade-item-meta">
                                 <span class="partial-feature-score">相似度: ${{similarity.toFixed(2)}}</span>
                                 <span class="partial-feature-meta">${{dimension}}</span>
@@ -3683,9 +3685,11 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
 
                     html += `
                         <div class="low-similarity-item">
-                            <div class="low-feature-name">✗ ${{name}}</div>
-                            <div class="low-feature-score">${{similarity.toFixed(2)}}</div>
-                            <div class="low-feature-meta">${{dimension}}</div>
+                            <div class="low-feature-name">📝 ${{name}}</div>
+                            <div class="cascade-item-meta">
+                                <span class="low-feature-score">相似度: ${{similarity.toFixed(2)}}</span>
+                                <span class="low-feature-meta">${{dimension}}</span>
+                            </div>
                         </div>
                     `;
                 }});
@@ -3737,13 +3741,8 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
                 endY = toRect.top + toRect.height / 2 - containerRect.top;
             }}
 
-            // 使用贝塞尔曲线绘制连接线
-            const controlPoint1X = startX + (endX - startX) * 0.5;
-            const controlPoint1Y = startY;
-            const controlPoint2X = startX + (endX - startX) * 0.5;
-            const controlPoint2Y = endY;
-
-            const path = `M ${{startX}} ${{startY}} C ${{controlPoint1X}} ${{controlPoint1Y}}, ${{controlPoint2X}} ${{controlPoint2Y}}, ${{endX}} ${{endY}}`;
+            // 使用直线绘制连接线,避免任何弯曲或折角
+            const path = `M ${{startX}} ${{startY}} L ${{endX}} ${{endY}}`;
 
             lineEl.setAttribute('d', path);
         }}
@@ -3872,7 +3871,7 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
                     <div class="baseword-item ${{isActive ? 'active' : ''}}"
                          onclick="selectBaseWord(${{featureIdx}}, ${{groupIdx}})"
                          id="baseword-${{featureIdx}}-${{groupIdx}}">
-                        <div class="cascade-item-title" style="color:#059669;">👤 ${{baseWord}}</div>
+                        <div class="cascade-item-title" style="color:#ca8a04;">👤 ${{baseWord}}</div>
                         <div class="cascade-item-meta">
                             相似度: ${{baseSimilarity.toFixed(2)}} · ${{searches.length}}个搜索词
                         </div>
@@ -4072,11 +4071,8 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
 
             // 渲染搜索结果
             let html = `
-                <div class="search-result-header" style="padding:20px;background:#f9fafb;border-bottom:2px solid #e5e7eb;">
-                    <h3 style="margin:0 0 10px 0;">📝 ${{searchWord}}</h3>
-                    <div style="font-size:12px;color:#6b7280;">
-                        组合词: ${{sourceWord}} · ${{notes.length}}个搜索结果
-                    </div>
+                <div class="search-result-header" style="position:sticky;top:0;z-index:100;padding:20px;background:#f9fafb;border-bottom:2px solid #e5e7eb;">
+                    <h3 style="margin:0;">📝 ${{searchWord}} · ${{notes.length}}个搜索结果</h3>
                 </div>
                 <div class="notes-grid" style="padding:20px;display:grid;grid-template-columns:repeat(auto-fill,minmax(280px,1fr));gap:15px;">
             `;
@@ -4141,7 +4137,7 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any],
                 html += `
                     <div class="note-card ${{matchClass}}" style="border:2px solid #fbbf24;border-radius:12px;overflow:hidden;background:white;transition:all 0.2s;cursor:pointer;" onclick="openNoteImagesModal(${{featureIdx}}, ${{baseWordIdx}}, ${{swIdx}}, ${{noteIdx}})">
                         <!-- 图片轮播区域 -->
-                        <div style="position:relative;width:100%;height:200px;background:#f3f4f6;">
+                        <div style="position:relative;width:100%;height:260px;background:#f3f4f6;">
                             ${{cover ? `<img src="${{cover}}" style="width:100%;height:100%;object-fit:cover;">` : `<div style="width:100%;height:100%;display:flex;align-items:center;justify-content:center;color:#9ca3af;">${{typeIcon}}</div>`}}
                             <div style="position:absolute;top:10px;right:10px;background:rgba(0,0,0,0.6);color:white;padding:4px 10px;border-radius:20px;font-size:12px;font-weight:600;">
                                 1/${{imageList.length || 1}}

+ 1487 - 0
src/visualizers/search_results_visualizer.py

@@ -0,0 +1,1487 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+搜索结果评估可视化工具
+整合两层评估结果的交互式HTML页面
+"""
+
+import json
+import os
+from datetime import datetime
+from typing import List, Dict, Any
+
+
+def load_data(json_path: str) -> List[Dict[str, Any]]:
+    """加载JSON数据"""
+    with open(json_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def calculate_statistics(data: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """计算统计数据(包括评估结果)"""
+    total_features = len(data)
+    total_search_words = 0
+    searched_count = 0  # 已执行搜索的数量
+    not_searched_count = 0  # 未执行搜索的数量
+    total_notes = 0
+    video_count = 0
+    normal_count = 0
+
+    # 评估统计
+    total_evaluated_notes = 0
+    total_filtered = 0
+    match_complete = 0  # 0.8-1.0分
+    match_similar = 0   # 0.6-0.79分
+    match_weak = 0      # 0.5-0.59分
+    match_none = 0      # ≤0.4分
+
+    for feature in data:
+        grouped_results = feature.get('组合评估结果_分组', [])
+
+        for group in grouped_results:
+            search_items = group.get('top10_searches', [])
+            total_search_words += len(search_items)
+
+            for search_item in search_items:
+                search_result = search_item.get('search_result', {})
+
+                # 统计搜索状态
+                if search_result:
+                    searched_count += 1
+                    notes = search_result.get('data', {}).get('data', [])
+                    total_notes += len(notes)
+
+                    # 统计视频/图文类型
+                    for note in notes:
+                        note_type = note.get('note_card', {}).get('type', '')
+                        if note_type == 'video':
+                            video_count += 1
+                        else:
+                            normal_count += 1
+
+                    # 统计评估结果
+                    evaluation = search_item.get('evaluation_with_filter')
+                    if evaluation:
+                        total_evaluated_notes += evaluation.get('total_notes', 0)
+                        total_filtered += evaluation.get('filtered_count', 0)
+
+                        stats = evaluation.get('statistics', {})
+                        match_complete += stats.get('完全匹配(0.8-1.0)', 0)
+                        match_similar += stats.get('相似匹配(0.6-0.79)', 0)
+                        match_weak += stats.get('弱相似(0.5-0.59)', 0)
+                        match_none += stats.get('无匹配(≤0.4)', 0)
+                else:
+                    not_searched_count += 1
+
+    # 计算百分比
+    total_remaining = total_evaluated_notes - total_filtered if total_evaluated_notes > 0 else 0
+
+    return {
+        'total_features': total_features,
+        'total_search_words': total_search_words,
+        'searched_count': searched_count,
+        'not_searched_count': not_searched_count,
+        'searched_percentage': round(searched_count / total_search_words * 100, 1) if total_search_words > 0 else 0,
+        'total_notes': total_notes,
+        'video_count': video_count,
+        'normal_count': normal_count,
+        'video_percentage': round(video_count / total_notes * 100, 1) if total_notes > 0 else 0,
+        'normal_percentage': round(normal_count / total_notes * 100, 1) if total_notes > 0 else 0,
+
+        # 评估统计
+        'total_evaluated': total_evaluated_notes,
+        'total_filtered': total_filtered,
+        'total_remaining': total_remaining,
+        'filter_rate': round(total_filtered / total_evaluated_notes * 100, 1) if total_evaluated_notes > 0 else 0,
+        'match_complete': match_complete,
+        'match_similar': match_similar,
+        'match_weak': match_weak,
+        'match_none': match_none,
+        'complete_rate': round(match_complete / total_remaining * 100, 1) if total_remaining > 0 else 0,
+        'similar_rate': round(match_similar / total_remaining * 100, 1) if total_remaining > 0 else 0,
+    }
+
+
+def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path: str):
+    """生成HTML可视化页面"""
+
+    # 准备数据JSON(用于JavaScript)
+    data_json = json.dumps(data, ensure_ascii=False, indent=2)
+
+    html_content = f'''<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>搜索结果评估可视化</title>
+    <style>
+        * {{
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }}
+
+        body {{
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+            background: #f5f7fa;
+            color: #333;
+            overflow-x: hidden;
+        }}
+
+        /* 顶部统计面板 */
+        .stats-panel {{
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 20px;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+        }}
+
+        .stats-container {{
+            max-width: 1400px;
+            margin: 0 auto;
+        }}
+
+        .stats-row {{
+            display: flex;
+            justify-content: space-around;
+            align-items: center;
+            flex-wrap: wrap;
+            gap: 15px;
+            margin-bottom: 15px;
+        }}
+
+        .stats-row:last-child {{
+            margin-bottom: 0;
+            padding-top: 15px;
+            border-top: 1px solid rgba(255,255,255,0.2);
+        }}
+
+        .stat-item {{
+            text-align: center;
+        }}
+
+        .stat-value {{
+            font-size: 28px;
+            font-weight: bold;
+            margin-bottom: 5px;
+        }}
+
+        .stat-label {{
+            font-size: 12px;
+            opacity: 0.9;
+        }}
+
+        .stat-item.small .stat-value {{
+            font-size: 22px;
+        }}
+
+        /* 过滤控制面板 */
+        .filter-panel {{
+            background: white;
+            max-width: 1400px;
+            margin: 20px auto;
+            padding: 15px 20px;
+            border-radius: 8px;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+            display: flex;
+            align-items: center;
+            gap: 20px;
+            flex-wrap: wrap;
+        }}
+
+        .filter-label {{
+            font-weight: 600;
+            color: #374151;
+        }}
+
+        .filter-buttons {{
+            display: flex;
+            gap: 10px;
+            flex-wrap: wrap;
+        }}
+
+        .filter-btn {{
+            padding: 6px 12px;
+            border: 2px solid #e5e7eb;
+            background: white;
+            border-radius: 6px;
+            cursor: pointer;
+            font-size: 13px;
+            font-weight: 500;
+            transition: all 0.2s;
+        }}
+
+        .filter-btn:hover {{
+            border-color: #667eea;
+            background: #f9fafb;
+        }}
+
+        .filter-btn.active {{
+            border-color: #667eea;
+            background: #667eea;
+            color: white;
+        }}
+
+        .filter-btn.complete {{
+            border-color: #10b981;
+        }}
+        .filter-btn.complete.active {{
+            background: #10b981;
+            border-color: #10b981;
+        }}
+
+        .filter-btn.similar {{
+            border-color: #f59e0b;
+        }}
+        .filter-btn.similar.active {{
+            background: #f59e0b;
+            border-color: #f59e0b;
+        }}
+
+        .filter-btn.weak {{
+            border-color: #f97316;
+        }}
+        .filter-btn.weak.active {{
+            background: #f97316;
+            border-color: #f97316;
+        }}
+
+        .filter-btn.none {{
+            border-color: #ef4444;
+        }}
+        .filter-btn.none.active {{
+            background: #ef4444;
+            border-color: #ef4444;
+        }}
+
+        .filter-btn.filtered {{
+            border-color: #6b7280;
+        }}
+        .filter-btn.filtered.active {{
+            background: #6b7280;
+            border-color: #6b7280;
+        }}
+
+        /* 主容器 */
+        .main-container {{
+            display: flex;
+            max-width: 1400px;
+            margin: 0 auto 20px;
+            gap: 20px;
+            padding: 0 20px;
+            height: calc(100vh - 260px);
+        }}
+
+        /* 左侧导航 */
+        .left-sidebar {{
+            width: 30%;
+            background: white;
+            border-radius: 8px;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+            overflow-y: auto;
+            position: sticky;
+            top: 20px;
+            height: fit-content;
+            max-height: calc(100vh - 280px);
+        }}
+
+        .feature-group {{
+            border-bottom: 1px solid #e5e7eb;
+        }}
+
+        .feature-header {{
+            padding: 15px 20px;
+            background: #f9fafb;
+            cursor: pointer;
+            user-select: none;
+            transition: background 0.2s;
+        }}
+
+        .feature-header:hover {{
+            background: #f3f4f6;
+        }}
+
+        .feature-header.active {{
+            background: #667eea;
+            color: white;
+        }}
+
+        .feature-title {{
+            font-size: 16px;
+            font-weight: 600;
+            margin-bottom: 5px;
+        }}
+
+        .feature-meta {{
+            font-size: 12px;
+            color: #6b7280;
+        }}
+
+        .feature-header.active .feature-meta {{
+            color: rgba(255,255,255,0.8);
+        }}
+
+        .search-words-list {{
+            display: none;
+            padding: 0;
+        }}
+
+        .search-words-list.expanded {{
+            display: block;
+        }}
+
+        /* Base word分组层 */
+        .base-word-group {{
+            border-bottom: 1px solid #f3f4f6;
+        }}
+
+        .base-word-header {{
+            padding: 12px 20px 12px 30px;
+            background: #fafbfc;
+            cursor: pointer;
+            user-select: none;
+            transition: all 0.2s;
+            border-left: 3px solid transparent;
+        }}
+
+        .base-word-header:hover {{
+            background: #f3f4f6;
+            border-left-color: #a78bfa;
+        }}
+
+        .base-word-header.active {{
+            background: #f3f4f6;
+            border-left-color: #7c3aed;
+        }}
+
+        .base-word-title {{
+            font-size: 15px;
+            font-weight: 600;
+            color: #7c3aed;
+            margin-bottom: 4px;
+        }}
+
+        .base-word-meta {{
+            font-size: 11px;
+            color: #6b7280;
+        }}
+
+        .base-word-desc {{
+            padding: 8px 20px 8px 30px;
+            background: #fefce8;
+            font-size: 12px;
+            color: #854d0e;
+            line-height: 1.5;
+            border-left: 3px solid #fbbf24;
+            display: none;
+        }}
+
+        .base-word-desc.expanded {{
+            display: block;
+        }}
+
+        .search-words-sublist {{
+            display: none;
+        }}
+
+        .search-words-sublist.expanded {{
+            display: block;
+        }}
+
+        .search-word-item {{
+            padding: 12px 20px 12px 50px;
+            cursor: pointer;
+            border-left: 3px solid transparent;
+            transition: all 0.2s;
+        }}
+
+        .search-word-item:hover {{
+            background: #f9fafb;
+            border-left-color: #667eea;
+        }}
+
+        .search-word-item.active {{
+            background: #ede9fe;
+            border-left-color: #7c3aed;
+        }}
+
+        .search-word-text {{
+            font-size: 14px;
+            font-weight: 500;
+            color: #374151;
+            margin-bottom: 4px;
+        }}
+
+        .search-word-score {{
+            display: inline-block;
+            padding: 2px 8px;
+            border-radius: 12px;
+            font-size: 11px;
+            font-weight: 600;
+            margin-left: 8px;
+        }}
+
+        .score-high {{
+            background: #d1fae5;
+            color: #065f46;
+        }}
+
+        .score-medium {{
+            background: #fef3c7;
+            color: #92400e;
+        }}
+
+        .score-low {{
+            background: #fee2e2;
+            color: #991b1b;
+        }}
+
+        /* 评估徽章 */
+        .eval-badge {{
+            display: inline-block;
+            padding: 2px 6px;
+            border-radius: 10px;
+            font-size: 11px;
+            font-weight: 600;
+            margin-left: 6px;
+        }}
+
+        .eval-complete {{
+            background: #d1fae5;
+            color: #065f46;
+            border: 1px solid #10b981;
+        }}
+
+        .eval-similar {{
+            background: #fef3c7;
+            color: #92400e;
+            border: 1px solid #f59e0b;
+        }}
+
+        .eval-weak {{
+            background: #fed7aa;
+            color: #9a3412;
+            border: 1px solid #f97316;
+        }}
+
+        .eval-none {{
+            background: #fee2e2;
+            color: #991b1b;
+            border: 1px solid #ef4444;
+        }}
+
+        .eval-filtered {{
+            background: #e5e7eb;
+            color: #4b5563;
+            border: 1px solid #6b7280;
+        }}
+
+        .search-word-eval {{
+            font-size: 11px;
+            color: #6b7280;
+            margin-top: 4px;
+        }}
+
+        /* 右侧结果区 */
+        .right-content {{
+            flex: 1;
+            overflow-y: auto;
+            padding-bottom: 40px;
+        }}
+
+        .result-block {{
+            background: white;
+            border-radius: 8px;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+            margin-bottom: 30px;
+            padding: 20px;
+            scroll-margin-top: 20px;
+        }}
+
+        .result-header {{
+            margin-bottom: 20px;
+            padding-bottom: 15px;
+            border-bottom: 2px solid #e5e7eb;
+        }}
+
+        .result-title {{
+            font-size: 20px;
+            font-weight: 600;
+            color: #111827;
+            margin-bottom: 10px;
+        }}
+
+        .result-stats {{
+            display: flex;
+            gap: 10px;
+            font-size: 12px;
+            color: #6b7280;
+            flex-wrap: wrap;
+        }}
+
+        .stat-badge {{
+            background: #f3f4f6;
+            padding: 4px 10px;
+            border-radius: 4px;
+        }}
+
+        .stat-badge.eval {{
+            font-weight: 600;
+        }}
+
+        .stat-badge.eval.complete {{
+            background: #d1fae5;
+            color: #065f46;
+        }}
+
+        .stat-badge.eval.similar {{
+            background: #fef3c7;
+            color: #92400e;
+        }}
+
+        .stat-badge.eval.weak {{
+            background: #fed7aa;
+            color: #9a3412;
+        }}
+
+        .stat-badge.eval.none {{
+            background: #fee2e2;
+            color: #991b1b;
+        }}
+
+        .stat-badge.eval.filtered {{
+            background: #e5e7eb;
+            color: #4b5563;
+        }}
+
+        /* 帖子网格 */
+        .notes-grid {{
+            display: grid;
+            grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
+            gap: 20px;
+        }}
+
+        /* 空状态样式 */
+        .empty-state {{
+            text-align: center;
+            padding: 60px 40px;
+            color: #6b7280;
+        }}
+
+        .empty-icon {{
+            font-size: 48px;
+            margin-bottom: 16px;
+        }}
+
+        .empty-title {{
+            font-size: 16px;
+            font-weight: 600;
+            color: #374151;
+            margin-bottom: 8px;
+        }}
+
+        .empty-desc {{
+            font-size: 14px;
+            line-height: 1.6;
+            color: #9ca3af;
+            max-width: 400px;
+            margin: 0 auto;
+        }}
+
+        .note-card {{
+            border: 3px solid #e5e7eb;
+            border-radius: 8px;
+            overflow: hidden;
+            cursor: pointer;
+            transition: all 0.3s;
+            background: white;
+        }}
+
+        .note-card:hover {{
+            transform: translateY(-4px);
+            box-shadow: 0 10px 25px rgba(0,0,0,0.15);
+        }}
+
+        /* 根据评估分数设置边框颜色 */
+        .note-card.eval-complete {{
+            border-color: #10b981;
+        }}
+
+        .note-card.eval-similar {{
+            border-color: #f59e0b;
+        }}
+
+        .note-card.eval-weak {{
+            border-color: #f97316;
+        }}
+
+        .note-card.eval-none {{
+            border-color: #ef4444;
+        }}
+
+        .note-card.eval-filtered {{
+            border-color: #6b7280;
+            opacity: 0.6;
+        }}
+
+        /* 图片轮播 */
+        .image-carousel {{
+            position: relative;
+            width: 100%;
+            height: 280px;
+            background: #f3f4f6;
+            overflow: hidden;
+        }}
+
+        .carousel-images {{
+            display: flex;
+            height: 100%;
+            transition: transform 0.3s ease;
+        }}
+
+        .carousel-image {{
+            min-width: 100%;
+            height: 100%;
+            object-fit: cover;
+        }}
+
+        .carousel-btn {{
+            position: absolute;
+            top: 50%;
+            transform: translateY(-50%);
+            background: rgba(0,0,0,0.5);
+            color: white;
+            border: none;
+            width: 32px;
+            height: 32px;
+            border-radius: 50%;
+            cursor: pointer;
+            font-size: 16px;
+            display: none;
+            align-items: center;
+            justify-content: center;
+            transition: background 0.2s;
+            z-index: 10;
+        }}
+
+        .carousel-btn:hover {{
+            background: rgba(0,0,0,0.7);
+        }}
+
+        .carousel-btn.prev {{
+            left: 8px;
+        }}
+
+        .carousel-btn.next {{
+            right: 8px;
+        }}
+
+        .note-card:hover .carousel-btn {{
+            display: flex;
+        }}
+
+        .carousel-indicators {{
+            position: absolute;
+            bottom: 10px;
+            left: 50%;
+            transform: translateX(-50%);
+            display: flex;
+            gap: 6px;
+            z-index: 10;
+        }}
+
+        .dot {{
+            width: 8px;
+            height: 8px;
+            border-radius: 50%;
+            background: rgba(255,255,255,0.5);
+            cursor: pointer;
+            transition: all 0.2s;
+        }}
+
+        .dot.active {{
+            background: white;
+            width: 24px;
+            border-radius: 4px;
+        }}
+
+        .image-counter {{
+            position: absolute;
+            top: 10px;
+            right: 10px;
+            background: rgba(0,0,0,0.6);
+            color: white;
+            padding: 4px 8px;
+            border-radius: 4px;
+            font-size: 12px;
+            z-index: 10;
+        }}
+
+        /* 帖子信息 */
+        .note-info {{
+            padding: 12px;
+        }}
+
+        .note-title {{
+            font-size: 14px;
+            font-weight: 500;
+            color: #111827;
+            margin-bottom: 8px;
+            display: -webkit-box;
+            -webkit-line-clamp: 2;
+            -webkit-box-orient: vertical;
+            overflow: hidden;
+            line-height: 1.4;
+        }}
+
+        .note-meta {{
+            display: flex;
+            align-items: center;
+            justify-content: space-between;
+            font-size: 12px;
+            color: #6b7280;
+            margin-bottom: 8px;
+        }}
+
+        .note-type {{
+            padding: 3px 8px;
+            border-radius: 4px;
+            font-weight: 500;
+        }}
+
+        .type-video {{
+            background: #dbeafe;
+            color: #1e40af;
+        }}
+
+        .type-normal {{
+            background: #d1fae5;
+            color: #065f46;
+        }}
+
+        .note-author {{
+            display: flex;
+            align-items: center;
+            gap: 6px;
+        }}
+
+        .author-avatar {{
+            width: 24px;
+            height: 24px;
+            border-radius: 50%;
+        }}
+
+        /* 评估信息 */
+        .note-eval {{
+            padding: 8px 12px;
+            background: #f9fafb;
+            border-top: 1px solid #e5e7eb;
+            font-size: 12px;
+        }}
+
+        .note-eval-header {{
+            display: flex;
+            align-items: center;
+            justify-content: space-between;
+            cursor: pointer;
+            user-select: none;
+        }}
+
+        .note-eval-score {{
+            font-weight: 600;
+        }}
+
+        .note-eval-toggle {{
+            color: #6b7280;
+            font-size: 10px;
+        }}
+
+        .note-eval-details {{
+            margin-top: 8px;
+            padding-top: 8px;
+            border-top: 1px solid #e5e7eb;
+            display: none;
+            line-height: 1.5;
+        }}
+
+        .note-eval-details.expanded {{
+            display: block;
+        }}
+
+        .eval-detail-label {{
+            font-weight: 600;
+            color: #374151;
+            margin-top: 6px;
+            margin-bottom: 2px;
+        }}
+
+        .eval-detail-label:first-child {{
+            margin-top: 0;
+        }}
+
+        .eval-detail-text {{
+            color: #6b7280;
+        }}
+
+        /* 滚动条样式 */
+        ::-webkit-scrollbar {{
+            width: 8px;
+            height: 8px;
+        }}
+
+        ::-webkit-scrollbar-track {{
+            background: #f1f1f1;
+        }}
+
+        ::-webkit-scrollbar-thumb {{
+            background: #888;
+            border-radius: 4px;
+        }}
+
+        ::-webkit-scrollbar-thumb:hover {{
+            background: #555;
+        }}
+
+        /* 隐藏类 */
+        .hidden {{
+            display: none !important;
+        }}
+    </style>
+</head>
+<body>
+    <!-- 统计面板 -->
+    <div class="stats-panel">
+        <div class="stats-container">
+            <div class="stats-row">
+                <div class="stat-item">
+                    <div class="stat-value">📊 {stats['total_features']}</div>
+                    <div class="stat-label">原始特征数</div>
+                </div>
+                <div class="stat-item">
+                    <div class="stat-value">🔍 {stats['total_search_words']}</div>
+                    <div class="stat-label">搜索词总数</div>
+                </div>
+                <div class="stat-item">
+                    <div class="stat-value">✅ {stats['searched_count']}</div>
+                    <div class="stat-label">已搜索 ({stats['searched_percentage']}%)</div>
+                </div>
+                <div class="stat-item">
+                    <div class="stat-value">⏸️ {stats['not_searched_count']}</div>
+                    <div class="stat-label">未搜索</div>
+                </div>
+                <div class="stat-item">
+                    <div class="stat-value">📝 {stats['total_notes']}</div>
+                    <div class="stat-label">帖子总数</div>
+                </div>
+                <div class="stat-item">
+                    <div class="stat-value">🎬 {stats['video_count']}</div>
+                    <div class="stat-label">视频 ({stats['video_percentage']}%)</div>
+                </div>
+                <div class="stat-item">
+                    <div class="stat-value">📷 {stats['normal_count']}</div>
+                    <div class="stat-label">图文 ({stats['normal_percentage']}%)</div>
+                </div>
+            </div>
+            <div class="stats-row">
+                <div class="stat-item small">
+                    <div class="stat-value">⚡ {stats['total_evaluated']}</div>
+                    <div class="stat-label">已评估</div>
+                </div>
+                <div class="stat-item small">
+                    <div class="stat-value">⚫ {stats['total_filtered']}</div>
+                    <div class="stat-label">已过滤 ({stats['filter_rate']}%)</div>
+                </div>
+                <div class="stat-item small">
+                    <div class="stat-value">🟢 {stats['match_complete']}</div>
+                    <div class="stat-label">完全匹配 ({stats['complete_rate']}%)</div>
+                </div>
+                <div class="stat-item small">
+                    <div class="stat-value">🟡 {stats['match_similar']}</div>
+                    <div class="stat-label">相似匹配 ({stats['similar_rate']}%)</div>
+                </div>
+                <div class="stat-item small">
+                    <div class="stat-value">🟠 {stats['match_weak']}</div>
+                    <div class="stat-label">弱相似</div>
+                </div>
+                <div class="stat-item small">
+                    <div class="stat-value">🔴 {stats['match_none']}</div>
+                    <div class="stat-label">无匹配</div>
+                </div>
+            </div>
+        </div>
+    </div>
+
+    <!-- 过滤控制面板 -->
+    <div class="filter-panel">
+        <span class="filter-label">🔍 筛选显示:</span>
+        <div class="filter-buttons">
+            <button class="filter-btn active" onclick="filterNotes('all')">全部</button>
+            <button class="filter-btn complete" onclick="filterNotes('complete')">🟢 完全匹配</button>
+            <button class="filter-btn similar" onclick="filterNotes('similar')">🟡 相似匹配</button>
+            <button class="filter-btn weak" onclick="filterNotes('weak')">🟠 弱相似</button>
+            <button class="filter-btn none" onclick="filterNotes('none')">🔴 无匹配</button>
+            <button class="filter-btn filtered" onclick="filterNotes('filtered')">⚫ 已过滤</button>
+        </div>
+    </div>
+
+    <!-- 主容器 -->
+    <div class="main-container">
+        <!-- 左侧导航 -->
+        <div class="left-sidebar" id="leftSidebar">
+            <!-- 通过JavaScript动态生成 -->
+        </div>
+
+        <!-- 右侧结果区 -->
+        <div class="right-content" id="rightContent">
+            <!-- 通过JavaScript动态生成 -->
+        </div>
+    </div>
+
+    <script>
+        // 数据
+        const data = {data_json};
+        let currentFilter = 'all';
+
+        // 创建评估映射(使用索引: "featureIdx-groupIdx-swIdx-noteIdx" -> evaluation)
+        const noteEvaluations = {{}};
+        data.forEach((feature, fIdx) => {{
+            const groups = feature['组合评估结果_分组'] || [];
+            groups.forEach((group, gIdx) => {{
+                const searches = group['top10_searches'] || [];
+                searches.forEach((search, sIdx) => {{
+                    const evaluation = search['evaluation_with_filter'];
+                    if (evaluation && evaluation.notes_evaluation) {{
+                        evaluation.notes_evaluation.forEach(noteEval => {{
+                            const key = `${{fIdx}}-${{gIdx}}-${{sIdx}}-${{noteEval.note_index}}`;
+                            noteEvaluations[key] = noteEval;
+                        }});
+                    }}
+                }});
+            }});
+        }});
+
+        // 获取评估类别
+        function getEvalCategory(noteEval) {{
+            if (!noteEval || noteEval['Query相关性'] !== '相关') {{
+                return 'filtered';
+            }}
+            const score = noteEval['综合得分'];
+            if (score >= 0.8) return 'complete';
+            if (score >= 0.6) return 'similar';
+            if (score >= 0.5) return 'weak';
+            return 'none';
+        }}
+
+        // 渲染左侧导航
+        function renderLeftSidebar() {{
+            const sidebar = document.getElementById('leftSidebar');
+            let html = '';
+
+            data.forEach((feature, featureIdx) => {{
+                const groups = feature['组合评估结果_分组'] || [];
+                let totalSearches = 0;
+                groups.forEach(group => {{
+                    totalSearches += (group['top10_searches'] || []).length;
+                }});
+
+                // 层级1: 原始特征
+                html += `
+                    <div class="feature-group">
+                        <div class="feature-header" onclick="toggleFeature(${{featureIdx}})" id="feature-header-${{featureIdx}}">
+                            <div class="feature-title">${{feature['原始特征名称']}}</div>
+                            <div class="feature-meta">
+                                ${{feature['来源层级']}} · 权重: ${{feature['权重'].toFixed(2)}} · ${{totalSearches}}个搜索词
+                            </div>
+                        </div>
+                        <div class="search-words-list" id="search-words-${{featureIdx}}">
+                `;
+
+                // 层级2: Base word分组
+                groups.forEach((group, groupIdx) => {{
+                    const baseWord = group['base_word'] || '';
+                    const baseSimilarity = group['base_word_similarity'] || 0;
+                    const searches = group['top10_searches'] || [];
+
+                    // 获取相关词汇
+                    const relatedWords = feature['高相似度候选_按base_word']?.[baseWord] || [];
+                    const relatedWordNames = relatedWords.map(w => w['人设特征名称']).slice(0, 10).join('、');
+
+                    html += `
+                        <div class="base-word-group">
+                            <div class="base-word-header" onclick="toggleBaseWord(${{featureIdx}}, ${{groupIdx}})"
+                                 id="base-word-header-${{featureIdx}}-${{groupIdx}}">
+                                <div class="base-word-title">🎯 ${{baseWord}}</div>
+                                <div class="base-word-meta">相似度: ${{baseSimilarity.toFixed(2)}} · ${{searches.length}}个搜索词</div>
+                            </div>
+                            <div class="base-word-desc" id="base-word-desc-${{featureIdx}}-${{groupIdx}}">
+                                <strong>关联特征范围(可用词汇池):</strong>${{relatedWordNames || '无相关词汇'}}
+                            </div>
+                            <div class="search-words-sublist" id="search-words-sublist-${{featureIdx}}-${{groupIdx}}">
+                    `;
+
+                    // 层级3: 搜索词列表
+                    searches.forEach((sw, swIdx) => {{
+                        const score = sw.score || 0;
+                        const scoreClass = score >= 0.9 ? 'score-high' : score >= 0.7 ? 'score-medium' : 'score-low';
+                        const blockId = `block-${{featureIdx}}-${{groupIdx}}-${{swIdx}}`;
+                        const sourceWord = sw.source_word || '';
+
+                        // 获取评估统计
+                        const evaluation = sw['evaluation_with_filter'];
+                        let evalBadges = '';
+                        if (evaluation) {{
+                            const stats = evaluation.statistics || {{}};
+                            const complete = stats['完全匹配(0.8-1.0)'] || 0;
+                            const similar = stats['相似匹配(0.6-0.79)'] || 0;
+                            const weak = stats['弱相似(0.5-0.59)'] || 0;
+                            const none = stats['无匹配(≤0.4)'] || 0;
+                            const filtered = evaluation.filtered_count || 0;
+
+                            if (complete > 0) evalBadges += `<span class="eval-badge eval-complete">🟢${{complete}}</span>`;
+                            if (similar > 0) evalBadges += `<span class="eval-badge eval-similar">🟡${{similar}}</span>`;
+                            if (weak > 0) evalBadges += `<span class="eval-badge eval-weak">🟠${{weak}}</span>`;
+                            if (none > 0) evalBadges += `<span class="eval-badge eval-none">🔴${{none}}</span>`;
+                            if (filtered > 0) evalBadges += `<span class="eval-badge eval-filtered">⚫${{filtered}}</span>`;
+                        }}
+
+                        html += `
+                            <div class="search-word-item" onclick="scrollToBlock('${{blockId}}')"
+                                 id="sw-${{featureIdx}}-${{groupIdx}}-${{swIdx}}"
+                                 data-block-id="${{blockId}}">
+                                <div class="search-word-text">
+                                    🔍 ${{sw.search_word}}
+                                </div>
+                                <div class="search-word-meta" style="font-size:11px;color:#9ca3af;margin-top:2px">
+                                    来源: ${{sourceWord}}
+                                </div>
+                                <div class="search-word-eval">${{evalBadges}}</div>
+                            </div>
+                        `;
+                    }});
+
+                    html += `
+                            </div>
+                        </div>
+                    `;
+                }});
+
+                html += `
+                        </div>
+                    </div>
+                `;
+            }});
+
+            sidebar.innerHTML = html;
+        }}
+
+        // 渲染右侧结果区
+        function renderRightContent() {{
+            const content = document.getElementById('rightContent');
+            let html = '';
+
+            data.forEach((feature, featureIdx) => {{
+                const groups = feature['组合评估结果_分组'] || [];
+
+                groups.forEach((group, groupIdx) => {{
+                    const searches = group['top10_searches'] || [];
+
+                    searches.forEach((sw, swIdx) => {{
+                        const blockId = `block-${{featureIdx}}-${{groupIdx}}-${{swIdx}}`;
+                        const hasSearchResult = sw.search_result != null;
+                        const searchResult = sw.search_result || {{}};
+                        const notes = searchResult.data?.data || [];
+
+                        const videoCount = notes.filter(n => n.note_card?.type === 'video').length;
+                        const normalCount = notes.length - videoCount;
+
+                        // 获取评估统计
+                        const evaluation = sw['evaluation_with_filter'];
+                        let evalStats = '';
+                        if (evaluation) {{
+                            const stats = evaluation.statistics || {{}};
+                            const complete = stats['完全匹配(0.8-1.0)'] || 0;
+                            const similar = stats['相似匹配(0.6-0.79)'] || 0;
+                            const weak = stats['弱相似(0.5-0.59)'] || 0;
+                            const none = stats['无匹配(≤0.4)'] || 0;
+                            const filtered = evaluation.filtered_count || 0;
+
+                            if (complete > 0) evalStats += `<span class="stat-badge eval complete">🟢 完全:${{complete}}</span>`;
+                            if (similar > 0) evalStats += `<span class="stat-badge eval similar">🟡 相似:${{similar}}</span>`;
+                            if (weak > 0) evalStats += `<span class="stat-badge eval weak">🟠 弱:${{weak}}</span>`;
+                            if (none > 0) evalStats += `<span class="stat-badge eval none">🔴 无:${{none}}</span>`;
+                            if (filtered > 0) evalStats += `<span class="stat-badge eval filtered">⚫ 过滤:${{filtered}}</span>`;
+                        }}
+
+                        // 构建结果块
+                        html += `
+                            <div class="result-block" id="${{blockId}}">
+                                <div class="result-header">
+                                    <div class="result-title">${{sw.search_word}}</div>
+                                    <div class="result-stats">
+                        `;
+
+                        // 根据搜索状态显示不同的统计信息
+                        if (!hasSearchResult) {{
+                            // 未执行搜索
+                            html += `
+                                        <span class="stat-badge" style="background:#fef3c7;color:#92400e;font-weight:600">⏸️ 未执行搜索</span>
+                            `;
+                        }} else if (notes.length === 0) {{
+                            // 搜索完成但无结果
+                            html += `
+                                        <span class="stat-badge">📝 0 条帖子</span>
+                                        <span class="stat-badge" style="background:#fee2e2;color:#991b1b;font-weight:600">❌ 未找到匹配</span>
+                            `;
+                        }} else {{
+                            // 正常有结果
+                            html += `
+                                        <span class="stat-badge">📝 ${{notes.length}} 条帖子</span>
+                                        <span class="stat-badge">🎬 ${{videoCount}} 视频</span>
+                                        <span class="stat-badge">📷 ${{normalCount}} 图文</span>
+                                        ${{evalStats}}
+                            `;
+                        }}
+
+                        html += `
+                                    </div>
+                                </div>
+                        `;
+
+                        // 根据搜索状态显示不同的内容区域
+                        if (!hasSearchResult) {{
+                            // 未执行搜索 - 显示空状态消息
+                            html += `
+                                <div class="empty-state">
+                                    <div class="empty-icon">⏸️</div>
+                                    <div class="empty-title">该搜索词未执行搜索</div>
+                                    <div class="empty-desc">由于搜索次数限制(--max-searches-per-feature 和 --max-searches-per-base-word),该搜索词未被执行</div>
+                                </div>
+                            `;
+                        }} else if (notes.length === 0) {{
+                            // 搜索完成但无结果
+                            html += `
+                                <div class="empty-state">
+                                    <div class="empty-icon">❌</div>
+                                    <div class="empty-title">搜索完成,但未找到匹配的帖子</div>
+                                    <div class="empty-desc">该搜索词已执行,但小红书返回了 0 条结果</div>
+                                </div>
+                            `;
+                        }} else {{
+                            // 正常有结果 - 显示帖子网格
+                            html += `
+                                <div class="notes-grid">
+                                    ${{notes.map((note, noteIdx) => renderNoteCard(note, featureIdx, groupIdx, swIdx, noteIdx)).join('')}}
+                                </div>
+                            `;
+                        }}
+
+                        html += `
+                            </div>
+                        `;
+                    }});
+                }});
+            }});
+
+            content.innerHTML = html;
+        }}
+
+        // 渲染单个帖子卡片
+        function renderNoteCard(note, featureIdx, groupIdx, swIdx, noteIdx) {{
+            const card = note.note_card || {{}};
+            const images = card.image_list || [];
+            const title = card.display_title || '无标题';
+            const noteType = card.type || 'normal';
+            const noteId = note.id || '';
+            const user = card.user || {{}};
+            const userName = user.nick_name || '未知用户';
+            const userAvatar = user.avatar || '';
+
+            const carouselId = `carousel-${{featureIdx}}-${{groupIdx}}-${{swIdx}}-${{noteIdx}}`;
+
+            // 获取评估结果(使用索引key)
+            const evalKey = `${{featureIdx}}-${{groupIdx}}-${{swIdx}}-${{noteIdx}}`;
+            const noteEval = noteEvaluations[evalKey];
+            const evalCategory = getEvalCategory(noteEval);
+            const evalClass = `eval-${{evalCategory}}`;
+
+            let evalSection = '';
+            if (noteEval) {{
+                const score = noteEval['综合得分'];
+                const scoreEmoji = score >= 0.8 ? '🟢' : score >= 0.6 ? '🟡' : score >= 0.5 ? '🟠' : '🔴';
+                const scoreText = score >= 0.8 ? '完全匹配' : score >= 0.6 ? '相似匹配' : score >= 0.5 ? '弱相似' : '无匹配';
+                const reasoning = noteEval['评分说明'] || '无';
+                const matchingPoints = (noteEval['关键匹配点'] || []).join('、') || '无';
+
+                evalSection = `
+                    <div class="note-eval">
+                        <div class="note-eval-header" onclick="event.stopPropagation(); toggleEvalDetails('${{carouselId}}')">
+                            <span class="note-eval-score">${{scoreEmoji}} ${{scoreText}} (${{score}}分)</span>
+                            <span class="note-eval-toggle" id="${{carouselId}}-toggle">▼ 详情</span>
+                        </div>
+                        <div class="note-eval-details" id="${{carouselId}}-details">
+                            <div class="eval-detail-label">评估理由:</div>
+                            <div class="eval-detail-text">${{reasoning}}</div>
+                            <div class="eval-detail-label">匹配要点:</div>
+                            <div class="eval-detail-text">${{matchingPoints}}</div>
+                        </div>
+                    </div>
+                `;
+            }} else if (evalCategory === 'filtered') {{
+                evalSection = `
+                    <div class="note-eval">
+                        <div class="note-eval-score">⚫ 已过滤(与搜索无关)</div>
+                    </div>
+                `;
+            }}
+
+            return `
+                <div class="note-card ${{evalClass}}" data-eval-category="${{evalCategory}}" onclick="openNote('${{noteId}}')">
+                    <div class="image-carousel" id="${{carouselId}}">
+                        <div class="carousel-images">
+                            ${{images.map(img => `<img class="carousel-image" src="${{img}}" alt="帖子图片" loading="lazy">`).join('')}}
+                        </div>
+                        ${{images.length > 1 ? `
+                            <button class="carousel-btn prev" onclick="event.stopPropagation(); changeImage('${{carouselId}}', -1)">←</button>
+                            <button class="carousel-btn next" onclick="event.stopPropagation(); changeImage('${{carouselId}}', 1)">→</button>
+                            <div class="carousel-indicators">
+                                ${{images.map((_, i) => `<span class="dot ${{i === 0 ? 'active' : ''}}" onclick="event.stopPropagation(); goToImage('${{carouselId}}', ${{i}})"></span>`).join('')}}
+                            </div>
+                            <span class="image-counter">1/${{images.length}}</span>
+                        ` : ''}}
+                    </div>
+                    <div class="note-info">
+                        <div class="note-title">${{title}}</div>
+                        <div class="note-meta">
+                            <span class="note-type type-${{noteType}}">
+                                ${{noteType === 'video' ? '🎬 视频' : '📷 图文'}}
+                            </span>
+                            <div class="note-author">
+                                ${{userAvatar ? `<img class="author-avatar" src="${{userAvatar}}" alt="${{userName}}">` : ''}}
+                                <span>${{userName}}</span>
+                            </div>
+                        </div>
+                    </div>
+                    ${{evalSection}}
+                </div>
+            `;
+        }}
+
+        // 图片轮播逻辑
+        const carouselStates = {{}};
+
+        function changeImage(carouselId, direction) {{
+            if (!carouselStates[carouselId]) {{
+                carouselStates[carouselId] = {{ currentIndex: 0 }};
+            }}
+
+            const carousel = document.getElementById(carouselId);
+            const imagesContainer = carousel.querySelector('.carousel-images');
+            const images = carousel.querySelectorAll('.carousel-image');
+            const dots = carousel.querySelectorAll('.dot');
+            const counter = carousel.querySelector('.image-counter');
+
+            let newIndex = carouselStates[carouselId].currentIndex + direction;
+            if (newIndex < 0) newIndex = images.length - 1;
+            if (newIndex >= images.length) newIndex = 0;
+
+            carouselStates[carouselId].currentIndex = newIndex;
+            imagesContainer.style.transform = `translateX(-${{newIndex * 100}}%)`;
+
+            // 更新指示器
+            dots.forEach((dot, i) => {{
+                dot.classList.toggle('active', i === newIndex);
+            }});
+
+            // 更新计数器
+            if (counter) {{
+                counter.textContent = `${{newIndex + 1}}/${{images.length}}`;
+            }}
+        }}
+
+        function goToImage(carouselId, index) {{
+            if (!carouselStates[carouselId]) {{
+                carouselStates[carouselId] = {{ currentIndex: 0 }};
+            }}
+
+            const carousel = document.getElementById(carouselId);
+            const imagesContainer = carousel.querySelector('.carousel-images');
+            const dots = carousel.querySelectorAll('.dot');
+            const counter = carousel.querySelector('.image-counter');
+
+            carouselStates[carouselId].currentIndex = index;
+            imagesContainer.style.transform = `translateX(-${{index * 100}}%)`;
+
+            // 更新指示器
+            dots.forEach((dot, i) => {{
+                dot.classList.toggle('active', i === index);
+            }});
+
+            // 更新计数器
+            if (counter) {{
+                counter.textContent = `${{index + 1}}/${{dots.length}}`;
+            }}
+        }}
+
+        // 展开/折叠特征组
+        function toggleFeature(featureIdx) {{
+            const searchWordsList = document.getElementById(`search-words-${{featureIdx}}`);
+            const featureHeader = document.getElementById(`feature-header-${{featureIdx}}`);
+
+            searchWordsList.classList.toggle('expanded');
+            featureHeader.classList.toggle('active');
+        }}
+
+        // 展开/折叠base word分组
+        function toggleBaseWord(featureIdx, groupIdx) {{
+            const baseWordHeader = document.getElementById(`base-word-header-${{featureIdx}}-${{groupIdx}}`);
+            const baseWordDesc = document.getElementById(`base-word-desc-${{featureIdx}}-${{groupIdx}}`);
+            const searchWordsSublist = document.getElementById(`search-words-sublist-${{featureIdx}}-${{groupIdx}}`);
+
+            baseWordHeader.classList.toggle('active');
+            baseWordDesc.classList.toggle('expanded');
+            searchWordsSublist.classList.toggle('expanded');
+        }}
+
+        // 滚动到指定结果块
+        function scrollToBlock(blockId) {{
+            const block = document.getElementById(blockId);
+            if (block) {{
+                block.scrollIntoView({{ behavior: 'smooth', block: 'start' }});
+
+                // 高亮对应的搜索词
+                document.querySelectorAll('.search-word-item').forEach(item => {{
+                    item.classList.remove('active');
+                }});
+
+                document.querySelectorAll(`[data-block-id="${{blockId}}"]`).forEach(item => {{
+                    item.classList.add('active');
+                }});
+            }}
+        }}
+
+        // 切换评估详情
+        function toggleEvalDetails(carouselId) {{
+            const details = document.getElementById(`${{carouselId}}-details`);
+            const toggle = document.getElementById(`${{carouselId}}-toggle`);
+
+            if (details && toggle) {{
+                details.classList.toggle('expanded');
+                toggle.textContent = details.classList.contains('expanded') ? '▲ 收起' : '▼ 详情';
+            }}
+        }}
+
+        // 过滤帖子
+        function filterNotes(category) {{
+            currentFilter = category;
+
+            // 更新按钮状态
+            document.querySelectorAll('.filter-btn').forEach(btn => {{
+                btn.classList.remove('active');
+            }});
+            event.target.classList.add('active');
+
+            // 过滤帖子卡片
+            document.querySelectorAll('.note-card').forEach(card => {{
+                const evalCategory = card.getAttribute('data-eval-category');
+                if (category === 'all' || evalCategory === category) {{
+                    card.classList.remove('hidden');
+                }} else {{
+                    card.classList.add('hidden');
+                }}
+            }});
+
+            // 隐藏空的结果块
+            document.querySelectorAll('.result-block').forEach(block => {{
+                const visibleCards = block.querySelectorAll('.note-card:not(.hidden)');
+                if (visibleCards.length === 0) {{
+                    block.classList.add('hidden');
+                }} else {{
+                    block.classList.remove('hidden');
+                }}
+            }});
+        }}
+
+        // 打开小红书帖子
+        function openNote(noteId) {{
+            if (noteId) {{
+                window.open(`https://www.xiaohongshu.com/explore/${{noteId}}`, '_blank');
+            }}
+        }}
+
+        // 初始化
+        document.addEventListener('DOMContentLoaded', () => {{
+            renderLeftSidebar();
+            renderRightContent();
+
+            // 默认展开第一个特征组和第一个base_word
+            if (data.length > 0) {{
+                toggleFeature(0);
+
+                // 展开第一个base_word分组
+                const firstGroups = data[0]['组合评估结果_分组'];
+                if (firstGroups && firstGroups.length > 0) {{
+                    toggleBaseWord(0, 0);
+                }}
+            }}
+        }});
+    </script>
+</body>
+</html>
+'''
+
+    # 写入文件
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write(html_content)
+
+
+def main():
+    """主函数"""
+    # 配置路径 - 使用项目根目录
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    # 从 src/visualizers/ 向上回溯两级到项目根目录
+    project_root = os.path.dirname(os.path.dirname(script_dir))
+    json_path = os.path.join(project_root, 'output_v2', 'evaluated_results.json')
+    output_dir = os.path.join(project_root, 'visualization')
+    os.makedirs(output_dir, exist_ok=True)
+
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    output_path = os.path.join(output_dir, f'search_results_interactive_{timestamp}.html')
+
+    # 加载数据
+    print(f"📖 加载数据: {json_path}")
+    data = load_data(json_path)
+    print(f"✓ 加载了 {len(data)} 个原始特征")
+
+    # 计算统计
+    print("📊 计算统计数据...")
+    stats = calculate_statistics(data)
+    print(f"✓ 统计完成:")
+    print(f"  - 原始特征: {stats['total_features']}")
+    print(f"  - 搜索词总数: {stats['total_search_words']}")
+    print(f"  - 已搜索: {stats['searched_count']} ({stats['searched_percentage']}%)")
+    print(f"  - 未搜索: {stats['not_searched_count']}")
+    print(f"  - 帖子总数: {stats['total_notes']}")
+    print(f"  - 视频: {stats['video_count']} ({stats['video_percentage']}%)")
+    print(f"  - 图文: {stats['normal_count']} ({stats['normal_percentage']}%)")
+    print(f"\n  评估结果:")
+    print(f"  - 已评估: {stats['total_evaluated']}")
+    print(f"  - 已过滤: {stats['total_filtered']} ({stats['filter_rate']}%)")
+    print(f"  - 完全匹配: {stats['match_complete']} ({stats['complete_rate']}%)")
+    print(f"  - 相似匹配: {stats['match_similar']} ({stats['similar_rate']}%)")
+    print(f"  - 弱相似: {stats['match_weak']}")
+    print(f"  - 无匹配: {stats['match_none']}")
+
+    # 生成HTML
+    print(f"\n🎨 生成可视化页面...")
+    generate_html(data, stats, output_path)
+    print(f"✓ 生成完成: {output_path}")
+
+    # 打印访问提示
+    print(f"\n🌐 在浏览器中打开查看:")
+    print(f"   file://{output_path}")
+
+    return output_path
+
+
+if __name__ == '__main__':
+    main()

+ 255 - 0
tools/analyze_associations.py

@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+分析 dimension_associations_analysis.json 中的关联关系
+"""
+import json
+from collections import defaultdict, Counter
+from typing import Dict, List, Any
+
+
+def load_data(file_path: str) -> Dict:
+    """加载JSON数据"""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def analyze_basic_info(data: Dict) -> None:
+    """分析基本信息"""
+    print("=" * 80)
+    print("📊 基本信息分析")
+    print("=" * 80)
+
+    info = data.get("分析说明", {})
+    print(f"\n分析类型: {', '.join(info.get('分析类型', []))}")
+    print(f"最小共同帖子数: {info.get('最小共同帖子数', 0)}")
+    print(f"\n维度统计:")
+    print(f"  灵感点: {info.get('灵感点分类数(全部)', 0)} 个分类 (非一级: {info.get('灵感点非一级分类数', 0)})")
+    print(f"  目的点: {info.get('目的点分类数(全部)', 0)} 个分类 (非一级: {info.get('目的点非一级分类数', 0)})")
+    print(f"  关键点: {info.get('关键点分类数(全部)', 0)} 个分类 (非一级: {info.get('关键点非一级分类数', 0)})")
+
+
+def analyze_single_dimension(data: Dict) -> None:
+    """分析单维度关联"""
+    print("\n" + "=" * 80)
+    print("🔗 单维度关联分析")
+    print("=" * 80)
+
+    single_dim = data.get("单维度关联分析", {})
+
+    for dimension_name, dimension_data in single_dim.items():
+        print(f"\n【{dimension_name}】")
+        print(f"说明: {dimension_data.get('说明', '')}")
+
+        # 统计每种关联方向
+        for direction, associations in dimension_data.items():
+            if direction == "说明":
+                continue
+
+            print(f"\n  {direction}:")
+
+            # 统计总体情况
+            total_sources = len(associations)
+            total_associations = 0
+            high_similarity = []  # 高相似度关联
+            high_overlap = []  # 高重叠系数关联
+
+            for source_name, source_data in associations.items():
+                assoc_list = source_data.get("与目的点的关联", []) or \
+                            source_data.get("与关键点的关联", []) or \
+                            source_data.get("与灵感点的关联", [])
+
+                total_associations += len(assoc_list)
+
+                # 找出高相似度和高重叠系数的关联
+                for assoc in assoc_list:
+                    jaccard = assoc.get("Jaccard相似度", 0)
+                    overlap = assoc.get("重叠系数", 0)
+
+                    if jaccard >= 0.5:
+                        high_similarity.append({
+                            "源": source_name,
+                            "目标": assoc.get("目标分类", ""),
+                            "Jaccard": jaccard,
+                            "共同帖子数": assoc.get("共同帖子数", 0)
+                        })
+
+                    if overlap >= 0.8:
+                        high_overlap.append({
+                            "源": source_name,
+                            "目标": assoc.get("目标分类", ""),
+                            "重叠系数": overlap,
+                            "共同帖子数": assoc.get("共同帖子数", 0)
+                        })
+
+            print(f"    总源分类数: {total_sources}")
+            print(f"    总关联数: {total_associations}")
+            print(f"    平均每个源分类的关联数: {total_associations/total_sources:.2f}" if total_sources > 0 else "    平均每个源分类的关联数: 0")
+
+            if high_similarity:
+                print(f"\n    🔥 高相似度关联 (Jaccard >= 0.5): {len(high_similarity)} 个")
+                for item in sorted(high_similarity, key=lambda x: x["Jaccard"], reverse=True)[:5]:
+                    print(f"       • {item['源']} → {item['目标']}")
+                    print(f"         Jaccard: {item['Jaccard']:.4f}, 共同帖子: {item['共同帖子数']}")
+
+            if high_overlap:
+                print(f"\n    🎯 高重叠系数关联 (重叠 >= 0.8): {len(high_overlap)} 个")
+                for item in sorted(high_overlap, key=lambda x: x["重叠系数"], reverse=True)[:5]:
+                    print(f"       • {item['源']} → {item['目标']}")
+                    print(f"         重叠系数: {item['重叠系数']:.4f}, 共同帖子: {item['共同帖子数']}")
+
+
+def analyze_triple_dimension(data: Dict) -> None:
+    """分析三维正交关联"""
+    print("\n" + "=" * 80)
+    print("🎲 三维正交关联分析")
+    print("=" * 80)
+
+    triple_dim = data.get("三维正交关联分析", {})
+
+    if not triple_dim:
+        print("未找到三维正交关联数据")
+        return
+
+    # 按灵感点分类组织
+    total_inspiration_classes = len(triple_dim)
+    total_orthogonal_combinations = 0
+    all_combinations = []
+
+    print(f"\n灵感点分类数: {total_inspiration_classes}")
+
+    for inspiration_class, inspiration_data in triple_dim.items():
+        orthogonal_list = inspiration_data.get("正交关联", [])
+        total_orthogonal_combinations += len(orthogonal_list)
+
+        for combo in orthogonal_list:
+            all_combinations.append({
+                "灵感点": inspiration_class,
+                "目的点": combo.get("目的点分类", ""),
+                "关键点": combo.get("关键点分类", ""),
+                "三维共同帖子数": combo.get("三维共同帖子数", 0),
+                "三维交集占灵感点比例": combo.get("三维交集占灵感点比例", 0),
+                "三维交集占目的点比例": combo.get("三维交集占目的点比例", 0),
+                "三维交集占关键点比例": combo.get("三维交集占关键点比例", 0),
+                "共同帖子ID": combo.get("三维共同帖子ID", [])
+            })
+
+    print(f"总正交组合数: {total_orthogonal_combinations}")
+    print(f"平均每个灵感点的正交组合数: {total_orthogonal_combinations/total_inspiration_classes:.2f}" if total_inspiration_classes > 0 else "平均每个灵感点的正交组合数: 0")
+
+    if all_combinations:
+        post_counts = [c["三维共同帖子数"] for c in all_combinations]
+        print(f"\n正交组合帖子数统计:")
+        print(f"  平均值: {sum(post_counts)/len(post_counts):.2f}")
+        print(f"  最大值: {max(post_counts)}")
+        print(f"  最小值: {min(post_counts)}")
+
+        # 高频组合
+        high_post_combinations = [c for c in all_combinations if c["三维共同帖子数"] >= 2]
+
+        if high_post_combinations:
+            print(f"\n🌟 高频三维正交组合 (三维共同帖子数 >= 2): {len(high_post_combinations)} 个")
+            for combo in sorted(high_post_combinations, key=lambda x: x["三维共同帖子数"], reverse=True)[:10]:
+                print(f"\n  三维共同帖子数: {combo['三维共同帖子数']}")
+                print(f"    灵感点: {combo['灵感点']}")
+                print(f"    目的点: {combo['目的点']}")
+                print(f"    关键点: {combo['关键点']}")
+                print(f"    交集占比 - 灵感:{combo['三维交集占灵感点比例']:.2f} 目的:{combo['三维交集占目的点比例']:.2f} 关键:{combo['三维交集占关键点比例']:.2f}")
+
+        # 高交集占比组合
+        high_ratio_combinations = [c for c in all_combinations if
+                                   c["三维交集占灵感点比例"] >= 0.5 and
+                                   c["三维交集占目的点比例"] >= 0.5 and
+                                   c["三维交集占关键点比例"] >= 0.5]
+        if high_ratio_combinations:
+            print(f"\n🔥 高交集占比正交组合 (三维度占比均 >= 0.5): {len(high_ratio_combinations)} 个")
+            for combo in sorted(high_ratio_combinations, key=lambda x: x["三维共同帖子数"], reverse=True)[:5]:
+                print(f"\n  三维共同帖子数: {combo['三维共同帖子数']}")
+                print(f"    灵感点: {combo['灵感点']}")
+                print(f"    目的点: {combo['目的点']}")
+                print(f"    关键点: {combo['关键点']}")
+                print(f"    交集占比 - 灵感:{combo['三维交集占灵感点比例']:.2f} 目的:{combo['三维交集占目的点比例']:.2f} 关键:{combo['三维交集占关键点比例']:.2f}")
+
+
+def analyze_association_strength(data: Dict) -> None:
+    """分析关联强度分布"""
+    print("\n" + "=" * 80)
+    print("📈 关联强度分布分析")
+    print("=" * 80)
+
+    single_dim = data.get("单维度关联分析", {})
+
+    all_jaccard = []
+    all_overlap = []
+    all_coverage_source = []
+    all_coverage_target = []
+
+    for dimension_name, dimension_data in single_dim.items():
+        for direction, associations in dimension_data.items():
+            if direction == "说明":
+                continue
+
+            for source_name, source_data in associations.items():
+                assoc_list = source_data.get("与目的点的关联", []) or \
+                            source_data.get("与关键点的关联", []) or \
+                            source_data.get("与灵感点的关联", [])
+
+                for assoc in assoc_list:
+                    all_jaccard.append(assoc.get("Jaccard相似度", 0))
+                    all_overlap.append(assoc.get("重叠系数", 0))
+
+                    # 根据direction确定覆盖率字段
+                    if "灵感点→" in direction:
+                        all_coverage_source.append(assoc.get("灵感点覆盖率", 0))
+                    elif "目的点→" in direction:
+                        all_coverage_source.append(assoc.get("目的点覆盖率", 0))
+                    elif "关键点→" in direction:
+                        all_coverage_source.append(assoc.get("关键点覆盖率", 0))
+
+                    all_coverage_target.append(assoc.get("目标维度覆盖率", 0))
+
+    if all_jaccard:
+        print(f"\nJaccard相似度分布:")
+        print(f"  平均值: {sum(all_jaccard)/len(all_jaccard):.4f}")
+        print(f"  中位数: {sorted(all_jaccard)[len(all_jaccard)//2]:.4f}")
+        print(f"  最大值: {max(all_jaccard):.4f}")
+        print(f"  最小值: {min(all_jaccard):.4f}")
+
+        # 分段统计
+        ranges = [(0, 0.2), (0.2, 0.4), (0.4, 0.6), (0.6, 0.8), (0.8, 1.0)]
+        for low, high in ranges:
+            count = sum(1 for j in all_jaccard if low <= j < high)
+            pct = count / len(all_jaccard) * 100
+            print(f"  [{low:.1f}, {high:.1f}): {count} ({pct:.1f}%)")
+
+    if all_overlap:
+        print(f"\n重叠系数分布:")
+        print(f"  平均值: {sum(all_overlap)/len(all_overlap):.4f}")
+        print(f"  中位数: {sorted(all_overlap)[len(all_overlap)//2]:.4f}")
+        print(f"  最大值: {max(all_overlap):.4f}")
+        print(f"  最小值: {min(all_overlap):.4f}")
+
+        # 统计完全重叠(1.0)的数量
+        perfect_overlap = sum(1 for o in all_overlap if o == 1.0)
+        print(f"  完全重叠(1.0): {perfect_overlap} ({perfect_overlap/len(all_overlap)*100:.1f}%)")
+
+
+def main():
+    file_path = "/Users/liulidong/project/pattern相关文件/optimization/dimension_associations_analysis.json"
+
+    print("🔍 加载数据...")
+    data = load_data(file_path)
+
+    # 执行各项分析
+    analyze_basic_info(data)
+    analyze_single_dimension(data)
+    analyze_triple_dimension(data)
+    analyze_association_strength(data)
+
+    print("\n" + "=" * 80)
+    print("✅ 分析完成!")
+    print("=" * 80)
+
+
+if __name__ == "__main__":
+    main()

+ 100 - 0
tools/analyze_content_types.py

@@ -0,0 +1,100 @@
+"""
+分析搜索结果中的内容类型分布(视频 vs 图文)
+"""
+
+import json
+from collections import Counter
+from typing import Dict, Any, List
+
+
+def analyze_content_types(stage6_path: str):
+    """分析 Stage6 搜索结果中的内容类型"""
+
+    # 加载数据
+    with open(stage6_path, 'r', encoding='utf-8') as f:
+        stage6_data = json.load(f)
+
+    print("=" * 80)
+    print("Stage6 搜索结果内容类型分析")
+    print("=" * 80)
+
+    # 收集所有搜索结果的内容类型
+    content_type_counter = Counter()
+    feature_content_types = {}  # 原始特征 -> 内容类型分布
+
+    total_searches = 0
+    total_notes = 0
+
+    for original_feature in stage6_data:
+        feature_name = original_feature['原始特征名称']
+        feature_types = Counter()
+
+        for association in original_feature.get('找到的关联', []):
+            for feature in association.get('特征列表', []):
+                search_result = feature.get('search_result')
+
+                if search_result:
+                    total_searches += 1
+
+                    # 提取帖子数据
+                    notes = search_result.get('data', {}).get('data', [])
+                    total_notes += len(notes)
+
+                    for note in notes:
+                        note_card = note.get('note_card', {})
+                        note_type = note_card.get('type', 'unknown')
+
+                        content_type_counter[note_type] += 1
+                        feature_types[note_type] += 1
+
+        if feature_types:
+            feature_content_types[feature_name] = feature_types
+
+    # 打印总体统计
+    print(f"\n📊 总体统计:")
+    print(f"  已执行搜索: {total_searches} 次")
+    print(f"  总帖子数: {total_notes} 个")
+
+    print(f"\n📋 内容类型分布:")
+    for content_type, count in content_type_counter.most_common():
+        percentage = count / total_notes * 100
+        print(f"  {content_type}: {count} 个 ({percentage:.1f}%)")
+
+    # 打印各特征的内容类型分布
+    print(f"\n📊 各原始特征的内容类型分布:")
+    for feature_name, types in feature_content_types.items():
+        total_feature_notes = sum(types.values())
+        print(f"\n  【{feature_name}】 共 {total_feature_notes} 个帖子")
+
+        for content_type, count in types.most_common():
+            percentage = count / total_feature_notes * 100
+            print(f"    {content_type}: {count} 个 ({percentage:.1f}%)")
+
+    # 分析视频占比
+    video_count = content_type_counter.get('video', 0)
+    normal_count = content_type_counter.get('normal', 0)  # 图文类型
+
+    print(f"\n🎯 关键发现:")
+    if video_count > 0:
+        video_ratio = video_count / total_notes * 100
+        print(f"  ⚠️  发现 {video_count} 个视频帖子 (占比 {video_ratio:.1f}%)")
+        print(f"  ✓ 图文帖子: {normal_count} 个 (占比 {normal_count/total_notes*100:.1f}%)")
+        print(f"\n  问题原因分析:")
+        print(f"    - 小红书 API 的 content_type='图文' 参数可能未被严格遵守")
+        print(f"    - 或者 API 返回混合类型的内容")
+        print(f"    - 建议在客户端侧添加内容类型过滤")
+    else:
+        print(f"  ✓ 未发现视频内容,全部为图文")
+
+    print("\n" + "=" * 80)
+
+
+if __name__ == '__main__':
+    import sys
+
+    stage6_path = 'output_v2/stage6_with_evaluations.json'
+
+    if len(sys.argv) > 1:
+        stage6_path = sys.argv[1]
+
+    analyze_content_types(stage6_path)

+ 202 - 0
tools/analyze_feature_matches.py

@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+统计 how 解构文件中所有原始特征匹配到的分类/标签及其路径
+"""
+
+import json
+from collections import defaultdict
+from typing import Dict, List, Set, Any
+
+
+def build_classification_path(classification_list: List[str]) -> str:
+    """构建分类路径字符串"""
+    if not classification_list:
+        return ""
+    return "/".join(classification_list)
+
+
+def analyze_feature_matches(json_file_path: str) -> Dict[str, Any]:
+    """
+    分析文件中所有原始特征的匹配情况
+
+    返回结构:
+    {
+        "原始特征1": {
+            "匹配的分类标签": [
+                {
+                    "名称": "...",
+                    "类型": "标签/分类",
+                    "路径": "...",
+                    "层级": "...",
+                    "相似度": 0.xxx
+                }
+            ],
+            "统计": {
+                "总匹配数": xxx,
+                "高相似度匹配数(>0.8)": xxx,
+                "中等相似度匹配数(0.5-0.8)": xxx,
+                "低相似度匹配数(<0.5)": xxx
+            }
+        }
+    }
+    """
+
+    # 读取JSON文件
+    with open(json_file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
+    # 存储结果
+    feature_matches = defaultdict(lambda: {
+        "匹配的分类标签": [],
+        "统计": {
+            "高相似度匹配数(>=0.8)": 0
+        }
+    })
+
+    # 遍历 how解构结果
+    how_result = data.get('how解构结果', {})
+
+    # 处理三种列表:灵感点列表、目的点列表、关键点列表
+    for level_name in ['灵感点列表', '目的点列表', '关键点列表']:
+        level_list = how_result.get(level_name, [])
+
+        for item in level_list:
+            # 遍历how步骤列表
+            for step in item.get('how步骤列表', []):
+                # 遍历每个步骤中的特征
+                for feature in step.get('特征列表', []):
+                    feature_name = feature.get('特征名称', '')
+                    matches = feature.get('匹配结果', [])
+
+                    if not feature_name:
+                        continue
+
+                    # 处理每个匹配结果
+                    for match in matches:
+                        persona_feature_name = match.get('人设特征名称', '')
+                        feature_type = match.get('特征类型', '')
+                        classification_list = match.get('特征分类', [])
+                        feature_level = match.get('人设特征层级', '')
+                        similarity = match.get('匹配结果', {}).get('相似度', 0)
+
+                        # 只保留相似度>=0.8的匹配
+                        if similarity < 0.8:
+                            continue
+
+                        # 构建路径
+                        path = build_classification_path(classification_list)
+
+                        # 添加到结果
+                        match_info = {
+                            "名称": persona_feature_name,
+                            "类型": feature_type,
+                            "路径": path,
+                            "层级": feature_level,
+                            "相似度": round(similarity, 3)
+                        }
+
+                        feature_matches[feature_name]["匹配的分类标签"].append(match_info)
+
+                        # 更新统计
+                        stats = feature_matches[feature_name]["统计"]
+                        stats["高相似度匹配数(>=0.8)"] += 1
+
+    # 对每个原始特征的匹配结果按相似度降序排序
+    for feature_name in feature_matches:
+        feature_matches[feature_name]["匹配的分类标签"].sort(
+            key=lambda x: x["相似度"],
+            reverse=True
+        )
+
+    return dict(feature_matches)
+
+
+def print_summary(results: Dict[str, Any]):
+    """打印统计摘要"""
+    print("=" * 80)
+    print("原始特征匹配统计摘要(仅相似度>=0.8)")
+    print("=" * 80)
+
+    total_features = len(results)
+    # 统计有匹配的特征数
+    features_with_matches = sum(1 for data in results.values() if data["统计"]["高相似度匹配数(>=0.8)"] > 0)
+
+    print(f"\n总原始特征数: {total_features}")
+    print(f"有高相似度匹配的特征数: {features_with_matches}")
+    print(f"无匹配的特征数: {total_features - features_with_matches}")
+
+    # 统计总体数据
+    total_matches = 0
+
+    for feature_name, data in results.items():
+        stats = data["统计"]
+        total_matches += stats["高相似度匹配数(>=0.8)"]
+
+    print(f"\n总高相似度匹配数(>=0.8): {total_matches}")
+
+    print("\n" + "=" * 80)
+    print("各原始特征详细匹配情况")
+    print("=" * 80)
+
+
+def print_detailed_results(results: Dict[str, Any], top_n: int = None):
+    """打印详细结果"""
+
+    for idx, (feature_name, data) in enumerate(results.items(), 1):
+        stats = data["统计"]
+        matches = data["匹配的分类标签"]
+        match_count = stats['高相似度匹配数(>=0.8)']
+
+        # 跳过没有匹配的特征
+        if match_count == 0:
+            continue
+
+        print(f"\n[{idx}] 原始特征: {feature_name}")
+        print(f"    高相似度匹配数(>=0.8): {match_count}")
+
+        # 显示所有匹配(如果指定了top_n则只显示前N个)
+        display_matches = matches[:top_n] if top_n else matches
+        print(f"    匹配列表(共{len(display_matches)}个):")
+        for i, match in enumerate(display_matches, 1):
+            print(f"      {i}. {match['名称']} ({match['相似度']:.3f})")
+            print(f"         类型: {match['类型']}, 层级: {match['层级']}")
+            if match['路径']:
+                print(f"         路径: {match['路径']}")
+            else:
+                print(f"         路径: (顶级分类)")
+
+
+def save_results(results: Dict[str, Any], output_file: str):
+    """保存结果到JSON文件"""
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    print(f"\n详细结果已保存到: {output_file}")
+
+
+def main():
+    # 输入文件路径
+    input_file = "/Users/liulidong/project/pattern相关文件/optimization/690d977d0000000007036331_how.json"
+
+    # 输出文件路径
+    output_file = "/Users/liulidong/project/pattern相关文件/optimization/feature_matches_analysis.json"
+
+    print("开始分析特征匹配...")
+
+    # 分析
+    results = analyze_feature_matches(input_file)
+
+    # 打印摘要
+    print_summary(results)
+
+    # 打印详细结果(显示所有匹配,不限制数量)
+    print_detailed_results(results, top_n=None)
+
+    # 保存结果
+    save_results(results, output_file)
+
+    print("\n分析完成!")
+
+
+if __name__ == "__main__":
+    main()

+ 168 - 0
tools/analyze_specific_feature.py

@@ -0,0 +1,168 @@
+"""
+分析特定原始特征的搜索执行情况
+"""
+
+import json
+import sys
+from typing import Dict, Any, List
+
+
+def analyze_feature_searches(stage4_path: str, stage6_path: str, feature_name: str):
+    """分析指定原始特征的搜索情况"""
+
+    # 加载数据
+    with open(stage4_path, 'r', encoding='utf-8') as f:
+        stage4_data = json.load(f)
+
+    with open(stage6_path, 'r', encoding='utf-8') as f:
+        stage6_data = json.load(f)
+
+    # 找到指定特征
+    stage4_feature = None
+    stage6_feature = None
+
+    for item in stage4_data:
+        if item['原始特征名称'] == feature_name:
+            stage4_feature = item
+            break
+
+    for item in stage6_data:
+        if item['原始特征名称'] == feature_name:
+            stage6_feature = item
+            break
+
+    if not stage4_feature:
+        print(f"❌ 在 Stage4 中未找到特征: {feature_name}")
+        return
+
+    if not stage6_feature:
+        print(f"❌ 在 Stage6 中未找到特征: {feature_name}")
+        return
+
+    print("=" * 80)
+    print(f"原始特征: {feature_name}")
+    print("=" * 80)
+
+    # 收集 Stage4 的所有搜索词
+    stage4_search_words = []
+    for association in stage4_feature.get('找到的关联', []):
+        assoc_name = association.get('分类名称', '')
+        for feature in association.get('特征列表', []):
+            search_word = feature.get('search_word')
+            llm_eval = feature.get('llm_evaluation', {})
+
+            if search_word:
+                stage4_search_words.append({
+                    'search_word': search_word,
+                    'association': assoc_name,
+                    'feature_name': feature.get('特征名称', ''),
+                    'llm_score': llm_eval.get('score'),
+                    'llm_rank': llm_eval.get('rank'),
+                    'reasoning': llm_eval.get('reasoning', '')
+                })
+
+    # 收集 Stage6 的所有搜索词及其执行状态
+    stage6_search_words = []
+    for association in stage6_feature.get('找到的关联', []):
+        assoc_name = association.get('分类名称', '')
+        for feature in association.get('特征列表', []):
+            search_word = feature.get('search_word')
+            search_result = feature.get('search_result')
+            search_metadata = feature.get('search_metadata', {})
+            llm_eval = feature.get('llm_evaluation', {})
+
+            if search_word:
+                stage6_search_words.append({
+                    'search_word': search_word,
+                    'association': assoc_name,
+                    'feature_name': feature.get('特征名称', ''),
+                    'llm_score': llm_eval.get('score'),
+                    'llm_rank': llm_eval.get('rank'),
+                    'has_result': search_result is not None,
+                    'status': search_metadata.get('status', 'not_searched'),
+                    'note_count': search_metadata.get('note_count', 0)
+                })
+
+    # 统计
+    total_stage4 = len(stage4_search_words)
+    total_stage6 = len(stage6_search_words)
+    searched = sum(1 for w in stage6_search_words if w['has_result'])
+    not_searched = total_stage6 - searched
+
+    print(f"\n📊 统计信息:")
+    print(f"  Stage4 生成的搜索词数: {total_stage4}")
+    print(f"  Stage6 保留的搜索词数: {total_stage6}")
+    print(f"  已执行搜索: {searched} 个")
+    print(f"  未执行搜索: {not_searched} 个")
+    print(f"  搜索执行率: {searched/total_stage6*100:.1f}%")
+
+    # 按 rank 排序并展示
+    stage6_sorted = sorted(stage6_search_words, key=lambda x: x['llm_rank'] if x['llm_rank'] else 999)
+
+    print(f"\n📋 详细搜索词列表 (按 LLM Rank 排序):")
+    print(f"{'Rank':<6} {'评分':<6} {'搜索状态':<12} {'帖子数':<8} 搜索词")
+    print("-" * 80)
+
+    for idx, word in enumerate(stage6_sorted, 1):
+        rank = word['llm_rank'] if word['llm_rank'] else 'N/A'
+        score = f"{word['llm_score']:.2f}" if word['llm_score'] else 'N/A'
+        status = '✅ 已搜索' if word['has_result'] else '⏸️  未搜索'
+        note_count = word['note_count'] if word['has_result'] else '-'
+
+        print(f"{rank:<6} {score:<6} {status:<12} {note_count:<8} {word['search_word']}")
+
+    # 展示已搜索的搜索词详情
+    searched_words = [w for w in stage6_sorted if w['has_result']]
+    if searched_words:
+        print(f"\n✅ 已执行搜索的 {len(searched_words)} 个搜索词:")
+        for idx, word in enumerate(searched_words, 1):
+            print(f"\n  【{idx}】 {word['search_word']}")
+            print(f"       关联: {word['association']}")
+            print(f"       特征: {word['feature_name']}")
+            print(f"       评分: {word['llm_score']:.2f}, 排名: #{word['llm_rank']}")
+            print(f"       结果: {word['note_count']} 个帖子")
+
+    # 展示未搜索的搜索词
+    not_searched_words = [w for w in stage6_sorted if not w['has_result']]
+    if not_searched_words:
+        print(f"\n⏸️  未执行搜索的 {len(not_searched_words)} 个搜索词:")
+        for idx, word in enumerate(not_searched_words, 1):
+            print(f"\n  【{idx}】 {word['search_word']}")
+            print(f"       关联: {word['association']}")
+            print(f"       特征: {word['feature_name']}")
+            print(f"       评分: {word['llm_score']:.2f}, 排名: #{word['llm_rank']}")
+
+    # 分析为什么只搜索了部分
+    print(f"\n🔍 搜索策略分析:")
+    if searched == 10:
+        print(f"  系统使用了 Top-10 策略")
+        top_10_ranks = sorted([w['llm_rank'] for w in searched_words if w['llm_rank']])
+        print(f"  实际搜索的 Rank 范围: {top_10_ranks}")
+
+        # 检查是否严格按 rank 取的 top-10
+        expected_top_10_ranks = sorted([w['llm_rank'] for w in stage6_sorted[:10] if w['llm_rank']])
+        if top_10_ranks == expected_top_10_ranks:
+            print(f"  ✓ 严格按照 LLM Rank 取了 Top-10")
+        else:
+            print(f"  ⚠️  不是严格的 Top-10 (期望: {expected_top_10_ranks})")
+    elif searched > 0:
+        print(f"  系统执行了 {searched} 个搜索")
+    else:
+        print(f"  该特征的搜索尚未执行")
+
+    print("\n" + "=" * 80)
+
+
+if __name__ == '__main__':
+    stage4_path = 'output_v2/stage4_with_llm_scores.json'
+    stage6_path = 'output_v2/stage6_with_evaluations.json'
+    feature_name = '墨镜'
+
+    if len(sys.argv) > 1:
+        feature_name = sys.argv[1]
+    if len(sys.argv) > 2:
+        stage4_path = sys.argv[2]
+    if len(sys.argv) > 3:
+        stage6_path = sys.argv[3]
+
+    analyze_feature_searches(stage4_path, stage6_path, feature_name)

+ 236 - 0
tools/analyze_stage6_results.py

@@ -0,0 +1,236 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Stage 6 评估结果统计分析
+分析两层评估的过滤效果和匹配质量
+"""
+
+import json
+from typing import Dict, List, Any
+from collections import defaultdict
+
+
+def load_stage6_results(file_path: str) -> List[Dict[str, Any]]:
+    """加载Stage 6评估结果"""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def analyze_evaluation_results(data: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """分析评估结果"""
+
+    # 全局统计
+    global_stats = {
+        'total_search_words': 0,
+        'total_notes_evaluated': 0,
+        'total_filtered': 0,
+        'match_distribution': {
+            '完全匹配(8-10)': 0,
+            '相似匹配(6-7)': 0,
+            '弱相似(5-6)': 0,
+            '无匹配(≤4)': 0
+        }
+    }
+
+    # 按原始特征分组统计
+    feature_stats = defaultdict(lambda: {
+        'search_words_count': 0,
+        'total_notes': 0,
+        'total_filtered': 0,
+        'match_distribution': {
+            '完全匹配(8-10)': 0,
+            '相似匹配(6-7)': 0,
+            '弱相似(5-6)': 0,
+            '无匹配(≤4)': 0
+        },
+        'search_words': []
+    })
+
+    # 所有搜索词的详细统计
+    search_word_details = []
+
+    # 遍历所有原始特征
+    for feature_result in data:
+        original_feature = feature_result.get('原始特征名称', 'Unknown')
+
+        # 从组合评估结果_分组中读取
+        grouped_results = feature_result.get('组合评估结果_分组', [])
+
+        for group in grouped_results:
+            base_word = group.get('base_word', '')
+
+            for eval_item in group.get('top10_searches', []):
+                # 检查是否有评估结果
+                evaluation = eval_item.get('evaluation_with_filter')
+                if not evaluation:
+                    continue
+
+                search_word = eval_item.get('search_word', '')
+
+                # 提取评估数据
+                total_notes = evaluation.get('total_notes', 0)
+                evaluated_notes = evaluation.get('evaluated_notes', 0)
+                filtered_count = evaluation.get('filtered_count', 0)
+                statistics = evaluation.get('statistics', {})
+
+                # 更新全局统计
+                global_stats['total_search_words'] += 1
+                global_stats['total_notes_evaluated'] += total_notes
+                global_stats['total_filtered'] += filtered_count
+
+                for key in global_stats['match_distribution']:
+                    global_stats['match_distribution'][key] += statistics.get(key, 0)
+
+                # 更新特征统计
+                feature_stats[original_feature]['search_words_count'] += 1
+                feature_stats[original_feature]['total_notes'] += total_notes
+                feature_stats[original_feature]['total_filtered'] += filtered_count
+
+                for key in feature_stats[original_feature]['match_distribution']:
+                    feature_stats[original_feature]['match_distribution'][key] += statistics.get(key, 0)
+
+                # 记录搜索词详情
+                search_word_info = {
+                    'original_feature': original_feature,
+                    'base_word': base_word,
+                    'search_word': search_word,
+                    'total_notes': total_notes,
+                    'evaluated_notes': evaluated_notes,
+                    'filtered_count': filtered_count,
+                    'match_distribution': statistics,
+                    'high_quality_count': statistics.get('完全匹配(8-10)', 0),
+                    'similar_count': statistics.get('相似匹配(6-7)', 0)
+                }
+
+                search_word_details.append(search_word_info)
+                feature_stats[original_feature]['search_words'].append(search_word_info)
+
+    # 计算全局过滤率
+    if global_stats['total_notes_evaluated'] > 0:
+        global_stats['filter_rate'] = global_stats['total_filtered'] / global_stats['total_notes_evaluated']
+    else:
+        global_stats['filter_rate'] = 0.0
+
+    # 计算每个特征的过滤率
+    for feature_name, stats in feature_stats.items():
+        if stats['total_notes'] > 0:
+            stats['filter_rate'] = stats['total_filtered'] / stats['total_notes']
+        else:
+            stats['filter_rate'] = 0.0
+
+    # 按高质量匹配数排序搜索词
+    search_word_details.sort(key=lambda x: x['high_quality_count'], reverse=True)
+
+    return {
+        'global_stats': global_stats,
+        'feature_stats': dict(feature_stats),
+        'search_word_details': search_word_details
+    }
+
+
+def print_statistics(stats: Dict[str, Any]):
+    """打印统计结果"""
+    global_stats = stats['global_stats']
+    feature_stats = stats['feature_stats']
+    search_word_details = stats['search_word_details']
+
+    print("=" * 80)
+    print("Stage 6 评估结果统计分析")
+    print("=" * 80)
+
+    # 全局统计
+    print("\n【全局统计】")
+    print(f"  总搜索词数: {global_stats['total_search_words']}")
+    print(f"  总评估帖子数: {global_stats['total_notes_evaluated']}")
+    print(f"  总过滤帖子数: {global_stats['total_filtered']} (过滤率: {global_stats['filter_rate']*100:.1f}%)")
+    print(f"\n  匹配度分布:")
+    for match_type, count in global_stats['match_distribution'].items():
+        print(f"    {match_type}: {count} 个帖子")
+
+    # 按原始特征统计
+    print("\n" + "=" * 80)
+    print("【按原始特征统计】")
+    print("=" * 80)
+
+    for feature_name, stats in sorted(feature_stats.items()):
+        print(f"\n特征: {feature_name}")
+        print(f"  搜索词数: {stats['search_words_count']}")
+        print(f"  总评估帖子: {stats['total_notes']}")
+        print(f"  总过滤帖子: {stats['total_filtered']} (过滤率: {stats['filter_rate']*100:.1f}%)")
+        print(f"  高质量匹配: {stats['match_distribution']['完全匹配(8-10)']} 个帖子")
+        print(f"  相似匹配: {stats['match_distribution']['相似匹配(6-7)']} 个帖子")
+
+        # 找出该特征下高质量匹配最多的搜索词
+        best_searches = sorted(stats['search_words'], key=lambda x: x['high_quality_count'], reverse=True)[:3]
+        if best_searches:
+            print(f"  Top 3 最佳搜索词:")
+            for idx, sw in enumerate(best_searches, 1):
+                print(f"    {idx}. \"{sw['search_word']}\" - {sw['high_quality_count']}个完全匹配")
+
+    # Top 10 最佳搜索词
+    print("\n" + "=" * 80)
+    print("【Top 10 最佳搜索词(按完全匹配数排序)】")
+    print("=" * 80)
+
+    for idx, sw in enumerate(search_word_details[:10], 1):
+        print(f"\n{idx}. \"{sw['search_word']}\"")
+        print(f"   原始特征: {sw['original_feature']}")
+        print(f"   Base Word: {sw['base_word']}")
+        print(f"   评估帖子: {sw['total_notes']}, 过滤: {sw['filtered_count']}")
+        print(f"   完全匹配(8-10): {sw['high_quality_count']} 个")
+        print(f"   相似匹配(6-7): {sw['similar_count']} 个")
+
+    # 过滤效果分析
+    print("\n" + "=" * 80)
+    print("【过滤效果分析】")
+    print("=" * 80)
+
+    total_evaluated = global_stats['total_notes_evaluated']
+    total_filtered = global_stats['total_filtered']
+    total_remaining = total_evaluated - total_filtered
+
+    total_high_quality = global_stats['match_distribution']['完全匹配(8-10)']
+    total_similar = global_stats['match_distribution']['相似匹配(6-7)']
+    total_weak = global_stats['match_distribution']['弱相似(5-6)']
+    total_no_match = global_stats['match_distribution']['无匹配(≤4)']
+
+    print(f"  评估帖子总数: {total_evaluated}")
+    print(f"  第一层过滤(Query不相关): {total_filtered} ({total_filtered/total_evaluated*100:.1f}%)")
+    print(f"  通过过滤的帖子: {total_remaining} ({total_remaining/total_evaluated*100:.1f}%)")
+    print(f"\n  通过过滤后的质量分布:")
+    if total_remaining > 0:
+        print(f"    完全匹配(8-10): {total_high_quality} ({total_high_quality/total_remaining*100:.1f}%)")
+        print(f"    相似匹配(6-7): {total_similar} ({total_similar/total_remaining*100:.1f}%)")
+        print(f"    弱相似(5-6): {total_weak} ({total_weak/total_remaining*100:.1f}%)")
+        print(f"    无匹配(≤4): {total_no_match} ({total_no_match/total_remaining*100:.1f}%)")
+
+    print("\n" + "=" * 80)
+
+
+def save_statistics(stats: Dict[str, Any], output_path: str):
+    """保存统计结果到JSON文件"""
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(stats, f, ensure_ascii=False, indent=2)
+    print(f"\n统计结果已保存到: {output_path}")
+
+
+def main():
+    """主函数"""
+    input_file = "output_v2/stage6_with_evaluations.json"
+    output_file = "output_v2/stage6_statistics.json"
+
+    print("正在加载数据...")
+    data = load_stage6_results(input_file)
+
+    print("正在分析评估结果...")
+    stats = analyze_evaluation_results(data)
+
+    # 打印统计结果
+    print_statistics(stats)
+
+    # 保存结果
+    save_statistics(stats, output_file)
+
+
+if __name__ == '__main__':
+    main()

+ 104 - 0
tools/remove_association_methods.py

@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+临时脚本:删除 association 相关的方法
+"""
+
+# 需要删除的方法列表(方法名)
+METHODS_TO_REMOVE = [
+    '_is_classification',
+    '_navigate_to_node',
+    '_recursive_search',
+    '_search_classification_path',
+    'stage2_find_associations',
+    '_find_associations',
+    '_find_intra_dimension_associations',
+    '_collect_classification_info',
+    'stage3_filter_high_similarity_matches',
+    '_collect_scope_from_associations',
+    '_collect_stage2_scope',
+    '_find_features_by_path',
+]
+
+def find_method_bounds(lines, method_name):
+    """
+    查找方法的起始和结束行号
+
+    Returns:
+        (start_line, end_line) 或 None
+    """
+    start_line = None
+    indent_level = None
+
+    # 查找方法开始
+    for i, line in enumerate(lines):
+        if f'def {method_name}(' in line:
+            start_line = i
+            # 获取方法的缩进级别
+            indent_level = len(line) - len(line.lstrip())
+            break
+
+    if start_line is None:
+        return None
+
+    # 查找方法结束(下一个同级或更外层的def/class,或遇到注释分隔符)
+    for i in range(start_line + 1, len(lines)):
+        line = lines[i]
+        stripped = line.lstrip()
+
+        # 空行跳过
+        if not stripped:
+            continue
+
+        current_indent = len(line) - len(line.lstrip())
+
+        # 遇到 # ========== 注释分隔符
+        if stripped.startswith('# =========='):
+            return (start_line, i)
+
+        # 遇到同级或更外层的def/class
+        if current_indent <= indent_level and (stripped.startswith('def ') or stripped.startswith('class ')):
+            return (start_line, i)
+
+    # 如果到文件末尾都没找到
+    return (start_line, len(lines))
+
+
+def main():
+    input_file = 'enhanced_search_v2.py'
+    output_file = 'enhanced_search_v2_cleaned.py'
+
+    # 读取文件
+    with open(input_file, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+
+    # 收集所有要删除的行范围
+    ranges_to_remove = []
+
+    for method_name in METHODS_TO_REMOVE:
+        result = find_method_bounds(lines, method_name)
+        if result:
+            ranges_to_remove.append(result)
+            print(f"找到方法 {method_name}: 行 {result[0]+1} - {result[1]+1}")
+        else:
+            print(f"未找到方法 {method_name}")
+
+    # 按起始行排序(倒序)
+    ranges_to_remove.sort(reverse=True)
+
+    # 删除方法(从后往前删)
+    for start, end in ranges_to_remove:
+        print(f"删除行 {start+1} - {end}")
+        del lines[start:end]
+
+    # 写入新文件
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.writelines(lines)
+
+    print(f"\n✓ 已生成清理后的文件: {output_file}")
+    print(f"原文件行数: {len(open(input_file).readlines())}")
+    print(f"新文件行数: {len(lines)}")
+
+
+if __name__ == '__main__':
+    main()