刘立冬 3 hete
szülő
commit
26b2f19f36
6 módosított fájl, 1424 hozzáadás és 27 törlés
  1. 269 0
      image_downloader.py
  2. 214 0
      run_stage7.py
  3. 550 0
      stage7_analyzer.py
  4. 253 0
      stage7_api_client.py
  5. 13 0
      stage7_config.json
  6. 125 27
      visualize_stage6_results.py

+ 269 - 0
image_downloader.py

@@ -0,0 +1,269 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+图片下载和本地服务工具
+用于将小红书图片下载到本地,并通过HTTP服务器提供访问
+"""
+
+import os
+import hashlib
+import requests
+import time
+from pathlib import Path
+from typing import List, Optional
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class ImageDownloader:
+    """图片下载器"""
+
+    def __init__(self, download_dir: str = "downloaded_images", max_retries: int = 3):
+        """
+        初始化图片下载器
+
+        Args:
+            download_dir: 图片下载目录
+            max_retries: 最大重试次数
+        """
+        self.download_dir = Path(download_dir)
+        self.download_dir.mkdir(parents=True, exist_ok=True)
+        self.max_retries = max_retries
+
+        # 请求头,模拟浏览器
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
+            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+            'Referer': 'https://www.xiaohongshu.com/'
+        }
+
+    def get_image_hash(self, url: str) -> str:
+        """
+        根据URL生成唯一的图片文件名
+
+        Args:
+            url: 图片URL
+
+        Returns:
+            文件名(不含扩展名)
+        """
+        return hashlib.md5(url.encode()).hexdigest()
+
+    def get_extension_from_url(self, url: str) -> str:
+        """
+        从URL中提取文件扩展名
+
+        Args:
+            url: 图片URL
+
+        Returns:
+            扩展名(如 .jpg, .png, .webp)
+        """
+        # 检查URL中是否指定了format参数
+        if 'format/jpg' in url or url.endswith('.jpg'):
+            return '.jpg'
+        elif 'format/png' in url or url.endswith('.png'):
+            return '.png'
+        elif 'format/webp' in url or url.endswith('.webp'):
+            return '.webp'
+        elif 'format/jpeg' in url or url.endswith('.jpeg'):
+            return '.jpeg'
+
+        # 默认使用webp
+        return '.webp'
+
+    def download_image(self, url: str) -> Optional[str]:
+        """
+        下载单张图片
+
+        Args:
+            url: 图片URL
+
+        Returns:
+            本地文件路径,失败返回None
+        """
+        if not url:
+            return None
+
+        # 生成本地文件路径
+        file_hash = self.get_image_hash(url)
+        extension = self.get_extension_from_url(url)
+        local_path = self.download_dir / f"{file_hash}{extension}"
+
+        # 如果文件已存在,直接返回
+        if local_path.exists():
+            logger.debug(f"图片已存在: {local_path}")
+            return str(local_path)
+
+        # 下载图片
+        for attempt in range(self.max_retries):
+            try:
+                logger.debug(f"下载图片 (尝试 {attempt + 1}/{self.max_retries}): {url}")
+
+                response = requests.get(
+                    url,
+                    headers=self.headers,
+                    timeout=30,
+                    stream=True  # 使用流式下载避免内存问题
+                )
+
+                if response.status_code == 200:
+                    # 写入文件
+                    with open(local_path, 'wb') as f:
+                        for chunk in response.iter_content(chunk_size=8192):
+                            if chunk:
+                                f.write(chunk)
+
+                    logger.debug(f"✓ 下载成功: {local_path}")
+                    return str(local_path)
+                else:
+                    logger.warning(f"下载失败,状态码: {response.status_code}")
+
+            except requests.Timeout:
+                logger.warning(f"下载超时 (尝试 {attempt + 1}/{self.max_retries})")
+            except Exception as e:
+                logger.warning(f"下载失败: {e} (尝试 {attempt + 1}/{self.max_retries})")
+
+            # 等待后重试
+            if attempt < self.max_retries - 1:
+                wait_time = 2 ** attempt
+                time.sleep(wait_time)
+
+        logger.error(f"✗ 下载失败(已重试{self.max_retries}次): {url}")
+        return None
+
+    def download_images(self, urls: List[str]) -> List[Optional[str]]:
+        """
+        批量下载图片
+
+        Args:
+            urls: 图片URL列表
+
+        Returns:
+            本地文件路径列表
+        """
+        local_paths = []
+        for url in urls:
+            local_path = self.download_image(url)
+            local_paths.append(local_path)
+
+        return local_paths
+
+
+class LocalImageServer:
+    """本地图片服务器配置"""
+
+    def __init__(self, base_url: str = "http://localhost:8765", image_dir: str = "downloaded_images"):
+        """
+        初始化本地图片服务器配置
+
+        Args:
+            base_url: 服务器基础URL
+            image_dir: 图片目录名
+        """
+        self.base_url = base_url.rstrip('/')
+        self.image_dir = image_dir
+
+    def get_public_url(self, local_path: str) -> str:
+        """
+        将本地路径转换为公开URL
+
+        Args:
+            local_path: 本地文件路径
+
+        Returns:
+            公开可访问的URL
+        """
+        if not local_path:
+            return ""
+
+        # 提取文件名
+        filename = Path(local_path).name
+
+        # 生成公开URL
+        return f"{self.base_url}/{filename}"
+
+    def convert_paths_to_urls(self, local_paths: List[Optional[str]]) -> List[str]:
+        """
+        批量转换本地路径为公开URL
+
+        Args:
+            local_paths: 本地文件路径列表
+
+        Returns:
+            公开URL列表
+        """
+        return [self.get_public_url(path) if path else "" for path in local_paths]
+
+
+def start_simple_http_server(directory: str = "downloaded_images", port: int = 8765):
+    """
+    启动简单的HTTP文件服务器(用于开发/测试)
+
+    Args:
+        directory: 要服务的目录
+        port: 端口号
+
+    Note:
+        这个函数会阻塞当前线程,建议在单独的进程中运行
+    """
+    import http.server
+    import socketserver
+    import os
+
+    os.chdir(directory)
+
+    Handler = http.server.SimpleHTTPRequestHandler
+
+    # 添加CORS支持
+    class CORSRequestHandler(Handler):
+        def end_headers(self):
+            self.send_header('Access-Control-Allow-Origin', '*')
+            self.send_header('Access-Control-Allow-Methods', 'GET, OPTIONS')
+            self.send_header('Cache-Control', 'no-store, no-cache, must-revalidate')
+            return super().end_headers()
+
+    with socketserver.TCPServer(("", port), CORSRequestHandler) as httpd:
+        print(f"图片服务器运行在 http://localhost:{port}")
+        print(f"服务目录: {os.getcwd()}")
+        print("按 Ctrl+C 停止服务器")
+        httpd.serve_forever()
+
+
+if __name__ == '__main__':
+    import sys
+
+    if len(sys.argv) > 1 and sys.argv[1] == 'serve':
+        # 启动HTTP服务器模式
+        port = int(sys.argv[2]) if len(sys.argv) > 2 else 8765
+        directory = sys.argv[3] if len(sys.argv) > 3 else "downloaded_images"
+
+        print(f"启动图片服务器...")
+        print(f"目录: {directory}")
+        print(f"端口: {port}")
+
+        start_simple_http_server(directory, port)
+    else:
+        # 测试下载功能
+        test_url = "https://ci.xiaohongshu.com/1040g2sg31e4ln39lh0bg5p8vj7kp2skkvm4jgno?imageView2/2/w/1080/format/webp"
+
+        print("测试图片下载功能")
+        print(f"测试URL: {test_url}")
+
+        downloader = ImageDownloader()
+        local_path = downloader.download_image(test_url)
+
+        if local_path:
+            print(f"✓ 下载成功: {local_path}")
+
+            # 测试URL转换
+            server = LocalImageServer()
+            public_url = server.get_public_url(local_path)
+            print(f"公开URL: {public_url}")
+
+            print(f"\n要启动图片服务器,运行:")
+            print(f"python3 image_downloader.py serve 8765")
+        else:
+            print("✗ 下载失败")

+ 214 - 0
run_stage7.py

@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Stage 7 独立运行脚本
+从 Stage 6 结果开始,进行深度解构分析
+支持指定 feature 和数量限制
+"""
+
+import os
+import json
+import logging
+import argparse
+from stage7_analyzer import Stage7DeconstructionAnalyzer
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S',
+    handlers=[
+        logging.FileHandler('stage7_standalone.log', encoding='utf-8'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+
+def main():
+    """主函数"""
+    parser = argparse.ArgumentParser(
+        description='Stage 7 深度解构分析(独立运行)',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog='''
+示例用法:
+  # 只处理"墨镜"特征的前10个高分帖子
+  python3 run_stage7.py --feature "墨镜" --max-notes 10
+
+  # 处理"墨镜"和"耳环"两个特征,每个最多5个
+  python3 run_stage7.py --feature "墨镜" "耳环" --max-notes 5
+
+  # 处理所有特征,按时间排序,前20个
+  python3 run_stage7.py --sort-by time --max-notes 20
+
+  # 只处理"墨镜",按互动量排序,跳过前3个
+  python3 run_stage7.py --feature "墨镜" --sort-by engagement --skip 3
+
+  # 降低分数阈值,处理更多帖子
+  python3 run_stage7.py --feature "墨镜" --min-score 6.0 --max-notes 30
+
+  # 使用配置文件
+  python3 run_stage7.py --config stage7_config.json
+        '''
+    )
+
+    # 输入输出配置
+    parser.add_argument(
+        '--input',
+        default='output_v2/stage6_with_evaluations.json',
+        help='Stage 6 结果文件路径(默认: output_v2/stage6_with_evaluations.json)'
+    )
+    parser.add_argument(
+        '--output',
+        default='output_v2/stage7_with_deconstruction.json',
+        help='Stage 7 输出文件路径(默认: output_v2/stage7_with_deconstruction.json)'
+    )
+
+    # Feature 过滤(新增)
+    parser.add_argument(
+        '--feature',
+        nargs='+',
+        default=None,
+        help='指定要处理的原始特征名称(可指定多个),如: --feature "墨镜" "耳环"。不指定则处理所有特征'
+    )
+
+    # 过滤参数
+    parser.add_argument(
+        '--min-score',
+        type=float,
+        default=8.0,
+        help='最低分数阈值,只处理 >= 此分数的帖子(默认: 8.0)'
+    )
+    parser.add_argument(
+        '--skip',
+        type=int,
+        default=0,
+        help='跳过前 N 个帖子(默认: 0)'
+    )
+    parser.add_argument(
+        '--max-notes',
+        type=int,
+        default=None,
+        help='最多处理多少个帖子(默认: None 不限制)'
+    )
+    parser.add_argument(
+        '--sort-by',
+        choices=['score', 'time', 'engagement'],
+        default='score',
+        help='排序方式: score(评分), time(时间), engagement(互动量)(默认: score)'
+    )
+
+    # API 配置
+    parser.add_argument(
+        '--api-url',
+        default='http://192.168.245.150:7000/what/analysis/single',
+        help='解构 API 地址(默认: http://192.168.245.150:7000/what/analysis/single)'
+    )
+    parser.add_argument(
+        '--timeout',
+        type=int,
+        default=30,
+        help='API 超时时间(秒)(默认: 30)'
+    )
+    parser.add_argument(
+        '--max-retries',
+        type=int,
+        default=3,
+        help='API 最大重试次数(默认: 3)'
+    )
+
+    # 并发配置
+    parser.add_argument(
+        '--max-workers',
+        type=int,
+        default=5,
+        help='并发处理数(默认: 5)'
+    )
+
+    # 从配置文件加载
+    parser.add_argument(
+        '--config',
+        default=None,
+        help='从 JSON 配置文件加载参数'
+    )
+
+    args = parser.parse_args()
+
+    # 如果提供了配置文件,加载配置
+    if args.config:
+        logger.info(f"从配置文件加载参数: {args.config}")
+        with open(args.config, 'r', encoding='utf-8') as f:
+            config = json.load(f)
+
+        # 配置文件中的参数会覆盖命令行参数
+        for key, value in config.items():
+            setattr(args, key.replace('-', '_'), value)
+
+    # 检查输入文件是否存在
+    if not os.path.exists(args.input):
+        logger.error(f"输入文件不存在: {args.input}")
+        return
+
+    # 加载 Stage 6 结果
+    logger.info(f"加载 Stage 6 结果: {args.input}")
+    with open(args.input, 'r', encoding='utf-8') as f:
+        stage6_results = json.load(f)
+
+    # 打印配置
+    logger.info("=" * 60)
+    logger.info("运行配置:")
+    logger.info(f"  输入文件: {args.input}")
+    logger.info(f"  输出文件: {args.output}")
+    if args.feature:
+        logger.info(f"  指定特征: {', '.join(args.feature)}")
+    else:
+        logger.info(f"  指定特征: 全部")
+    logger.info(f"  API 地址: {args.api_url}")
+    logger.info(f"  最低分数阈值: {args.min_score}")
+    logger.info(f"  跳过前 N 个: {args.skip}")
+    logger.info(f"  最多处理数: {args.max_notes if args.max_notes else '不限制'}")
+    logger.info(f"  排序方式: {args.sort_by}")
+    logger.info(f"  并发数: {args.max_workers}")
+    logger.info(f"  API 超时: {args.timeout}秒")
+    logger.info(f"  最大重试: {args.max_retries}次")
+    logger.info("=" * 60)
+
+    # 创建分析器
+    analyzer = Stage7DeconstructionAnalyzer(
+        api_url=args.api_url,
+        max_workers=args.max_workers,
+        max_notes=args.max_notes,
+        min_score=args.min_score,
+        skip_count=args.skip,
+        sort_by=args.sort_by,
+        timeout=args.timeout,
+        max_retries=args.max_retries,
+        output_dir=os.path.dirname(args.output) or 'output_v2',
+        target_features=args.feature  # 传递 feature 过滤参数
+    )
+
+    # 运行分析
+    try:
+        stage7_results = analyzer.run(
+            stage6_results=stage6_results,
+            output_path=args.output
+        )
+
+        # 打印结果摘要
+        logger.info("\n" + "=" * 60)
+        logger.info("执行完成!")
+        logger.info(f"  总匹配帖子数: {stage7_results['metadata']['total_matched_notes']}")
+        logger.info(f"  实际处理数: {stage7_results['metadata']['processed_notes']}")
+        logger.info(f"  成功: {stage7_results['metadata']['success_count']}")
+        logger.info(f"  失败: {stage7_results['metadata']['failed_count']}")
+        logger.info(f"  总耗时: {stage7_results['metadata']['processing_time_seconds']}秒")
+        logger.info(f"  结果已保存: {args.output}")
+        logger.info("=" * 60)
+
+    except Exception as e:
+        logger.error(f"执行失败: {e}", exc_info=True)
+        raise
+
+
+if __name__ == '__main__':
+    main()

+ 550 - 0
stage7_analyzer.py

@@ -0,0 +1,550 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Stage 7 分析器
+对 Stage 6 中完全匹配的帖子进行深度解构分析
+"""
+
+import os
+import json
+import time
+import logging
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Dict, List, Any, Optional
+
+from stage7_api_client import DeconstructionAPIClient, map_note_to_api_format
+
+try:
+    from tqdm import tqdm
+    TQDM_AVAILABLE = True
+except ImportError:
+    TQDM_AVAILABLE = False
+    logger.warning("tqdm 未安装,将使用简单进度显示。安装命令: pip install tqdm")
+
+logger = logging.getLogger(__name__)
+
+
+class Stage7DeconstructionAnalyzer:
+    """Stage 7: 完全匹配帖子的深度解构分析"""
+
+    def __init__(
+        self,
+        api_url: str = "http://192.168.245.150:7000/what/analysis/single",
+        max_workers: int = 5,
+        max_notes: Optional[int] = None,
+        min_score: float = 8.0,
+        skip_count: int = 0,
+        sort_by: str = 'score',
+        timeout: int = 30,
+        max_retries: int = 3,
+        output_dir: str = "output_v2",
+        enable_image_download: bool = True,
+        image_server_url: str = "http://localhost:8765",
+        image_download_dir: str = "downloaded_images",
+        target_features: Optional[List[str]] = None
+    ):
+        """
+        初始化 Stage 7 分析器
+
+        Args:
+            api_url: API 地址
+            max_workers: 并发数
+            max_notes: 最多处理多少个帖子(None = 不限制)
+            min_score: 最低分数阈值(只处理 >= 此分数的帖子)
+            skip_count: 跳过前 N 个
+            sort_by: 排序方式 ('score' | 'time' | 'engagement')
+            timeout: API 超时时间
+            max_retries: API 最大重试次数
+            output_dir: 输出目录
+            enable_image_download: 是否启用图片下载(下载小红书图片并转换为本地URL)
+            image_server_url: 图片服务器URL
+            image_download_dir: 图片下载目录
+            target_features: 指定要处理的原始特征列表(None = 处理所有特征)
+        """
+        self.max_workers = max_workers
+        self.max_notes = max_notes
+        self.min_score = min_score
+        self.skip_count = skip_count
+        self.sort_by = sort_by
+        self.output_dir = output_dir
+        self.enable_image_download = enable_image_download
+        self.target_features = target_features  # 新增:目标特征过滤
+
+        # 初始化 API 客户端
+        self.api_client = DeconstructionAPIClient(
+            api_url=api_url,
+            timeout=timeout,
+            max_retries=max_retries
+        )
+
+        # 图片下载功能已弃用,直接使用原始图片URL
+        # 保留参数以向后兼容,但不再使用
+        if self.enable_image_download:
+            logger.warning("  注意: enable_image_download 参数已弃用,将直接使用原始图片URL")
+
+    def extract_matched_notes_from_stage6(
+        self,
+        stage6_results: List[Dict]
+    ) -> List[Dict]:
+        """
+        从 Stage 6 结果中提取所有完全匹配的帖子
+
+        Args:
+            stage6_results: Stage 6 结果(列表)
+
+        Returns:
+            完全匹配的帖子列表
+        """
+        matched_notes = []
+
+        # Stage 6 结果是一个列表,每个元素是一个 feature_group
+        for feature_group in stage6_results:
+            original_feature = feature_group.get('原始特征名称', '')
+
+            # 如果指定了 target_features,只处理指定的特征
+            if self.target_features and original_feature not in self.target_features:
+                continue
+
+            # 遍历 组合评估结果_分组(这一层包含了 top10_searches)
+            for combo_group in feature_group.get('组合评估结果_分组', []):
+                # top10_searches 包含所有搜索结果
+                for search_item in combo_group.get('top10_searches', []):
+                    search_word = search_item.get('search_word', '')
+                    source_word = search_item.get('source_word', '')
+                    evaluation = search_item.get('evaluation_with_filter', {})
+
+                    # 检查是否有搜索结果
+                    if 'search_result' not in search_item:
+                        continue
+
+                    notes = search_item['search_result'].get('data', {}).get('data', [])
+
+                    # 遍历评估结果
+                    for note_eval in evaluation.get('notes_evaluation', []):
+                        score = note_eval.get('综合得分', 0)
+
+                        # 只处理完全匹配的(分数 >= min_score)
+                        if score >= self.min_score:
+                            note_index = note_eval.get('note_index', -1)
+                            if 0 <= note_index < len(notes):
+                                note = notes[note_index]
+
+                                matched_notes.append({
+                                    'note': note,
+                                    'note_card': note.get('note_card', {}),
+                                    'evaluation': note_eval,
+                                    'search_word': search_word,
+                                    'source_word': source_word,
+                                    'original_feature': original_feature
+                                })
+
+        return matched_notes
+
+    def sort_matched_notes(
+        self,
+        matched_notes: List[Dict]
+    ) -> List[Dict]:
+        """
+        对完全匹配的帖子进行排序
+
+        Args:
+            matched_notes: 匹配的帖子列表
+
+        Returns:
+            排序后的帖子列表
+        """
+        if self.sort_by == 'score':
+            # 按评分降序(优先处理高分帖子)
+            return sorted(
+                matched_notes,
+                key=lambda x: x['evaluation'].get('综合得分', 0),
+                reverse=True
+            )
+
+        elif self.sort_by == 'time':
+            # 按时间降序(优先处理最新帖子)
+            return sorted(
+                matched_notes,
+                key=lambda x: x['note_card'].get('publish_timestamp', 0),
+                reverse=True
+            )
+
+        elif self.sort_by == 'engagement':
+            # 按互动量降序(点赞+收藏+评论)
+            def calc_engagement(note_data):
+                interact = note_data['note_card'].get('interact_info', {})
+                return (
+                    interact.get('liked_count', 0) +
+                    interact.get('collected_count', 0) +
+                    interact.get('comment_count', 0)
+                )
+
+            return sorted(
+                matched_notes,
+                key=calc_engagement,
+                reverse=True
+            )
+
+        return matched_notes
+
+    def _save_intermediate_results(
+        self,
+        results: List[Dict],
+        output_path: str,
+        processed_count: int,
+        total_count: int,
+        start_time: float
+    ):
+        """
+        保存中间结果
+
+        Args:
+            results: 当前结果列表
+            output_path: 输出路径
+            processed_count: 已处理数量
+            total_count: 总数量
+            start_time: 开始时间
+        """
+        # 构建中间结果文件路径
+        base_dir = os.path.dirname(output_path) or 'output_v2'
+        base_name = os.path.basename(output_path)
+        name_without_ext = os.path.splitext(base_name)[0]
+
+        intermediate_path = os.path.join(
+            base_dir,
+            f"{name_without_ext}_partial_{processed_count}of{total_count}.json"
+        )
+
+        # 统计成功失败数
+        success_count = sum(1 for r in results if r['api_response']['status'] == 'success')
+        failed_count = len(results) - success_count
+
+        # 构建中间结果
+        intermediate_result = {
+            'metadata': {
+                'stage': 'stage7_partial',
+                'description': f'部分结果({processed_count}/{total_count})',
+                'processed_notes': len(results),
+                'success_count': success_count,
+                'failed_count': failed_count,
+                'saved_at': datetime.now().isoformat(),
+                'processing_time_seconds': round(time.time() - start_time, 2)
+            },
+            'results': results
+        }
+
+        # 保存
+        os.makedirs(base_dir, exist_ok=True)
+        with open(intermediate_path, 'w', encoding='utf-8') as f:
+            json.dump(intermediate_result, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"    已保存中间结果: {intermediate_path} ({processed_count}/{total_count})")
+
+    def process_single_note(
+        self,
+        matched_note_data: Dict,
+        index: int,
+        total: int
+    ) -> Dict:
+        """
+        处理单个帖子的解构分析
+
+        Args:
+            matched_note_data: 匹配的帖子数据
+            index: 当前索引(用于日志)
+            total: 总数(用于日志)
+
+        Returns:
+            处理结果
+        """
+        note = matched_note_data['note']
+        note_card = matched_note_data['note_card']
+        evaluation = matched_note_data['evaluation']
+        search_word = matched_note_data['search_word']
+        original_feature = matched_note_data['original_feature']
+
+        note_id = note.get('id', '')
+        note_title = note_card.get('display_title', '')[:30]  # 前30个字符
+
+        logger.info(f"[{index}/{total}] 解构分析: {note_id}")
+        logger.info(f"  标题: {note_title}...")
+        logger.info(f"  搜索词: {search_word}")
+        logger.info(f"  原始特征: {original_feature}")
+
+        # 构建 start_points(使用组合方案)
+        key_points = evaluation.get('关键匹配点', [])
+        start_points = [
+            original_feature,                    # 原始特征
+            search_word,                         # 搜索词
+            key_points[0] if key_points else ''  # 第一个关键匹配点
+        ]
+        start_points = [p for p in start_points if p]  # 过滤空值
+
+        logger.info(f"  start_points: {start_points}")
+
+        # 直接使用原始图片URL,不做任何处理
+        original_images = note_card.get('image_list', [])
+        if original_images:
+            logger.info(f"  图片数量: {len(original_images)}")
+
+        # 映射数据为 API 格式(直接使用原始图片URL)
+        api_payload = map_note_to_api_format(
+            note=note,
+            note_card=note_card,
+            evaluation=evaluation,
+            search_word=search_word,
+            original_feature=original_feature,
+            start_points=start_points,
+            processed_image_urls=None  # 不传递处理后的URL,使用原始URL
+        )
+
+        # 调用 API
+        start_time = time.time()
+        api_response = self.api_client.call_api(api_payload)
+        processing_time = (time.time() - start_time) * 1000  # 毫秒
+
+        # 构建结果
+        result = {
+            'note_id': note_id,
+            'search_word': search_word,
+            'original_feature': original_feature,
+            'source_word': matched_note_data['source_word'],
+            'evaluation_score': evaluation.get('综合得分', 0),
+            'evaluation_type': evaluation.get('匹配类型', ''),
+            'evaluation_confidence': evaluation.get('置信度', ''),
+            'key_matching_points': key_points,
+            'note_data': {
+                'title': note_card.get('display_title', ''),
+                'author': note_card.get('user', {}).get('nick_name', ''),
+                'link': f"https://www.xiaohongshu.com/explore/{note_id}"
+            },
+            'api_request': api_payload,
+            'api_response': api_response,
+            'processed_at': datetime.now().isoformat(),
+            'processing_time_ms': round(processing_time, 2)
+        }
+
+        if api_response['status'] == 'success':
+            logger.info(f"  ✓ 成功 ({processing_time:.0f}ms)")
+        else:
+            logger.error(f"  ✗ 失败: {api_response['error']}")
+
+        return result
+
+    def run(
+        self,
+        stage6_results: Dict,
+        output_path: Optional[str] = None
+    ) -> Dict:
+        """
+        执行 Stage 7 解构分析
+
+        Args:
+            stage6_results: Stage 6 结果
+            output_path: 输出路径(可选)
+
+        Returns:
+            Stage 7 结果
+        """
+        logger.info("\n" + "=" * 60)
+        logger.info("Stage 7: 完全匹配帖子的深度解构分析")
+        logger.info("=" * 60)
+
+        # 打印配置参数
+        logger.info("配置参数:")
+        logger.info(f"  API 地址: {self.api_client.api_url}")
+        if self.target_features:
+            logger.info(f"  目标特征: {', '.join(self.target_features)}")
+        else:
+            logger.info(f"  目标特征: 全部")
+        logger.info(f"  最低分数阈值: {self.min_score}")
+        logger.info(f"  并发数: {self.max_workers}")
+        logger.info(f"  最多处理帖子数: {self.max_notes if self.max_notes else '不限制'}")
+        logger.info(f"  跳过前 N 个: {self.skip_count}")
+        logger.info(f"  排序方式: {self.sort_by}")
+        logger.info(f"  API 超时: {self.api_client.timeout}秒")
+        logger.info(f"  最大重试次数: {self.api_client.max_retries}")
+
+        # 默认输出路径
+        if output_path is None:
+            output_path = os.path.join(self.output_dir, "stage7_with_deconstruction.json")
+
+        # 1. 提取完全匹配的帖子
+        matched_notes = self.extract_matched_notes_from_stage6(stage6_results)
+        total_matched = len(matched_notes)
+
+        logger.info(f"  完全匹配帖子总数: {total_matched} (分数 >= {self.min_score})")
+
+        if total_matched == 0:
+            logger.warning("  没有找到完全匹配的帖子")
+            return {
+                'metadata': {
+                    'stage': 'stage7',
+                    'total_matched_notes': 0,
+                    'processed_notes': 0
+                },
+                'results': []
+            }
+
+        # 2. 排序
+        matched_notes = self.sort_matched_notes(matched_notes)
+        logger.info(f"  排序方式: {self.sort_by}")
+
+        # 3. 跳过前 N 个
+        if self.skip_count > 0:
+            logger.info(f"  跳过前 {self.skip_count} 个")
+            matched_notes = matched_notes[self.skip_count:]
+
+        # 4. 限制数量
+        if self.max_notes is not None and len(matched_notes) > self.max_notes:
+            logger.info(f"  数量限制: {self.max_notes}")
+            matched_notes = matched_notes[:self.max_notes]
+
+        to_process = len(matched_notes)
+        logger.info(f"  实际处理: {to_process} 个")
+        logger.info(f"  并发数: {self.max_workers}")
+        logger.info(f"  API: {self.api_client.api_url}")
+
+        if to_process == 0:
+            logger.warning("  没有需要处理的帖子")
+            return {
+                'metadata': {
+                    'stage': 'stage7',
+                    'total_matched_notes': total_matched,
+                    'processed_notes': 0,
+                    'skipped_notes': self.skip_count
+                },
+                'results': []
+            }
+
+        # 5. 并行处理
+        results = []
+        start_time = time.time()
+        save_interval = 10  # 每处理10个帖子保存一次
+
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            futures = []
+            for idx, note_data in enumerate(matched_notes, start=1):
+                future = executor.submit(
+                    self.process_single_note,
+                    note_data,
+                    idx,
+                    to_process
+                )
+                futures.append(future)
+
+            # 收集结果(带进度显示)
+            if TQDM_AVAILABLE:
+                # 使用 tqdm 进度条
+                logger.info("  使用进度条显示...")
+                iterator = tqdm(
+                    as_completed(futures),
+                    total=len(futures),
+                    desc="  处理进度",
+                    unit="帖子",
+                    ncols=100
+                )
+            else:
+                # 简单进度显示
+                iterator = as_completed(futures)
+
+            processed_count = 0
+            for future in iterator:
+                try:
+                    result = future.result()
+                    results.append(result)
+                    processed_count += 1
+
+                    # 增量保存(每处理一定数量保存一次)
+                    if processed_count % save_interval == 0:
+                        self._save_intermediate_results(
+                            results,
+                            output_path,
+                            processed_count,
+                            to_process,
+                            start_time
+                        )
+
+                    # 简单进度显示(如果没有 tqdm)
+                    if not TQDM_AVAILABLE and processed_count % 5 == 0:
+                        logger.info(f"  进度: {processed_count}/{to_process}")
+
+                except Exception as e:
+                    logger.error(f"  处理失败: {e}")
+
+        processing_time = time.time() - start_time
+
+        # 6. 统计
+        success_count = sum(1 for r in results if r['api_response']['status'] == 'success')
+        failed_count = len(results) - success_count
+
+        logger.info(f"\n  总耗时: {processing_time:.1f}s")
+        logger.info(f"  成功: {success_count}")
+        logger.info(f"  失败: {failed_count}")
+
+        # 7. 构建最终结果
+        final_result = {
+            'metadata': {
+                'stage': 'stage7',
+                'description': '完全匹配帖子的深度解构分析',
+                'target_features': self.target_features if self.target_features else '全部',
+                'total_matched_notes': total_matched,
+                'processed_notes': len(results),
+                'skipped_notes': self.skip_count,
+                'max_notes_limit': self.max_notes,
+                'sort_by': self.sort_by,
+                'success_count': success_count,
+                'failed_count': failed_count,
+                'api_url': self.api_client.api_url,
+                'min_score_threshold': self.min_score,
+                'created_at': datetime.now().isoformat(),
+                'processing_time_seconds': round(processing_time, 2)
+            },
+            'results': results
+        }
+
+        # 8. 保存结果
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(final_result, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"  结果已保存: {output_path}")
+
+        return final_result
+
+
+def test_stage7_analyzer():
+    """测试 Stage 7 分析器"""
+    # 读取 Stage 6 结果
+    stage6_path = "output_v2/stage6_with_evaluations.json"
+
+    if not os.path.exists(stage6_path):
+        print(f"Stage 6 结果不存在: {stage6_path}")
+        return
+
+    with open(stage6_path, 'r', encoding='utf-8') as f:
+        stage6_results = json.load(f)
+
+    # 创建分析器
+    analyzer = Stage7DeconstructionAnalyzer(
+        max_workers=3,
+        max_notes=5,  # 只测试 5 个
+        skip_count=0,
+        sort_by='score'
+    )
+
+    # 运行分析
+    stage7_results = analyzer.run(stage6_results)
+
+    print(f"\n处理了 {stage7_results['metadata']['processed_notes']} 个帖子")
+    print(f"成功: {stage7_results['metadata']['success_count']}")
+    print(f"失败: {stage7_results['metadata']['failed_count']}")
+
+
+if __name__ == '__main__':
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    test_stage7_analyzer()

+ 253 - 0
stage7_api_client.py

@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Stage 7 API 客户端
+用于调用深度解构分析 API
+"""
+
+import time
+import logging
+import requests
+from datetime import datetime
+from typing import Dict, List, Any, Optional
+
+logger = logging.getLogger(__name__)
+
+
+def map_note_to_api_format(
+    note: Dict,
+    note_card: Dict,
+    evaluation: Dict,
+    search_word: str,
+    original_feature: str,
+    start_points: List[str],
+    processed_image_urls: Optional[List[str]] = None
+) -> Dict:
+    """
+    将小红书笔记数据映射为 API 所需格式
+
+    Args:
+        note: 笔记原始数据
+        note_card: 笔记卡片信息
+        evaluation: 评估结果
+        search_word: 搜索词
+        original_feature: 原始特征
+        start_points: 起点列表
+        processed_image_urls: 处理后的图片URL列表(如果提供,将替代原始URL)
+
+    Returns:
+        API 请求格式的数据
+    """
+    # 构建小红书链接
+    note_id = note.get('id', '')
+    link = f"https://www.xiaohongshu.com/explore/{note_id}"
+
+    # 获取用户信息
+    user = note_card.get('user', {})
+    interact_info = note_card.get('interact_info', {})
+
+    # 获取发布时间(需要转换为毫秒时间戳)
+    publish_ts = note_card.get('publish_timestamp', 0)
+    publish_ts_ms = publish_ts * 1000 if publish_ts else 0
+
+    # 格式化发布日期
+    publish_date = ''
+    if publish_ts:
+        try:
+            publish_date = datetime.fromtimestamp(publish_ts).strftime('%Y-%m-%d %H:%M:%S')
+        except:
+            publish_date = ''
+
+    # 使用处理后的图片URL,如果没有则使用原始URL
+    image_urls = processed_image_urls if processed_image_urls else note_card.get('image_list', [])
+
+    return {
+        "post_data": {
+            "channel_content_id": note_id,
+            "link": link,
+            "xsec_token": "",  # 通常为空
+            "comment_count": interact_info.get('comment_count', 0),
+            "images": image_urls,
+            "like_count": interact_info.get('liked_count', 0),
+            "body_text": note_card.get('desc', ''),
+            "title": note_card.get('display_title', ''),
+            "collect_count": interact_info.get('collected_count', 0),
+            "channel_account_id": user.get('user_id', ''),
+            "channel_account_name": user.get('nick_name', ''),
+            "publish_timestamp": publish_ts_ms,
+            "modify_timestamp": publish_ts_ms,
+            "update_timestamp": int(time.time() * 1000),
+            "publish_date": publish_date,
+            "content_type": "note",
+            "video": {}  # 图文类型无视频
+        },
+        "question_data": {
+            "target": original_feature,      # 例如: "墨镜"
+            "start_points": start_points,    # 例如: ["墨镜", "猫咪服饰造型元素", "图片中猫咪佩戴墨镜"]
+            "query": search_word             # 例如: "猫咪服饰造型元素"
+        }
+    }
+
+
+class DeconstructionAPIClient:
+    """解构分析 API 客户端"""
+
+    def __init__(
+        self,
+        api_url: str = "http://192.168.245.150:7000/what/analysis/single",
+        timeout: int = 30,
+        max_retries: int = 3
+    ):
+        """
+        初始化 API 客户端
+
+        Args:
+            api_url: API 地址
+            timeout: 超时时间(秒)
+            max_retries: 最大重试次数
+        """
+        self.api_url = api_url
+        self.timeout = timeout
+        self.max_retries = max_retries
+
+    def call_api(
+        self,
+        api_payload: Dict
+    ) -> Dict:
+        """
+        调用解构 API(带重试机制)
+
+        Args:
+            api_payload: API 请求数据
+
+        Returns:
+            {
+                'status': 'success' | 'failed',
+                'result': API响应数据(成功时),
+                'error': 错误信息(失败时)
+            }
+        """
+        for attempt in range(self.max_retries):
+            try:
+                response = requests.post(
+                    self.api_url,
+                    json=api_payload,
+                    headers={'Content-Type': 'application/json'},
+                    timeout=self.timeout
+                )
+
+                if response.status_code == 200:
+                    return {
+                        'status': 'success',
+                        'result': response.json(),
+                        'error': None
+                    }
+                else:
+                    error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
+
+                    # 如果还有重试机会,继续重试
+                    if attempt < self.max_retries - 1:
+                        wait_time = 2 ** attempt  # 指数退避: 1s, 2s, 4s
+                        logger.warning(f"    API 调用失败,{wait_time}s 后重试 ({attempt + 1}/{self.max_retries})")
+                        time.sleep(wait_time)
+                        continue
+
+                    # 最后一次重试也失败
+                    return {
+                        'status': 'failed',
+                        'result': None,
+                        'error': error_msg
+                    }
+
+            except requests.Timeout:
+                if attempt < self.max_retries - 1:
+                    wait_time = 2 ** attempt
+                    logger.warning(f"    API 超时,{wait_time}s 后重试 ({attempt + 1}/{self.max_retries})")
+                    time.sleep(wait_time)
+                    continue
+
+                return {
+                    'status': 'failed',
+                    'result': None,
+                    'error': f'API timeout after {self.timeout}s'
+                }
+
+            except Exception as e:
+                if attempt < self.max_retries - 1:
+                    wait_time = 2 ** attempt
+                    logger.warning(f"    API 异常,{wait_time}s 后重试 ({attempt + 1}/{self.max_retries}): {e}")
+                    time.sleep(wait_time)
+                    continue
+
+                return {
+                    'status': 'failed',
+                    'result': None,
+                    'error': f'Exception: {str(e)}'
+                }
+
+        # 理论上不会到这里
+        return {
+            'status': 'failed',
+            'result': None,
+            'error': 'Max retries exceeded'
+        }
+
+
+def test_api_client():
+    """测试 API 客户端"""
+    # 模拟数据
+    test_note = {
+        'id': '68ba3a27000000001c00f8fc'
+    }
+
+    test_note_card = {
+        'display_title': '测试标题',
+        'desc': '测试内容',
+        'image_list': [
+            'https://example.com/image1.jpg',
+            'https://example.com/image2.jpg'
+        ],
+        'user': {
+            'user_id': '123456',
+            'nick_name': '测试用户'
+        },
+        'interact_info': {
+            'liked_count': 100,
+            'collected_count': 50,
+            'comment_count': 10
+        },
+        'publish_timestamp': 1640000000
+    }
+
+    test_evaluation = {
+        '综合得分': 9.0,
+        '关键匹配点': ['测试匹配点1', '测试匹配点2']
+    }
+
+    # 数据映射测试
+    api_payload = map_note_to_api_format(
+        note=test_note,
+        note_card=test_note_card,
+        evaluation=test_evaluation,
+        search_word='测试搜索词',
+        original_feature='测试特征',
+        start_points=['起点1', '起点2']
+    )
+
+    print("API Payload:")
+    import json
+    print(json.dumps(api_payload, ensure_ascii=False, indent=2))
+
+    # API 调用测试(需要实际 API 服务)
+    # client = DeconstructionAPIClient()
+    # result = client.call_api(api_payload)
+    # print("\nAPI Result:")
+    # print(json.dumps(result, ensure_ascii=False, indent=2))
+
+
+if __name__ == '__main__':
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    test_api_client()

+ 13 - 0
stage7_config.json

@@ -0,0 +1,13 @@
+{
+  "input": "output_v2/stage6_with_evaluations.json",
+  "output": "output_v2/stage7_with_deconstruction.json",
+  "feature": ["墨镜"],
+  "min_score": 8.0,
+  "skip": 0,
+  "max_notes": 10,
+  "sort_by": "score",
+  "api_url": "http://192.168.245.150:7000/what/analysis/single",
+  "timeout": 30,
+  "max_retries": 3,
+  "max_workers": 5
+}

+ 125 - 27
visualize_stage6_results.py

@@ -21,6 +21,8 @@ def calculate_statistics(data: List[Dict[str, Any]]) -> Dict[str, Any]:
     """计算统计数据(包括评估结果)"""
     total_features = len(data)
     total_search_words = 0
+    searched_count = 0  # 已执行搜索的数量
+    not_searched_count = 0  # 未执行搜索的数量
     total_notes = 0
     video_count = 0
     normal_count = 0
@@ -42,28 +44,34 @@ def calculate_statistics(data: List[Dict[str, Any]]) -> Dict[str, Any]:
 
             for search_item in search_items:
                 search_result = search_item.get('search_result', {})
-                notes = search_result.get('data', {}).get('data', [])
-                total_notes += len(notes)
-
-                # 统计视频/图文类型
-                for note in notes:
-                    note_type = note.get('note_card', {}).get('type', '')
-                    if note_type == 'video':
-                        video_count += 1
-                    else:
-                        normal_count += 1
-
-                # 统计评估结果
-                evaluation = search_item.get('evaluation_with_filter')
-                if evaluation:
-                    total_evaluated_notes += evaluation.get('total_notes', 0)
-                    total_filtered += evaluation.get('filtered_count', 0)
-
-                    stats = evaluation.get('statistics', {})
-                    match_complete += stats.get('完全匹配(8-10)', 0)
-                    match_similar += stats.get('相似匹配(6-7)', 0)
-                    match_weak += stats.get('弱相似(5-6)', 0)
-                    match_none += stats.get('无匹配(≤4)', 0)
+
+                # 统计搜索状态
+                if search_result:
+                    searched_count += 1
+                    notes = search_result.get('data', {}).get('data', [])
+                    total_notes += len(notes)
+
+                    # 统计视频/图文类型
+                    for note in notes:
+                        note_type = note.get('note_card', {}).get('type', '')
+                        if note_type == 'video':
+                            video_count += 1
+                        else:
+                            normal_count += 1
+
+                    # 统计评估结果
+                    evaluation = search_item.get('evaluation_with_filter')
+                    if evaluation:
+                        total_evaluated_notes += evaluation.get('total_notes', 0)
+                        total_filtered += evaluation.get('filtered_count', 0)
+
+                        stats = evaluation.get('statistics', {})
+                        match_complete += stats.get('完全匹配(8-10)', 0)
+                        match_similar += stats.get('相似匹配(6-7)', 0)
+                        match_weak += stats.get('弱相似(5-6)', 0)
+                        match_none += stats.get('无匹配(≤4)', 0)
+                else:
+                    not_searched_count += 1
 
     # 计算百分比
     total_remaining = total_evaluated_notes - total_filtered if total_evaluated_notes > 0 else 0
@@ -71,6 +79,9 @@ def calculate_statistics(data: List[Dict[str, Any]]) -> Dict[str, Any]:
     return {
         'total_features': total_features,
         'total_search_words': total_search_words,
+        'searched_count': searched_count,
+        'not_searched_count': not_searched_count,
+        'searched_percentage': round(searched_count / total_search_words * 100, 1) if total_search_words > 0 else 0,
         'total_notes': total_notes,
         'video_count': video_count,
         'normal_count': normal_count,
@@ -550,6 +561,33 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
             gap: 20px;
         }}
 
+        /* 空状态样式 */
+        .empty-state {{
+            text-align: center;
+            padding: 60px 40px;
+            color: #6b7280;
+        }}
+
+        .empty-icon {{
+            font-size: 48px;
+            margin-bottom: 16px;
+        }}
+
+        .empty-title {{
+            font-size: 16px;
+            font-weight: 600;
+            color: #374151;
+            margin-bottom: 8px;
+        }}
+
+        .empty-desc {{
+            font-size: 14px;
+            line-height: 1.6;
+            color: #9ca3af;
+            max-width: 400px;
+            margin: 0 auto;
+        }}
+
         .note-card {{
             border: 3px solid #e5e7eb;
             border-radius: 8px;
@@ -821,7 +859,15 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
                 </div>
                 <div class="stat-item">
                     <div class="stat-value">🔍 {stats['total_search_words']}</div>
-                    <div class="stat-label">搜索词数</div>
+                    <div class="stat-label">搜索词总数</div>
+                </div>
+                <div class="stat-item">
+                    <div class="stat-value">✅ {stats['searched_count']}</div>
+                    <div class="stat-label">已搜索 ({stats['searched_percentage']}%)</div>
+                </div>
+                <div class="stat-item">
+                    <div class="stat-value">⏸️ {stats['not_searched_count']}</div>
+                    <div class="stat-label">未搜索</div>
                 </div>
                 <div class="stat-item">
                     <div class="stat-value">📝 {stats['total_notes']}</div>
@@ -829,11 +875,11 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
                 </div>
                 <div class="stat-item">
                     <div class="stat-value">🎬 {stats['video_count']}</div>
-                    <div class="stat-label">视频类型 ({stats['video_percentage']}%)</div>
+                    <div class="stat-label">视频 ({stats['video_percentage']}%)</div>
                 </div>
                 <div class="stat-item">
                     <div class="stat-value">📷 {stats['normal_count']}</div>
-                    <div class="stat-label">图文类型 ({stats['normal_percentage']}%)</div>
+                    <div class="stat-label">图文 ({stats['normal_percentage']}%)</div>
                 </div>
             </div>
             <div class="stats-row">
@@ -1004,7 +1050,6 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
                                  data-block-id="${{blockId}}">
                                 <div class="search-word-text">
                                     🔍 ${{sw.search_word}}
-                                    <span class="search-word-score ${{scoreClass}}">${{score.toFixed(2)}}</span>
                                 </div>
                                 <div class="search-word-meta" style="font-size:11px;color:#9ca3af;margin-top:2px">
                                     来源: ${{sourceWord}}
@@ -1042,6 +1087,7 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
 
                     searches.forEach((sw, swIdx) => {{
                         const blockId = `block-${{featureIdx}}-${{groupIdx}}-${{swIdx}}`;
+                        const hasSearchResult = sw.search_result != null;
                         const searchResult = sw.search_result || {{}};
                         const notes = searchResult.data?.data || [];
 
@@ -1066,20 +1112,70 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
                             if (filtered > 0) evalStats += `<span class="stat-badge eval filtered">⚫ 过滤:${{filtered}}</span>`;
                         }}
 
+                        // 构建结果块
                         html += `
                             <div class="result-block" id="${{blockId}}">
                                 <div class="result-header">
                                     <div class="result-title">${{sw.search_word}}</div>
                                     <div class="result-stats">
+                        `;
+
+                        // 根据搜索状态显示不同的统计信息
+                        if (!hasSearchResult) {{
+                            // 未执行搜索
+                            html += `
+                                        <span class="stat-badge" style="background:#fef3c7;color:#92400e;font-weight:600">⏸️ 未执行搜索</span>
+                            `;
+                        }} else if (notes.length === 0) {{
+                            // 搜索完成但无结果
+                            html += `
+                                        <span class="stat-badge">📝 0 条帖子</span>
+                                        <span class="stat-badge" style="background:#fee2e2;color:#991b1b;font-weight:600">❌ 未找到匹配</span>
+                            `;
+                        }} else {{
+                            // 正常有结果
+                            html += `
                                         <span class="stat-badge">📝 ${{notes.length}} 条帖子</span>
                                         <span class="stat-badge">🎬 ${{videoCount}} 视频</span>
                                         <span class="stat-badge">📷 ${{normalCount}} 图文</span>
                                         ${{evalStats}}
+                            `;
+                        }}
+
+                        html += `
                                     </div>
                                 </div>
+                        `;
+
+                        // 根据搜索状态显示不同的内容区域
+                        if (!hasSearchResult) {{
+                            // 未执行搜索 - 显示空状态消息
+                            html += `
+                                <div class="empty-state">
+                                    <div class="empty-icon">⏸️</div>
+                                    <div class="empty-title">该搜索词未执行搜索</div>
+                                    <div class="empty-desc">由于搜索次数限制(--max-searches-per-feature 和 --max-searches-per-base-word),该搜索词未被执行</div>
+                                </div>
+                            `;
+                        }} else if (notes.length === 0) {{
+                            // 搜索完成但无结果
+                            html += `
+                                <div class="empty-state">
+                                    <div class="empty-icon">❌</div>
+                                    <div class="empty-title">搜索完成,但未找到匹配的帖子</div>
+                                    <div class="empty-desc">该搜索词已执行,但小红书返回了 0 条结果</div>
+                                </div>
+                            `;
+                        }} else {{
+                            // 正常有结果 - 显示帖子网格
+                            html += `
                                 <div class="notes-grid">
                                     ${{notes.map((note, noteIdx) => renderNoteCard(note, featureIdx, groupIdx, swIdx, noteIdx)).join('')}}
                                 </div>
+                            `;
+                        }}
+
+                        html += `
                             </div>
                         `;
                     }});
@@ -1359,7 +1455,9 @@ def main():
     stats = calculate_statistics(data)
     print(f"✓ 统计完成:")
     print(f"  - 原始特征: {stats['total_features']}")
-    print(f"  - 搜索词: {stats['total_search_words']}")
+    print(f"  - 搜索词总数: {stats['total_search_words']}")
+    print(f"  - 已搜索: {stats['searched_count']} ({stats['searched_percentage']}%)")
+    print(f"  - 未搜索: {stats['not_searched_count']}")
     print(f"  - 帖子总数: {stats['total_notes']}")
     print(f"  - 视频: {stats['video_count']} ({stats['video_percentage']}%)")
     print(f"  - 图文: {stats['normal_count']} ({stats['normal_percentage']}%)")