3 hete · 26b2f19f36
--- a/image_downloader.py
+++ b/image_downloader.py
@@ -0,0 +1,269 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+图片下载和本地服务工具
			
 
				+用于将小红书图片下载到本地,并通过HTTP服务器提供访问
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import hashlib
			
 
				+import requests
			
 
				+import time
			
 
				+from pathlib import Path
			
 
				+from typing import List, Optional
			
 
				+import logging
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class ImageDownloader:
			
 
				+    """图片下载器"""
			
 
				+
			
 
				+    def __init__(self, download_dir: str = "downloaded_images", max_retries: int = 3):
			
 
				+        """
			
 
				+        初始化图片下载器
			
 
				+
			
 
				+        Args:
			
 
				+            download_dir: 图片下载目录
			
 
				+            max_retries: 最大重试次数
			
 
				+        """
			
 
				+        self.download_dir = Path(download_dir)
			
 
				+        self.download_dir.mkdir(parents=True, exist_ok=True)
			
 
				+        self.max_retries = max_retries
			
 
				+
			
 
				+        # 请求头,模拟浏览器
			
 
				+        self.headers = {
			
 
				+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
			
 
				+            'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
			
 
				+            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
			
 
				+            'Referer': 'https://www.xiaohongshu.com/'
			
 
				+        }
			
 
				+
			
 
				+    def get_image_hash(self, url: str) -> str:
			
 
				+        """
			
 
				+        根据URL生成唯一的图片文件名
			
 
				+
			
 
				+        Args:
			
 
				+            url: 图片URL
			
 
				+
			
 
				+        Returns:
			
 
				+            文件名(不含扩展名)
			
 
				+        """
			
 
				+        return hashlib.md5(url.encode()).hexdigest()
			
 
				+
			
 
				+    def get_extension_from_url(self, url: str) -> str:
			
 
				+        """
			
 
				+        从URL中提取文件扩展名
			
 
				+
			
 
				+        Args:
			
 
				+            url: 图片URL
			
 
				+
			
 
				+        Returns:
			
 
				+            扩展名(如 .jpg, .png, .webp)
			
 
				+        """
			
 
				+        # 检查URL中是否指定了format参数
			
 
				+        if 'format/jpg' in url or url.endswith('.jpg'):
			
 
				+            return '.jpg'
			
 
				+        elif 'format/png' in url or url.endswith('.png'):
			
 
				+            return '.png'
			
 
				+        elif 'format/webp' in url or url.endswith('.webp'):
			
 
				+            return '.webp'
			
 
				+        elif 'format/jpeg' in url or url.endswith('.jpeg'):
			
 
				+            return '.jpeg'
			
 
				+
			
 
				+        # 默认使用webp
			
 
				+        return '.webp'
			
 
				+
			
 
				+    def download_image(self, url: str) -> Optional[str]:
			
 
				+        """
			
 
				+        下载单张图片
			
 
				+
			
 
				+        Args:
			
 
				+            url: 图片URL
			
 
				+
			
 
				+        Returns:
			
 
				+            本地文件路径,失败返回None
			
 
				+        """
			
 
				+        if not url:
			
 
				+            return None
			
 
				+
			
 
				+        # 生成本地文件路径
			
 
				+        file_hash = self.get_image_hash(url)
			
 
				+        extension = self.get_extension_from_url(url)
			
 
				+        local_path = self.download_dir / f"{file_hash}{extension}"
			
 
				+
			
 
				+        # 如果文件已存在,直接返回
			
 
				+        if local_path.exists():
			
 
				+            logger.debug(f"图片已存在: {local_path}")
			
 
				+            return str(local_path)
			
 
				+
			
 
				+        # 下载图片
			
 
				+        for attempt in range(self.max_retries):
			
 
				+            try:
			
 
				+                logger.debug(f"下载图片 (尝试 {attempt + 1}/{self.max_retries}): {url}")
			
 
				+
			
 
				+                response = requests.get(
			
 
				+                    url,
			
 
				+                    headers=self.headers,
			
 
				+                    timeout=30,
			
 
				+                    stream=True  # 使用流式下载避免内存问题
			
 
				+                )
			
 
				+
			
 
				+                if response.status_code == 200:
			
 
				+                    # 写入文件
			
 
				+                    with open(local_path, 'wb') as f:
			
 
				+                        for chunk in response.iter_content(chunk_size=8192):
			
 
				+                            if chunk:
			
 
				+                                f.write(chunk)
			
 
				+
			
 
				+                    logger.debug(f"✓ 下载成功: {local_path}")
			
 
				+                    return str(local_path)
			
 
				+                else:
			
 
				+                    logger.warning(f"下载失败,状态码: {response.status_code}")
			
 
				+
			
 
				+            except requests.Timeout:
			
 
				+                logger.warning(f"下载超时 (尝试 {attempt + 1}/{self.max_retries})")
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"下载失败: {e} (尝试 {attempt + 1}/{self.max_retries})")
			
 
				+
			
 
				+            # 等待后重试
			
 
				+            if attempt < self.max_retries - 1:
			
 
				+                wait_time = 2 ** attempt
			
 
				+                time.sleep(wait_time)
			
 
				+
			
 
				+        logger.error(f"✗ 下载失败(已重试{self.max_retries}次): {url}")
			
 
				+        return None
			
 
				+
			
 
				+    def download_images(self, urls: List[str]) -> List[Optional[str]]:
			
 
				+        """
			
 
				+        批量下载图片
			
 
				+
			
 
				+        Args:
			
 
				+            urls: 图片URL列表
			
 
				+
			
 
				+        Returns:
			
 
				+            本地文件路径列表
			
 
				+        """
			
 
				+        local_paths = []
			
 
				+        for url in urls:
			
 
				+            local_path = self.download_image(url)
			
 
				+            local_paths.append(local_path)
			
 
				+
			
 
				+        return local_paths
			
 
				+
			
 
				+
			
 
				+class LocalImageServer:
			
 
				+    """本地图片服务器配置"""
			
 
				+
			
 
				+    def __init__(self, base_url: str = "http://localhost:8765", image_dir: str = "downloaded_images"):
			
 
				+        """
			
 
				+        初始化本地图片服务器配置
			
 
				+
			
 
				+        Args:
			
 
				+            base_url: 服务器基础URL
			
 
				+            image_dir: 图片目录名
			
 
				+        """
			
 
				+        self.base_url = base_url.rstrip('/')
			
 
				+        self.image_dir = image_dir
			
 
				+
			
 
				+    def get_public_url(self, local_path: str) -> str:
			
 
				+        """
			
 
				+        将本地路径转换为公开URL
			
 
				+
			
 
				+        Args:
			
 
				+            local_path: 本地文件路径
			
 
				+
			
 
				+        Returns:
			
 
				+            公开可访问的URL
			
 
				+        """
			
 
				+        if not local_path:
			
 
				+            return ""
			
 
				+
			
 
				+        # 提取文件名
			
 
				+        filename = Path(local_path).name
			
 
				+
			
 
				+        # 生成公开URL
			
 
				+        return f"{self.base_url}/{filename}"
			
 
				+
			
 
				+    def convert_paths_to_urls(self, local_paths: List[Optional[str]]) -> List[str]:
			
 
				+        """
			
 
				+        批量转换本地路径为公开URL
			
 
				+
			
 
				+        Args:
			
 
				+            local_paths: 本地文件路径列表
			
 
				+
			
 
				+        Returns:
			
 
				+            公开URL列表
			
 
				+        """
			
 
				+        return [self.get_public_url(path) if path else "" for path in local_paths]
			
 
				+
			
 
				+
			
 
				+def start_simple_http_server(directory: str = "downloaded_images", port: int = 8765):
			
 
				+    """
			
 
				+    启动简单的HTTP文件服务器(用于开发/测试)
			
 
				+
			
 
				+    Args:
			
 
				+        directory: 要服务的目录
			
 
				+        port: 端口号
			
 
				+
			
 
				+    Note:
			
 
				+        这个函数会阻塞当前线程,建议在单独的进程中运行
			
 
				+    """
			
 
				+    import http.server
			
 
				+    import socketserver
			
 
				+    import os
			
 
				+
			
 
				+    os.chdir(directory)
			
 
				+
			
 
				+    Handler = http.server.SimpleHTTPRequestHandler
			
 
				+
			
 
				+    # 添加CORS支持
			
 
				+    class CORSRequestHandler(Handler):
			
 
				+        def end_headers(self):
			
 
				+            self.send_header('Access-Control-Allow-Origin', '*')
			
 
				+            self.send_header('Access-Control-Allow-Methods', 'GET, OPTIONS')
			
 
				+            self.send_header('Cache-Control', 'no-store, no-cache, must-revalidate')
			
 
				+            return super().end_headers()
			
 
				+
			
 
				+    with socketserver.TCPServer(("", port), CORSRequestHandler) as httpd:
			
 
				+        print(f"图片服务器运行在 http://localhost:{port}")
			
 
				+        print(f"服务目录: {os.getcwd()}")
			
 
				+        print("按 Ctrl+C 停止服务器")
			
 
				+        httpd.serve_forever()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    import sys
			
 
				+
			
 
				+    if len(sys.argv) > 1 and sys.argv[1] == 'serve':
			
 
				+        # 启动HTTP服务器模式
			
 
				+        port = int(sys.argv[2]) if len(sys.argv) > 2 else 8765
			
 
				+        directory = sys.argv[3] if len(sys.argv) > 3 else "downloaded_images"
			
 
				+
			
 
				+        print(f"启动图片服务器...")
			
 
				+        print(f"目录: {directory}")
			
 
				+        print(f"端口: {port}")
			
 
				+
			
 
				+        start_simple_http_server(directory, port)
			
 
				+    else:
			
 
				+        # 测试下载功能
			
 
				+        test_url = "https://ci.xiaohongshu.com/1040g2sg31e4ln39lh0bg5p8vj7kp2skkvm4jgno?imageView2/2/w/1080/format/webp"
			
 
				+
			
 
				+        print("测试图片下载功能")
			
 
				+        print(f"测试URL: {test_url}")
			
 
				+
			
 
				+        downloader = ImageDownloader()
			
 
				+        local_path = downloader.download_image(test_url)
			
 
				+
			
 
				+        if local_path:
			
 
				+            print(f"✓ 下载成功: {local_path}")
			
 
				+
			
 
				+            # 测试URL转换
			
 
				+            server = LocalImageServer()
			
 
				+            public_url = server.get_public_url(local_path)
			
 
				+            print(f"公开URL: {public_url}")
			
 
				+
			
 
				+            print(f"\n要启动图片服务器,运行:")
			
 
				+            print(f"python3 image_downloader.py serve 8765")
			
 
				+        else:
			
 
				+            print("✗ 下载失败")
			
--- a/run_stage7.py
+++ b/run_stage7.py
@@ -0,0 +1,214 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Stage 7 独立运行脚本
			
 
				+从 Stage 6 结果开始，进行深度解构分析
			
 
				+支持指定 feature 和数量限制
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import json
			
 
				+import logging
			
 
				+import argparse
			
 
				+from stage7_analyzer import Stage7DeconstructionAnalyzer
			
 
				+
			
 
				+# 配置日志
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format='%(asctime)s - %(levelname)s - %(message)s',
			
 
				+    datefmt='%Y-%m-%d %H:%M:%S',
			
 
				+    handlers=[
			
 
				+        logging.FileHandler('stage7_standalone.log', encoding='utf-8'),
			
 
				+        logging.StreamHandler()
			
 
				+    ]
			
 
				+)
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='Stage 7 深度解构分析（独立运行）',
			
 
				+        formatter_class=argparse.RawDescriptionHelpFormatter,
			
 
				+        epilog='''
			
 
				+示例用法:
			
 
				+  # 只处理"墨镜"特征的前10个高分帖子
			
 
				+  python3 run_stage7.py --feature "墨镜" --max-notes 10
			
 
				+
			
 
				+  # 处理"墨镜"和"耳环"两个特征，每个最多5个
			
 
				+  python3 run_stage7.py --feature "墨镜" "耳环" --max-notes 5
			
 
				+
			
 
				+  # 处理所有特征，按时间排序，前20个
			
 
				+  python3 run_stage7.py --sort-by time --max-notes 20
			
 
				+
			
 
				+  # 只处理"墨镜"，按互动量排序，跳过前3个
			
 
				+  python3 run_stage7.py --feature "墨镜" --sort-by engagement --skip 3
			
 
				+
			
 
				+  # 降低分数阈值，处理更多帖子
			
 
				+  python3 run_stage7.py --feature "墨镜" --min-score 6.0 --max-notes 30
			
 
				+
			
 
				+  # 使用配置文件
			
 
				+  python3 run_stage7.py --config stage7_config.json
			
 
				+        '''
			
 
				+    )
			
 
				+
			
 
				+    # 输入输出配置
			
 
				+    parser.add_argument(
			
 
				+        '--input',
			
 
				+        default='output_v2/stage6_with_evaluations.json',
			
 
				+        help='Stage 6 结果文件路径（默认: output_v2/stage6_with_evaluations.json）'
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--output',
			
 
				+        default='output_v2/stage7_with_deconstruction.json',
			
 
				+        help='Stage 7 输出文件路径（默认: output_v2/stage7_with_deconstruction.json）'
			
 
				+    )
			
 
				+
			
 
				+    # Feature 过滤（新增）
			
 
				+    parser.add_argument(
			
 
				+        '--feature',
			
 
				+        nargs='+',
			
 
				+        default=None,
			
 
				+        help='指定要处理的原始特征名称（可指定多个），如: --feature "墨镜" "耳环"。不指定则处理所有特征'
			
 
				+    )
			
 
				+
			
 
				+    # 过滤参数
			
 
				+    parser.add_argument(
			
 
				+        '--min-score',
			
 
				+        type=float,
			
 
				+        default=8.0,
			
 
				+        help='最低分数阈值，只处理 >= 此分数的帖子（默认: 8.0）'
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--skip',
			
 
				+        type=int,
			
 
				+        default=0,
			
 
				+        help='跳过前 N 个帖子（默认: 0）'
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--max-notes',
			
 
				+        type=int,
			
 
				+        default=None,
			
 
				+        help='最多处理多少个帖子（默认: None 不限制）'
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--sort-by',
			
 
				+        choices=['score', 'time', 'engagement'],
			
 
				+        default='score',
			
 
				+        help='排序方式: score(评分), time(时间), engagement(互动量)（默认: score）'
			
 
				+    )
			
 
				+
			
 
				+    # API 配置
			
 
				+    parser.add_argument(
			
 
				+        '--api-url',
			
 
				+        default='http://192.168.245.150:7000/what/analysis/single',
			
 
				+        help='解构 API 地址（默认: http://192.168.245.150:7000/what/analysis/single）'
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--timeout',
			
 
				+        type=int,
			
 
				+        default=30,
			
 
				+        help='API 超时时间（秒）（默认: 30）'
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--max-retries',
			
 
				+        type=int,
			
 
				+        default=3,
			
 
				+        help='API 最大重试次数（默认: 3）'
			
 
				+    )
			
 
				+
			
 
				+    # 并发配置
			
 
				+    parser.add_argument(
			
 
				+        '--max-workers',
			
 
				+        type=int,
			
 
				+        default=5,
			
 
				+        help='并发处理数（默认: 5）'
			
 
				+    )
			
 
				+
			
 
				+    # 从配置文件加载
			
 
				+    parser.add_argument(
			
 
				+        '--config',
			
 
				+        default=None,
			
 
				+        help='从 JSON 配置文件加载参数'
			
 
				+    )
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    # 如果提供了配置文件，加载配置
			
 
				+    if args.config:
			
 
				+        logger.info(f"从配置文件加载参数: {args.config}")
			
 
				+        with open(args.config, 'r', encoding='utf-8') as f:
			
 
				+            config = json.load(f)
			
 
				+
			
 
				+        # 配置文件中的参数会覆盖命令行参数
			
 
				+        for key, value in config.items():
			
 
				+            setattr(args, key.replace('-', '_'), value)
			
 
				+
			
 
				+    # 检查输入文件是否存在
			
 
				+    if not os.path.exists(args.input):
			
 
				+        logger.error(f"输入文件不存在: {args.input}")
			
 
				+        return
			
 
				+
			
 
				+    # 加载 Stage 6 结果
			
 
				+    logger.info(f"加载 Stage 6 结果: {args.input}")
			
 
				+    with open(args.input, 'r', encoding='utf-8') as f:
			
 
				+        stage6_results = json.load(f)
			
 
				+
			
 
				+    # 打印配置
			
 
				+    logger.info("=" * 60)
			
 
				+    logger.info("运行配置:")
			
 
				+    logger.info(f"  输入文件: {args.input}")
			
 
				+    logger.info(f"  输出文件: {args.output}")
			
 
				+    if args.feature:
			
 
				+        logger.info(f"  指定特征: {', '.join(args.feature)}")
			
 
				+    else:
			
 
				+        logger.info(f"  指定特征: 全部")
			
 
				+    logger.info(f"  API 地址: {args.api_url}")
			
 
				+    logger.info(f"  最低分数阈值: {args.min_score}")
			
 
				+    logger.info(f"  跳过前 N 个: {args.skip}")
			
 
				+    logger.info(f"  最多处理数: {args.max_notes if args.max_notes else '不限制'}")
			
 
				+    logger.info(f"  排序方式: {args.sort_by}")
			
 
				+    logger.info(f"  并发数: {args.max_workers}")
			
 
				+    logger.info(f"  API 超时: {args.timeout}秒")
			
 
				+    logger.info(f"  最大重试: {args.max_retries}次")
			
 
				+    logger.info("=" * 60)
			
 
				+
			
 
				+    # 创建分析器
			
 
				+    analyzer = Stage7DeconstructionAnalyzer(
			
 
				+        api_url=args.api_url,
			
 
				+        max_workers=args.max_workers,
			
 
				+        max_notes=args.max_notes,
			
 
				+        min_score=args.min_score,
			
 
				+        skip_count=args.skip,
			
 
				+        sort_by=args.sort_by,
			
 
				+        timeout=args.timeout,
			
 
				+        max_retries=args.max_retries,
			
 
				+        output_dir=os.path.dirname(args.output) or 'output_v2',
			
 
				+        target_features=args.feature  # 传递 feature 过滤参数
			
 
				+    )
			
 
				+
			
 
				+    # 运行分析
			
 
				+    try:
			
 
				+        stage7_results = analyzer.run(
			
 
				+            stage6_results=stage6_results,
			
 
				+            output_path=args.output
			
 
				+        )
			
 
				+
			
 
				+        # 打印结果摘要
			
 
				+        logger.info("\n" + "=" * 60)
			
 
				+        logger.info("执行完成!")
			
 
				+        logger.info(f"  总匹配帖子数: {stage7_results['metadata']['total_matched_notes']}")
			
 
				+        logger.info(f"  实际处理数: {stage7_results['metadata']['processed_notes']}")
			
 
				+        logger.info(f"  成功: {stage7_results['metadata']['success_count']}")
			
 
				+        logger.info(f"  失败: {stage7_results['metadata']['failed_count']}")
			
 
				+        logger.info(f"  总耗时: {stage7_results['metadata']['processing_time_seconds']}秒")
			
 
				+        logger.info(f"  结果已保存: {args.output}")
			
 
				+        logger.info("=" * 60)
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"执行失败: {e}", exc_info=True)
			
 
				+        raise
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/stage7_analyzer.py
+++ b/stage7_analyzer.py
@@ -0,0 +1,550 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Stage 7 分析器
			
 
				+对 Stage 6 中完全匹配的帖子进行深度解构分析
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import json
			
 
				+import time
			
 
				+import logging
			
 
				+from datetime import datetime
			
 
				+from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				+from typing import Dict, List, Any, Optional
			
 
				+
			
 
				+from stage7_api_client import DeconstructionAPIClient, map_note_to_api_format
			
 
				+
			
 
				+try:
			
 
				+    from tqdm import tqdm
			
 
				+    TQDM_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    TQDM_AVAILABLE = False
			
 
				+    logger.warning("tqdm 未安装，将使用简单进度显示。安装命令: pip install tqdm")
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class Stage7DeconstructionAnalyzer:
			
 
				+    """Stage 7: 完全匹配帖子的深度解构分析"""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        api_url: str = "http://192.168.245.150:7000/what/analysis/single",
			
 
				+        max_workers: int = 5,
			
 
				+        max_notes: Optional[int] = None,
			
 
				+        min_score: float = 8.0,
			
 
				+        skip_count: int = 0,
			
 
				+        sort_by: str = 'score',
			
 
				+        timeout: int = 30,
			
 
				+        max_retries: int = 3,
			
 
				+        output_dir: str = "output_v2",
			
 
				+        enable_image_download: bool = True,
			
 
				+        image_server_url: str = "http://localhost:8765",
			
 
				+        image_download_dir: str = "downloaded_images",
			
 
				+        target_features: Optional[List[str]] = None
			
 
				+    ):
			
 
				+        """
			
 
				+        初始化 Stage 7 分析器
			
 
				+
			
 
				+        Args:
			
 
				+            api_url: API 地址
			
 
				+            max_workers: 并发数
			
 
				+            max_notes: 最多处理多少个帖子（None = 不限制）
			
 
				+            min_score: 最低分数阈值（只处理 >= 此分数的帖子）
			
 
				+            skip_count: 跳过前 N 个
			
 
				+            sort_by: 排序方式 ('score' | 'time' | 'engagement')
			
 
				+            timeout: API 超时时间
			
 
				+            max_retries: API 最大重试次数
			
 
				+            output_dir: 输出目录
			
 
				+            enable_image_download: 是否启用图片下载（下载小红书图片并转换为本地URL）
			
 
				+            image_server_url: 图片服务器URL
			
 
				+            image_download_dir: 图片下载目录
			
 
				+            target_features: 指定要处理的原始特征列表（None = 处理所有特征）
			
 
				+        """
			
 
				+        self.max_workers = max_workers
			
 
				+        self.max_notes = max_notes
			
 
				+        self.min_score = min_score
			
 
				+        self.skip_count = skip_count
			
 
				+        self.sort_by = sort_by
			
 
				+        self.output_dir = output_dir
			
 
				+        self.enable_image_download = enable_image_download
			
 
				+        self.target_features = target_features  # 新增：目标特征过滤
			
 
				+
			
 
				+        # 初始化 API 客户端
			
 
				+        self.api_client = DeconstructionAPIClient(
			
 
				+            api_url=api_url,
			
 
				+            timeout=timeout,
			
 
				+            max_retries=max_retries
			
 
				+        )
			
 
				+
			
 
				+        # 图片下载功能已弃用，直接使用原始图片URL
			
 
				+        # 保留参数以向后兼容，但不再使用
			
 
				+        if self.enable_image_download:
			
 
				+            logger.warning("  注意: enable_image_download 参数已弃用，将直接使用原始图片URL")
			
 
				+
			
 
				+    def extract_matched_notes_from_stage6(
			
 
				+        self,
			
 
				+        stage6_results: List[Dict]
			
 
				+    ) -> List[Dict]:
			
 
				+        """
			
 
				+        从 Stage 6 结果中提取所有完全匹配的帖子
			
 
				+
			
 
				+        Args:
			
 
				+            stage6_results: Stage 6 结果（列表）
			
 
				+
			
 
				+        Returns:
			
 
				+            完全匹配的帖子列表
			
 
				+        """
			
 
				+        matched_notes = []
			
 
				+
			
 
				+        # Stage 6 结果是一个列表，每个元素是一个 feature_group
			
 
				+        for feature_group in stage6_results:
			
 
				+            original_feature = feature_group.get('原始特征名称', '')
			
 
				+
			
 
				+            # 如果指定了 target_features，只处理指定的特征
			
 
				+            if self.target_features and original_feature not in self.target_features:
			
 
				+                continue
			
 
				+
			
 
				+            # 遍历 组合评估结果_分组（这一层包含了 top10_searches）
			
 
				+            for combo_group in feature_group.get('组合评估结果_分组', []):
			
 
				+                # top10_searches 包含所有搜索结果
			
 
				+                for search_item in combo_group.get('top10_searches', []):
			
 
				+                    search_word = search_item.get('search_word', '')
			
 
				+                    source_word = search_item.get('source_word', '')
			
 
				+                    evaluation = search_item.get('evaluation_with_filter', {})
			
 
				+
			
 
				+                    # 检查是否有搜索结果
			
 
				+                    if 'search_result' not in search_item:
			
 
				+                        continue
			
 
				+
			
 
				+                    notes = search_item['search_result'].get('data', {}).get('data', [])
			
 
				+
			
 
				+                    # 遍历评估结果
			
 
				+                    for note_eval in evaluation.get('notes_evaluation', []):
			
 
				+                        score = note_eval.get('综合得分', 0)
			
 
				+
			
 
				+                        # 只处理完全匹配的（分数 >= min_score）
			
 
				+                        if score >= self.min_score:
			
 
				+                            note_index = note_eval.get('note_index', -1)
			
 
				+                            if 0 <= note_index < len(notes):
			
 
				+                                note = notes[note_index]
			
 
				+
			
 
				+                                matched_notes.append({
			
 
				+                                    'note': note,
			
 
				+                                    'note_card': note.get('note_card', {}),
			
 
				+                                    'evaluation': note_eval,
			
 
				+                                    'search_word': search_word,
			
 
				+                                    'source_word': source_word,
			
 
				+                                    'original_feature': original_feature
			
 
				+                                })
			
 
				+
			
 
				+        return matched_notes
			
 
				+
			
 
				+    def sort_matched_notes(
			
 
				+        self,
			
 
				+        matched_notes: List[Dict]
			
 
				+    ) -> List[Dict]:
			
 
				+        """
			
 
				+        对完全匹配的帖子进行排序
			
 
				+
			
 
				+        Args:
			
 
				+            matched_notes: 匹配的帖子列表
			
 
				+
			
 
				+        Returns:
			
 
				+            排序后的帖子列表
			
 
				+        """
			
 
				+        if self.sort_by == 'score':
			
 
				+            # 按评分降序（优先处理高分帖子）
			
 
				+            return sorted(
			
 
				+                matched_notes,
			
 
				+                key=lambda x: x['evaluation'].get('综合得分', 0),
			
 
				+                reverse=True
			
 
				+            )
			
 
				+
			
 
				+        elif self.sort_by == 'time':
			
 
				+            # 按时间降序（优先处理最新帖子）
			
 
				+            return sorted(
			
 
				+                matched_notes,
			
 
				+                key=lambda x: x['note_card'].get('publish_timestamp', 0),
			
 
				+                reverse=True
			
 
				+            )
			
 
				+
			
 
				+        elif self.sort_by == 'engagement':
			
 
				+            # 按互动量降序（点赞+收藏+评论）
			
 
				+            def calc_engagement(note_data):
			
 
				+                interact = note_data['note_card'].get('interact_info', {})
			
 
				+                return (
			
 
				+                    interact.get('liked_count', 0) +
			
 
				+                    interact.get('collected_count', 0) +
			
 
				+                    interact.get('comment_count', 0)
			
 
				+                )
			
 
				+
			
 
				+            return sorted(
			
 
				+                matched_notes,
			
 
				+                key=calc_engagement,
			
 
				+                reverse=True
			
 
				+            )
			
 
				+
			
 
				+        return matched_notes
			
 
				+
			
 
				+    def _save_intermediate_results(
			
 
				+        self,
			
 
				+        results: List[Dict],
			
 
				+        output_path: str,
			
 
				+        processed_count: int,
			
 
				+        total_count: int,
			
 
				+        start_time: float
			
 
				+    ):
			
 
				+        """
			
 
				+        保存中间结果
			
 
				+
			
 
				+        Args:
			
 
				+            results: 当前结果列表
			
 
				+            output_path: 输出路径
			
 
				+            processed_count: 已处理数量
			
 
				+            total_count: 总数量
			
 
				+            start_time: 开始时间
			
 
				+        """
			
 
				+        # 构建中间结果文件路径
			
 
				+        base_dir = os.path.dirname(output_path) or 'output_v2'
			
 
				+        base_name = os.path.basename(output_path)
			
 
				+        name_without_ext = os.path.splitext(base_name)[0]
			
 
				+
			
 
				+        intermediate_path = os.path.join(
			
 
				+            base_dir,
			
 
				+            f"{name_without_ext}_partial_{processed_count}of{total_count}.json"
			
 
				+        )
			
 
				+
			
 
				+        # 统计成功失败数
			
 
				+        success_count = sum(1 for r in results if r['api_response']['status'] == 'success')
			
 
				+        failed_count = len(results) - success_count
			
 
				+
			
 
				+        # 构建中间结果
			
 
				+        intermediate_result = {
			
 
				+            'metadata': {
			
 
				+                'stage': 'stage7_partial',
			
 
				+                'description': f'部分结果（{processed_count}/{total_count}）',
			
 
				+                'processed_notes': len(results),
			
 
				+                'success_count': success_count,
			
 
				+                'failed_count': failed_count,
			
 
				+                'saved_at': datetime.now().isoformat(),
			
 
				+                'processing_time_seconds': round(time.time() - start_time, 2)
			
 
				+            },
			
 
				+            'results': results
			
 
				+        }
			
 
				+
			
 
				+        # 保存
			
 
				+        os.makedirs(base_dir, exist_ok=True)
			
 
				+        with open(intermediate_path, 'w', encoding='utf-8') as f:
			
 
				+            json.dump(intermediate_result, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+        logger.info(f"    已保存中间结果: {intermediate_path} ({processed_count}/{total_count})")
			
 
				+
			
 
				+    def process_single_note(
			
 
				+        self,
			
 
				+        matched_note_data: Dict,
			
 
				+        index: int,
			
 
				+        total: int
			
 
				+    ) -> Dict:
			
 
				+        """
			
 
				+        处理单个帖子的解构分析
			
 
				+
			
 
				+        Args:
			
 
				+            matched_note_data: 匹配的帖子数据
			
 
				+            index: 当前索引（用于日志）
			
 
				+            total: 总数（用于日志）
			
 
				+
			
 
				+        Returns:
			
 
				+            处理结果
			
 
				+        """
			
 
				+        note = matched_note_data['note']
			
 
				+        note_card = matched_note_data['note_card']
			
 
				+        evaluation = matched_note_data['evaluation']
			
 
				+        search_word = matched_note_data['search_word']
			
 
				+        original_feature = matched_note_data['original_feature']
			
 
				+
			
 
				+        note_id = note.get('id', '')
			
 
				+        note_title = note_card.get('display_title', '')[:30]  # 前30个字符
			
 
				+
			
 
				+        logger.info(f"[{index}/{total}] 解构分析: {note_id}")
			
 
				+        logger.info(f"  标题: {note_title}...")
			
 
				+        logger.info(f"  搜索词: {search_word}")
			
 
				+        logger.info(f"  原始特征: {original_feature}")
			
 
				+
			
 
				+        # 构建 start_points（使用组合方案）
			
 
				+        key_points = evaluation.get('关键匹配点', [])
			
 
				+        start_points = [
			
 
				+            original_feature,                    # 原始特征
			
 
				+            search_word,                         # 搜索词
			
 
				+            key_points[0] if key_points else ''  # 第一个关键匹配点
			
 
				+        ]
			
 
				+        start_points = [p for p in start_points if p]  # 过滤空值
			
 
				+
			
 
				+        logger.info(f"  start_points: {start_points}")
			
 
				+
			
 
				+        # 直接使用原始图片URL，不做任何处理
			
 
				+        original_images = note_card.get('image_list', [])
			
 
				+        if original_images:
			
 
				+            logger.info(f"  图片数量: {len(original_images)}")
			
 
				+
			
 
				+        # 映射数据为 API 格式（直接使用原始图片URL）
			
 
				+        api_payload = map_note_to_api_format(
			
 
				+            note=note,
			
 
				+            note_card=note_card,
			
 
				+            evaluation=evaluation,
			
 
				+            search_word=search_word,
			
 
				+            original_feature=original_feature,
			
 
				+            start_points=start_points,
			
 
				+            processed_image_urls=None  # 不传递处理后的URL，使用原始URL
			
 
				+        )
			
 
				+
			
 
				+        # 调用 API
			
 
				+        start_time = time.time()
			
 
				+        api_response = self.api_client.call_api(api_payload)
			
 
				+        processing_time = (time.time() - start_time) * 1000  # 毫秒
			
 
				+
			
 
				+        # 构建结果
			
 
				+        result = {
			
 
				+            'note_id': note_id,
			
 
				+            'search_word': search_word,
			
 
				+            'original_feature': original_feature,
			
 
				+            'source_word': matched_note_data['source_word'],
			
 
				+            'evaluation_score': evaluation.get('综合得分', 0),
			
 
				+            'evaluation_type': evaluation.get('匹配类型', ''),
			
 
				+            'evaluation_confidence': evaluation.get('置信度', ''),
			
 
				+            'key_matching_points': key_points,
			
 
				+            'note_data': {
			
 
				+                'title': note_card.get('display_title', ''),
			
 
				+                'author': note_card.get('user', {}).get('nick_name', ''),
			
 
				+                'link': f"https://www.xiaohongshu.com/explore/{note_id}"
			
 
				+            },
			
 
				+            'api_request': api_payload,
			
 
				+            'api_response': api_response,
			
 
				+            'processed_at': datetime.now().isoformat(),
			
 
				+            'processing_time_ms': round(processing_time, 2)
			
 
				+        }
			
 
				+
			
 
				+        if api_response['status'] == 'success':
			
 
				+            logger.info(f"  ✓ 成功 ({processing_time:.0f}ms)")
			
 
				+        else:
			
 
				+            logger.error(f"  ✗ 失败: {api_response['error']}")
			
 
				+
			
 
				+        return result
			
 
				+
			
 
				+    def run(
			
 
				+        self,
			
 
				+        stage6_results: Dict,
			
 
				+        output_path: Optional[str] = None
			
 
				+    ) -> Dict:
			
 
				+        """
			
 
				+        执行 Stage 7 解构分析
			
 
				+
			
 
				+        Args:
			
 
				+            stage6_results: Stage 6 结果
			
 
				+            output_path: 输出路径（可选）
			
 
				+
			
 
				+        Returns:
			
 
				+            Stage 7 结果
			
 
				+        """
			
 
				+        logger.info("\n" + "=" * 60)
			
 
				+        logger.info("Stage 7: 完全匹配帖子的深度解构分析")
			
 
				+        logger.info("=" * 60)
			
 
				+
			
 
				+        # 打印配置参数
			
 
				+        logger.info("配置参数:")
			
 
				+        logger.info(f"  API 地址: {self.api_client.api_url}")
			
 
				+        if self.target_features:
			
 
				+            logger.info(f"  目标特征: {', '.join(self.target_features)}")
			
 
				+        else:
			
 
				+            logger.info(f"  目标特征: 全部")
			
 
				+        logger.info(f"  最低分数阈值: {self.min_score}")
			
 
				+        logger.info(f"  并发数: {self.max_workers}")
			
 
				+        logger.info(f"  最多处理帖子数: {self.max_notes if self.max_notes else '不限制'}")
			
 
				+        logger.info(f"  跳过前 N 个: {self.skip_count}")
			
 
				+        logger.info(f"  排序方式: {self.sort_by}")
			
 
				+        logger.info(f"  API 超时: {self.api_client.timeout}秒")
			
 
				+        logger.info(f"  最大重试次数: {self.api_client.max_retries}")
			
 
				+
			
 
				+        # 默认输出路径
			
 
				+        if output_path is None:
			
 
				+            output_path = os.path.join(self.output_dir, "stage7_with_deconstruction.json")
			
 
				+
			
 
				+        # 1. 提取完全匹配的帖子
			
 
				+        matched_notes = self.extract_matched_notes_from_stage6(stage6_results)
			
 
				+        total_matched = len(matched_notes)
			
 
				+
			
 
				+        logger.info(f"  完全匹配帖子总数: {total_matched} (分数 >= {self.min_score})")
			
 
				+
			
 
				+        if total_matched == 0:
			
 
				+            logger.warning("  没有找到完全匹配的帖子")
			
 
				+            return {
			
 
				+                'metadata': {
			
 
				+                    'stage': 'stage7',
			
 
				+                    'total_matched_notes': 0,
			
 
				+                    'processed_notes': 0
			
 
				+                },
			
 
				+                'results': []
			
 
				+            }
			
 
				+
			
 
				+        # 2. 排序
			
 
				+        matched_notes = self.sort_matched_notes(matched_notes)
			
 
				+        logger.info(f"  排序方式: {self.sort_by}")
			
 
				+
			
 
				+        # 3. 跳过前 N 个
			
 
				+        if self.skip_count > 0:
			
 
				+            logger.info(f"  跳过前 {self.skip_count} 个")
			
 
				+            matched_notes = matched_notes[self.skip_count:]
			
 
				+
			
 
				+        # 4. 限制数量
			
 
				+        if self.max_notes is not None and len(matched_notes) > self.max_notes:
			
 
				+            logger.info(f"  数量限制: {self.max_notes}")
			
 
				+            matched_notes = matched_notes[:self.max_notes]
			
 
				+
			
 
				+        to_process = len(matched_notes)
			
 
				+        logger.info(f"  实际处理: {to_process} 个")
			
 
				+        logger.info(f"  并发数: {self.max_workers}")
			
 
				+        logger.info(f"  API: {self.api_client.api_url}")
			
 
				+
			
 
				+        if to_process == 0:
			
 
				+            logger.warning("  没有需要处理的帖子")
			
 
				+            return {
			
 
				+                'metadata': {
			
 
				+                    'stage': 'stage7',
			
 
				+                    'total_matched_notes': total_matched,
			
 
				+                    'processed_notes': 0,
			
 
				+                    'skipped_notes': self.skip_count
			
 
				+                },
			
 
				+                'results': []
			
 
				+            }
			
 
				+
			
 
				+        # 5. 并行处理
			
 
				+        results = []
			
 
				+        start_time = time.time()
			
 
				+        save_interval = 10  # 每处理10个帖子保存一次
			
 
				+
			
 
				+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
			
 
				+            futures = []
			
 
				+            for idx, note_data in enumerate(matched_notes, start=1):
			
 
				+                future = executor.submit(
			
 
				+                    self.process_single_note,
			
 
				+                    note_data,
			
 
				+                    idx,
			
 
				+                    to_process
			
 
				+                )
			
 
				+                futures.append(future)
			
 
				+
			
 
				+            # 收集结果（带进度显示）
			
 
				+            if TQDM_AVAILABLE:
			
 
				+                # 使用 tqdm 进度条
			
 
				+                logger.info("  使用进度条显示...")
			
 
				+                iterator = tqdm(
			
 
				+                    as_completed(futures),
			
 
				+                    total=len(futures),
			
 
				+                    desc="  处理进度",
			
 
				+                    unit="帖子",
			
 
				+                    ncols=100
			
 
				+                )
			
 
				+            else:
			
 
				+                # 简单进度显示
			
 
				+                iterator = as_completed(futures)
			
 
				+
			
 
				+            processed_count = 0
			
 
				+            for future in iterator:
			
 
				+                try:
			
 
				+                    result = future.result()
			
 
				+                    results.append(result)
			
 
				+                    processed_count += 1
			
 
				+
			
 
				+                    # 增量保存（每处理一定数量保存一次）
			
 
				+                    if processed_count % save_interval == 0:
			
 
				+                        self._save_intermediate_results(
			
 
				+                            results,
			
 
				+                            output_path,
			
 
				+                            processed_count,
			
 
				+                            to_process,
			
 
				+                            start_time
			
 
				+                        )
			
 
				+
			
 
				+                    # 简单进度显示（如果没有 tqdm）
			
 
				+                    if not TQDM_AVAILABLE and processed_count % 5 == 0:
			
 
				+                        logger.info(f"  进度: {processed_count}/{to_process}")
			
 
				+
			
 
				+                except Exception as e:
			
 
				+                    logger.error(f"  处理失败: {e}")
			
 
				+
			
 
				+        processing_time = time.time() - start_time
			
 
				+
			
 
				+        # 6. 统计
			
 
				+        success_count = sum(1 for r in results if r['api_response']['status'] == 'success')
			
 
				+        failed_count = len(results) - success_count
			
 
				+
			
 
				+        logger.info(f"\n  总耗时: {processing_time:.1f}s")
			
 
				+        logger.info(f"  成功: {success_count}")
			
 
				+        logger.info(f"  失败: {failed_count}")
			
 
				+
			
 
				+        # 7. 构建最终结果
			
 
				+        final_result = {
			
 
				+            'metadata': {
			
 
				+                'stage': 'stage7',
			
 
				+                'description': '完全匹配帖子的深度解构分析',
			
 
				+                'target_features': self.target_features if self.target_features else '全部',
			
 
				+                'total_matched_notes': total_matched,
			
 
				+                'processed_notes': len(results),
			
 
				+                'skipped_notes': self.skip_count,
			
 
				+                'max_notes_limit': self.max_notes,
			
 
				+                'sort_by': self.sort_by,
			
 
				+                'success_count': success_count,
			
 
				+                'failed_count': failed_count,
			
 
				+                'api_url': self.api_client.api_url,
			
 
				+                'min_score_threshold': self.min_score,
			
 
				+                'created_at': datetime.now().isoformat(),
			
 
				+                'processing_time_seconds': round(processing_time, 2)
			
 
				+            },
			
 
				+            'results': results
			
 
				+        }
			
 
				+
			
 
				+        # 8. 保存结果
			
 
				+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
			
 
				+        with open(output_path, 'w', encoding='utf-8') as f:
			
 
				+            json.dump(final_result, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+        logger.info(f"  结果已保存: {output_path}")
			
 
				+
			
 
				+        return final_result
			
 
				+
			
 
				+
			
 
				+def test_stage7_analyzer():
			
 
				+    """测试 Stage 7 分析器"""
			
 
				+    # 读取 Stage 6 结果
			
 
				+    stage6_path = "output_v2/stage6_with_evaluations.json"
			
 
				+
			
 
				+    if not os.path.exists(stage6_path):
			
 
				+        print(f"Stage 6 结果不存在: {stage6_path}")
			
 
				+        return
			
 
				+
			
 
				+    with open(stage6_path, 'r', encoding='utf-8') as f:
			
 
				+        stage6_results = json.load(f)
			
 
				+
			
 
				+    # 创建分析器
			
 
				+    analyzer = Stage7DeconstructionAnalyzer(
			
 
				+        max_workers=3,
			
 
				+        max_notes=5,  # 只测试 5 个
			
 
				+        skip_count=0,
			
 
				+        sort_by='score'
			
 
				+    )
			
 
				+
			
 
				+    # 运行分析
			
 
				+    stage7_results = analyzer.run(stage6_results)
			
 
				+
			
 
				+    print(f"\n处理了 {stage7_results['metadata']['processed_notes']} 个帖子")
			
 
				+    print(f"成功: {stage7_results['metadata']['success_count']}")
			
 
				+    print(f"失败: {stage7_results['metadata']['failed_count']}")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    logging.basicConfig(
			
 
				+        level=logging.INFO,
			
 
				+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
			
 
				+    )
			
 
				+    test_stage7_analyzer()
			
--- a/stage7_api_client.py
+++ b/stage7_api_client.py
@@ -0,0 +1,253 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Stage 7 API 客户端
			
 
				+用于调用深度解构分析 API
			
 
				+"""
			
 
				+
			
 
				+import time
			
 
				+import logging
			
 
				+import requests
			
 
				+from datetime import datetime
			
 
				+from typing import Dict, List, Any, Optional
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+def map_note_to_api_format(
			
 
				+    note: Dict,
			
 
				+    note_card: Dict,
			
 
				+    evaluation: Dict,
			
 
				+    search_word: str,
			
 
				+    original_feature: str,
			
 
				+    start_points: List[str],
			
 
				+    processed_image_urls: Optional[List[str]] = None
			
 
				+) -> Dict:
			
 
				+    """
			
 
				+    将小红书笔记数据映射为 API 所需格式
			
 
				+
			
 
				+    Args:
			
 
				+        note: 笔记原始数据
			
 
				+        note_card: 笔记卡片信息
			
 
				+        evaluation: 评估结果
			
 
				+        search_word: 搜索词
			
 
				+        original_feature: 原始特征
			
 
				+        start_points: 起点列表
			
 
				+        processed_image_urls: 处理后的图片URL列表(如果提供,将替代原始URL)
			
 
				+
			
 
				+    Returns:
			
 
				+        API 请求格式的数据
			
 
				+    """
			
 
				+    # 构建小红书链接
			
 
				+    note_id = note.get('id', '')
			
 
				+    link = f"https://www.xiaohongshu.com/explore/{note_id}"
			
 
				+
			
 
				+    # 获取用户信息
			
 
				+    user = note_card.get('user', {})
			
 
				+    interact_info = note_card.get('interact_info', {})
			
 
				+
			
 
				+    # 获取发布时间（需要转换为毫秒时间戳）
			
 
				+    publish_ts = note_card.get('publish_timestamp', 0)
			
 
				+    publish_ts_ms = publish_ts * 1000 if publish_ts else 0
			
 
				+
			
 
				+    # 格式化发布日期
			
 
				+    publish_date = ''
			
 
				+    if publish_ts:
			
 
				+        try:
			
 
				+            publish_date = datetime.fromtimestamp(publish_ts).strftime('%Y-%m-%d %H:%M:%S')
			
 
				+        except:
			
 
				+            publish_date = ''
			
 
				+
			
 
				+    # 使用处理后的图片URL,如果没有则使用原始URL
			
 
				+    image_urls = processed_image_urls if processed_image_urls else note_card.get('image_list', [])
			
 
				+
			
 
				+    return {
			
 
				+        "post_data": {
			
 
				+            "channel_content_id": note_id,
			
 
				+            "link": link,
			
 
				+            "xsec_token": "",  # 通常为空
			
 
				+            "comment_count": interact_info.get('comment_count', 0),
			
 
				+            "images": image_urls,
			
 
				+            "like_count": interact_info.get('liked_count', 0),
			
 
				+            "body_text": note_card.get('desc', ''),
			
 
				+            "title": note_card.get('display_title', ''),
			
 
				+            "collect_count": interact_info.get('collected_count', 0),
			
 
				+            "channel_account_id": user.get('user_id', ''),
			
 
				+            "channel_account_name": user.get('nick_name', ''),
			
 
				+            "publish_timestamp": publish_ts_ms,
			
 
				+            "modify_timestamp": publish_ts_ms,
			
 
				+            "update_timestamp": int(time.time() * 1000),
			
 
				+            "publish_date": publish_date,
			
 
				+            "content_type": "note",
			
 
				+            "video": {}  # 图文类型无视频
			
 
				+        },
			
 
				+        "question_data": {
			
 
				+            "target": original_feature,      # 例如: "墨镜"
			
 
				+            "start_points": start_points,    # 例如: ["墨镜", "猫咪服饰造型元素", "图片中猫咪佩戴墨镜"]
			
 
				+            "query": search_word             # 例如: "猫咪服饰造型元素"
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+class DeconstructionAPIClient:
			
 
				+    """解构分析 API 客户端"""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        api_url: str = "http://192.168.245.150:7000/what/analysis/single",
			
 
				+        timeout: int = 30,
			
 
				+        max_retries: int = 3
			
 
				+    ):
			
 
				+        """
			
 
				+        初始化 API 客户端
			
 
				+
			
 
				+        Args:
			
 
				+            api_url: API 地址
			
 
				+            timeout: 超时时间（秒）
			
 
				+            max_retries: 最大重试次数
			
 
				+        """
			
 
				+        self.api_url = api_url
			
 
				+        self.timeout = timeout
			
 
				+        self.max_retries = max_retries
			
 
				+
			
 
				+    def call_api(
			
 
				+        self,
			
 
				+        api_payload: Dict
			
 
				+    ) -> Dict:
			
 
				+        """
			
 
				+        调用解构 API（带重试机制）
			
 
				+
			
 
				+        Args:
			
 
				+            api_payload: API 请求数据
			
 
				+
			
 
				+        Returns:
			
 
				+            {
			
 
				+                'status': 'success' | 'failed',
			
 
				+                'result': API响应数据（成功时）,
			
 
				+                'error': 错误信息（失败时）
			
 
				+            }
			
 
				+        """
			
 
				+        for attempt in range(self.max_retries):
			
 
				+            try:
			
 
				+                response = requests.post(
			
 
				+                    self.api_url,
			
 
				+                    json=api_payload,
			
 
				+                    headers={'Content-Type': 'application/json'},
			
 
				+                    timeout=self.timeout
			
 
				+                )
			
 
				+
			
 
				+                if response.status_code == 200:
			
 
				+                    return {
			
 
				+                        'status': 'success',
			
 
				+                        'result': response.json(),
			
 
				+                        'error': None
			
 
				+                    }
			
 
				+                else:
			
 
				+                    error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
			
 
				+
			
 
				+                    # 如果还有重试机会，继续重试
			
 
				+                    if attempt < self.max_retries - 1:
			
 
				+                        wait_time = 2 ** attempt  # 指数退避: 1s, 2s, 4s
			
 
				+                        logger.warning(f"    API 调用失败，{wait_time}s 后重试 ({attempt + 1}/{self.max_retries})")
			
 
				+                        time.sleep(wait_time)
			
 
				+                        continue
			
 
				+
			
 
				+                    # 最后一次重试也失败
			
 
				+                    return {
			
 
				+                        'status': 'failed',
			
 
				+                        'result': None,
			
 
				+                        'error': error_msg
			
 
				+                    }
			
 
				+
			
 
				+            except requests.Timeout:
			
 
				+                if attempt < self.max_retries - 1:
			
 
				+                    wait_time = 2 ** attempt
			
 
				+                    logger.warning(f"    API 超时，{wait_time}s 后重试 ({attempt + 1}/{self.max_retries})")
			
 
				+                    time.sleep(wait_time)
			
 
				+                    continue
			
 
				+
			
 
				+                return {
			
 
				+                    'status': 'failed',
			
 
				+                    'result': None,
			
 
				+                    'error': f'API timeout after {self.timeout}s'
			
 
				+                }
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                if attempt < self.max_retries - 1:
			
 
				+                    wait_time = 2 ** attempt
			
 
				+                    logger.warning(f"    API 异常，{wait_time}s 后重试 ({attempt + 1}/{self.max_retries}): {e}")
			
 
				+                    time.sleep(wait_time)
			
 
				+                    continue
			
 
				+
			
 
				+                return {
			
 
				+                    'status': 'failed',
			
 
				+                    'result': None,
			
 
				+                    'error': f'Exception: {str(e)}'
			
 
				+                }
			
 
				+
			
 
				+        # 理论上不会到这里
			
 
				+        return {
			
 
				+            'status': 'failed',
			
 
				+            'result': None,
			
 
				+            'error': 'Max retries exceeded'
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def test_api_client():
			
 
				+    """测试 API 客户端"""
			
 
				+    # 模拟数据
			
 
				+    test_note = {
			
 
				+        'id': '68ba3a27000000001c00f8fc'
			
 
				+    }
			
 
				+
			
 
				+    test_note_card = {
			
 
				+        'display_title': '测试标题',
			
 
				+        'desc': '测试内容',
			
 
				+        'image_list': [
			
 
				+            'https://example.com/image1.jpg',
			
 
				+            'https://example.com/image2.jpg'
			
 
				+        ],
			
 
				+        'user': {
			
 
				+            'user_id': '123456',
			
 
				+            'nick_name': '测试用户'
			
 
				+        },
			
 
				+        'interact_info': {
			
 
				+            'liked_count': 100,
			
 
				+            'collected_count': 50,
			
 
				+            'comment_count': 10
			
 
				+        },
			
 
				+        'publish_timestamp': 1640000000
			
 
				+    }
			
 
				+
			
 
				+    test_evaluation = {
			
 
				+        '综合得分': 9.0,
			
 
				+        '关键匹配点': ['测试匹配点1', '测试匹配点2']
			
 
				+    }
			
 
				+
			
 
				+    # 数据映射测试
			
 
				+    api_payload = map_note_to_api_format(
			
 
				+        note=test_note,
			
 
				+        note_card=test_note_card,
			
 
				+        evaluation=test_evaluation,
			
 
				+        search_word='测试搜索词',
			
 
				+        original_feature='测试特征',
			
 
				+        start_points=['起点1', '起点2']
			
 
				+    )
			
 
				+
			
 
				+    print("API Payload:")
			
 
				+    import json
			
 
				+    print(json.dumps(api_payload, ensure_ascii=False, indent=2))
			
 
				+
			
 
				+    # API 调用测试（需要实际 API 服务）
			
 
				+    # client = DeconstructionAPIClient()
			
 
				+    # result = client.call_api(api_payload)
			
 
				+    # print("\nAPI Result:")
			
 
				+    # print(json.dumps(result, ensure_ascii=False, indent=2))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    logging.basicConfig(
			
 
				+        level=logging.INFO,
			
 
				+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
			
 
				+    )
			
 
				+    test_api_client()
			
--- a/stage7_config.json
+++ b/stage7_config.json
@@ -0,0 +1,13 @@
 
				+{
			
 
				+  "input": "output_v2/stage6_with_evaluations.json",
			
 
				+  "output": "output_v2/stage7_with_deconstruction.json",
			
 
				+  "feature": ["墨镜"],
			
 
				+  "min_score": 8.0,
			
 
				+  "skip": 0,
			
 
				+  "max_notes": 10,
			
 
				+  "sort_by": "score",
			
 
				+  "api_url": "http://192.168.245.150:7000/what/analysis/single",
			
 
				+  "timeout": 30,
			
 
				+  "max_retries": 3,
			
 
				+  "max_workers": 5
			
 
				+}
			
--- a/visualize_stage6_results.py
+++ b/visualize_stage6_results.py
@@ -21,6 +21,8 @@ def calculate_statistics(data: List[Dict[str, Any]]) -> Dict[str, Any]:
 
				     """计算统计数据(包括评估结果)"""
			
 
				     total_features = len(data)
			
 
				     total_search_words = 0
			
 
				+    searched_count = 0  # 已执行搜索的数量
			
 
				+    not_searched_count = 0  # 未执行搜索的数量
			
 
				     total_notes = 0
			
 
				     video_count = 0
			
 
				     normal_count = 0
			
@@ -42,28 +44,34 @@ def calculate_statistics(data: List[Dict[str, Any]]) -> Dict[str, Any]:
 
				 
			
 
				             for search_item in search_items:
			
 
				                 search_result = search_item.get('search_result', {})
			
 
				-                notes = search_result.get('data', {}).get('data', [])
			
 
				-                total_notes += len(notes)
			
 
				-
			
 
				-                # 统计视频/图文类型
			
 
				-                for note in notes:
			
 
				-                    note_type = note.get('note_card', {}).get('type', '')
			
 
				-                    if note_type == 'video':
			
 
				-                        video_count += 1
			
 
				-                    else:
			
 
				-                        normal_count += 1
			
 
				-
			
 
				-                # 统计评估结果
			
 
				-                evaluation = search_item.get('evaluation_with_filter')
			
 
				-                if evaluation:
			
 
				-                    total_evaluated_notes += evaluation.get('total_notes', 0)
			
 
				-                    total_filtered += evaluation.get('filtered_count', 0)
			
 
				-
			
 
				-                    stats = evaluation.get('statistics', {})
			
 
				-                    match_complete += stats.get('完全匹配(8-10)', 0)
			
 
				-                    match_similar += stats.get('相似匹配(6-7)', 0)
			
 
				-                    match_weak += stats.get('弱相似(5-6)', 0)
			
 
				-                    match_none += stats.get('无匹配(≤4)', 0)
			
 
				+
			
 
				+                # 统计搜索状态
			
 
				+                if search_result:
			
 
				+                    searched_count += 1
			
 
				+                    notes = search_result.get('data', {}).get('data', [])
			
 
				+                    total_notes += len(notes)
			
 
				+
			
 
				+                    # 统计视频/图文类型
			
 
				+                    for note in notes:
			
 
				+                        note_type = note.get('note_card', {}).get('type', '')
			
 
				+                        if note_type == 'video':
			
 
				+                            video_count += 1
			
 
				+                        else:
			
 
				+                            normal_count += 1
			
 
				+
			
 
				+                    # 统计评估结果
			
 
				+                    evaluation = search_item.get('evaluation_with_filter')
			
 
				+                    if evaluation:
			
 
				+                        total_evaluated_notes += evaluation.get('total_notes', 0)
			
 
				+                        total_filtered += evaluation.get('filtered_count', 0)
			
 
				+
			
 
				+                        stats = evaluation.get('statistics', {})
			
 
				+                        match_complete += stats.get('完全匹配(8-10)', 0)
			
 
				+                        match_similar += stats.get('相似匹配(6-7)', 0)
			
 
				+                        match_weak += stats.get('弱相似(5-6)', 0)
			
 
				+                        match_none += stats.get('无匹配(≤4)', 0)
			
 
				+                else:
			
 
				+                    not_searched_count += 1
			
 
				 
			
 
				     # 计算百分比
			
 
				     total_remaining = total_evaluated_notes - total_filtered if total_evaluated_notes > 0 else 0
			
@@ -71,6 +79,9 @@ def calculate_statistics(data: List[Dict[str, Any]]) -> Dict[str, Any]:
 
				     return {
			
 
				         'total_features': total_features,
			
 
				         'total_search_words': total_search_words,
			
 
				+        'searched_count': searched_count,
			
 
				+        'not_searched_count': not_searched_count,
			
 
				+        'searched_percentage': round(searched_count / total_search_words * 100, 1) if total_search_words > 0 else 0,
			
 
				         'total_notes': total_notes,
			
 
				         'video_count': video_count,
			
 
				         'normal_count': normal_count,
			
@@ -550,6 +561,33 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
 
				             gap: 20px;
			
 
				         }}
			
 
				 
			
 
				+        /* 空状态样式 */
			
 
				+        .empty-state {{
			
 
				+            text-align: center;
			
 
				+            padding: 60px 40px;
			
 
				+            color: #6b7280;
			
 
				+        }}
			
 
				+
			
 
				+        .empty-icon {{
			
 
				+            font-size: 48px;
			
 
				+            margin-bottom: 16px;
			
 
				+        }}
			
 
				+
			
 
				+        .empty-title {{
			
 
				+            font-size: 16px;
			
 
				+            font-weight: 600;
			
 
				+            color: #374151;
			
 
				+            margin-bottom: 8px;
			
 
				+        }}
			
 
				+
			
 
				+        .empty-desc {{
			
 
				+            font-size: 14px;
			
 
				+            line-height: 1.6;
			
 
				+            color: #9ca3af;
			
 
				+            max-width: 400px;
			
 
				+            margin: 0 auto;
			
 
				+        }}
			
 
				+
			
 
				         .note-card {{
			
 
				             border: 3px solid #e5e7eb;
			
 
				             border-radius: 8px;
			
@@ -821,7 +859,15 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
 
				                 </div>
			
 
				                 <div class="stat-item">
			
 
				                     <div class="stat-value">🔍 {stats['total_search_words']}</div>
			
 
				-                    <div class="stat-label">搜索词数</div>
			
 
				+                    <div class="stat-label">搜索词总数</div>
			
 
				+                </div>
			
 
				+                <div class="stat-item">
			
 
				+                    <div class="stat-value">✅ {stats['searched_count']}</div>
			
 
				+                    <div class="stat-label">已搜索 ({stats['searched_percentage']}%)</div>
			
 
				+                </div>
			
 
				+                <div class="stat-item">
			
 
				+                    <div class="stat-value">⏸️ {stats['not_searched_count']}</div>
			
 
				+                    <div class="stat-label">未搜索</div>
			
 
				                 </div>
			
 
				                 <div class="stat-item">
			
 
				                     <div class="stat-value">📝 {stats['total_notes']}</div>
			
@@ -829,11 +875,11 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
 
				                 </div>
			
 
				                 <div class="stat-item">
			
 
				                     <div class="stat-value">🎬 {stats['video_count']}</div>
			
 
				-                    <div class="stat-label">视频类型 ({stats['video_percentage']}%)</div>
			
 
				+                    <div class="stat-label">视频 ({stats['video_percentage']}%)</div>
			
 
				                 </div>
			
 
				                 <div class="stat-item">
			
 
				                     <div class="stat-value">📷 {stats['normal_count']}</div>
			
 
				-                    <div class="stat-label">图文类型 ({stats['normal_percentage']}%)</div>
			
 
				+                    <div class="stat-label">图文 ({stats['normal_percentage']}%)</div>
			
 
				                 </div>
			
 
				             </div>
			
 
				             <div class="stats-row">
			
@@ -1004,7 +1050,6 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
 
				                                  data-block-id="${{blockId}}">
			
 
				                                 <div class="search-word-text">
			
 
				                                     🔍 ${{sw.search_word}}
			
 
				-                                    <span class="search-word-score ${{scoreClass}}">${{score.toFixed(2)}}</span>
			
 
				                                 </div>
			
 
				                                 <div class="search-word-meta" style="font-size:11px;color:#9ca3af;margin-top:2px">
			
 
				                                     来源: ${{sourceWord}}
			
@@ -1042,6 +1087,7 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
 
				 
			
 
				                     searches.forEach((sw, swIdx) => {{
			
 
				                         const blockId = `block-${{featureIdx}}-${{groupIdx}}-${{swIdx}}`;
			
 
				+                        const hasSearchResult = sw.search_result != null;
			
 
				                         const searchResult = sw.search_result || {{}};
			
 
				                         const notes = searchResult.data?.data || [];
			
 
				 
			
@@ -1066,20 +1112,70 @@ def generate_html(data: List[Dict[str, Any]], stats: Dict[str, Any], output_path
 
				                             if (filtered > 0) evalStats += `<span class="stat-badge eval filtered">⚫ 过滤:${{filtered}}</span>`;
			
 
				                         }}
			
 
				 
			
 
				+                        // 构建结果块
			
 
				                         html += `
			
 
				                             <div class="result-block" id="${{blockId}}">
			
 
				                                 <div class="result-header">
			
 
				                                     <div class="result-title">${{sw.search_word}}</div>
			
 
				                                     <div class="result-stats">
			
 
				+                        `;
			
 
				+
			
 
				+                        // 根据搜索状态显示不同的统计信息
			
 
				+                        if (!hasSearchResult) {{
			
 
				+                            // 未执行搜索
			
 
				+                            html += `
			
 
				+                                        <span class="stat-badge" style="background:#fef3c7;color:#92400e;font-weight:600">⏸️ 未执行搜索</span>
			
 
				+                            `;
			
 
				+                        }} else if (notes.length === 0) {{
			
 
				+                            // 搜索完成但无结果
			
 
				+                            html += `
			
 
				+                                        <span class="stat-badge">📝 0 条帖子</span>
			
 
				+                                        <span class="stat-badge" style="background:#fee2e2;color:#991b1b;font-weight:600">❌ 未找到匹配</span>
			
 
				+                            `;
			
 
				+                        }} else {{
			
 
				+                            // 正常有结果
			
 
				+                            html += `
			
 
				                                         <span class="stat-badge">📝 ${{notes.length}} 条帖子</span>
			
 
				                                         <span class="stat-badge">🎬 ${{videoCount}} 视频</span>
			
 
				                                         <span class="stat-badge">📷 ${{normalCount}} 图文</span>
			
 
				                                         ${{evalStats}}
			
 
				+                            `;
			
 
				+                        }}
			
 
				+
			
 
				+                        html += `
			
 
				                                     </div>
			
 
				                                 </div>
			
 
				+                        `;
			
 
				+
			
 
				+                        // 根据搜索状态显示不同的内容区域
			
 
				+                        if (!hasSearchResult) {{
			
 
				+                            // 未执行搜索 - 显示空状态消息
			
 
				+                            html += `
			
 
				+                                <div class="empty-state">
			
 
				+                                    <div class="empty-icon">⏸️</div>
			
 
				+                                    <div class="empty-title">该搜索词未执行搜索</div>
			
 
				+                                    <div class="empty-desc">由于搜索次数限制（--max-searches-per-feature 和 --max-searches-per-base-word），该搜索词未被执行</div>
			
 
				+                                </div>
			
 
				+                            `;
			
 
				+                        }} else if (notes.length === 0) {{
			
 
				+                            // 搜索完成但无结果
			
 
				+                            html += `
			
 
				+                                <div class="empty-state">
			
 
				+                                    <div class="empty-icon">❌</div>
			
 
				+                                    <div class="empty-title">搜索完成，但未找到匹配的帖子</div>
			
 
				+                                    <div class="empty-desc">该搜索词已执行，但小红书返回了 0 条结果</div>
			
 
				+                                </div>
			
 
				+                            `;
			
 
				+                        }} else {{
			
 
				+                            // 正常有结果 - 显示帖子网格
			
 
				+                            html += `
			
 
				                                 <div class="notes-grid">
			
 
				                                     ${{notes.map((note, noteIdx) => renderNoteCard(note, featureIdx, groupIdx, swIdx, noteIdx)).join('')}}
			
 
				                                 </div>
			
 
				+                            `;
			
 
				+                        }}
			
 
				+
			
 
				+                        html += `
			
 
				                             </div>
			
 
				                         `;
			
 
				                     }});
			
@@ -1359,7 +1455,9 @@ def main():
 
				     stats = calculate_statistics(data)
			
 
				     print(f"✓ 统计完成:")
			
 
				     print(f"  - 原始特征: {stats['total_features']}")
			
 
				-    print(f"  - 搜索词: {stats['total_search_words']}")
			
 
				+    print(f"  - 搜索词总数: {stats['total_search_words']}")
			
 
				+    print(f"  - 已搜索: {stats['searched_count']} ({stats['searched_percentage']}%)")
			
 
				+    print(f"  - 未搜索: {stats['not_searched_count']}")
			
 
				     print(f"  - 帖子总数: {stats['total_notes']}")
			
 
				     print(f"  - 视频: {stats['video_count']} ({stats['video_percentage']}%)")
			
 
				     print(f"  - 图文: {stats['normal_count']} ({stats['normal_percentage']}%)")