2 週間前 · 9863bdabf3
--- a/script/data_processing/README.md
+++ b/script/data_processing/README.md
@@ -0,0 +1,160 @@
 
				+# 数据处理模块
			
 
				+
			
 
				+本模块提供数据提取和转换功能，用于处理小红书帖子的特征提取和分类映射。
			
 
				+
			
 
				+## 脚本说明
			
 
				+
			
 
				+### 1. extract_features_from_posts.py
			
 
				+
			
 
				+从 `过去帖子_what解构结果` 目录中提取特征名称及其来源信息。
			
 
				+
			
 
				+**功能：**
			
 
				+- 从帖子的三点解构（灵感点、目的点、关键点）中提取特征
			
 
				+- 自动获取帖子详情（标题、正文、图片、点赞数等）
			
 
				+- 根据当前帖子的最早发布时间过滤数据，避免时间穿越
			
 
				+- 输出特征名称到帖子来源的映射关系
			
 
				+
			
 
				+**输入：**
			
 
				+- `data/data_1117/过去帖子_what解构结果/*.json` - 过去帖子的解构结果
			
 
				+- `data/data_1117/当前帖子_what解构结果/*.json` - 当前帖子的解构结果（用于获取时间阈值）
			
 
				+
			
 
				+**输出：**
			
 
				+- `data/data_1117/特征名称_帖子来源.json` - 特征名称及其帖子来源映射
			
 
				+
			
 
				+**使用方法：**
			
 
				+```bash
			
 
				+# 从项目根目录运行
			
 
				+python script/data_processing/extract_features_from_posts.py
			
 
				+
			
 
				+# 或者从任意目录运行
			
 
				+python /path/to/script/data_processing/extract_features_from_posts.py
			
 
				+```
			
 
				+
			
 
				+**输出格式：**
			
 
				+```json
			
 
				+{
			
 
				+    "灵感点": [
			
 
				+        {
			
 
				+            "特征名称": "猫咪",
			
 
				+            "特征来源": [
			
 
				+                {
			
 
				+                    "点的名称": "猫咪照片拍出了专业模特感",
			
 
				+                    "点的描述": "...",
			
 
				+                    "帖子id": "69114f150000000007001f30",
			
 
				+                    "帖子详情": {
			
 
				+                        "title": "老师 我家孩子可以做童模吗",
			
 
				+                        "body_text": "...",
			
 
				+                        "like_count": 765,
			
 
				+                        "publish_time": "2025-11-10 10:33:58",
			
 
				+                        ...
			
 
				+                    }
			
 
				+                }
			
 
				+            ]
			
 
				+        }
			
 
				+    ],
			
 
				+    "目的点": [...],
			
 
				+    "关键点": [...]
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 2. extract_feature_categories.py
			
 
				+
			
 
				+从 `过去帖子_pattern聚合结果.json` 中提取特征名称及其分类层级信息。
			
 
				+
			
 
				+**功能：**
			
 
				+- 提取特征名称到分类的映射关系
			
 
				+- 构建分类层级结构（包括层级深度、是否叶子节点、下一级节点等）
			
 
				+- 根据当前帖子的最早发布时间过滤数据
			
 
				+- 打印被过滤掉的帖子警告信息
			
 
				+
			
 
				+**输入：**
			
 
				+- `data/data_1117/过去帖子_pattern聚合结果.json` - Pattern聚合结果
			
 
				+- `data/data_1117/当前帖子_what解构结果/*.json` - 当前帖子（用于获取时间阈值）
			
 
				+
			
 
				+**输出：**
			
 
				+- `data/data_1117/特征名称_分类映射.json` - 特征名称到分类的映射
			
 
				+- `data/data_1117/分类层级映射.json` - 分类层级结构
			
 
				+
			
 
				+**使用方法：**
			
 
				+```bash
			
 
				+# 从项目根目录运行
			
 
				+python script/data_processing/extract_feature_categories.py
			
 
				+
			
 
				+# 或者从任意目录运行
			
 
				+python /path/to/script/data_processing/extract_feature_categories.py
			
 
				+```
			
 
				+
			
 
				+**输出格式1 - 特征名称_分类映射.json：**
			
 
				+```json
			
 
				+{
			
 
				+    "灵感点": {
			
 
				+        "猫咪": {
			
 
				+            "所属分类": ["物体", "实质"]
			
 
				+        }
			
 
				+    },
			
 
				+    "目的点": {...},
			
 
				+    "关键点": {...}
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+**输出格式2 - 分类层级映射.json：**
			
 
				+```json
			
 
				+{
			
 
				+    "灵感点": {
			
 
				+        "形式": {
			
 
				+            "几级分类": 1,
			
 
				+            "是否是叶子分类": false,
			
 
				+            "下一级": [
			
 
				+                {
			
 
				+                    "节点类型": "分类",
			
 
				+                    "节点名称": "概念"
			
 
				+                },
			
 
				+                {
			
 
				+                    "节点类型": "分类",
			
 
				+                    "节点名称": "方式"
			
 
				+                }
			
 
				+            ]
			
 
				+        }
			
 
				+    },
			
 
				+    "目的点": {...},
			
 
				+    "关键点": {...}
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 时间过滤机制
			
 
				+
			
 
				+两个脚本都实现了时间过滤功能，避免使用"未来"的数据（时间穿越）：
			
 
				+
			
 
				+1. **获取时间阈值**：从 `当前帖子_what解构结果` 目录中获取最早的帖子发布时间
			
 
				+2. **过滤条件**：只保留发布时间**早于**阈值的过去帖子
			
 
				+3. **警告信息**：打印被过滤掉的帖子ID和发布时间
			
 
				+
			
 
				+**示例输出：**
			
 
				+```
			
 
				+当前帖子最早发布时间: 2025-11-07 15:08:59
			
 
				+
			
 
				+正在应用时间过滤 (< 2025-11-07 15:08:59)，避免使用晚于当前帖子的数据...
			
 
				+  ⚠️  过滤掉帖子 69114f150000000007001f30 (发布时间: 2025-11-10 10:33:58，晚于阈值)
			
 
				+  ⚠️  过滤掉帖子 6915dfc400000000070224d9 (发布时间: 2025-11-14 10:22:16，晚于阈值)
			
 
				+
			
 
				+过滤掉 2 个帖子（穿越），保留 17 个帖子
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 依赖
			
 
				+
			
 
				+- `script.detail.get_xiaohongshu_detail` - 获取小红书帖子详情
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 注意事项
			
 
				+
			
 
				+1. 脚本会自动使用缓存的帖子详情，避免重复请求
			
 
				+2. 如果当前帖子目录不存在或为空，将不启用时间过滤
			
 
				+3. 过滤后只保留有来源的特征，避免空数据
			
 
				+4. 所有路径都相对于项目根目录，可以从任意位置运行脚本
			
--- a/script/data_processing/__init__.py
+++ b/script/data_processing/__init__.py
@@ -0,0 +1,14 @@
 
				+"""
			
 
				+数据处理模块
			
 
				+
			
 
				+提供数据提取和转换功能
			
 
				+"""
			
 
				+
			
 
				+from .extract_features_from_posts import main as extract_features_main
			
 
				+from .extract_feature_categories import main as extract_categories_main
			
 
				+
			
 
				+__all__ = [
			
 
				+    'extract_features_main',
			
 
				+    'extract_categories_main',
			
 
				+]
			
 
				+__version__ = '1.0.0'
			
--- a/script/data_processing/extract_feature_categories.py
+++ b/script/data_processing/extract_feature_categories.py
@@ -0,0 +1,466 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+从过去帖子_pattern聚合结果.json中提取特征名称及其对应的分类层级
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, List, Any, Optional, Set
			
 
				+import sys
			
 
				+import re
			
 
				+
			
 
				+# 添加项目根目录到路径
			
 
				+project_root = Path(__file__).parent.parent.parent
			
 
				+sys.path.insert(0, str(project_root))
			
 
				+
			
 
				+from script.detail import get_xiaohongshu_detail
			
 
				+
			
 
				+
			
 
				+def extract_post_id_from_filename(filename: str) -> str:
			
 
				+    """从文件名中提取帖子ID"""
			
 
				+    match = re.match(r'^([^_]+)_', filename)
			
 
				+    if match:
			
 
				+        return match.group(1)
			
 
				+    return ""
			
 
				+
			
 
				+
			
 
				+def get_post_detail(post_id: str) -> Optional[Dict]:
			
 
				+    """获取帖子详情"""
			
 
				+    try:
			
 
				+        detail = get_xiaohongshu_detail(post_id)
			
 
				+        return detail
			
 
				+    except Exception as e:
			
 
				+        print(f"  警告: 获取帖子 {post_id} 详情失败: {e}")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def get_earliest_publish_time(current_posts_dir: Path) -> Optional[str]:
			
 
				+    """
			
 
				+    获取当前帖子目录中最早的发布时间
			
 
				+
			
 
				+    Args:
			
 
				+        current_posts_dir: 当前帖子目录路径
			
 
				+
			
 
				+    Returns:
			
 
				+        最早的发布时间字符串，格式为 "YYYY-MM-DD HH:MM:SS"
			
 
				+    """
			
 
				+    if not current_posts_dir.exists():
			
 
				+        print(f"警告: 当前帖子目录不存在: {current_posts_dir}")
			
 
				+        return None
			
 
				+
			
 
				+    json_files = list(current_posts_dir.glob("*.json"))
			
 
				+    if not json_files:
			
 
				+        print(f"警告: 当前帖子目录为空: {current_posts_dir}")
			
 
				+        return None
			
 
				+
			
 
				+    print(f"\n正在获取当前帖子的发布时间...")
			
 
				+    print(f"找到 {len(json_files)} 个当前帖子")
			
 
				+
			
 
				+    earliest_time = None
			
 
				+    for file_path in json_files:
			
 
				+        post_id = extract_post_id_from_filename(file_path.name)
			
 
				+        if not post_id:
			
 
				+            continue
			
 
				+
			
 
				+        try:
			
 
				+            detail = get_post_detail(post_id)
			
 
				+            if detail and 'publish_time' in detail:
			
 
				+                publish_time = detail['publish_time']
			
 
				+                if earliest_time is None or publish_time < earliest_time:
			
 
				+                    earliest_time = publish_time
			
 
				+                    print(f"  更新最早时间: {publish_time} (帖子: {post_id})")
			
 
				+        except Exception as e:
			
 
				+            print(f"  警告: 获取帖子 {post_id} 发布时间失败: {e}")
			
 
				+
			
 
				+    if earliest_time:
			
 
				+        print(f"\n当前帖子最早发布时间: {earliest_time}")
			
 
				+    else:
			
 
				+        print("\n警告: 未能获取到任何当前帖子的发布时间")
			
 
				+
			
 
				+    return earliest_time
			
 
				+
			
 
				+
			
 
				+def collect_all_post_ids(data: Dict) -> Set[str]:
			
 
				+    """
			
 
				+    收集数据中的所有帖子ID
			
 
				+
			
 
				+    Args:
			
 
				+        data: 聚合结果数据
			
 
				+
			
 
				+    Returns:
			
 
				+        帖子ID集合
			
 
				+    """
			
 
				+    post_ids = set()
			
 
				+
			
 
				+    def traverse_node(node):
			
 
				+        if isinstance(node, dict):
			
 
				+            # 检查是否有帖子列表
			
 
				+            if "帖子列表" in node and isinstance(node["帖子列表"], list):
			
 
				+                post_ids.update(node["帖子列表"])
			
 
				+
			
 
				+            # 检查是否有特征列表
			
 
				+            if "特征列表" in node and isinstance(node["特征列表"], list):
			
 
				+                for feature in node["特征列表"]:
			
 
				+                    if "帖子id" in feature:
			
 
				+                        post_ids.add(feature["帖子id"])
			
 
				+
			
 
				+            # 递归遍历
			
 
				+            for key, value in node.items():
			
 
				+                if key not in ["_meta", "帖子数", "特征数", "帖子列表"]:
			
 
				+                    traverse_node(value)
			
 
				+        elif isinstance(node, list):
			
 
				+            for item in node:
			
 
				+                traverse_node(item)
			
 
				+
			
 
				+    for category in ["灵感点列表", "目的点", "关键点列表"]:
			
 
				+        if category in data:
			
 
				+            traverse_node(data[category])
			
 
				+
			
 
				+    return post_ids
			
 
				+
			
 
				+
			
 
				+def filter_data_by_time(data: Dict, time_filter: str) -> tuple[Dict, Set[str]]:
			
 
				+    """
			
 
				+    根据发布时间过滤数据
			
 
				+
			
 
				+    Args:
			
 
				+        data: 原始聚合结果数据
			
 
				+        time_filter: 时间过滤阈值
			
 
				+
			
 
				+    Returns:
			
 
				+        (过滤后的数据, 被过滤掉的帖子ID集合)
			
 
				+    """
			
 
				+    # 收集所有帖子ID
			
 
				+    all_post_ids = collect_all_post_ids(data)
			
 
				+    print(f"\n数据中包含 {len(all_post_ids)} 个不同的帖子")
			
 
				+
			
 
				+    # 获取所有帖子的详情
			
 
				+    print("正在获取帖子详情...")
			
 
				+    post_details = {}
			
 
				+    for i, post_id in enumerate(all_post_ids, 1):
			
 
				+        print(f"[{i}/{len(all_post_ids)}] 获取帖子 {post_id} 的详情...")
			
 
				+        detail = get_post_detail(post_id)
			
 
				+        if detail:
			
 
				+            post_details[post_id] = detail
			
 
				+
			
 
				+    # 根据时间过滤（过滤掉发布时间晚于等于阈值的帖子，避免穿越）
			
 
				+    print(f"\n正在应用时间过滤 (< {time_filter})，避免使用晚于当前帖子的数据...")
			
 
				+    filtered_post_ids = set()
			
 
				+    valid_post_ids = set()
			
 
				+
			
 
				+    for post_id, detail in post_details.items():
			
 
				+        publish_time = detail.get('publish_time', '')
			
 
				+        if publish_time < time_filter:
			
 
				+            valid_post_ids.add(post_id)
			
 
				+        else:
			
 
				+            filtered_post_ids.add(post_id)
			
 
				+            print(f"  ⚠️  过滤掉帖子 {post_id} (发布时间: {publish_time}，晚于阈值)")
			
 
				+
			
 
				+    print(f"\n过滤统计: 过滤掉 {len(filtered_post_ids)} 个帖子（穿越），保留 {len(valid_post_ids)} 个帖子")
			
 
				+
			
 
				+    # 过滤数据
			
 
				+    filtered_data = filter_node_by_post_ids(data, valid_post_ids)
			
 
				+
			
 
				+    return filtered_data, filtered_post_ids
			
 
				+
			
 
				+
			
 
				+def filter_node_by_post_ids(node: Any, valid_post_ids: Set[str]) -> Any:
			
 
				+    """
			
 
				+    递归过滤节点，只保留有效帖子的数据
			
 
				+
			
 
				+    Args:
			
 
				+        node: 当前节点
			
 
				+        valid_post_ids: 有效的帖子ID集合
			
 
				+
			
 
				+    Returns:
			
 
				+        过滤后的节点
			
 
				+    """
			
 
				+    if isinstance(node, dict):
			
 
				+        filtered_node = {}
			
 
				+
			
 
				+        # 处理特征列表
			
 
				+        if "特征列表" in node:
			
 
				+            filtered_features = []
			
 
				+            for feature in node["特征列表"]:
			
 
				+                if "帖子id" in feature and feature["帖子id"] in valid_post_ids:
			
 
				+                    filtered_features.append(feature)
			
 
				+
			
 
				+            if filtered_features:
			
 
				+                filtered_node["特征列表"] = filtered_features
			
 
				+                # 更新元数据
			
 
				+                if "_meta" in node:
			
 
				+                    filtered_node["_meta"] = node["_meta"].copy()
			
 
				+                filtered_node["帖子数"] = len(set(f["帖子id"] for f in filtered_features if "帖子id" in f))
			
 
				+                filtered_node["特征数"] = len(filtered_features)
			
 
				+
			
 
				+                # 更新帖子列表
			
 
				+                filtered_node["帖子列表"] = list(set(f["帖子id"] for f in filtered_features if "帖子id" in f))
			
 
				+
			
 
				+        # 递归处理子节点
			
 
				+        for key, value in node.items():
			
 
				+            if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
			
 
				+                continue
			
 
				+
			
 
				+            filtered_child = filter_node_by_post_ids(value, valid_post_ids)
			
 
				+            if filtered_child:  # 只添加非空的子节点
			
 
				+                filtered_node[key] = filtered_child
			
 
				+
			
 
				+        return filtered_node if filtered_node else None
			
 
				+
			
 
				+    elif isinstance(node, list):
			
 
				+        return [filter_node_by_post_ids(item, valid_post_ids) for item in node]
			
 
				+
			
 
				+    else:
			
 
				+        return node
			
 
				+
			
 
				+
			
 
				+def extract_categories_from_node(node: Dict, current_path: List[str], result: Dict[str, Dict]):
			
 
				+    """
			
 
				+    递归遍历树形结构，提取特征名称及其分类路径
			
 
				+
			
 
				+    Args:
			
 
				+        node: 当前节点
			
 
				+        current_path: 当前分类路径（从下到上）
			
 
				+        result: 结果字典，用于存储特征名称到分类的映射
			
 
				+    """
			
 
				+    # 如果当前节点包含"特征列表"
			
 
				+    if "特征列表" in node:
			
 
				+        for feature in node["特征列表"]:
			
 
				+            feature_name = feature.get("特征名称")
			
 
				+            if feature_name:
			
 
				+                # 将分类路径存储到结果中
			
 
				+                result[feature_name] = {
			
 
				+                    "所属分类": current_path.copy()
			
 
				+                }
			
 
				+
			
 
				+    # 递归处理子节点
			
 
				+    for key, value in node.items():
			
 
				+        # 跳过特殊字段
			
 
				+        if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
			
 
				+            continue
			
 
				+
			
 
				+        # 如果值是字典，继续递归
			
 
				+        if isinstance(value, dict):
			
 
				+            # 将当前key添加到路径中
			
 
				+            new_path = [key] + current_path
			
 
				+            extract_categories_from_node(value, new_path, result)
			
 
				+
			
 
				+
			
 
				+def process_category(category_data: Dict, category_key: str) -> Dict[str, Dict]:
			
 
				+    """
			
 
				+    处理单个分类（灵感点列表/目的点/关键点列表）
			
 
				+
			
 
				+    Args:
			
 
				+        category_data: 分类数据
			
 
				+        category_key: 分类键名
			
 
				+
			
 
				+    Returns:
			
 
				+        特征名称到分类的映射字典
			
 
				+    """
			
 
				+    result = {}
			
 
				+
			
 
				+    if isinstance(category_data, dict):
			
 
				+        extract_categories_from_node(category_data, [], result)
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def build_category_hierarchy_from_node(
			
 
				+    node: Dict,
			
 
				+    category_hierarchy: Dict[str, Dict],
			
 
				+    current_level: int = 1,
			
 
				+    parent_categories: List[str] = None
			
 
				+):
			
 
				+    """
			
 
				+    递归构建分类层级结构
			
 
				+
			
 
				+    Args:
			
 
				+        node: 当前节点
			
 
				+        category_hierarchy: 分类层级字典
			
 
				+        current_level: 当前层级（从1开始）
			
 
				+        parent_categories: 父级分类列表（从顶到下）
			
 
				+    """
			
 
				+    if parent_categories is None:
			
 
				+        parent_categories = []
			
 
				+
			
 
				+    # 遍历当前节点的所有键
			
 
				+    for key, value in node.items():
			
 
				+        # 跳过特殊字段
			
 
				+        if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
			
 
				+            continue
			
 
				+
			
 
				+        if isinstance(value, dict):
			
 
				+            # 初始化当前分类的信息
			
 
				+            if key not in category_hierarchy:
			
 
				+                category_hierarchy[key] = {
			
 
				+                    "几级分类": current_level,
			
 
				+                    "是否是叶子分类": False,
			
 
				+                    "下一级": []
			
 
				+                }
			
 
				+
			
 
				+            # 收集下一级的分类名称和特征名称
			
 
				+            next_level_items = []
			
 
				+
			
 
				+            # 检查是否有子分类
			
 
				+            has_sub_categories = False
			
 
				+            for sub_key, sub_value in value.items():
			
 
				+                if sub_key not in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
			
 
				+                    if isinstance(sub_value, dict):
			
 
				+                        has_sub_categories = True
			
 
				+                        next_level_items.append({
			
 
				+                            "节点类型": "分类",
			
 
				+                            "节点名称": sub_key
			
 
				+                        })
			
 
				+
			
 
				+            # 如果有特征列表，添加特征名称
			
 
				+            if "特征列表" in value:
			
 
				+                for feature in value["特征列表"]:
			
 
				+                    feature_name = feature.get("特征名称")
			
 
				+                    if feature_name:
			
 
				+                        next_level_items.append({
			
 
				+                            "节点类型": "特征",
			
 
				+                            "节点名称": feature_name
			
 
				+                        })
			
 
				+
			
 
				+            # 更新下一级列表
			
 
				+            category_hierarchy[key]["下一级"] = next_level_items
			
 
				+
			
 
				+            # 如果没有子分类，标记为叶子分类
			
 
				+            if not has_sub_categories:
			
 
				+                category_hierarchy[key]["是否是叶子分类"] = True
			
 
				+
			
 
				+            # 递归处理子节点
			
 
				+            new_parent_categories = parent_categories + [key]
			
 
				+            build_category_hierarchy_from_node(
			
 
				+                value,
			
 
				+                category_hierarchy,
			
 
				+                current_level + 1,
			
 
				+                new_parent_categories
			
 
				+            )
			
 
				+
			
 
				+
			
 
				+def build_category_hierarchy(category_data: Dict) -> Dict[str, Dict]:
			
 
				+    """
			
 
				+    构建分类名称到下一级的映射关系
			
 
				+
			
 
				+    Args:
			
 
				+        category_data: 分类数据
			
 
				+
			
 
				+    Returns:
			
 
				+        分类层级映射字典
			
 
				+    """
			
 
				+    category_hierarchy = {}
			
 
				+
			
 
				+    if isinstance(category_data, dict):
			
 
				+        build_category_hierarchy_from_node(category_data, category_hierarchy)
			
 
				+
			
 
				+    return category_hierarchy
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    # 输入输出文件路径（默认使用项目根目录下的 data/data_1117 目录）
			
 
				+    script_dir = Path(__file__).parent
			
 
				+    project_root = script_dir.parent.parent
			
 
				+    data_dir = project_root / "data" / "data_1117"
			
 
				+
			
 
				+    input_file = data_dir / "过去帖子_pattern聚合结果.json"
			
 
				+    current_posts_dir = data_dir / "当前帖子_what解构结果"
			
 
				+    output_file_1 = data_dir / "特征名称_分类映射.json"
			
 
				+    output_file_2 = data_dir / "分类层级映射.json"
			
 
				+
			
 
				+    # 获取当前帖子的最早发布时间
			
 
				+    earliest_time = get_earliest_publish_time(current_posts_dir)
			
 
				+
			
 
				+    # 读取输入文件
			
 
				+    print(f"\n正在读取文件: {input_file}")
			
 
				+    with open(input_file, "r", encoding="utf-8") as f:
			
 
				+        data = json.load(f)
			
 
				+
			
 
				+    # 如果有时间过滤，应用过滤
			
 
				+    filtered_post_ids = set()
			
 
				+    if earliest_time:
			
 
				+        print("\n" + "="*60)
			
 
				+        print("开始应用时间过滤...")
			
 
				+        data, filtered_post_ids = filter_data_by_time(data, earliest_time)
			
 
				+
			
 
				+        if filtered_post_ids:
			
 
				+            print(f"\n⚠️  警告: 以下 {len(filtered_post_ids)} 个帖子因发布时间晚于阈值被过滤:")
			
 
				+            for post_id in sorted(filtered_post_ids):
			
 
				+                print(f"  - {post_id}")
			
 
				+    else:
			
 
				+        print("\n未启用时间过滤")
			
 
				+
			
 
				+    # 处理结果1: 特征名称到分类的映射
			
 
				+    output_1 = {}
			
 
				+
			
 
				+    # 处理灵感点列表
			
 
				+    if "灵感点列表" in data:
			
 
				+        print("正在处理: 灵感点列表 (特征名称映射)")
			
 
				+        output_1["灵感点"] = process_category(data["灵感点列表"], "灵感点列表")
			
 
				+        print(f"  提取了 {len(output_1['灵感点'])} 个特征")
			
 
				+
			
 
				+    # 处理目的点
			
 
				+    if "目的点" in data:
			
 
				+        print("正在处理: 目的点 (特征名称映射)")
			
 
				+        output_1["目的点"] = process_category(data["目的点"], "目的点")
			
 
				+        print(f"  提取了 {len(output_1['目的点'])} 个特征")
			
 
				+
			
 
				+    # 处理关键点列表
			
 
				+    if "关键点列表" in data:
			
 
				+        print("正在处理: 关键点列表 (特征名称映射)")
			
 
				+        output_1["关键点"] = process_category(data["关键点列表"], "关键点列表")
			
 
				+        print(f"  提取了 {len(output_1['关键点'])} 个特征")
			
 
				+
			
 
				+    # 保存结果1
			
 
				+    print(f"\n正在保存结果到: {output_file_1}")
			
 
				+    with open(output_file_1, "w", encoding="utf-8") as f:
			
 
				+        json.dump(output_1, f, ensure_ascii=False, indent=4)
			
 
				+
			
 
				+    print("完成!")
			
 
				+    if earliest_time:
			
 
				+        print(f"\n总计 (特征名称映射，已过滤掉发布时间 >= {earliest_time} 的帖子):")
			
 
				+    else:
			
 
				+        print(f"\n总计 (特征名称映射):")
			
 
				+    for category, features in output_1.items():
			
 
				+        print(f"  {category}: {len(features)} 个特征")
			
 
				+
			
 
				+    # 处理结果2: 分类层级映射
			
 
				+    print("\n" + "="*60)
			
 
				+    print("开始生成分类层级映射...")
			
 
				+    output_2 = {}
			
 
				+
			
 
				+    # 处理灵感点列表
			
 
				+    if "灵感点列表" in data:
			
 
				+        print("正在处理: 灵感点列表 (分类层级)")
			
 
				+        output_2["灵感点"] = build_category_hierarchy(data["灵感点列表"])
			
 
				+        print(f"  提取了 {len(output_2['灵感点'])} 个分类")
			
 
				+
			
 
				+    # 处理目的点
			
 
				+    if "目的点" in data:
			
 
				+        print("正在处理: 目的点 (分类层级)")
			
 
				+        output_2["目的点"] = build_category_hierarchy(data["目的点"])
			
 
				+        print(f"  提取了 {len(output_2['目的点'])} 个分类")
			
 
				+
			
 
				+    # 处理关键点列表
			
 
				+    if "关键点列表" in data:
			
 
				+        print("正在处理: 关键点列表 (分类层级)")
			
 
				+        output_2["关键点"] = build_category_hierarchy(data["关键点列表"])
			
 
				+        print(f"  提取了 {len(output_2['关键点'])} 个分类")
			
 
				+
			
 
				+    # 保存结果2
			
 
				+    print(f"\n正在保存结果到: {output_file_2}")
			
 
				+    with open(output_file_2, "w", encoding="utf-8") as f:
			
 
				+        json.dump(output_2, f, ensure_ascii=False, indent=4)
			
 
				+
			
 
				+    print("完成!")
			
 
				+    if earliest_time:
			
 
				+        print(f"\n总计 (分类层级映射，已过滤掉发布时间 >= {earliest_time} 的帖子):")
			
 
				+    else:
			
 
				+        print(f"\n总计 (分类层级映射):")
			
 
				+    for category, hierarchies in output_2.items():
			
 
				+        print(f"  {category}: {len(hierarchies)} 个分类")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/script/data_processing/extract_features_from_posts.py
+++ b/script/data_processing/extract_features_from_posts.py
@@ -0,0 +1,409 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+从过去帖子_what解构结果目录中提取特征名称及其来源信息
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, List, Optional
			
 
				+import re
			
 
				+import sys
			
 
				+
			
 
				+# 添加项目根目录到路径
			
 
				+project_root = Path(__file__).parent.parent.parent
			
 
				+sys.path.insert(0, str(project_root))
			
 
				+
			
 
				+from script.detail import get_xiaohongshu_detail
			
 
				+
			
 
				+
			
 
				+def extract_post_id_from_filename(filename: str) -> str:
			
 
				+    """从文件名中提取帖子ID"""
			
 
				+    match = re.match(r'^([^_]+)_', filename)
			
 
				+    if match:
			
 
				+        return match.group(1)
			
 
				+    return ""
			
 
				+
			
 
				+
			
 
				+def get_post_detail(post_id: str) -> Optional[Dict]:
			
 
				+    """
			
 
				+    获取帖子详情
			
 
				+
			
 
				+    Args:
			
 
				+        post_id: 帖子ID
			
 
				+
			
 
				+    Returns:
			
 
				+        帖子详情字典，如果获取失败则返回None
			
 
				+    """
			
 
				+    try:
			
 
				+        detail = get_xiaohongshu_detail(post_id)
			
 
				+        return detail
			
 
				+    except Exception as e:
			
 
				+        print(f"  警告: 获取帖子 {post_id} 详情失败: {e}")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def extract_features_from_point(point_data: Dict, post_id: str, point_name: str, point_description: str) -> List[Dict]:
			
 
				+    """
			
 
				+    从单个点（灵感点/目的点/关键点）中提取特征信息
			
 
				+
			
 
				+    Args:
			
 
				+        point_data: 点的数据
			
 
				+        post_id: 帖子ID
			
 
				+        point_name: 点的名称
			
 
				+        point_description: 点的描述
			
 
				+
			
 
				+    Returns:
			
 
				+        特征列表
			
 
				+    """
			
 
				+    features = []
			
 
				+
			
 
				+    # 检查是否有"提取的特征"字段
			
 
				+    if "提取的特征" in point_data and isinstance(point_data["提取的特征"], list):
			
 
				+        for feature in point_data["提取的特征"]:
			
 
				+            if "特征名称" in feature:
			
 
				+                features.append({
			
 
				+                    "特征名称": feature["特征名称"],
			
 
				+                    "点的名称": point_name,
			
 
				+                    "点的描述": point_description,
			
 
				+                    "帖子id": post_id
			
 
				+                })
			
 
				+
			
 
				+    return features
			
 
				+
			
 
				+
			
 
				+def process_single_file(file_path: Path) -> Dict[str, Dict[str, List[Dict]]]:
			
 
				+    """
			
 
				+    处理单个JSON文件，提取所有特征信息
			
 
				+
			
 
				+    Args:
			
 
				+        file_path: JSON文件路径
			
 
				+
			
 
				+    Returns:
			
 
				+        包含灵感点、目的点、关键点的特征字典
			
 
				+    """
			
 
				+    result = {
			
 
				+        "灵感点": {},
			
 
				+        "目的点": {},
			
 
				+        "关键点": {}
			
 
				+    }
			
 
				+
			
 
				+    # 从文件名提取帖子ID
			
 
				+    post_id = extract_post_id_from_filename(file_path.name)
			
 
				+
			
 
				+    try:
			
 
				+        with open(file_path, "r", encoding="utf-8") as f:
			
 
				+            data = json.load(f)
			
 
				+
			
 
				+        # 提取三点解构数据
			
 
				+        if "三点解构" not in data:
			
 
				+            return result
			
 
				+
			
 
				+        three_points = data["三点解构"]
			
 
				+
			
 
				+        # 处理灵感点
			
 
				+        if "灵感点" in three_points:
			
 
				+            inspiration = three_points["灵感点"]
			
 
				+
			
 
				+            # 处理全新内容
			
 
				+            if "全新内容" in inspiration and isinstance(inspiration["全新内容"], list):
			
 
				+                for item in inspiration["全新内容"]:
			
 
				+                    point_name = item.get("灵感点", "")
			
 
				+                    point_desc = item.get("描述", "")
			
 
				+                    features = extract_features_from_point(item, post_id, point_name, point_desc)
			
 
				+
			
 
				+                    for feature in features:
			
 
				+                        feature_name = feature["特征名称"]
			
 
				+                        if feature_name not in result["灵感点"]:
			
 
				+                            result["灵感点"][feature_name] = []
			
 
				+                        result["灵感点"][feature_name].append({
			
 
				+                            "点的名称": feature["点的名称"],
			
 
				+                            "点的描述": feature["点的描述"],
			
 
				+                            "帖子id": feature["帖子id"]
			
 
				+                        })
			
 
				+
			
 
				+            # 处理共性差异
			
 
				+            if "共性差异" in inspiration and isinstance(inspiration["共性差异"], list):
			
 
				+                for item in inspiration["共性差异"]:
			
 
				+                    point_name = item.get("灵感点", "")
			
 
				+                    point_desc = item.get("描述", "")
			
 
				+                    features = extract_features_from_point(item, post_id, point_name, point_desc)
			
 
				+
			
 
				+                    for feature in features:
			
 
				+                        feature_name = feature["特征名称"]
			
 
				+                        if feature_name not in result["灵感点"]:
			
 
				+                            result["灵感点"][feature_name] = []
			
 
				+                        result["灵感点"][feature_name].append({
			
 
				+                            "点的名称": feature["点的名称"],
			
 
				+                            "点的描述": feature["点的描述"],
			
 
				+                            "帖子id": feature["帖子id"]
			
 
				+                        })
			
 
				+
			
 
				+            # 处理共性内容
			
 
				+            if "共性内容" in inspiration and isinstance(inspiration["共性内容"], list):
			
 
				+                for item in inspiration["共性内容"]:
			
 
				+                    point_name = item.get("灵感点", "")
			
 
				+                    point_desc = item.get("描述", "")
			
 
				+                    features = extract_features_from_point(item, post_id, point_name, point_desc)
			
 
				+
			
 
				+                    for feature in features:
			
 
				+                        feature_name = feature["特征名称"]
			
 
				+                        if feature_name not in result["灵感点"]:
			
 
				+                            result["灵感点"][feature_name] = []
			
 
				+                        result["灵感点"][feature_name].append({
			
 
				+                            "点的名称": feature["点的名称"],
			
 
				+                            "点的描述": feature["点的描述"],
			
 
				+                            "帖子id": feature["帖子id"]
			
 
				+                        })
			
 
				+
			
 
				+        # 处理目的点
			
 
				+        if "目的点" in three_points:
			
 
				+            purpose = three_points["目的点"]
			
 
				+
			
 
				+            if "purposes" in purpose and isinstance(purpose["purposes"], list):
			
 
				+                for item in purpose["purposes"]:
			
 
				+                    point_name = item.get("目的点", "")
			
 
				+                    point_desc = item.get("描述", "")
			
 
				+                    features = extract_features_from_point(item, post_id, point_name, point_desc)
			
 
				+
			
 
				+                    for feature in features:
			
 
				+                        feature_name = feature["特征名称"]
			
 
				+                        if feature_name not in result["目的点"]:
			
 
				+                            result["目的点"][feature_name] = []
			
 
				+                        result["目的点"][feature_name].append({
			
 
				+                            "点的名称": feature["点的名称"],
			
 
				+                            "点的描述": feature["点的描述"],
			
 
				+                            "帖子id": feature["帖子id"]
			
 
				+                        })
			
 
				+
			
 
				+        # 处理关键点
			
 
				+        if "关键点" in three_points:
			
 
				+            key_points = three_points["关键点"]
			
 
				+
			
 
				+            if "key_points" in key_points and isinstance(key_points["key_points"], list):
			
 
				+                for item in key_points["key_points"]:
			
 
				+                    point_name = item.get("关键点", "")
			
 
				+                    point_desc = item.get("描述", "")
			
 
				+                    features = extract_features_from_point(item, post_id, point_name, point_desc)
			
 
				+
			
 
				+                    for feature in features:
			
 
				+                        feature_name = feature["特征名称"]
			
 
				+                        if feature_name not in result["关键点"]:
			
 
				+                            result["关键点"][feature_name] = []
			
 
				+                        result["关键点"][feature_name].append({
			
 
				+                            "点的名称": feature["点的名称"],
			
 
				+                            "点的描述": feature["点的描述"],
			
 
				+                            "帖子id": feature["帖子id"]
			
 
				+                        })
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(f"处理文件 {file_path.name} 时出错: {e}")
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def merge_results(all_results: List[Dict]) -> Dict:
			
 
				+    """
			
 
				+    合并所有文件的提取结果
			
 
				+
			
 
				+    Args:
			
 
				+        all_results: 所有文件的结果列表
			
 
				+
			
 
				+    Returns:
			
 
				+        合并后的结果
			
 
				+    """
			
 
				+    merged = {
			
 
				+        "灵感点": {},
			
 
				+        "目的点": {},
			
 
				+        "关键点": {}
			
 
				+    }
			
 
				+
			
 
				+    for result in all_results:
			
 
				+        for category in ["灵感点", "目的点", "关键点"]:
			
 
				+            for feature_name, sources in result[category].items():
			
 
				+                if feature_name not in merged[category]:
			
 
				+                    merged[category][feature_name] = {"来源": []}
			
 
				+                merged[category][feature_name]["来源"].extend(sources)
			
 
				+
			
 
				+    return merged
			
 
				+
			
 
				+
			
 
				+def convert_to_array_format(merged_dict: Dict, fetch_details: bool = True, time_filter: Optional[str] = None) -> Dict:
			
 
				+    """
			
 
				+    将字典格式转换为数组格式，并添加帖子详情
			
 
				+
			
 
				+    Args:
			
 
				+        merged_dict: 字典格式的结果
			
 
				+        fetch_details: 是否获取帖子详情，默认为True
			
 
				+        time_filter: 时间过滤阈值，只保留发布时间>=该时间的帖子，格式为 "YYYY-MM-DD HH:MM:SS"
			
 
				+
			
 
				+    Returns:
			
 
				+        数组格式的结果
			
 
				+    """
			
 
				+    result = {
			
 
				+        "灵感点": [],
			
 
				+        "目的点": [],
			
 
				+        "关键点": []
			
 
				+    }
			
 
				+
			
 
				+    # 收集所有需要获取详情的帖子ID
			
 
				+    post_ids = set()
			
 
				+    if fetch_details:
			
 
				+        for category in ["灵感点", "目的点", "关键点"]:
			
 
				+            for feature_name, data in merged_dict[category].items():
			
 
				+                for source in data["来源"]:
			
 
				+                    post_ids.add(source["帖子id"])
			
 
				+
			
 
				+        # 批量获取帖子详情
			
 
				+        print(f"\n正在获取 {len(post_ids)} 个帖子的详情...")
			
 
				+        post_details = {}
			
 
				+        for i, post_id in enumerate(post_ids, 1):
			
 
				+            print(f"[{i}/{len(post_ids)}] 获取帖子 {post_id} 的详情...")
			
 
				+            detail = get_post_detail(post_id)
			
 
				+            if detail:
			
 
				+                post_details[post_id] = detail
			
 
				+
			
 
				+        print(f"成功获取 {len(post_details)} 个帖子详情")
			
 
				+
			
 
				+        # 如果启用时间过滤，过滤帖子（过滤掉发布时间晚于等于阈值的帖子，避免穿越）
			
 
				+        if time_filter:
			
 
				+            print(f"\n正在应用时间过滤 (< {time_filter})，避免使用晚于当前帖子的数据...")
			
 
				+            filtered_post_ids = set()
			
 
				+            filtered_count = 0
			
 
				+            for post_id, detail in post_details.items():
			
 
				+                publish_time = detail.get('publish_time', '')
			
 
				+                if publish_time < time_filter:
			
 
				+                    filtered_post_ids.add(post_id)
			
 
				+                else:
			
 
				+                    filtered_count += 1
			
 
				+                    print(f"  ⚠️  过滤掉帖子 {post_id} (发布时间: {publish_time}，晚于阈值)")
			
 
				+
			
 
				+            print(f"过滤掉 {filtered_count} 个帖子（穿越），保留 {len(filtered_post_ids)} 个帖子")
			
 
				+            # 更新post_details，只保留符合时间条件的
			
 
				+            post_details = {pid: detail for pid, detail in post_details.items() if pid in filtered_post_ids}
			
 
				+
			
 
				+    # 转换为数组格式并添加帖子详情
			
 
				+    for category in ["灵感点", "目的点", "关键点"]:
			
 
				+        for feature_name, data in merged_dict[category].items():
			
 
				+            # 为每个来源添加帖子详情
			
 
				+            enhanced_sources = []
			
 
				+            for source in data["来源"]:
			
 
				+                # 如果启用时间过滤，跳过不符合时间条件的帖子
			
 
				+                if fetch_details and time_filter and source["帖子id"] not in post_details:
			
 
				+                    continue
			
 
				+
			
 
				+                enhanced_source = source.copy()
			
 
				+                if fetch_details and source["帖子id"] in post_details:
			
 
				+                    enhanced_source["帖子详情"] = post_details[source["帖子id"]]
			
 
				+                enhanced_sources.append(enhanced_source)
			
 
				+
			
 
				+            # 只添加有来源的特征
			
 
				+            if enhanced_sources:
			
 
				+                result[category].append({
			
 
				+                    "特征名称": feature_name,
			
 
				+                    "特征来源": enhanced_sources
			
 
				+                })
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def get_earliest_publish_time(current_posts_dir: Path) -> Optional[str]:
			
 
				+    """
			
 
				+    获取当前帖子目录中最早的发布时间
			
 
				+
			
 
				+    Args:
			
 
				+        current_posts_dir: 当前帖子目录路径
			
 
				+
			
 
				+    Returns:
			
 
				+        最早的发布时间字符串，格式为 "YYYY-MM-DD HH:MM:SS"
			
 
				+    """
			
 
				+    if not current_posts_dir.exists():
			
 
				+        print(f"警告: 当前帖子目录不存在: {current_posts_dir}")
			
 
				+        return None
			
 
				+
			
 
				+    json_files = list(current_posts_dir.glob("*.json"))
			
 
				+    if not json_files:
			
 
				+        print(f"警告: 当前帖子目录为空: {current_posts_dir}")
			
 
				+        return None
			
 
				+
			
 
				+    print(f"\n正在获取当前帖子的发布时间...")
			
 
				+    print(f"找到 {len(json_files)} 个当前帖子")
			
 
				+
			
 
				+    earliest_time = None
			
 
				+    for file_path in json_files:
			
 
				+        post_id = extract_post_id_from_filename(file_path.name)
			
 
				+        if not post_id:
			
 
				+            continue
			
 
				+
			
 
				+        try:
			
 
				+            detail = get_post_detail(post_id)
			
 
				+            if detail and 'publish_time' in detail:
			
 
				+                publish_time = detail['publish_time']
			
 
				+                if earliest_time is None or publish_time < earliest_time:
			
 
				+                    earliest_time = publish_time
			
 
				+                    print(f"  更新最早时间: {publish_time} (帖子: {post_id})")
			
 
				+        except Exception as e:
			
 
				+            print(f"  警告: 获取帖子 {post_id} 发布时间失败: {e}")
			
 
				+
			
 
				+    if earliest_time:
			
 
				+        print(f"\n当前帖子最早发布时间: {earliest_time}")
			
 
				+    else:
			
 
				+        print("\n警告: 未能获取到任何当前帖子的发布时间")
			
 
				+
			
 
				+    return earliest_time
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    # 输入输出路径（默认使用项目根目录下的 data/data_1117 目录）
			
 
				+    script_dir = Path(__file__).parent
			
 
				+    project_root = script_dir.parent.parent
			
 
				+    data_dir = project_root / "data" / "data_1117"
			
 
				+
			
 
				+    input_dir = data_dir / "过去帖子_what解构结果"
			
 
				+    current_posts_dir = data_dir / "当前帖子_what解构结果"
			
 
				+    output_file = data_dir / "特征名称_帖子来源.json"
			
 
				+
			
 
				+    # 获取当前帖子的最早发布时间
			
 
				+    earliest_time = get_earliest_publish_time(current_posts_dir)
			
 
				+
			
 
				+    print(f"\n正在扫描目录: {input_dir}")
			
 
				+
			
 
				+    # 获取所有JSON文件
			
 
				+    json_files = list(input_dir.glob("*.json"))
			
 
				+    print(f"找到 {len(json_files)} 个JSON文件")
			
 
				+
			
 
				+    # 处理所有文件
			
 
				+    all_results = []
			
 
				+    for i, file_path in enumerate(json_files, 1):
			
 
				+        print(f"处理文件 [{i}/{len(json_files)}]: {file_path.name}")
			
 
				+        result = process_single_file(file_path)
			
 
				+        all_results.append(result)
			
 
				+
			
 
				+    # 合并结果
			
 
				+    print("\n正在合并结果...")
			
 
				+    merged_result = merge_results(all_results)
			
 
				+
			
 
				+    # 转换为数组格式（带时间过滤）
			
 
				+    print("正在转换为数组格式...")
			
 
				+    final_result = convert_to_array_format(merged_result, fetch_details=True, time_filter=earliest_time)
			
 
				+
			
 
				+    # 统计信息
			
 
				+    if earliest_time:
			
 
				+        print(f"\n提取统计 (已过滤掉发布时间 >= {earliest_time} 的帖子):")
			
 
				+    else:
			
 
				+        print(f"\n提取统计:")
			
 
				+    for category in ["灵感点", "目的点", "关键点"]:
			
 
				+        feature_count = len(final_result[category])
			
 
				+        source_count = sum(len(item["特征来源"]) for item in final_result[category])
			
 
				+        print(f"  {category}: {feature_count} 个特征, {source_count} 个来源")
			
 
				+
			
 
				+    # 保存结果
			
 
				+    print(f"\n正在保存结果到: {output_file}")
			
 
				+    with open(output_file, "w", encoding="utf-8") as f:
			
 
				+        json.dump(final_result, f, ensure_ascii=False, indent=4)
			
 
				+
			
 
				+    print("完成!")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()