2 周之前 · 9863bdabf3
--- a/script/data_processing/README.md
+++ b/script/data_processing/README.md
@@ -0,0 +1,160 @@
 
															+# 数据处理模块
														
 
															+
														
 
															+本模块提供数据提取和转换功能，用于处理小红书帖子的特征提取和分类映射。
														
 
															+
														
 
															+## 脚本说明
														
 
															+
														
 
															+### 1. extract_features_from_posts.py
														
 
															+
														
 
															+从 `过去帖子_what解构结果` 目录中提取特征名称及其来源信息。
														
 
															+
														
 
															+**功能：**
														
 
															+- 从帖子的三点解构（灵感点、目的点、关键点）中提取特征
														
 
															+- 自动获取帖子详情（标题、正文、图片、点赞数等）
														
 
															+- 根据当前帖子的最早发布时间过滤数据，避免时间穿越
														
 
															+- 输出特征名称到帖子来源的映射关系
														
 
															+
														
 
															+**输入：**
														
 
															+- `data/data_1117/过去帖子_what解构结果/*.json` - 过去帖子的解构结果
														
 
															+- `data/data_1117/当前帖子_what解构结果/*.json` - 当前帖子的解构结果（用于获取时间阈值）
														
 
															+
														
 
															+**输出：**
														
 
															+- `data/data_1117/特征名称_帖子来源.json` - 特征名称及其帖子来源映射
														
 
															+
														
 
															+**使用方法：**
														
 
															+```bash
														
 
															+# 从项目根目录运行
														
 
															+python script/data_processing/extract_features_from_posts.py
														
 
															+
														
 
															+# 或者从任意目录运行
														
 
															+python /path/to/script/data_processing/extract_features_from_posts.py
														
 
															+```
														
 
															+
														
 
															+**输出格式：**
														
 
															+```json
														
 
															+{
														
 
															+    "灵感点": [
														
 
															+        {
														
 
															+            "特征名称": "猫咪",
														
 
															+            "特征来源": [
														
 
															+                {
														
 
															+                    "点的名称": "猫咪照片拍出了专业模特感",
														
 
															+                    "点的描述": "...",
														
 
															+                    "帖子id": "69114f150000000007001f30",
														
 
															+                    "帖子详情": {
														
 
															+                        "title": "老师 我家孩子可以做童模吗",
														
 
															+                        "body_text": "...",
														
 
															+                        "like_count": 765,
														
 
															+                        "publish_time": "2025-11-10 10:33:58",
														
 
															+                        ...
														
 
															+                    }
														
 
															+                }
														
 
															+            ]
														
 
															+        }
														
 
															+    ],
														
 
															+    "目的点": [...],
														
 
															+    "关键点": [...]
														
 
															+}
														
 
															+```
														
 
															+
														
 
															+---
														
 
															+
														
 
															+### 2. extract_feature_categories.py
														
 
															+
														
 
															+从 `过去帖子_pattern聚合结果.json` 中提取特征名称及其分类层级信息。
														
 
															+
														
 
															+**功能：**
														
 
															+- 提取特征名称到分类的映射关系
														
 
															+- 构建分类层级结构（包括层级深度、是否叶子节点、下一级节点等）
														
 
															+- 根据当前帖子的最早发布时间过滤数据
														
 
															+- 打印被过滤掉的帖子警告信息
														
 
															+
														
 
															+**输入：**
														
 
															+- `data/data_1117/过去帖子_pattern聚合结果.json` - Pattern聚合结果
														
 
															+- `data/data_1117/当前帖子_what解构结果/*.json` - 当前帖子（用于获取时间阈值）
														
 
															+
														
 
															+**输出：**
														
 
															+- `data/data_1117/特征名称_分类映射.json` - 特征名称到分类的映射
														
 
															+- `data/data_1117/分类层级映射.json` - 分类层级结构
														
 
															+
														
 
															+**使用方法：**
														
 
															+```bash
														
 
															+# 从项目根目录运行
														
 
															+python script/data_processing/extract_feature_categories.py
														
 
															+
														
 
															+# 或者从任意目录运行
														
 
															+python /path/to/script/data_processing/extract_feature_categories.py
														
 
															+```
														
 
															+
														
 
															+**输出格式1 - 特征名称_分类映射.json：**
														
 
															+```json
														
 
															+{
														
 
															+    "灵感点": {
														
 
															+        "猫咪": {
														
 
															+            "所属分类": ["物体", "实质"]
														
 
															+        }
														
 
															+    },
														
 
															+    "目的点": {...},
														
 
															+    "关键点": {...}
														
 
															+}
														
 
															+```
														
 
															+
														
 
															+**输出格式2 - 分类层级映射.json：**
														
 
															+```json
														
 
															+{
														
 
															+    "灵感点": {
														
 
															+        "形式": {
														
 
															+            "几级分类": 1,
														
 
															+            "是否是叶子分类": false,
														
 
															+            "下一级": [
														
 
															+                {
														
 
															+                    "节点类型": "分类",
														
 
															+                    "节点名称": "概念"
														
 
															+                },
														
 
															+                {
														
 
															+                    "节点类型": "分类",
														
 
															+                    "节点名称": "方式"
														
 
															+                }
														
 
															+            ]
														
 
															+        }
														
 
															+    },
														
 
															+    "目的点": {...},
														
 
															+    "关键点": {...}
														
 
															+}
														
 
															+```
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## 时间过滤机制
														
 
															+
														
 
															+两个脚本都实现了时间过滤功能，避免使用"未来"的数据（时间穿越）：
														
 
															+
														
 
															+1. **获取时间阈值**：从 `当前帖子_what解构结果` 目录中获取最早的帖子发布时间
														
 
															+2. **过滤条件**：只保留发布时间**早于**阈值的过去帖子
														
 
															+3. **警告信息**：打印被过滤掉的帖子ID和发布时间
														
 
															+
														
 
															+**示例输出：**
														
 
															+```
														
 
															+当前帖子最早发布时间: 2025-11-07 15:08:59
														
 
															+
														
 
															+正在应用时间过滤 (< 2025-11-07 15:08:59)，避免使用晚于当前帖子的数据...
														
 
															+  ⚠️  过滤掉帖子 69114f150000000007001f30 (发布时间: 2025-11-10 10:33:58，晚于阈值)
														
 
															+  ⚠️  过滤掉帖子 6915dfc400000000070224d9 (发布时间: 2025-11-14 10:22:16，晚于阈值)
														
 
															+
														
 
															+过滤掉 2 个帖子（穿越），保留 17 个帖子
														
 
															+```
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## 依赖
														
 
															+
														
 
															+- `script.detail.get_xiaohongshu_detail` - 获取小红书帖子详情
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## 注意事项
														
 
															+
														
 
															+1. 脚本会自动使用缓存的帖子详情，避免重复请求
														
 
															+2. 如果当前帖子目录不存在或为空，将不启用时间过滤
														
 
															+3. 过滤后只保留有来源的特征，避免空数据
														
 
															+4. 所有路径都相对于项目根目录，可以从任意位置运行脚本
														
--- a/script/data_processing/__init__.py
+++ b/script/data_processing/__init__.py
@@ -0,0 +1,14 @@
 
															+"""
														
 
															+数据处理模块
														
 
															+
														
 
															+提供数据提取和转换功能
														
 
															+"""
														
 
															+
														
 
															+from .extract_features_from_posts import main as extract_features_main
														
 
															+from .extract_feature_categories import main as extract_categories_main
														
 
															+
														
 
															+__all__ = [
														
 
															+    'extract_features_main',
														
 
															+    'extract_categories_main',
														
 
															+]
														
 
															+__version__ = '1.0.0'
														
--- a/script/data_processing/extract_feature_categories.py
+++ b/script/data_processing/extract_feature_categories.py
@@ -0,0 +1,466 @@
 
															+#!/usr/bin/env python3
														
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+从过去帖子_pattern聚合结果.json中提取特征名称及其对应的分类层级
														
 
															+"""
														
 
															+
														
 
															+import json
														
 
															+from pathlib import Path
														
 
															+from typing import Dict, List, Any, Optional, Set
														
 
															+import sys
														
 
															+import re
														
 
															+
														
 
															+# 添加项目根目录到路径
														
 
															+project_root = Path(__file__).parent.parent.parent
														
 
															+sys.path.insert(0, str(project_root))
														
 
															+
														
 
															+from script.detail import get_xiaohongshu_detail
														
 
															+
														
 
															+
														
 
															+def extract_post_id_from_filename(filename: str) -> str:
														
 
															+    """从文件名中提取帖子ID"""
														
 
															+    match = re.match(r'^([^_]+)_', filename)
														
 
															+    if match:
														
 
															+        return match.group(1)
														
 
															+    return ""
														
 
															+
														
 
															+
														
 
															+def get_post_detail(post_id: str) -> Optional[Dict]:
														
 
															+    """获取帖子详情"""
														
 
															+    try:
														
 
															+        detail = get_xiaohongshu_detail(post_id)
														
 
															+        return detail
														
 
															+    except Exception as e:
														
 
															+        print(f"  警告: 获取帖子 {post_id} 详情失败: {e}")
														
 
															+        return None
														
 
															+
														
 
															+
														
 
															+def get_earliest_publish_time(current_posts_dir: Path) -> Optional[str]:
														
 
															+    """
														
 
															+    获取当前帖子目录中最早的发布时间
														
 
															+
														
 
															+    Args:
														
 
															+        current_posts_dir: 当前帖子目录路径
														
 
															+
														
 
															+    Returns:
														
 
															+        最早的发布时间字符串，格式为 "YYYY-MM-DD HH:MM:SS"
														
 
															+    """
														
 
															+    if not current_posts_dir.exists():
														
 
															+        print(f"警告: 当前帖子目录不存在: {current_posts_dir}")
														
 
															+        return None
														
 
															+
														
 
															+    json_files = list(current_posts_dir.glob("*.json"))
														
 
															+    if not json_files:
														
 
															+        print(f"警告: 当前帖子目录为空: {current_posts_dir}")
														
 
															+        return None
														
 
															+
														
 
															+    print(f"\n正在获取当前帖子的发布时间...")
														
 
															+    print(f"找到 {len(json_files)} 个当前帖子")
														
 
															+
														
 
															+    earliest_time = None
														
 
															+    for file_path in json_files:
														
 
															+        post_id = extract_post_id_from_filename(file_path.name)
														
 
															+        if not post_id:
														
 
															+            continue
														
 
															+
														
 
															+        try:
														
 
															+            detail = get_post_detail(post_id)
														
 
															+            if detail and 'publish_time' in detail:
														
 
															+                publish_time = detail['publish_time']
														
 
															+                if earliest_time is None or publish_time < earliest_time:
														
 
															+                    earliest_time = publish_time
														
 
															+                    print(f"  更新最早时间: {publish_time} (帖子: {post_id})")
														
 
															+        except Exception as e:
														
 
															+            print(f"  警告: 获取帖子 {post_id} 发布时间失败: {e}")
														
 
															+
														
 
															+    if earliest_time:
														
 
															+        print(f"\n当前帖子最早发布时间: {earliest_time}")
														
 
															+    else:
														
 
															+        print("\n警告: 未能获取到任何当前帖子的发布时间")
														
 
															+
														
 
															+    return earliest_time
														
 
															+
														
 
															+
														
 
															+def collect_all_post_ids(data: Dict) -> Set[str]:
														
 
															+    """
														
 
															+    收集数据中的所有帖子ID
														
 
															+
														
 
															+    Args:
														
 
															+        data: 聚合结果数据
														
 
															+
														
 
															+    Returns:
														
 
															+        帖子ID集合
														
 
															+    """
														
 
															+    post_ids = set()
														
 
															+
														
 
															+    def traverse_node(node):
														
 
															+        if isinstance(node, dict):
														
 
															+            # 检查是否有帖子列表
														
 
															+            if "帖子列表" in node and isinstance(node["帖子列表"], list):
														
 
															+                post_ids.update(node["帖子列表"])
														
 
															+
														
 
															+            # 检查是否有特征列表
														
 
															+            if "特征列表" in node and isinstance(node["特征列表"], list):
														
 
															+                for feature in node["特征列表"]:
														
 
															+                    if "帖子id" in feature:
														
 
															+                        post_ids.add(feature["帖子id"])
														
 
															+
														
 
															+            # 递归遍历
														
 
															+            for key, value in node.items():
														
 
															+                if key not in ["_meta", "帖子数", "特征数", "帖子列表"]:
														
 
															+                    traverse_node(value)
														
 
															+        elif isinstance(node, list):
														
 
															+            for item in node:
														
 
															+                traverse_node(item)
														
 
															+
														
 
															+    for category in ["灵感点列表", "目的点", "关键点列表"]:
														
 
															+        if category in data:
														
 
															+            traverse_node(data[category])
														
 
															+
														
 
															+    return post_ids
														
 
															+
														
 
															+
														
 
															+def filter_data_by_time(data: Dict, time_filter: str) -> tuple[Dict, Set[str]]:
														
 
															+    """
														
 
															+    根据发布时间过滤数据
														
 
															+
														
 
															+    Args:
														
 
															+        data: 原始聚合结果数据
														
 
															+        time_filter: 时间过滤阈值
														
 
															+
														
 
															+    Returns:
														
 
															+        (过滤后的数据, 被过滤掉的帖子ID集合)
														
 
															+    """
														
 
															+    # 收集所有帖子ID
														
 
															+    all_post_ids = collect_all_post_ids(data)
														
 
															+    print(f"\n数据中包含 {len(all_post_ids)} 个不同的帖子")
														
 
															+
														
 
															+    # 获取所有帖子的详情
														
 
															+    print("正在获取帖子详情...")
														
 
															+    post_details = {}
														
 
															+    for i, post_id in enumerate(all_post_ids, 1):
														
 
															+        print(f"[{i}/{len(all_post_ids)}] 获取帖子 {post_id} 的详情...")
														
 
															+        detail = get_post_detail(post_id)
														
 
															+        if detail:
														
 
															+            post_details[post_id] = detail
														
 
															+
														
 
															+    # 根据时间过滤（过滤掉发布时间晚于等于阈值的帖子，避免穿越）
														
 
															+    print(f"\n正在应用时间过滤 (< {time_filter})，避免使用晚于当前帖子的数据...")
														
 
															+    filtered_post_ids = set()
														
 
															+    valid_post_ids = set()
														
 
															+
														
 
															+    for post_id, detail in post_details.items():
														
 
															+        publish_time = detail.get('publish_time', '')
														
 
															+        if publish_time < time_filter:
														
 
															+            valid_post_ids.add(post_id)
														
 
															+        else:
														
 
															+            filtered_post_ids.add(post_id)
														
 
															+            print(f"  ⚠️  过滤掉帖子 {post_id} (发布时间: {publish_time}，晚于阈值)")
														
 
															+
														
 
															+    print(f"\n过滤统计: 过滤掉 {len(filtered_post_ids)} 个帖子（穿越），保留 {len(valid_post_ids)} 个帖子")
														
 
															+
														
 
															+    # 过滤数据
														
 
															+    filtered_data = filter_node_by_post_ids(data, valid_post_ids)
														
 
															+
														
 
															+    return filtered_data, filtered_post_ids
														
 
															+
														
 
															+
														
 
															+def filter_node_by_post_ids(node: Any, valid_post_ids: Set[str]) -> Any:
														
 
															+    """
														
 
															+    递归过滤节点，只保留有效帖子的数据
														
 
															+
														
 
															+    Args:
														
 
															+        node: 当前节点
														
 
															+        valid_post_ids: 有效的帖子ID集合
														
 
															+
														
 
															+    Returns:
														
 
															+        过滤后的节点
														
 
															+    """
														
 
															+    if isinstance(node, dict):
														
 
															+        filtered_node = {}
														
 
															+
														
 
															+        # 处理特征列表
														
 
															+        if "特征列表" in node:
														
 
															+            filtered_features = []
														
 
															+            for feature in node["特征列表"]:
														
 
															+                if "帖子id" in feature and feature["帖子id"] in valid_post_ids:
														
 
															+                    filtered_features.append(feature)
														
 
															+
														
 
															+            if filtered_features:
														
 
															+                filtered_node["特征列表"] = filtered_features
														
 
															+                # 更新元数据
														
 
															+                if "_meta" in node:
														
 
															+                    filtered_node["_meta"] = node["_meta"].copy()
														
 
															+                filtered_node["帖子数"] = len(set(f["帖子id"] for f in filtered_features if "帖子id" in f))
														
 
															+                filtered_node["特征数"] = len(filtered_features)
														
 
															+
														
 
															+                # 更新帖子列表
														
 
															+                filtered_node["帖子列表"] = list(set(f["帖子id"] for f in filtered_features if "帖子id" in f))
														
 
															+
														
 
															+        # 递归处理子节点
														
 
															+        for key, value in node.items():
														
 
															+            if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
														
 
															+                continue
														
 
															+
														
 
															+            filtered_child = filter_node_by_post_ids(value, valid_post_ids)
														
 
															+            if filtered_child:  # 只添加非空的子节点
														
 
															+                filtered_node[key] = filtered_child
														
 
															+
														
 
															+        return filtered_node if filtered_node else None
														
 
															+
														
 
															+    elif isinstance(node, list):
														
 
															+        return [filter_node_by_post_ids(item, valid_post_ids) for item in node]
														
 
															+
														
 
															+    else:
														
 
															+        return node
														
 
															+
														
 
															+
														
 
															+def extract_categories_from_node(node: Dict, current_path: List[str], result: Dict[str, Dict]):
														
 
															+    """
														
 
															+    递归遍历树形结构，提取特征名称及其分类路径
														
 
															+
														
 
															+    Args:
														
 
															+        node: 当前节点
														
 
															+        current_path: 当前分类路径（从下到上）
														
 
															+        result: 结果字典，用于存储特征名称到分类的映射
														
 
															+    """
														
 
															+    # 如果当前节点包含"特征列表"
														
 
															+    if "特征列表" in node:
														
 
															+        for feature in node["特征列表"]:
														
 
															+            feature_name = feature.get("特征名称")
														
 
															+            if feature_name:
														
 
															+                # 将分类路径存储到结果中
														
 
															+                result[feature_name] = {
														
 
															+                    "所属分类": current_path.copy()
														
 
															+                }
														
 
															+
														
 
															+    # 递归处理子节点
														
 
															+    for key, value in node.items():
														
 
															+        # 跳过特殊字段
														
 
															+        if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
														
 
															+            continue
														
 
															+
														
 
															+        # 如果值是字典，继续递归
														
 
															+        if isinstance(value, dict):
														
 
															+            # 将当前key添加到路径中
														
 
															+            new_path = [key] + current_path
														
 
															+            extract_categories_from_node(value, new_path, result)
														
 
															+
														
 
															+
														
 
															+def process_category(category_data: Dict, category_key: str) -> Dict[str, Dict]:
														
 
															+    """
														
 
															+    处理单个分类（灵感点列表/目的点/关键点列表）
														
 
															+
														
 
															+    Args:
														
 
															+        category_data: 分类数据
														
 
															+        category_key: 分类键名
														
 
															+
														
 
															+    Returns:
														
 
															+        特征名称到分类的映射字典
														
 
															+    """
														
 
															+    result = {}
														
 
															+
														
 
															+    if isinstance(category_data, dict):
														
 
															+        extract_categories_from_node(category_data, [], result)
														
 
															+
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+def build_category_hierarchy_from_node(
														
 
															+    node: Dict,
														
 
															+    category_hierarchy: Dict[str, Dict],
														
 
															+    current_level: int = 1,
														
 
															+    parent_categories: List[str] = None
														
 
															+):
														
 
															+    """
														
 
															+    递归构建分类层级结构
														
 
															+
														
 
															+    Args:
														
 
															+        node: 当前节点
														
 
															+        category_hierarchy: 分类层级字典
														
 
															+        current_level: 当前层级（从1开始）
														
 
															+        parent_categories: 父级分类列表（从顶到下）
														
 
															+    """
														
 
															+    if parent_categories is None:
														
 
															+        parent_categories = []
														
 
															+
														
 
															+    # 遍历当前节点的所有键
														
 
															+    for key, value in node.items():
														
 
															+        # 跳过特殊字段
														
 
															+        if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
														
 
															+            continue
														
 
															+
														
 
															+        if isinstance(value, dict):
														
 
															+            # 初始化当前分类的信息
														
 
															+            if key not in category_hierarchy:
														
 
															+                category_hierarchy[key] = {
														
 
															+                    "几级分类": current_level,
														
 
															+                    "是否是叶子分类": False,
														
 
															+                    "下一级": []
														
 
															+                }
														
 
															+
														
 
															+            # 收集下一级的分类名称和特征名称
														
 
															+            next_level_items = []
														
 
															+
														
 
															+            # 检查是否有子分类
														
 
															+            has_sub_categories = False
														
 
															+            for sub_key, sub_value in value.items():
														
 
															+                if sub_key not in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
														
 
															+                    if isinstance(sub_value, dict):
														
 
															+                        has_sub_categories = True
														
 
															+                        next_level_items.append({
														
 
															+                            "节点类型": "分类",
														
 
															+                            "节点名称": sub_key
														
 
															+                        })
														
 
															+
														
 
															+            # 如果有特征列表，添加特征名称
														
 
															+            if "特征列表" in value:
														
 
															+                for feature in value["特征列表"]:
														
 
															+                    feature_name = feature.get("特征名称")
														
 
															+                    if feature_name:
														
 
															+                        next_level_items.append({
														
 
															+                            "节点类型": "特征",
														
 
															+                            "节点名称": feature_name
														
 
															+                        })
														
 
															+
														
 
															+            # 更新下一级列表
														
 
															+            category_hierarchy[key]["下一级"] = next_level_items
														
 
															+
														
 
															+            # 如果没有子分类，标记为叶子分类
														
 
															+            if not has_sub_categories:
														
 
															+                category_hierarchy[key]["是否是叶子分类"] = True
														
 
															+
														
 
															+            # 递归处理子节点
														
 
															+            new_parent_categories = parent_categories + [key]
														
 
															+            build_category_hierarchy_from_node(
														
 
															+                value,
														
 
															+                category_hierarchy,
														
 
															+                current_level + 1,
														
 
															+                new_parent_categories
														
 
															+            )
														
 
															+
														
 
															+
														
 
															+def build_category_hierarchy(category_data: Dict) -> Dict[str, Dict]:
														
 
															+    """
														
 
															+    构建分类名称到下一级的映射关系
														
 
															+
														
 
															+    Args:
														
 
															+        category_data: 分类数据
														
 
															+
														
 
															+    Returns:
														
 
															+        分类层级映射字典
														
 
															+    """
														
 
															+    category_hierarchy = {}
														
 
															+
														
 
															+    if isinstance(category_data, dict):
														
 
															+        build_category_hierarchy_from_node(category_data, category_hierarchy)
														
 
															+
														
 
															+    return category_hierarchy
														
 
															+
														
 
															+
														
 
															+def main():
														
 
															+    # 输入输出文件路径（默认使用项目根目录下的 data/data_1117 目录）
														
 
															+    script_dir = Path(__file__).parent
														
 
															+    project_root = script_dir.parent.parent
														
 
															+    data_dir = project_root / "data" / "data_1117"
														
 
															+
														
 
															+    input_file = data_dir / "过去帖子_pattern聚合结果.json"
														
 
															+    current_posts_dir = data_dir / "当前帖子_what解构结果"
														
 
															+    output_file_1 = data_dir / "特征名称_分类映射.json"
														
 
															+    output_file_2 = data_dir / "分类层级映射.json"
														
 
															+
														
 
															+    # 获取当前帖子的最早发布时间
														
 
															+    earliest_time = get_earliest_publish_time(current_posts_dir)
														
 
															+
														
 
															+    # 读取输入文件
														
 
															+    print(f"\n正在读取文件: {input_file}")
														
 
															+    with open(input_file, "r", encoding="utf-8") as f:
														
 
															+        data = json.load(f)
														
 
															+
														
 
															+    # 如果有时间过滤，应用过滤
														
 
															+    filtered_post_ids = set()
														
 
															+    if earliest_time:
														
 
															+        print("\n" + "="*60)
														
 
															+        print("开始应用时间过滤...")
														
 
															+        data, filtered_post_ids = filter_data_by_time(data, earliest_time)
														
 
															+
														
 
															+        if filtered_post_ids:
														
 
															+            print(f"\n⚠️  警告: 以下 {len(filtered_post_ids)} 个帖子因发布时间晚于阈值被过滤:")
														
 
															+            for post_id in sorted(filtered_post_ids):
														
 
															+                print(f"  - {post_id}")
														
 
															+    else:
														
 
															+        print("\n未启用时间过滤")
														
 
															+
														
 
															+    # 处理结果1: 特征名称到分类的映射
														
 
															+    output_1 = {}
														
 
															+
														
 
															+    # 处理灵感点列表
														
 
															+    if "灵感点列表" in data:
														
 
															+        print("正在处理: 灵感点列表 (特征名称映射)")
														
 
															+        output_1["灵感点"] = process_category(data["灵感点列表"], "灵感点列表")
														
 
															+        print(f"  提取了 {len(output_1['灵感点'])} 个特征")
														
 
															+
														
 
															+    # 处理目的点
														
 
															+    if "目的点" in data:
														
 
															+        print("正在处理: 目的点 (特征名称映射)")
														
 
															+        output_1["目的点"] = process_category(data["目的点"], "目的点")
														
 
															+        print(f"  提取了 {len(output_1['目的点'])} 个特征")
														
 
															+
														
 
															+    # 处理关键点列表
														
 
															+    if "关键点列表" in data:
														
 
															+        print("正在处理: 关键点列表 (特征名称映射)")
														
 
															+        output_1["关键点"] = process_category(data["关键点列表"], "关键点列表")
														
 
															+        print(f"  提取了 {len(output_1['关键点'])} 个特征")
														
 
															+
														
 
															+    # 保存结果1
														
 
															+    print(f"\n正在保存结果到: {output_file_1}")
														
 
															+    with open(output_file_1, "w", encoding="utf-8") as f:
														
 
															+        json.dump(output_1, f, ensure_ascii=False, indent=4)
														
 
															+
														
 
															+    print("完成!")
														
 
															+    if earliest_time:
														
 
															+        print(f"\n总计 (特征名称映射，已过滤掉发布时间 >= {earliest_time} 的帖子):")
														
 
															+    else:
														
 
															+        print(f"\n总计 (特征名称映射):")
														
 
															+    for category, features in output_1.items():
														
 
															+        print(f"  {category}: {len(features)} 个特征")
														
 
															+
														
 
															+    # 处理结果2: 分类层级映射
														
 
															+    print("\n" + "="*60)
														
 
															+    print("开始生成分类层级映射...")
														
 
															+    output_2 = {}
														
 
															+
														
 
															+    # 处理灵感点列表
														
 
															+    if "灵感点列表" in data:
														
 
															+        print("正在处理: 灵感点列表 (分类层级)")
														
 
															+        output_2["灵感点"] = build_category_hierarchy(data["灵感点列表"])
														
 
															+        print(f"  提取了 {len(output_2['灵感点'])} 个分类")
														
 
															+
														
 
															+    # 处理目的点
														
 
															+    if "目的点" in data:
														
 
															+        print("正在处理: 目的点 (分类层级)")
														
 
															+        output_2["目的点"] = build_category_hierarchy(data["目的点"])
														
 
															+        print(f"  提取了 {len(output_2['目的点'])} 个分类")
														
 
															+
														
 
															+    # 处理关键点列表
														
 
															+    if "关键点列表" in data:
														
 
															+        print("正在处理: 关键点列表 (分类层级)")
														
 
															+        output_2["关键点"] = build_category_hierarchy(data["关键点列表"])
														
 
															+        print(f"  提取了 {len(output_2['关键点'])} 个分类")
														
 
															+
														
 
															+    # 保存结果2
														
 
															+    print(f"\n正在保存结果到: {output_file_2}")
														
 
															+    with open(output_file_2, "w", encoding="utf-8") as f:
														
 
															+        json.dump(output_2, f, ensure_ascii=False, indent=4)
														
 
															+
														
 
															+    print("完成!")
														
 
															+    if earliest_time:
														
 
															+        print(f"\n总计 (分类层级映射，已过滤掉发布时间 >= {earliest_time} 的帖子):")
														
 
															+    else:
														
 
															+        print(f"\n总计 (分类层级映射):")
														
 
															+    for category, hierarchies in output_2.items():
														
 
															+        print(f"  {category}: {len(hierarchies)} 个分类")
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    main()
														
--- a/script/data_processing/extract_features_from_posts.py
+++ b/script/data_processing/extract_features_from_posts.py
@@ -0,0 +1,409 @@
 
															+#!/usr/bin/env python3
														
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+从过去帖子_what解构结果目录中提取特征名称及其来源信息
														
 
															+"""
														
 
															+
														
 
															+import json
														
 
															+from pathlib import Path
														
 
															+from typing import Dict, List, Optional
														
 
															+import re
														
 
															+import sys
														
 
															+
														
 
															+# 添加项目根目录到路径
														
 
															+project_root = Path(__file__).parent.parent.parent
														
 
															+sys.path.insert(0, str(project_root))
														
 
															+
														
 
															+from script.detail import get_xiaohongshu_detail
														
 
															+
														
 
															+
														
 
															+def extract_post_id_from_filename(filename: str) -> str:
														
 
															+    """从文件名中提取帖子ID"""
														
 
															+    match = re.match(r'^([^_]+)_', filename)
														
 
															+    if match:
														
 
															+        return match.group(1)
														
 
															+    return ""
														
 
															+
														
 
															+
														
 
															+def get_post_detail(post_id: str) -> Optional[Dict]:
														
 
															+    """
														
 
															+    获取帖子详情
														
 
															+
														
 
															+    Args:
														
 
															+        post_id: 帖子ID
														
 
															+
														
 
															+    Returns:
														
 
															+        帖子详情字典，如果获取失败则返回None
														
 
															+    """
														
 
															+    try:
														
 
															+        detail = get_xiaohongshu_detail(post_id)
														
 
															+        return detail
														
 
															+    except Exception as e:
														
 
															+        print(f"  警告: 获取帖子 {post_id} 详情失败: {e}")
														
 
															+        return None
														
 
															+
														
 
															+
														
 
															+def extract_features_from_point(point_data: Dict, post_id: str, point_name: str, point_description: str) -> List[Dict]:
														
 
															+    """
														
 
															+    从单个点（灵感点/目的点/关键点）中提取特征信息
														
 
															+
														
 
															+    Args:
														
 
															+        point_data: 点的数据
														
 
															+        post_id: 帖子ID
														
 
															+        point_name: 点的名称
														
 
															+        point_description: 点的描述
														
 
															+
														
 
															+    Returns:
														
 
															+        特征列表
														
 
															+    """
														
 
															+    features = []
														
 
															+
														
 
															+    # 检查是否有"提取的特征"字段
														
 
															+    if "提取的特征" in point_data and isinstance(point_data["提取的特征"], list):
														
 
															+        for feature in point_data["提取的特征"]:
														
 
															+            if "特征名称" in feature:
														
 
															+                features.append({
														
 
															+                    "特征名称": feature["特征名称"],
														
 
															+                    "点的名称": point_name,
														
 
															+                    "点的描述": point_description,
														
 
															+                    "帖子id": post_id
														
 
															+                })
														
 
															+
														
 
															+    return features
														
 
															+
														
 
															+
														
 
															+def process_single_file(file_path: Path) -> Dict[str, Dict[str, List[Dict]]]:
														
 
															+    """
														
 
															+    处理单个JSON文件，提取所有特征信息
														
 
															+
														
 
															+    Args:
														
 
															+        file_path: JSON文件路径
														
 
															+
														
 
															+    Returns:
														
 
															+        包含灵感点、目的点、关键点的特征字典
														
 
															+    """
														
 
															+    result = {
														
 
															+        "灵感点": {},
														
 
															+        "目的点": {},
														
 
															+        "关键点": {}
														
 
															+    }
														
 
															+
														
 
															+    # 从文件名提取帖子ID
														
 
															+    post_id = extract_post_id_from_filename(file_path.name)
														
 
															+
														
 
															+    try:
														
 
															+        with open(file_path, "r", encoding="utf-8") as f:
														
 
															+            data = json.load(f)
														
 
															+
														
 
															+        # 提取三点解构数据
														
 
															+        if "三点解构" not in data:
														
 
															+            return result
														
 
															+
														
 
															+        three_points = data["三点解构"]
														
 
															+
														
 
															+        # 处理灵感点
														
 
															+        if "灵感点" in three_points:
														
 
															+            inspiration = three_points["灵感点"]
														
 
															+
														
 
															+            # 处理全新内容
														
 
															+            if "全新内容" in inspiration and isinstance(inspiration["全新内容"], list):
														
 
															+                for item in inspiration["全新内容"]:
														
 
															+                    point_name = item.get("灵感点", "")
														
 
															+                    point_desc = item.get("描述", "")
														
 
															+                    features = extract_features_from_point(item, post_id, point_name, point_desc)
														
 
															+
														
 
															+                    for feature in features:
														
 
															+                        feature_name = feature["特征名称"]
														
 
															+                        if feature_name not in result["灵感点"]:
														
 
															+                            result["灵感点"][feature_name] = []
														
 
															+                        result["灵感点"][feature_name].append({
														
 
															+                            "点的名称": feature["点的名称"],
														
 
															+                            "点的描述": feature["点的描述"],
														
 
															+                            "帖子id": feature["帖子id"]
														
 
															+                        })
														
 
															+
														
 
															+            # 处理共性差异
														
 
															+            if "共性差异" in inspiration and isinstance(inspiration["共性差异"], list):
														
 
															+                for item in inspiration["共性差异"]:
														
 
															+                    point_name = item.get("灵感点", "")
														
 
															+                    point_desc = item.get("描述", "")
														
 
															+                    features = extract_features_from_point(item, post_id, point_name, point_desc)
														
 
															+
														
 
															+                    for feature in features:
														
 
															+                        feature_name = feature["特征名称"]
														
 
															+                        if feature_name not in result["灵感点"]:
														
 
															+                            result["灵感点"][feature_name] = []
														
 
															+                        result["灵感点"][feature_name].append({
														
 
															+                            "点的名称": feature["点的名称"],
														
 
															+                            "点的描述": feature["点的描述"],
														
 
															+                            "帖子id": feature["帖子id"]
														
 
															+                        })
														
 
															+
														
 
															+            # 处理共性内容
														
 
															+            if "共性内容" in inspiration and isinstance(inspiration["共性内容"], list):
														
 
															+                for item in inspiration["共性内容"]:
														
 
															+                    point_name = item.get("灵感点", "")
														
 
															+                    point_desc = item.get("描述", "")
														
 
															+                    features = extract_features_from_point(item, post_id, point_name, point_desc)
														
 
															+
														
 
															+                    for feature in features:
														
 
															+                        feature_name = feature["特征名称"]
														
 
															+                        if feature_name not in result["灵感点"]:
														
 
															+                            result["灵感点"][feature_name] = []
														
 
															+                        result["灵感点"][feature_name].append({
														
 
															+                            "点的名称": feature["点的名称"],
														
 
															+                            "点的描述": feature["点的描述"],
														
 
															+                            "帖子id": feature["帖子id"]
														
 
															+                        })
														
 
															+
														
 
															+        # 处理目的点
														
 
															+        if "目的点" in three_points:
														
 
															+            purpose = three_points["目的点"]
														
 
															+
														
 
															+            if "purposes" in purpose and isinstance(purpose["purposes"], list):
														
 
															+                for item in purpose["purposes"]:
														
 
															+                    point_name = item.get("目的点", "")
														
 
															+                    point_desc = item.get("描述", "")
														
 
															+                    features = extract_features_from_point(item, post_id, point_name, point_desc)
														
 
															+
														
 
															+                    for feature in features:
														
 
															+                        feature_name = feature["特征名称"]
														
 
															+                        if feature_name not in result["目的点"]:
														
 
															+                            result["目的点"][feature_name] = []
														
 
															+                        result["目的点"][feature_name].append({
														
 
															+                            "点的名称": feature["点的名称"],
														
 
															+                            "点的描述": feature["点的描述"],
														
 
															+                            "帖子id": feature["帖子id"]
														
 
															+                        })
														
 
															+
														
 
															+        # 处理关键点
														
 
															+        if "关键点" in three_points:
														
 
															+            key_points = three_points["关键点"]
														
 
															+
														
 
															+            if "key_points" in key_points and isinstance(key_points["key_points"], list):
														
 
															+                for item in key_points["key_points"]:
														
 
															+                    point_name = item.get("关键点", "")
														
 
															+                    point_desc = item.get("描述", "")
														
 
															+                    features = extract_features_from_point(item, post_id, point_name, point_desc)
														
 
															+
														
 
															+                    for feature in features:
														
 
															+                        feature_name = feature["特征名称"]
														
 
															+                        if feature_name not in result["关键点"]:
														
 
															+                            result["关键点"][feature_name] = []
														
 
															+                        result["关键点"][feature_name].append({
														
 
															+                            "点的名称": feature["点的名称"],
														
 
															+                            "点的描述": feature["点的描述"],
														
 
															+                            "帖子id": feature["帖子id"]
														
 
															+                        })
														
 
															+
														
 
															+    except Exception as e:
														
 
															+        print(f"处理文件 {file_path.name} 时出错: {e}")
														
 
															+
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+def merge_results(all_results: List[Dict]) -> Dict:
														
 
															+    """
														
 
															+    合并所有文件的提取结果
														
 
															+
														
 
															+    Args:
														
 
															+        all_results: 所有文件的结果列表
														
 
															+
														
 
															+    Returns:
														
 
															+        合并后的结果
														
 
															+    """
														
 
															+    merged = {
														
 
															+        "灵感点": {},
														
 
															+        "目的点": {},
														
 
															+        "关键点": {}
														
 
															+    }
														
 
															+
														
 
															+    for result in all_results:
														
 
															+        for category in ["灵感点", "目的点", "关键点"]:
														
 
															+            for feature_name, sources in result[category].items():
														
 
															+                if feature_name not in merged[category]:
														
 
															+                    merged[category][feature_name] = {"来源": []}
														
 
															+                merged[category][feature_name]["来源"].extend(sources)
														
 
															+
														
 
															+    return merged
														
 
															+
														
 
															+
														
 
															+def convert_to_array_format(merged_dict: Dict, fetch_details: bool = True, time_filter: Optional[str] = None) -> Dict:
														
 
															+    """
														
 
															+    将字典格式转换为数组格式，并添加帖子详情
														
 
															+
														
 
															+    Args:
														
 
															+        merged_dict: 字典格式的结果
														
 
															+        fetch_details: 是否获取帖子详情，默认为True
														
 
															+        time_filter: 时间过滤阈值，只保留发布时间>=该时间的帖子，格式为 "YYYY-MM-DD HH:MM:SS"
														
 
															+
														
 
															+    Returns:
														
 
															+        数组格式的结果
														
 
															+    """
														
 
															+    result = {
														
 
															+        "灵感点": [],
														
 
															+        "目的点": [],
														
 
															+        "关键点": []
														
 
															+    }
														
 
															+
														
 
															+    # 收集所有需要获取详情的帖子ID
														
 
															+    post_ids = set()
														
 
															+    if fetch_details:
														
 
															+        for category in ["灵感点", "目的点", "关键点"]:
														
 
															+            for feature_name, data in merged_dict[category].items():
														
 
															+                for source in data["来源"]:
														
 
															+                    post_ids.add(source["帖子id"])
														
 
															+
														
 
															+        # 批量获取帖子详情
														
 
															+        print(f"\n正在获取 {len(post_ids)} 个帖子的详情...")
														
 
															+        post_details = {}
														
 
															+        for i, post_id in enumerate(post_ids, 1):
														
 
															+            print(f"[{i}/{len(post_ids)}] 获取帖子 {post_id} 的详情...")
														
 
															+            detail = get_post_detail(post_id)
														
 
															+            if detail:
														
 
															+                post_details[post_id] = detail
														
 
															+
														
 
															+        print(f"成功获取 {len(post_details)} 个帖子详情")
														
 
															+
														
 
															+        # 如果启用时间过滤，过滤帖子（过滤掉发布时间晚于等于阈值的帖子，避免穿越）
														
 
															+        if time_filter:
														
 
															+            print(f"\n正在应用时间过滤 (< {time_filter})，避免使用晚于当前帖子的数据...")
														
 
															+            filtered_post_ids = set()
														
 
															+            filtered_count = 0
														
 
															+            for post_id, detail in post_details.items():
														
 
															+                publish_time = detail.get('publish_time', '')
														
 
															+                if publish_time < time_filter:
														
 
															+                    filtered_post_ids.add(post_id)
														
 
															+                else:
														
 
															+                    filtered_count += 1
														
 
															+                    print(f"  ⚠️  过滤掉帖子 {post_id} (发布时间: {publish_time}，晚于阈值)")
														
 
															+
														
 
															+            print(f"过滤掉 {filtered_count} 个帖子（穿越），保留 {len(filtered_post_ids)} 个帖子")
														
 
															+            # 更新post_details，只保留符合时间条件的
														
 
															+            post_details = {pid: detail for pid, detail in post_details.items() if pid in filtered_post_ids}
														
 
															+
														
 
															+    # 转换为数组格式并添加帖子详情
														
 
															+    for category in ["灵感点", "目的点", "关键点"]:
														
 
															+        for feature_name, data in merged_dict[category].items():
														
 
															+            # 为每个来源添加帖子详情
														
 
															+            enhanced_sources = []
														
 
															+            for source in data["来源"]:
														
 
															+                # 如果启用时间过滤，跳过不符合时间条件的帖子
														
 
															+                if fetch_details and time_filter and source["帖子id"] not in post_details:
														
 
															+                    continue
														
 
															+
														
 
															+                enhanced_source = source.copy()
														
 
															+                if fetch_details and source["帖子id"] in post_details:
														
 
															+                    enhanced_source["帖子详情"] = post_details[source["帖子id"]]
														
 
															+                enhanced_sources.append(enhanced_source)
														
 
															+
														
 
															+            # 只添加有来源的特征
														
 
															+            if enhanced_sources:
														
 
															+                result[category].append({
														
 
															+                    "特征名称": feature_name,
														
 
															+                    "特征来源": enhanced_sources
														
 
															+                })
														
 
															+
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+def get_earliest_publish_time(current_posts_dir: Path) -> Optional[str]:
														
 
															+    """
														
 
															+    获取当前帖子目录中最早的发布时间
														
 
															+
														
 
															+    Args:
														
 
															+        current_posts_dir: 当前帖子目录路径
														
 
															+
														
 
															+    Returns:
														
 
															+        最早的发布时间字符串，格式为 "YYYY-MM-DD HH:MM:SS"
														
 
															+    """
														
 
															+    if not current_posts_dir.exists():
														
 
															+        print(f"警告: 当前帖子目录不存在: {current_posts_dir}")
														
 
															+        return None
														
 
															+
														
 
															+    json_files = list(current_posts_dir.glob("*.json"))
														
 
															+    if not json_files:
														
 
															+        print(f"警告: 当前帖子目录为空: {current_posts_dir}")
														
 
															+        return None
														
 
															+
														
 
															+    print(f"\n正在获取当前帖子的发布时间...")
														
 
															+    print(f"找到 {len(json_files)} 个当前帖子")
														
 
															+
														
 
															+    earliest_time = None
														
 
															+    for file_path in json_files:
														
 
															+        post_id = extract_post_id_from_filename(file_path.name)
														
 
															+        if not post_id:
														
 
															+            continue
														
 
															+
														
 
															+        try:
														
 
															+            detail = get_post_detail(post_id)
														
 
															+            if detail and 'publish_time' in detail:
														
 
															+                publish_time = detail['publish_time']
														
 
															+                if earliest_time is None or publish_time < earliest_time:
														
 
															+                    earliest_time = publish_time
														
 
															+                    print(f"  更新最早时间: {publish_time} (帖子: {post_id})")
														
 
															+        except Exception as e:
														
 
															+            print(f"  警告: 获取帖子 {post_id} 发布时间失败: {e}")
														
 
															+
														
 
															+    if earliest_time:
														
 
															+        print(f"\n当前帖子最早发布时间: {earliest_time}")
														
 
															+    else:
														
 
															+        print("\n警告: 未能获取到任何当前帖子的发布时间")
														
 
															+
														
 
															+    return earliest_time
														
 
															+
														
 
															+
														
 
															+def main():
														
 
															+    # 输入输出路径（默认使用项目根目录下的 data/data_1117 目录）
														
 
															+    script_dir = Path(__file__).parent
														
 
															+    project_root = script_dir.parent.parent
														
 
															+    data_dir = project_root / "data" / "data_1117"
														
 
															+
														
 
															+    input_dir = data_dir / "过去帖子_what解构结果"
														
 
															+    current_posts_dir = data_dir / "当前帖子_what解构结果"
														
 
															+    output_file = data_dir / "特征名称_帖子来源.json"
														
 
															+
														
 
															+    # 获取当前帖子的最早发布时间
														
 
															+    earliest_time = get_earliest_publish_time(current_posts_dir)
														
 
															+
														
 
															+    print(f"\n正在扫描目录: {input_dir}")
														
 
															+
														
 
															+    # 获取所有JSON文件
														
 
															+    json_files = list(input_dir.glob("*.json"))
														
 
															+    print(f"找到 {len(json_files)} 个JSON文件")
														
 
															+
														
 
															+    # 处理所有文件
														
 
															+    all_results = []
														
 
															+    for i, file_path in enumerate(json_files, 1):
														
 
															+        print(f"处理文件 [{i}/{len(json_files)}]: {file_path.name}")
														
 
															+        result = process_single_file(file_path)
														
 
															+        all_results.append(result)
														
 
															+
														
 
															+    # 合并结果
														
 
															+    print("\n正在合并结果...")
														
 
															+    merged_result = merge_results(all_results)
														
 
															+
														
 
															+    # 转换为数组格式（带时间过滤）
														
 
															+    print("正在转换为数组格式...")
														
 
															+    final_result = convert_to_array_format(merged_result, fetch_details=True, time_filter=earliest_time)
														
 
															+
														
 
															+    # 统计信息
														
 
															+    if earliest_time:
														
 
															+        print(f"\n提取统计 (已过滤掉发布时间 >= {earliest_time} 的帖子):")
														
 
															+    else:
														
 
															+        print(f"\n提取统计:")
														
 
															+    for category in ["灵感点", "目的点", "关键点"]:
														
 
															+        feature_count = len(final_result[category])
														
 
															+        source_count = sum(len(item["特征来源"]) for item in final_result[category])
														
 
															+        print(f"  {category}: {feature_count} 个特征, {source_count} 个来源")
														
 
															+
														
 
															+    # 保存结果
														
 
															+    print(f"\n正在保存结果到: {output_file}")
														
 
															+    with open(output_file, "w", encoding="utf-8") as f:
														
 
															+        json.dump(final_result, f, ensure_ascii=False, indent=4)
														
 
															+
														
 
															+    print("完成!")
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    main()