2 周之前 · 4712e854a3
--- a/script/data_processing/README.md
+++ b/script/data_processing/README.md
@@ -4,7 +4,63 @@
 
				 
			
 
				 ## 脚本说明
			
 
				 
			
 
				-### 1. extract_features_from_posts.py
			
 
				+### 1. extract_current_posts.py
			
 
				+
			
 
				+从 `当前帖子_what解构结果` 目录中提取当前帖子的解构任务列表。
			
 
				+
			
 
				+**功能：**
			
 
				+- 从当前帖子的解构结果中提取灵感点、目的点、关键点
			
 
				+- 自动获取帖子详情（标题、正文、图片、点赞数等）
			
 
				+- 提取每个点的名称、描述和特征列表
			
 
				+- 生成统一格式的解构任务列表
			
 
				+
			
 
				+**输入：**
			
 
				+- `data/data_1117/当前帖子_what解构结果/*.json` - 当前帖子的解构结果
			
 
				+
			
 
				+**输出：**
			
 
				+- `data/data_1117/当前帖子_解构任务列表.json` - 当前帖子的解构任务列表
			
 
				+
			
 
				+**使用方法：**
			
 
				+```bash
			
 
				+# 从项目根目录运行
			
 
				+python script/data_processing/extract_current_posts.py
			
 
				+
			
 
				+# 或者从任意目录运行
			
 
				+python /path/to/script/data_processing/extract_current_posts.py
			
 
				+```
			
 
				+
			
 
				+**输出格式：**
			
 
				+```json
			
 
				+{
			
 
				+    "解构任务列表": [
			
 
				+        {
			
 
				+            "帖子id": "690d977d0000000007036331",
			
 
				+            "帖子详情": {
			
 
				+                "title": "你不会无缘无故刷到的",
			
 
				+                "body_text": "...",
			
 
				+                "like_count": 123,
			
 
				+                "publish_time": "2025-11-07 15:08:59",
			
 
				+                ...
			
 
				+            },
			
 
				+            "what解构结果": {
			
 
				+                "灵感点列表": [
			
 
				+                    {
			
 
				+                        "名称": "发现立冬和教资查分是同一天",
			
 
				+                        "描述": "创作者在构思内容时...",
			
 
				+                        "特征列表": ["立冬", "教资查分", "时间巧合"]
			
 
				+                    }
			
 
				+                ],
			
 
				+                "目的点列表": [...],
			
 
				+                "关键点列表": [...]
			
 
				+            }
			
 
				+        }
			
 
				+    ]
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 2. extract_features_from_posts.py
			
 
				 
			
 
				 从 `过去帖子_what解构结果` 目录中提取特征名称及其来源信息。
			
 
				 
			
@@ -59,7 +115,7 @@ python /path/to/script/data_processing/extract_features_from_posts.py
 
				 
			
 
				 ---
			
 
				 
			
 
				-### 2. extract_feature_categories.py
			
 
				+### 3. extract_feature_categories.py
			
 
				 
			
 
				 从 `过去帖子_pattern聚合结果.json` 中提取特征名称及其分类层级信息。
			
 
				 
			
--- a/script/data_processing/__init__.py
+++ b/script/data_processing/__init__.py
@@ -6,9 +6,11 @@
 
				 
			
 
				 from .extract_features_from_posts import main as extract_features_main
			
 
				 from .extract_feature_categories import main as extract_categories_main
			
 
				+from .extract_current_posts import main as extract_current_posts_main
			
 
				 
			
 
				 __all__ = [
			
 
				     'extract_features_main',
			
 
				     'extract_categories_main',
			
 
				+    'extract_current_posts_main',
			
 
				 ]
			
 
				 __version__ = '1.0.0'
			
--- a/script/data_processing/extract_current_posts.py
+++ b/script/data_processing/extract_current_posts.py
@@ -0,0 +1,242 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+从当前帖子_what解构结果目录中提取解构任务列表
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, List, Optional
			
 
				+import sys
			
 
				+import re
			
 
				+
			
 
				+# 添加项目根目录到路径
			
 
				+project_root = Path(__file__).parent.parent.parent
			
 
				+sys.path.insert(0, str(project_root))
			
 
				+
			
 
				+from script.detail import get_xiaohongshu_detail
			
 
				+
			
 
				+
			
 
				+def extract_post_id_from_filename(filename: str) -> str:
			
 
				+    """从文件名中提取帖子ID"""
			
 
				+    match = re.match(r'^([^_]+)_', filename)
			
 
				+    if match:
			
 
				+        return match.group(1)
			
 
				+    return ""
			
 
				+
			
 
				+
			
 
				+def get_post_detail(post_id: str) -> Optional[Dict]:
			
 
				+    """获取帖子详情"""
			
 
				+    try:
			
 
				+        detail = get_xiaohongshu_detail(post_id)
			
 
				+        return detail
			
 
				+    except Exception as e:
			
 
				+        print(f"  警告: 获取帖子 {post_id} 详情失败: {e}")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def extract_features_from_point(point_data: Dict) -> List[str]:
			
 
				+    """
			
 
				+    从点数据中提取特征名称列表
			
 
				+
			
 
				+    Args:
			
 
				+        point_data: 点的数据（包含"提取的特征"字段）
			
 
				+
			
 
				+    Returns:
			
 
				+        特征名称列表
			
 
				+    """
			
 
				+    features = []
			
 
				+    if "提取的特征" in point_data and isinstance(point_data["提取的特征"], list):
			
 
				+        for feature in point_data["提取的特征"]:
			
 
				+            if "特征名称" in feature:
			
 
				+                features.append(feature["特征名称"])
			
 
				+    return features
			
 
				+
			
 
				+
			
 
				+def process_inspiration_points(inspiration_data: Dict) -> List[Dict]:
			
 
				+    """
			
 
				+    处理灵感点数据
			
 
				+
			
 
				+    Args:
			
 
				+        inspiration_data: 灵感点数据
			
 
				+
			
 
				+    Returns:
			
 
				+        灵感点列表
			
 
				+    """
			
 
				+    result = []
			
 
				+
			
 
				+    # 处理三个维度：全新内容、共性差异、共性内容
			
 
				+    for dimension in ["全新内容", "共性差异", "共性内容"]:
			
 
				+        if dimension in inspiration_data and isinstance(inspiration_data[dimension], list):
			
 
				+            for item in inspiration_data[dimension]:
			
 
				+                point_item = {
			
 
				+                    "名称": item.get("灵感点", ""),
			
 
				+                    "描述": item.get("描述", ""),
			
 
				+                    "特征列表": extract_features_from_point(item)
			
 
				+                }
			
 
				+                result.append(point_item)
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def process_purpose_points(purpose_data: Dict) -> List[Dict]:
			
 
				+    """
			
 
				+    处理目的点数据
			
 
				+
			
 
				+    Args:
			
 
				+        purpose_data: 目的点数据
			
 
				+
			
 
				+    Returns:
			
 
				+        目的点列表
			
 
				+    """
			
 
				+    result = []
			
 
				+
			
 
				+    if "purposes" in purpose_data and isinstance(purpose_data["purposes"], list):
			
 
				+        for item in purpose_data["purposes"]:
			
 
				+            point_item = {
			
 
				+                "名称": item.get("目的点", ""),
			
 
				+                "描述": item.get("描述", ""),
			
 
				+                "特征列表": extract_features_from_point(item)
			
 
				+            }
			
 
				+            result.append(point_item)
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def process_key_points(key_data: Dict) -> List[Dict]:
			
 
				+    """
			
 
				+    处理关键点数据
			
 
				+
			
 
				+    Args:
			
 
				+        key_data: 关键点数据
			
 
				+
			
 
				+    Returns:
			
 
				+        关键点列表
			
 
				+    """
			
 
				+    result = []
			
 
				+
			
 
				+    if "key_points" in key_data and isinstance(key_data["key_points"], list):
			
 
				+        for item in key_data["key_points"]:
			
 
				+            point_item = {
			
 
				+                "名称": item.get("关键点", ""),
			
 
				+                "描述": item.get("描述", ""),
			
 
				+                "特征列表": extract_features_from_point(item)
			
 
				+            }
			
 
				+            result.append(point_item)
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def process_single_file(file_path: Path) -> Optional[Dict]:
			
 
				+    """
			
 
				+    处理单个JSON文件
			
 
				+
			
 
				+    Args:
			
 
				+        file_path: JSON文件路径
			
 
				+
			
 
				+    Returns:
			
 
				+        解构任务字典，如果处理失败则返回None
			
 
				+    """
			
 
				+    # 从文件名提取帖子ID
			
 
				+    post_id = extract_post_id_from_filename(file_path.name)
			
 
				+    if not post_id:
			
 
				+        print(f"  警告: 无法从文件名提取帖子ID: {file_path.name}")
			
 
				+        return None
			
 
				+
			
 
				+    try:
			
 
				+        # 读取文件
			
 
				+        with open(file_path, "r", encoding="utf-8") as f:
			
 
				+            data = json.load(f)
			
 
				+
			
 
				+        # 获取帖子详情
			
 
				+        print(f"  获取帖子 {post_id} 的详情...")
			
 
				+        post_detail = get_post_detail(post_id)
			
 
				+        if not post_detail:
			
 
				+            print(f"  警告: 未能获取帖子 {post_id} 的详情")
			
 
				+
			
 
				+        # 提取三点解构数据
			
 
				+        three_points = data.get("三点解构", {})
			
 
				+
			
 
				+        # 处理灵感点
			
 
				+        inspiration_points = []
			
 
				+        if "灵感点" in three_points:
			
 
				+            inspiration_points = process_inspiration_points(three_points["灵感点"])
			
 
				+
			
 
				+        # 处理目的点
			
 
				+        purpose_points = []
			
 
				+        if "目的点" in three_points:
			
 
				+            purpose_points = process_purpose_points(three_points["目的点"])
			
 
				+
			
 
				+        # 处理关键点
			
 
				+        key_points = []
			
 
				+        if "关键点" in three_points:
			
 
				+            key_points = process_key_points(three_points["关键点"])
			
 
				+
			
 
				+        # 构建结果
			
 
				+        task_item = {
			
 
				+            "帖子id": post_id,
			
 
				+            "帖子详情": post_detail if post_detail else {},
			
 
				+            "what解构结果": {
			
 
				+                "灵感点列表": inspiration_points,
			
 
				+                "目的点列表": purpose_points,
			
 
				+                "关键点列表": key_points
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        return task_item
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(f"  错误: 处理文件 {file_path.name} 时出错: {e}")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    # 输入输出路径（默认使用项目根目录下的 data/data_1117 目录）
			
 
				+    script_dir = Path(__file__).parent
			
 
				+    project_root = script_dir.parent.parent
			
 
				+    data_dir = project_root / "data" / "data_1117"
			
 
				+
			
 
				+    input_dir = data_dir / "当前帖子_what解构结果"
			
 
				+    output_file = data_dir / "当前帖子_解构任务列表.json"
			
 
				+
			
 
				+    print(f"正在扫描目录: {input_dir}")
			
 
				+
			
 
				+    # 获取所有JSON文件
			
 
				+    json_files = list(input_dir.glob("*.json"))
			
 
				+    print(f"找到 {len(json_files)} 个JSON文件\n")
			
 
				+
			
 
				+    # 处理所有文件
			
 
				+    task_list = []
			
 
				+    for i, file_path in enumerate(json_files, 1):
			
 
				+        print(f"处理文件 [{i}/{len(json_files)}]: {file_path.name}")
			
 
				+        task_item = process_single_file(file_path)
			
 
				+        if task_item:
			
 
				+            task_list.append(task_item)
			
 
				+            print(f"  ✓ 成功提取")
			
 
				+        print()
			
 
				+
			
 
				+    # 构建最终结果
			
 
				+    final_result = {
			
 
				+        "解构任务列表": task_list
			
 
				+    }
			
 
				+
			
 
				+    # 统计信息
			
 
				+    print(f"提取统计:")
			
 
				+    print(f"  总帖子数: {len(task_list)}")
			
 
				+    total_inspiration = sum(len(task["what解构结果"]["灵感点列表"]) for task in task_list)
			
 
				+    total_purpose = sum(len(task["what解构结果"]["目的点列表"]) for task in task_list)
			
 
				+    total_key = sum(len(task["what解构结果"]["关键点列表"]) for task in task_list)
			
 
				+    print(f"  总灵感点: {total_inspiration} 个")
			
 
				+    print(f"  总目的点: {total_purpose} 个")
			
 
				+    print(f"  总关键点: {total_key} 个")
			
 
				+
			
 
				+    # 保存结果
			
 
				+    print(f"\n正在保存结果到: {output_file}")
			
 
				+    with open(output_file, "w", encoding="utf-8") as f:
			
 
				+        json.dump(final_result, f, ensure_ascii=False, indent=4)
			
 
				+
			
 
				+    print("完成!")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()