Browse Source

refactor: 适配新数据结构的数据处理脚本

- extract_current_posts.py: 简化当前帖子提取逻辑
- extract_feature_categories.py: 调整特征分类提取
- extract_features_from_posts.py: 简化特征帖子来源提取
- extract_nodes_and_edges.py: 优化节点边提取
- build_persona_graph.py: 调整图谱构建
- config/accounts.json: 更新账号配置

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
yangxiaohui 1 day ago
parent
commit
ea27fad17a

+ 6 - 21
config/accounts.json

@@ -2,37 +2,22 @@
   "data_root": "../data",
   "accounts": [
     {
-      "name": "阿里多多酱",
+      "name": "阿里多多酱_1125",
       "enabled": false,
       "description": "旧目录结构(已停用)"
     },
     {
-      "name": "阿里多多酱1",
-      "enabled": false,
-      "description": "新目录结构(已停用)"
-    },
-    {
-      "name": "阿里多多酱2",
-      "enabled": false,
-      "description": "新目录结构(已停用)"
-    },
-    {
-      "name": "阿里多多酱3",
+      "name": "阿里多多酱_1203",
       "enabled": true,
-      "description": "新目录结构"
+      "description": "新目录结构(1203版本)"
     },
     {
       "name": "摸鱼阿希",
-      "enabled": true,
-      "description": "新目录结构"
-    },
-    {
-      "name": "示例账号2",
       "enabled": false,
-      "description": "未启用的示例账号"
+      "description": "新目录结构"
     }
   ],
-  "default_account": "阿里多多酱_1125",
+  "default_account": "阿里多多酱_1203",
   "comment": "数据根目录可通过 data_root 配置(支持绝对路径、~、环境变量),也可通过 DATA_ROOT 环境变量覆盖",
   "filter_mode": "exclude_current_posts",
   "filter_modes": {
@@ -54,7 +39,7 @@
     "input": {
       "current_posts": "解构内容/what单独",
       "historical_posts": "解构内容/pattern聚类",
-      "pattern_cluster": "pattern相关文件/optimization/optimized_clustered_data_gemini-3-pro-preview.json"
+      "pattern_cluster": "pattern相关文件/detail/optimized_clustered_data_gemini-3-pro-preview_concurrent.json"
     },
     "output": {
       "intermediate": "how解构_outputs/{version}/intermediate",

+ 3 - 2
script/data_processing/build_persona_graph.py

@@ -887,8 +887,9 @@ def main():
 
     # 输入文件路径
     pattern_file = config.pattern_cluster_file
-    associations_file = config.account_dir / "pattern相关文件/optimization/dimension_associations_analysis.json"
-    intra_associations_file = config.account_dir / "pattern相关文件/optimization/intra_dimension_associations_analysis.json"
+    # 使用新的 detail 目录
+    associations_file = config.account_dir / "pattern相关文件/detail/dimension_associations_analysis.json"
+    intra_associations_file = config.account_dir / "pattern相关文件/detail/intra_dimension_associations_analysis.json"
     historical_posts_dir = config.historical_posts_dir
 
     # 输出文件路径

+ 87 - 142
script/data_processing/extract_current_posts.py

@@ -1,14 +1,14 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-从当前帖子_what解构结果目录中提取解构任务列表
+从当前帖子目录中提取解构任务列表
+支持新版数据结构(inspiration_final_result, purpose_final_result, keypoint_final)
 """
 
 import json
 from pathlib import Path
 from typing import Dict, List, Optional
 import sys
-import re
 
 # 添加项目根目录到路径
 project_root = Path(__file__).parent.parent.parent
@@ -19,11 +19,10 @@ from script.data_processing.path_config import PathConfig
 
 
 def extract_post_id_from_filename(filename: str) -> str:
-    """从文件名中提取帖子ID"""
-    match = re.match(r'^([^_]+)_', filename)
-    if match:
-        return match.group(1)
-    return ""
+    """从文件名中提取帖子ID
+    格式: 68a6b96f000000001d006058.json
+    """
+    return filename.replace('.json', '')
 
 
 def get_post_detail(post_id: str) -> Optional[Dict]:
@@ -36,148 +35,107 @@ def get_post_detail(post_id: str) -> Optional[Dict]:
         return None
 
 
-def extract_features_from_point(point_data: Dict) -> List[Dict]:
-    """
-    从点数据中提取特征信息列表(包含名称和权重)
-
-    Args:
-        point_data: 点的数据(包含"提取的特征"字段)
-
-    Returns:
-        特征信息列表,每项包含 {"特征名称": str, "权重": float}
-    """
-    features = []
-    if "提取的特征" in point_data and isinstance(point_data["提取的特征"], list):
-        for feature in point_data["提取的特征"]:
-            if "特征名称" in feature:
-                feature_item = {
-                    "特征名称": feature["特征名称"],
-                    "权重": feature.get("权重", 1.0)  # 默认权重为1.0
-                }
-                features.append(feature_item)
-    return features
-
-
-def process_inspiration_points(inspiration_data: Dict) -> List[Dict]:
-    """
-    处理灵感点数据
-
-    Args:
-        inspiration_data: 灵感点数据
-
-    Returns:
-        灵感点列表
-    """
+def process_inspiration_points(data: Dict) -> List[Dict]:
+    """处理灵感点数据"""
     result = []
-
-    # 处理三个维度:全新内容、共性差异、共性内容
-    for dimension in ["全新内容", "共性差异", "共性内容"]:
-        if dimension in inspiration_data and isinstance(inspiration_data[dimension], list):
-            for item in inspiration_data[dimension]:
-                point_item = {
-                    "名称": item.get("灵感点", ""),
-                    "描述": item.get("描述", ""),
-                    "特征列表": extract_features_from_point(item)
-                }
-                result.append(point_item)
-
+    if "inspiration_final_result" not in data:
+        return result
+
+    inspiration_data = data["inspiration_final_result"]
+    for item in inspiration_data.get("最终灵感点列表", []):
+        point_item = {
+            "ID": item.get("id", ""),
+            "名称": item.get("灵感点", ""),
+            "类型": item.get("类型", ""),
+            "描述": item.get("描述", ""),
+            "置信度": item.get("置信度", ""),
+            "支撑的ID": [],
+            "关联的ID": []
+        }
+        result.append(point_item)
     return result
 
 
-def process_purpose_points(purpose_data: Dict) -> List[Dict]:
-    """
-    处理目的点数据
-
-    Args:
-        purpose_data: 目的点数据
-
-    Returns:
-        目的点列表
-    """
+def process_purpose_points(data: Dict) -> List[Dict]:
+    """处理目的点数据(意图+实质)"""
     result = []
-
-    if "purposes" in purpose_data and isinstance(purpose_data["purposes"], list):
-        for item in purpose_data["purposes"]:
-            point_item = {
-                "名称": item.get("目的点", ""),
-                "描述": item.get("描述", ""),
-                "特征列表": extract_features_from_point(item)
-            }
-            result.append(point_item)
+    if "purpose_final_result" not in data:
+        return result
+
+    purpose_data = data["purpose_final_result"]
+
+    # 处理意图列表
+    for item in purpose_data.get("最终意图列表", []):
+        point_item = {
+            "ID": item.get("意图ID", ""),
+            "名称": item.get("目的点", ""),
+            "类型": "意图",
+            "描述": item.get("描述", ""),
+            "置信度": item.get("置信度", ""),
+            "支撑的ID": [],
+            "关联的ID": []
+        }
+        result.append(point_item)
+
+    # 处理实质列表
+    for item in purpose_data.get("最终实质列表", []):
+        related_id = item.get("关联意图ID", "")
+        point_item = {
+            "ID": item.get("实质ID", ""),
+            "名称": item.get("目的点", ""),
+            "类型": "实质",
+            "描述": item.get("描述", ""),
+            "置信度": item.get("置信度", ""),
+            "支撑的ID": [],
+            "关联的ID": [related_id] if related_id else []
+        }
+        result.append(point_item)
 
     return result
 
 
-def process_key_points(key_data: Dict) -> List[Dict]:
-    """
-    处理关键点数据
-
-    Args:
-        key_data: 关键点数据
-
-    Returns:
-        关键点列表
-    """
+def process_key_points(data: Dict) -> List[Dict]:
+    """处理关键点数据"""
     result = []
-
-    if "key_points" in key_data and isinstance(key_data["key_points"], list):
-        for item in key_data["key_points"]:
-            point_item = {
-                "名称": item.get("关键点", ""),
-                "描述": item.get("描述", ""),
-                "特征列表": extract_features_from_point(item)
-            }
-            result.append(point_item)
-
+    if "keypoint_final" not in data:
+        return result
+
+    keypoint_data = data["keypoint_final"]
+    for item in keypoint_data.get("最终关键点列表", []):
+        point_item = {
+            "ID": item.get("关键点ID", ""),
+            "名称": item.get("关键点", ""),
+            "类型": item.get("类型", ""),
+            "描述": item.get("描述", ""),
+            "置信度": item.get("置信度", ""),
+            "支撑的ID": item.get("支撑的ID", []),
+            "关联的ID": []
+        }
+        result.append(point_item)
     return result
 
 
 def process_single_file(file_path: Path) -> Optional[Dict]:
-    """
-    处理单个JSON文件
-
-    Args:
-        file_path: JSON文件路径
-
-    Returns:
-        解构任务字典,如果处理失败则返回None
-    """
-    # 从文件名提取帖子ID
+    """处理单个JSON文件"""
     post_id = extract_post_id_from_filename(file_path.name)
     if not post_id:
         print(f"  警告: 无法从文件名提取帖子ID: {file_path.name}")
         return None
 
     try:
-        # 读取文件
         with open(file_path, "r", encoding="utf-8") as f:
             data = json.load(f)
 
-        # 获取帖子详情
         print(f"  获取帖子 {post_id} 的详情...")
         post_detail = get_post_detail(post_id)
         if not post_detail:
             print(f"  警告: 未能获取帖子 {post_id} 的详情")
 
-        # 提取三点解构数据
-        three_points = data.get("三点解构", {})
+        # 提取三点数据
+        inspiration_points = process_inspiration_points(data)
+        purpose_points = process_purpose_points(data)
+        key_points = process_key_points(data)
 
-        # 处理灵感点
-        inspiration_points = []
-        if "灵感点" in three_points:
-            inspiration_points = process_inspiration_points(three_points["灵感点"])
-
-        # 处理目的点
-        purpose_points = []
-        if "目的点" in three_points:
-            purpose_points = process_purpose_points(three_points["目的点"])
-
-        # 处理关键点
-        key_points = []
-        if "关键点" in three_points:
-            key_points = process_key_points(three_points["关键点"])
-
-        # 构建结果
         task_item = {
             "帖子id": post_id,
             "帖子详情": post_detail if post_detail else {},
@@ -196,56 +154,43 @@ def process_single_file(file_path: Path) -> Optional[Dict]:
 
 
 def main():
-    # 使用路径配置
     config = PathConfig()
-
-    # 确保输出目录存在
     config.ensure_dirs()
 
-    # 获取路径
     input_dir = config.current_posts_dir
     output_file = config.task_list_file
 
     print(f"账号: {config.account_name}")
     print(f"当前帖子目录: {input_dir}")
     print(f"输出文件: {output_file}")
-    print()
 
-    print(f"正在扫描目录: {input_dir}")
+    print(f"\n正在扫描目录: {input_dir}")
 
-    # 获取所有JSON文件
     json_files = list(input_dir.glob("*.json"))
-    print(f"找到 {len(json_files)} 个JSON文件\n")
+    print(f"找到 {len(json_files)} 个JSON文件")
 
-    # 处理所有文件
     task_list = []
     for i, file_path in enumerate(json_files, 1):
-        print(f"处理文件 [{i}/{len(json_files)}]: {file_path.name}")
+        print(f"\n处理文件 [{i}/{len(json_files)}]: {file_path.name}")
         task_item = process_single_file(file_path)
         if task_item:
             task_list.append(task_item)
             print(f"  ✓ 成功提取")
-        print()
 
-    # 构建最终结果
-    final_result = {
-        "解构任务列表": task_list
-    }
+    # 统计
+    total_inspiration = sum(len(t["what解构结果"]["灵感点列表"]) for t in task_list)
+    total_purpose = sum(len(t["what解构结果"]["目的点列表"]) for t in task_list)
+    total_key = sum(len(t["what解构结果"]["关键点列表"]) for t in task_list)
 
-    # 统计信息
-    print(f"提取统计:")
+    print(f"\n提取统计:")
     print(f"  总帖子数: {len(task_list)}")
-    total_inspiration = sum(len(task["what解构结果"]["灵感点列表"]) for task in task_list)
-    total_purpose = sum(len(task["what解构结果"]["目的点列表"]) for task in task_list)
-    total_key = sum(len(task["what解构结果"]["关键点列表"]) for task in task_list)
     print(f"  总灵感点: {total_inspiration} 个")
     print(f"  总目的点: {total_purpose} 个")
     print(f"  总关键点: {total_key} 个")
 
-    # 保存结果
     print(f"\n正在保存结果到: {output_file}")
     with open(output_file, "w", encoding="utf-8") as f:
-        json.dump(final_result, f, ensure_ascii=False, indent=4)
+        json.dump(task_list, f, ensure_ascii=False, indent=2)
 
     print("完成!")
 

+ 4 - 5
script/data_processing/extract_feature_categories.py

@@ -19,11 +19,10 @@ from script.data_processing.path_config import PathConfig
 
 
 def extract_post_id_from_filename(filename: str) -> str:
-    """从文件名中提取帖子ID"""
-    match = re.match(r'^([^_]+)_', filename)
-    if match:
-        return match.group(1)
-    return ""
+    """从文件名中提取帖子ID
+    格式: 68a6b96f000000001d006058.json
+    """
+    return filename.replace('.json', '')
 
 
 def get_post_detail(post_id: str) -> Optional[Dict]:

+ 80 - 267
script/data_processing/extract_features_from_posts.py

@@ -1,7 +1,8 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-从过去帖子_what解构结果目录中提取特征名称及其来源信息
+从过去帖子解构结果目录中提取特征名称及其来源信息
+仅支持新版数据结构(inspiration_final_result, purpose_final_result, keypoint_final)
 """
 
 import json
@@ -19,23 +20,14 @@ from script.data_processing.path_config import PathConfig
 
 
 def extract_post_id_from_filename(filename: str) -> str:
-    """从文件名中提取帖子ID"""
-    match = re.match(r'^([^_]+)_', filename)
-    if match:
-        return match.group(1)
-    return ""
-
-
-def get_post_detail(post_id: str) -> Optional[Dict]:
+    """从文件名中提取帖子ID
+    支持格式: 68a6b96f000000001d006058.json
     """
-    获取帖子详情
+    return filename.replace('.json', '')
 
-    Args:
-        post_id: 帖子ID
 
-    Returns:
-        帖子详情字典,如果获取失败则返回None
-    """
+def get_post_detail(post_id: str) -> Optional[Dict]:
+    """获取帖子详情"""
     try:
         detail = get_xiaohongshu_detail(post_id)
         return detail
@@ -44,35 +36,6 @@ def get_post_detail(post_id: str) -> Optional[Dict]:
         return None
 
 
-def extract_features_from_point(point_data: Dict, post_id: str, point_name: str, point_description: str) -> List[Dict]:
-    """
-    从单个点(灵感点/目的点/关键点)中提取特征信息
-
-    Args:
-        point_data: 点的数据
-        post_id: 帖子ID
-        point_name: 点的名称
-        point_description: 点的描述
-
-    Returns:
-        特征列表
-    """
-    features = []
-
-    # 检查是否有"提取的特征"字段
-    if "提取的特征" in point_data and isinstance(point_data["提取的特征"], list):
-        for feature in point_data["提取的特征"]:
-            if "特征名称" in feature:
-                features.append({
-                    "特征名称": feature["特征名称"],
-                    "点的名称": point_name,
-                    "点的描述": point_description,
-                    "帖子id": post_id
-                })
-
-    return features
-
-
 def process_single_file(file_path: Path) -> Dict[str, Dict[str, List[Dict]]]:
     """
     处理单个JSON文件,提取所有特征信息
@@ -89,113 +52,81 @@ def process_single_file(file_path: Path) -> Dict[str, Dict[str, List[Dict]]]:
         "关键点": {}
     }
 
-    # 从文件名提取帖子ID
     post_id = extract_post_id_from_filename(file_path.name)
 
     try:
         with open(file_path, "r", encoding="utf-8") as f:
             data = json.load(f)
 
-        # 提取三点解构数据
-        if "三点解构" not in data:
-            return result
+        # 处理灵感点
+        if "inspiration_final_result" in data:
+            inspiration_data = data["inspiration_final_result"]
+            for item in inspiration_data.get("最终灵感点列表", []):
+                feature_name = item.get("灵感点", "")
+                if not feature_name:
+                    continue
+                if feature_name not in result["灵感点"]:
+                    result["灵感点"][feature_name] = []
+                result["灵感点"][feature_name].append({
+                    "点的名称": feature_name,
+                    "点的描述": item.get("描述", ""),
+                    "帖子id": post_id,
+                    "点ID": item.get("id", ""),
+                    "类型": item.get("类型", "")
+                })
+
+        # 处理目的点(意图+实质)
+        if "purpose_final_result" in data:
+            purpose_data = data["purpose_final_result"]
 
-        three_points = data["三点解构"]
+            # 处理意图列表
+            for item in purpose_data.get("最终意图列表", []):
+                feature_name = item.get("目的点", "")
+                if not feature_name:
+                    continue
+                if feature_name not in result["目的点"]:
+                    result["目的点"][feature_name] = []
+                result["目的点"][feature_name].append({
+                    "点的名称": feature_name,
+                    "点的描述": item.get("描述", ""),
+                    "帖子id": post_id,
+                    "点ID": item.get("意图ID", ""),
+                    "类型": "意图"
+                })
 
-        # 处理灵感点
-        if "灵感点" in three_points:
-            inspiration = three_points["灵感点"]
-
-            # 处理全新内容
-            if "全新内容" in inspiration and isinstance(inspiration["全新内容"], list):
-                for item in inspiration["全新内容"]:
-                    point_name = item.get("灵感点", "")
-                    point_desc = item.get("描述", "")
-                    features = extract_features_from_point(item, post_id, point_name, point_desc)
-
-                    for feature in features:
-                        feature_name = feature["特征名称"]
-                        if feature_name not in result["灵感点"]:
-                            result["灵感点"][feature_name] = []
-                        result["灵感点"][feature_name].append({
-                            "点的名称": feature["点的名称"],
-                            "点的描述": feature["点的描述"],
-                            "帖子id": feature["帖子id"]
-                        })
-
-            # 处理共性差异
-            if "共性差异" in inspiration and isinstance(inspiration["共性差异"], list):
-                for item in inspiration["共性差异"]:
-                    point_name = item.get("灵感点", "")
-                    point_desc = item.get("描述", "")
-                    features = extract_features_from_point(item, post_id, point_name, point_desc)
-
-                    for feature in features:
-                        feature_name = feature["特征名称"]
-                        if feature_name not in result["灵感点"]:
-                            result["灵感点"][feature_name] = []
-                        result["灵感点"][feature_name].append({
-                            "点的名称": feature["点的名称"],
-                            "点的描述": feature["点的描述"],
-                            "帖子id": feature["帖子id"]
-                        })
-
-            # 处理共性内容
-            if "共性内容" in inspiration and isinstance(inspiration["共性内容"], list):
-                for item in inspiration["共性内容"]:
-                    point_name = item.get("灵感点", "")
-                    point_desc = item.get("描述", "")
-                    features = extract_features_from_point(item, post_id, point_name, point_desc)
-
-                    for feature in features:
-                        feature_name = feature["特征名称"]
-                        if feature_name not in result["灵感点"]:
-                            result["灵感点"][feature_name] = []
-                        result["灵感点"][feature_name].append({
-                            "点的名称": feature["点的名称"],
-                            "点的描述": feature["点的描述"],
-                            "帖子id": feature["帖子id"]
-                        })
-
-        # 处理目的点
-        if "目的点" in three_points:
-            purpose = three_points["目的点"]
-
-            if "purposes" in purpose and isinstance(purpose["purposes"], list):
-                for item in purpose["purposes"]:
-                    point_name = item.get("目的点", "")
-                    point_desc = item.get("描述", "")
-                    features = extract_features_from_point(item, post_id, point_name, point_desc)
-
-                    for feature in features:
-                        feature_name = feature["特征名称"]
-                        if feature_name not in result["目的点"]:
-                            result["目的点"][feature_name] = []
-                        result["目的点"][feature_name].append({
-                            "点的名称": feature["点的名称"],
-                            "点的描述": feature["点的描述"],
-                            "帖子id": feature["帖子id"]
-                        })
+            # 处理实质列表
+            for item in purpose_data.get("最终实质列表", []):
+                feature_name = item.get("目的点", "")
+                if not feature_name:
+                    continue
+                if feature_name not in result["目的点"]:
+                    result["目的点"][feature_name] = []
+                result["目的点"][feature_name].append({
+                    "点的名称": feature_name,
+                    "点的描述": item.get("描述", ""),
+                    "帖子id": post_id,
+                    "点ID": item.get("实质ID", ""),
+                    "类型": "实质",
+                    "关联意图ID": item.get("关联意图ID", "")
+                })
 
         # 处理关键点
-        if "关键点" in three_points:
-            key_points = three_points["关键点"]
-
-            if "key_points" in key_points and isinstance(key_points["key_points"], list):
-                for item in key_points["key_points"]:
-                    point_name = item.get("关键点", "")
-                    point_desc = item.get("描述", "")
-                    features = extract_features_from_point(item, post_id, point_name, point_desc)
-
-                    for feature in features:
-                        feature_name = feature["特征名称"]
-                        if feature_name not in result["关键点"]:
-                            result["关键点"][feature_name] = []
-                        result["关键点"][feature_name].append({
-                            "点的名称": feature["点的名称"],
-                            "点的描述": feature["点的描述"],
-                            "帖子id": feature["帖子id"]
-                        })
+        if "keypoint_final" in data:
+            keypoint_data = data["keypoint_final"]
+            for item in keypoint_data.get("最终关键点列表", []):
+                feature_name = item.get("关键点", "")
+                if not feature_name:
+                    continue
+                if feature_name not in result["关键点"]:
+                    result["关键点"][feature_name] = []
+                result["关键点"][feature_name].append({
+                    "点的名称": feature_name,
+                    "点的描述": item.get("描述", ""),
+                    "帖子id": post_id,
+                    "点ID": item.get("关键点ID", ""),
+                    "类型": item.get("类型", ""),
+                    "支撑的ID": item.get("支撑的ID", [])
+                })
 
     except Exception as e:
         print(f"处理文件 {file_path.name} 时出错: {e}")
@@ -204,15 +135,7 @@ def process_single_file(file_path: Path) -> Dict[str, Dict[str, List[Dict]]]:
 
 
 def merge_results(all_results: List[Dict]) -> Dict:
-    """
-    合并所有文件的提取结果
-
-    Args:
-        all_results: 所有文件的结果列表
-
-    Returns:
-        合并后的结果
-    """
+    """合并所有文件的提取结果"""
     merged = {
         "灵感点": {},
         "目的点": {},
@@ -232,21 +155,9 @@ def merge_results(all_results: List[Dict]) -> Dict:
 def convert_to_array_format(
     merged_dict: Dict,
     fetch_details: bool = True,
-    time_filter: Optional[str] = None,
     exclude_post_ids: Optional[Set[str]] = None
 ) -> Dict:
-    """
-    将字典格式转换为数组格式,并添加帖子详情
-
-    Args:
-        merged_dict: 字典格式的结果
-        fetch_details: 是否获取帖子详情,默认为True
-        time_filter: 时间过滤阈值,只保留发布时间<该时间的帖子,格式为 "YYYY-MM-DD HH:MM:SS"
-        exclude_post_ids: 要排除的帖子ID集合
-
-    Returns:
-        数组格式的结果
-    """
+    """将字典格式转换为数组格式,并添加帖子详情"""
     result = {
         "灵感点": [],
         "目的点": [],
@@ -272,10 +183,7 @@ def convert_to_array_format(
 
         print(f"成功获取 {len(post_details)} 个帖子详情")
 
-        # 应用过滤规则
-        filtered_count = 0
-
-        # 1. 如果启用帖子ID过滤
+        # 应用帖子ID过滤
         if exclude_post_ids:
             print(f"\n正在应用帖子ID过滤,排除 {len(exclude_post_ids)} 个当前帖子...")
             before_count = len(post_details)
@@ -285,38 +193,18 @@ def convert_to_array_format(
                 print(f"  ⚠️  过滤掉 {filtered_count} 个当前帖子")
             print(f"保留 {len(post_details)} 个帖子")
 
-        # 2. 如果启用时间过滤(过滤掉发布时间晚于等于阈值的帖子,避免穿越)
-        elif time_filter:
-            print(f"\n正在应用时间过滤 (< {time_filter}),避免使用晚于当前帖子的数据...")
-            filtered_post_ids = set()
-            for post_id, detail in post_details.items():
-                publish_time = detail.get('publish_time', '')
-                if publish_time < time_filter:
-                    filtered_post_ids.add(post_id)
-                else:
-                    filtered_count += 1
-                    print(f"  ⚠️  过滤掉帖子 {post_id} (发布时间: {publish_time},晚于阈值)")
-
-            print(f"过滤掉 {filtered_count} 个帖子(穿越),保留 {len(filtered_post_ids)} 个帖子")
-            # 更新post_details,只保留符合时间条件的
-            post_details = {pid: detail for pid, detail in post_details.items() if pid in filtered_post_ids}
-
     # 转换为数组格式并添加帖子详情
     for category in ["灵感点", "目的点", "关键点"]:
         for feature_name, data in merged_dict[category].items():
-            # 为每个来源添加帖子详情
             enhanced_sources = []
             for source in data["来源"]:
-                # 如果启用过滤,跳过不符合条件的帖子
-                if fetch_details and (time_filter or exclude_post_ids) and source["帖子id"] not in post_details:
+                if fetch_details and exclude_post_ids and source["帖子id"] not in post_details:
                     continue
-
                 enhanced_source = source.copy()
                 if fetch_details and source["帖子id"] in post_details:
                     enhanced_source["帖子详情"] = post_details[source["帖子id"]]
                 enhanced_sources.append(enhanced_source)
 
-            # 只添加有来源的特征
             if enhanced_sources:
                 result[category].append({
                     "特征名称": feature_name,
@@ -327,15 +215,7 @@ def convert_to_array_format(
 
 
 def get_current_post_ids(current_posts_dir: Path) -> Set[str]:
-    """
-    获取当前帖子目录中的所有帖子ID
-
-    Args:
-        current_posts_dir: 当前帖子目录路径
-
-    Returns:
-        当前帖子ID集合
-    """
+    """获取当前帖子目录中的所有帖子ID"""
     if not current_posts_dir.exists():
         print(f"警告: 当前帖子目录不存在: {current_posts_dir}")
         return set()
@@ -358,60 +238,10 @@ def get_current_post_ids(current_posts_dir: Path) -> Set[str]:
     return post_ids
 
 
-def get_earliest_publish_time(current_posts_dir: Path) -> Optional[str]:
-    """
-    获取当前帖子目录中最早的发布时间
-
-    Args:
-        current_posts_dir: 当前帖子目录路径
-
-    Returns:
-        最早的发布时间字符串,格式为 "YYYY-MM-DD HH:MM:SS"
-    """
-    if not current_posts_dir.exists():
-        print(f"警告: 当前帖子目录不存在: {current_posts_dir}")
-        return None
-
-    json_files = list(current_posts_dir.glob("*.json"))
-    if not json_files:
-        print(f"警告: 当前帖子目录为空: {current_posts_dir}")
-        return None
-
-    print(f"\n正在获取当前帖子的发布时间...")
-    print(f"找到 {len(json_files)} 个当前帖子")
-
-    earliest_time = None
-    for file_path in json_files:
-        post_id = extract_post_id_from_filename(file_path.name)
-        if not post_id:
-            continue
-
-        try:
-            detail = get_post_detail(post_id)
-            if detail and 'publish_time' in detail:
-                publish_time = detail['publish_time']
-                if earliest_time is None or publish_time < earliest_time:
-                    earliest_time = publish_time
-                    print(f"  更新最早时间: {publish_time} (帖子: {post_id})")
-        except Exception as e:
-            print(f"  警告: 获取帖子 {post_id} 发布时间失败: {e}")
-
-    if earliest_time:
-        print(f"\n当前帖子最早发布时间: {earliest_time}")
-    else:
-        print("\n警告: 未能获取到任何当前帖子的发布时间")
-
-    return earliest_time
-
-
 def main():
-    # 使用路径配置
     config = PathConfig()
-
-    # 确保输出目录存在
     config.ensure_dirs()
 
-    # 获取路径
     input_dir = config.historical_posts_dir
     current_posts_dir = config.current_posts_dir
     output_file = config.feature_source_mapping_file
@@ -425,56 +255,39 @@ def main():
 
     print(f"\n正在扫描目录: {input_dir}")
 
-    # 获取所有JSON文件
     json_files = list(input_dir.glob("*.json"))
     print(f"找到 {len(json_files)} 个JSON文件")
 
-    # 处理所有文件
     all_results = []
     for i, file_path in enumerate(json_files, 1):
         print(f"处理文件 [{i}/{len(json_files)}]: {file_path.name}")
         result = process_single_file(file_path)
         all_results.append(result)
 
-    # 合并结果
     print("\n正在合并结果...")
     merged_result = merge_results(all_results)
 
-    # 根据配置的过滤模式应用过滤
-    filter_mode = config.filter_mode
-    time_filter = None
+    # 过滤当前帖子
     exclude_post_ids = None
-
-    if filter_mode == "exclude_current_posts":
-        # 新规则:排除当前帖子ID
+    if config.filter_mode == "exclude_current_posts":
         print("\n应用过滤规则: 排除当前帖子ID")
         exclude_post_ids = get_current_post_ids(current_posts_dir)
-    elif filter_mode == "time_based":
-        # 旧规则:基于发布时间
-        print("\n应用过滤规则: 基于发布时间")
-        time_filter = get_earliest_publish_time(current_posts_dir)
-    elif filter_mode == "none":
+    elif config.filter_mode == "none":
         print("\n过滤模式: none,不应用任何过滤")
-    else:
-        print(f"\n警告: 未知的过滤模式 '{filter_mode}',不应用过滤")
 
-    # 转换为数组格式(带过滤)
     print("正在转换为数组格式...")
     final_result = convert_to_array_format(
         merged_result,
         fetch_details=True,
-        time_filter=time_filter,
         exclude_post_ids=exclude_post_ids
     )
 
-    # 统计信息
     print(f"\n提取统计:")
     for category in ["灵感点", "目的点", "关键点"]:
         feature_count = len(final_result[category])
         source_count = sum(len(item["特征来源"]) for item in final_result[category])
         print(f"  {category}: {feature_count} 个特征, {source_count} 个来源")
 
-    # 保存结果
     print(f"\n正在保存结果到: {output_file}")
     with open(output_file, "w", encoding="utf-8") as f:
         json.dump(final_result, f, ensure_ascii=False, indent=4)

+ 227 - 48
script/data_processing/extract_nodes_and_edges.py

@@ -58,11 +58,10 @@ def build_node_id(dimension: str, node_type: str, name: str) -> str:
 
 
 def extract_post_id_from_filename(filename: str) -> str:
-    """从文件名中提取帖子ID"""
-    match = re.match(r'^([^_]+)_', filename)
-    if match:
-        return match.group(1)
-    return ""
+    """从文件名中提取帖子ID
+    格式: 68a6b96f000000001d006058.json
+    """
+    return filename.replace('.json', '')
 
 
 def get_current_post_ids(current_posts_dir: Path) -> Set[str]:
@@ -111,11 +110,17 @@ def collect_all_post_ids_from_edges(edges: List[Dict]) -> Set[str]:
     """从边列表中收集所有帖子ID"""
     post_ids = set()
     for edge in edges:
-        if edge.get("边类型") in ("分类共现(跨点)", "标签共现"):
-            edge_details = edge.get("边详情", {})
+        edge_type = edge.get("边类型", "")
+        edge_details = edge.get("边详情", {})
+
+        if edge_type in ("分类共现(跨点)", "标签共现"):
             common_post_ids = edge_details.get("共同帖子ID", [])
             post_ids.update(common_post_ids)
-        # 点内共现边不包含帖子ID
+        elif edge_type in ("支撑", "关联意图"):
+            # 新边类型使用帖子ID列表
+            post_id_list = edge_details.get("帖子ID列表", [])
+            post_ids.update(post_id_list)
+        # 点内共现边、属于边、包含边不包含帖子ID
     return post_ids
 
 
@@ -468,44 +473,33 @@ def extract_tags_from_post(post_data: Dict) -> Dict[str, List[str]]:
         "关键点": []
     }
 
-    if "三点解构" not in post_data:
-        return tags_by_dimension
-
-    three_points = post_data["三点解构"]
-
-    # 提取灵感点的特征
-    if "灵感点" in three_points:
-        inspiration = three_points["灵感点"]
-        for section in ["全新内容", "共性差异", "共性内容"]:
-            if section in inspiration and isinstance(inspiration[section], list):
-                for item in inspiration[section]:
-                    if "提取的特征" in item and isinstance(item["提取的特征"], list):
-                        for feature in item["提取的特征"]:
-                            tag_name = feature.get("特征名称", "")
-                            if tag_name:
-                                tags_by_dimension["灵感点"].append(tag_name)
-
-    # 提取目的点的特征
-    if "目的点" in three_points:
-        purpose = three_points["目的点"]
-        if "purposes" in purpose and isinstance(purpose["purposes"], list):
-            for item in purpose["purposes"]:
-                if "提取的特征" in item and isinstance(item["提取的特征"], list):
-                    for feature in item["提取的特征"]:
-                        tag_name = feature.get("特征名称", "")
-                        if tag_name:
-                            tags_by_dimension["目的点"].append(tag_name)
-
-    # 提取关键点的特征
-    if "关键点" in three_points:
-        key_points = three_points["关键点"]
-        if "key_points" in key_points and isinstance(key_points["key_points"], list):
-            for item in key_points["key_points"]:
-                if "提取的特征" in item and isinstance(item["提取的特征"], list):
-                    for feature in item["提取的特征"]:
-                        tag_name = feature.get("特征名称", "")
-                        if tag_name:
-                            tags_by_dimension["关键点"].append(tag_name)
+    # 提取灵感点
+    if "inspiration_final_result" in post_data:
+        inspiration_data = post_data["inspiration_final_result"]
+        for item in inspiration_data.get("最终灵感点列表", []):
+            tag_name = item.get("灵感点", "")
+            if tag_name:
+                tags_by_dimension["灵感点"].append(tag_name)
+
+    # 提取目的点(意图+实质)
+    if "purpose_final_result" in post_data:
+        purpose_data = post_data["purpose_final_result"]
+        for item in purpose_data.get("最终意图列表", []):
+            tag_name = item.get("目的点", "")
+            if tag_name:
+                tags_by_dimension["目的点"].append(tag_name)
+        for item in purpose_data.get("最终实质列表", []):
+            tag_name = item.get("目的点", "")
+            if tag_name:
+                tags_by_dimension["目的点"].append(tag_name)
+
+    # 提取关键点
+    if "keypoint_final" in post_data:
+        keypoint_data = post_data["keypoint_final"]
+        for item in keypoint_data.get("最终关键点列表", []):
+            tag_name = item.get("关键点", "")
+            if tag_name:
+                tags_by_dimension["关键点"].append(tag_name)
 
     return tags_by_dimension
 
@@ -596,6 +590,173 @@ def extract_tag_cooccurrence_edges(historical_posts_dir: Path, exclude_post_ids:
     return edges
 
 
+# ========== 支撑边和关联意图边提取(新版数据结构)==========
+
+def extract_support_and_intent_edges(historical_posts_dir: Path, exclude_post_ids: Set[str] = None) -> tuple[List[Dict], List[Dict]]:
+    """
+    从历史帖子解构结果中提取支撑边和关联意图边(仅新版数据结构)
+
+    支撑边:关键点 -> 灵感点/意图/实质
+    关联意图边:实质 -> 意图
+
+    Args:
+        historical_posts_dir: 历史帖子解构结果目录
+        exclude_post_ids: 要排除的帖子ID集合
+
+    Returns:
+        (支撑边列表, 关联意图边列表)
+    """
+    if exclude_post_ids is None:
+        exclude_post_ids = set()
+
+    support_edges = []  # 支撑边
+    intent_edges = []   # 关联意图边
+    seen_support_edges = set()
+    seen_intent_edges = set()
+
+    if not historical_posts_dir.exists():
+        print(f"警告: 历史帖子目录不存在: {historical_posts_dir}")
+        return [], []
+
+    json_files = list(historical_posts_dir.glob("*.json"))
+    print(f"找到 {len(json_files)} 个历史帖子文件")
+
+    for file_path in json_files:
+        # 提取帖子ID
+        post_id = extract_post_id_from_filename(file_path.name)
+        if not post_id:
+            post_id = file_path.stem
+
+        # 跳过排除的帖子
+        if post_id in exclude_post_ids:
+            continue
+
+        try:
+            with open(file_path, "r", encoding="utf-8") as f:
+                post_data = json.load(f)
+
+            # 只处理新版数据结构
+            if "keypoint_final" not in post_data and "purpose_final_result" not in post_data:
+                continue
+
+            # 构建帖子内的ID到名称映射
+            id_to_name = {}
+            id_to_type = {}  # 记录ID对应的类型(灵感点/意图/实质)
+
+            # 收集灵感点ID
+            if "inspiration_final_result" in post_data:
+                for item in post_data["inspiration_final_result"].get("最终灵感点列表", []):
+                    item_id = item.get("id", "")
+                    item_name = item.get("灵感点", "")
+                    if item_id and item_name:
+                        id_to_name[item_id] = item_name
+                        id_to_type[item_id] = "灵感点"
+
+            # 收集意图和实质ID
+            if "purpose_final_result" in post_data:
+                purpose_data = post_data["purpose_final_result"]
+                # 意图
+                for item in purpose_data.get("最终意图列表", []):
+                    item_id = item.get("意图ID", "")
+                    item_name = item.get("目的点", "")
+                    if item_id and item_name:
+                        id_to_name[item_id] = item_name
+                        id_to_type[item_id] = "意图"
+                # 实质
+                for item in purpose_data.get("最终实质列表", []):
+                    item_id = item.get("实质ID", "")
+                    item_name = item.get("目的点", "")
+                    related_intent_id = item.get("关联意图ID", "")
+                    if item_id and item_name:
+                        id_to_name[item_id] = item_name
+                        id_to_type[item_id] = "实质"
+
+                    # 提取关联意图边:实质 -> 意图
+                    if item_id and related_intent_id and related_intent_id in id_to_name:
+                        substance_name = item_name
+                        intent_name = id_to_name[related_intent_id]
+
+                        # 构建节点ID(实质和意图都属于目的点维度)
+                        substance_node_id = build_node_id("目的点", "标签", substance_name)
+                        intent_node_id = build_node_id("目的点", "标签", intent_name)
+
+                        edge_key = (substance_node_id, intent_node_id)
+                        if edge_key not in seen_intent_edges:
+                            seen_intent_edges.add(edge_key)
+                            intent_edges.append({
+                                "源节点ID": substance_node_id,
+                                "目标节点ID": intent_node_id,
+                                "边类型": "关联意图",
+                                "边详情": {
+                                    "源类型": "实质",
+                                    "目标类型": "意图",
+                                    "帖子ID列表": [post_id]
+                                }
+                            })
+                        else:
+                            # 已存在的边,添加帖子ID
+                            for edge in intent_edges:
+                                if edge["源节点ID"] == substance_node_id and edge["目标节点ID"] == intent_node_id:
+                                    if post_id not in edge["边详情"]["帖子ID列表"]:
+                                        edge["边详情"]["帖子ID列表"].append(post_id)
+                                    break
+
+            # 收集关键点ID并提取支撑边
+            if "keypoint_final" in post_data:
+                for item in post_data["keypoint_final"].get("最终关键点列表", []):
+                    kp_id = item.get("关键点ID", "")
+                    kp_name = item.get("关键点", "")
+                    support_ids = item.get("支撑的ID", [])
+
+                    if not kp_name or not support_ids:
+                        continue
+
+                    # 关键点节点ID
+                    kp_node_id = build_node_id("关键点", "标签", kp_name)
+
+                    # 遍历支撑的ID
+                    for support_id in support_ids:
+                        if support_id not in id_to_name:
+                            continue
+
+                        target_name = id_to_name[support_id]
+                        target_type = id_to_type[support_id]
+
+                        # 确定目标节点的维度
+                        if target_type == "灵感点":
+                            target_dimension = "灵感点"
+                        else:  # 意图或实质
+                            target_dimension = "目的点"
+
+                        target_node_id = build_node_id(target_dimension, "标签", target_name)
+
+                        edge_key = (kp_node_id, target_node_id)
+                        if edge_key not in seen_support_edges:
+                            seen_support_edges.add(edge_key)
+                            support_edges.append({
+                                "源节点ID": kp_node_id,
+                                "目标节点ID": target_node_id,
+                                "边类型": "支撑",
+                                "边详情": {
+                                    "源类型": "关键点",
+                                    "目标类型": target_type,
+                                    "帖子ID列表": [post_id]
+                                }
+                            })
+                        else:
+                            # 已存在的边,添加帖子ID
+                            for edge in support_edges:
+                                if edge["源节点ID"] == kp_node_id and edge["目标节点ID"] == target_node_id:
+                                    if post_id not in edge["边详情"]["帖子ID列表"]:
+                                        edge["边详情"]["帖子ID列表"].append(post_id)
+                                    break
+
+        except Exception as e:
+            print(f"  警告: 处理文件 {file_path.name} 时出错: {e}")
+
+    return support_edges, intent_edges
+
+
 # ========== 分类-分类边提取 ==========
 
 def extract_category_edges_from_associations(associations_data: Dict) -> List[Dict]:
@@ -762,8 +923,15 @@ def main():
 
     # 输入文件路径
     pattern_file = config.pattern_cluster_file
-    associations_file = config.account_dir / "pattern相关文件/optimization/dimension_associations_analysis.json"
-    intra_associations_file = config.account_dir / "pattern相关文件/optimization/intra_dimension_associations_analysis.json"
+    # 尝试新路径,如果不存在则使用旧路径
+    associations_file_new = config.account_dir / "pattern相关文件/detail/dimension_associations_analysis.json"
+    associations_file_old = config.account_dir / "pattern相关文件/optimization/dimension_associations_analysis.json"
+    associations_file = associations_file_new if associations_file_new.exists() else associations_file_old
+
+    intra_associations_file_new = config.account_dir / "pattern相关文件/detail/intra_dimension_associations_analysis.json"
+    intra_associations_file_old = config.account_dir / "pattern相关文件/optimization/intra_dimension_associations_analysis.json"
+    intra_associations_file = intra_associations_file_new if intra_associations_file_new.exists() else intra_associations_file_old
+
     current_posts_dir = config.current_posts_dir
 
     # 输出文件路径
@@ -906,11 +1074,22 @@ def main():
     all_edges.extend(tag_cooccurrence_edges)
     print(f"  标签-标签共现边: {len(tag_cooccurrence_edges)} 条")
 
+    # ===== 提取支撑边和关联意图边(新版数据结构)=====
+    print("\n" + "="*60)
+    print("提取支撑边和关联意图边(新版数据结构)...")
+    support_edges, intent_edges = extract_support_and_intent_edges(historical_posts_dir, exclude_post_ids)
+    all_edges.extend(support_edges)
+    all_edges.extend(intent_edges)
+    print(f"  支撑边: {len(support_edges)} 条")
+    print(f"  关联意图边: {len(intent_edges)} 条")
+
     # 更新总计
     print(f"\n总计: {len(all_edges)} 条边")
     print(f"  分类共现(跨点)边: {len(category_edges)}")
     print(f"  分类共现(点内)边: {len(intra_category_edges)}")
     print(f"  标签共现边: {len(tag_cooccurrence_edges)}")
+    print(f"  支撑边: {len(support_edges)}")
+    print(f"  关联意图边: {len(intent_edges)}")
     print(f"  属于边: {belong_count}")
     print(f"  包含边: {contain_count}")