1
0

6 Commits 7f86de3a26 ... d3022c2454

Autor SHA1 Nachricht Datum
  yangxiaohui d3022c2454 Merge remote-tracking branch 'origin/how_1125_v2' into how_1125_v1 vor 5 Tagen
  yangxiaohui 8d67e969d1 feat: 添加帖子树可视化,优化布局为三层结构 vor 5 Tagen
  yangxiaohui a70cd23ef9 feat: 添加跨层边提取和图谱流程一键脚本 vor 5 Tagen
  yangxiaohui 81f6ab587f refactor: 重命名图层标签 vor 5 Tagen
  yangxiaohui c47cfcbbd9 feat: 关系图展示完整路径并支持点击交互 vor 5 Tagen
  yangxiaohui 36e1132d87 fix: 统一使用预计算路径节点高亮帖子标签边 vor 5 Tagen

+ 26 - 2
script/data_processing/build_match_graph.py

@@ -645,11 +645,30 @@ def process_filtered_result(
     useful_category_edges = [e for e in category_edges
                             if e["源节点ID"] in useful_expanded_ids and e["目标节点ID"] in useful_expanded_ids]
 
+    # 5. 获取直接匹配层(第2层)和扩展层(第3层)之间的所有边(不仅仅是属于边)
+    # 这些边连接了直接匹配的人设节点和扩展的分类节点
+    cross_layer_edges = []
+    for edge in edges_data.get("边列表", []):
+        src, tgt = edge["源节点ID"], edge["目标节点ID"]
+        edge_type = edge["边类型"]
+        # 跳过已经收集的属于边(避免重复)
+        if edge_type == "属于":
+            continue
+        # 一端在直接匹配层,另一端在扩展层
+        src_in_direct = src in persona_node_ids
+        src_in_expanded = src in useful_expanded_ids
+        tgt_in_direct = tgt in persona_node_ids
+        tgt_in_expanded = tgt in useful_expanded_ids
+        if (src_in_direct and tgt_in_expanded) or (src_in_expanded and tgt_in_direct):
+            cross_layer_edges.append(edge)
+
     # 合并节点列表
     all_nodes = post_nodes + persona_nodes + useful_expanded_nodes
 
     # 合并边列表(加入帖子内的属于边)
-    all_edges = post_belong_edges + match_edges + persona_edges + post_edges + useful_expanded_edges + useful_category_edges + post_edges_via_expanded
+    all_edges = (post_belong_edges + match_edges + persona_edges + post_edges +
+                 useful_expanded_edges + useful_category_edges + cross_layer_edges +
+                 post_edges_via_expanded)
     # 去重边
     seen_edges = set()
     unique_edges = []
@@ -709,6 +728,7 @@ def process_filtered_result(
                 "匹配边数": len(match_edges),
                 "人设节点间边数": len(persona_edges),
                 "扩展边数(有效)": len(useful_expanded_edges),
+                "跨层边数": len(cross_layer_edges),
                 "帖子镜像边数(直接)": len(post_edges),
                 "帖子镜像边数(二阶)": len(post_edges_via_expanded),
                 "总节点数": len(all_nodes),
@@ -724,6 +744,7 @@ def process_filtered_result(
         "匹配边列表": match_edges,
         "人设节点间边列表": persona_edges,
         "扩展边列表": useful_expanded_edges,
+        "跨层边列表": cross_layer_edges,
         "帖子镜像边列表(直接)": post_edges,
         "帖子镜像边列表(二阶)": post_edges_via_expanded,
         "节点列表": all_nodes,
@@ -748,6 +769,7 @@ def process_filtered_result(
         "匹配边数": len(match_edges),
         "人设边数": len(persona_edges),
         "扩展边数": len(useful_expanded_edges),
+        "跨层边数": len(cross_layer_edges),
         "帖子边数(直接)": len(post_edges),
         "帖子边数(二阶)": len(post_edges_via_expanded),
         "总节点数": len(all_nodes),
@@ -805,7 +827,7 @@ def main():
         result = process_filtered_result(filtered_file, nodes_data, edges_data, output_dir)
         results.append(result)
         print(f"  帖子节点: {result['帖子节点数']}, 人设节点: {result['人设节点数']}, 扩展节点: {result['扩展节点数']}")
-        print(f"  匹配边: {result['匹配边数']}, 人设边: {result['人设边数']}, 扩展边: {result['扩展边数']}")
+        print(f"  匹配边: {result['匹配边数']}, 人设边: {result['人设边数']}, 扩展边: {result['扩展边数']}, 跨层边: {result['跨层边数']}")
         print(f"  帖子边(直接): {result['帖子边数(直接)']}, 帖子边(二阶): {result['帖子边数(二阶)']}")
 
     # 汇总统计
@@ -819,6 +841,7 @@ def main():
     total_match = sum(r['匹配边数'] for r in results)
     total_persona_edges = sum(r['人设边数'] for r in results)
     total_expanded_edges = sum(r['扩展边数'] for r in results)
+    total_cross_layer_edges = sum(r['跨层边数'] for r in results)
     total_post_edges_direct = sum(r['帖子边数(直接)'] for r in results)
     total_post_edges_2hop = sum(r['帖子边数(二阶)'] for r in results)
     print(f"  总帖子节点: {total_post}")
@@ -827,6 +850,7 @@ def main():
     print(f"  总匹配边: {total_match}")
     print(f"  总人设边: {total_persona_edges}")
     print(f"  总扩展边: {total_expanded_edges}")
+    print(f"  总跨层边: {total_cross_layer_edges}")
     print(f"  总帖子边(直接): {total_post_edges_direct}")
     print(f"  总帖子边(二阶): {total_post_edges_2hop}")
     print(f"\n输出目录: {output_dir}")

+ 185 - 0
script/data_processing/build_post_tree.py

@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+构建帖子树的中间数据
+
+输入:match_graph/*.json, results/*.json
+输出:match_graph/post_trees.json(包含所有帖子的树结构)
+"""
+
+import json
+from pathlib import Path
+import sys
+
+# 添加项目根目录到路径
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+from script.data_processing.path_config import PathConfig
+
+
+def build_post_trees():
+    """构建所有帖子的树数据"""
+    config = PathConfig()
+
+    print(f"账号: {config.account_name}")
+    print(f"输出版本: {config.output_version}")
+    print()
+
+    match_graph_dir = config.intermediate_dir / "match_graph"
+    results_dir = config.intermediate_dir.parent / "results"
+    output_file = match_graph_dir / "post_trees.json"
+
+    # 读取所有匹配图谱文件
+    graph_files = sorted(match_graph_dir.glob("*_match_graph.json"))
+    print(f"找到 {len(graph_files)} 个匹配图谱文件")
+
+    all_post_trees = []
+
+    for i, graph_file in enumerate(graph_files, 1):
+        print(f"\n[{i}/{len(graph_files)}] 处理: {graph_file.name}")
+
+        with open(graph_file, "r", encoding="utf-8") as f:
+            match_graph_data = json.load(f)
+
+        post_id = match_graph_data["说明"]["帖子ID"]
+        post_title = match_graph_data["说明"].get("帖子标题", "")
+
+        # 读取完整帖子详情
+        post_detail = {
+            "title": post_title,
+            "post_id": post_id
+        }
+        how_file = results_dir / f"{post_id}_how.json"
+        if how_file.exists():
+            with open(how_file, "r", encoding="utf-8") as f:
+                how_data = json.load(f)
+                if "帖子详情" in how_data:
+                    post_detail = how_data["帖子详情"]
+                    post_detail["post_id"] = post_id
+            print(f"  读取帖子详情: {how_file.name}")
+
+        # 获取帖子点和帖子标签
+        post_points = match_graph_data.get("帖子点节点列表", [])
+        post_tags = match_graph_data.get("帖子标签节点列表", [])
+        belong_edges = match_graph_data.get("帖子属于边列表", [])
+
+        print(f"  帖子点: {len(post_points)}, 帖子标签: {len(post_tags)}, 属于边: {len(belong_edges)}")
+
+        # 构建树结构
+        # 维度颜色
+        dim_colors = {
+            "灵感点": "#f39c12",
+            "目的点": "#3498db",
+            "关键点": "#9b59b6"
+        }
+
+        # 构建节点映射
+        point_map = {}
+        for n in post_points:
+            point_map[n["节点ID"]] = {
+                "id": n["节点ID"],
+                "name": n["节点名称"],
+                "nodeType": "点",
+                "level": n.get("节点层级", ""),
+                "dimColor": dim_colors.get(n.get("节点层级", ""), "#888"),
+                "description": n.get("描述", ""),
+                "children": []
+            }
+
+        tag_map = {}
+        for n in post_tags:
+            tag_map[n["节点ID"]] = {
+                "id": n["节点ID"],
+                "name": n["节点名称"],
+                "nodeType": "标签",
+                "level": n.get("节点层级", ""),
+                "dimColor": dim_colors.get(n.get("节点层级", ""), "#888"),
+                "weight": n.get("权重", 0),
+                "children": []
+            }
+
+        # 根据属于边,把标签挂到点下面
+        for e in belong_edges:
+            tag_node = tag_map.get(e["源节点ID"])
+            point_node = point_map.get(e["目标节点ID"])
+            if tag_node and point_node:
+                point_node["children"].append(tag_node)
+
+        # 按维度分组点节点
+        dimensions = ["灵感点", "目的点", "关键点"]
+        dimension_children = []
+
+        for dim in dimensions:
+            dim_points = [
+                point_map[n["节点ID"]]
+                for n in post_points
+                if n.get("节点层级") == dim and n["节点ID"] in point_map
+            ]
+
+            if dim_points:
+                dim_node = {
+                    "id": f"dim_{dim}",
+                    "name": dim,
+                    "nodeType": "维度",
+                    "isDimension": True,
+                    "dimColor": dim_colors[dim],
+                    "children": dim_points
+                }
+                dimension_children.append(dim_node)
+
+        # 根节点(帖子)
+        root_node = {
+            "id": f"post_{post_id}",
+            "name": post_title[:20] + "..." if len(post_title) > 20 else post_title,
+            "nodeType": "帖子",
+            "isRoot": True,
+            "postDetail": post_detail,
+            "children": dimension_children
+        }
+
+        # 统计节点数
+        total_nodes = 1 + len(dimension_children)  # 根节点 + 维度节点
+        for dim_node in dimension_children:
+            total_nodes += len(dim_node["children"])  # 点节点
+            for point_node in dim_node["children"]:
+                total_nodes += len(point_node["children"])  # 标签节点
+
+        post_tree = {
+            "postId": post_id,
+            "postTitle": post_title,
+            "postDetail": post_detail,
+            "root": root_node,
+            "stats": {
+                "totalNodes": total_nodes,
+                "pointCount": len(post_points),
+                "tagCount": len(post_tags)
+            }
+        }
+
+        all_post_trees.append(post_tree)
+        print(f"  构建完成: {total_nodes} 个节点")
+
+    # 输出
+    output_data = {
+        "说明": {
+            "描述": "帖子树结构数据(每个帖子一棵树)",
+            "帖子数": len(all_post_trees)
+        },
+        "postTrees": all_post_trees
+    }
+
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(output_data, f, ensure_ascii=False, indent=2)
+
+    print()
+    print("=" * 60)
+    print(f"构建完成!")
+    print(f"  帖子数: {len(all_post_trees)}")
+    print(f"  输出文件: {output_file}")
+
+    return output_file
+
+
+if __name__ == "__main__":
+    build_post_trees()

+ 129 - 0
script/data_processing/run_graph_pipeline.sh

@@ -0,0 +1,129 @@
+#!/bin/bash
+# 图谱构建与可视化流程(步骤5-9)
+#
+# 依赖前置步骤(1-4)已执行完成:
+#   1. extract_feature_categories.py
+#   2. extract_features_from_posts.py
+#   3. extract_current_posts.py
+#   4. match_inspiration_features.py
+#
+# 本脚本执行:
+#   5. filter_how_results.py      - 过滤how解构结果
+#   6. extract_nodes_and_edges.py - 提取节点和边
+#   7. build_persona_tree.py      - 构建人设树
+#   8. build_match_graph.py       - 构建匹配图谱
+#   9. visualize_match_graph.py   - 生成可视化HTML
+#
+# 使用方式:
+#   ./run_graph_pipeline.sh              # 使用默认账号
+#   ./run_graph_pipeline.sh 阿里多多酱    # 指定账号
+#   ACCOUNT_NAME=xxx ./run_graph_pipeline.sh
+
+set -e  # 遇到错误立即退出
+
+# 获取脚本所在目录的绝对路径
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+cd "$PROJECT_ROOT"
+
+# 颜色定义
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+
+print_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
+print_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
+print_error() { echo -e "${RED}[ERROR]${NC} $1"; }
+print_step() { echo -e "${YELLOW}[$1]${NC} $2"; }
+
+# 执行单个步骤
+run_step() {
+    local step_num=$1
+    local step_name=$2
+    local script_name=$3
+
+    print_step "$step_num/5" "$step_name"
+
+    if python "script/data_processing/$script_name"; then
+        print_success "$step_name 完成"
+        echo ""
+        return 0
+    else
+        print_error "$step_name 失败"
+        return 1
+    fi
+}
+
+# 主处理函数
+process_account() {
+    local account_name=$1
+
+    echo ""
+    echo "=========================================="
+    echo "图谱构建与可视化流程"
+    echo "账号: $account_name"
+    echo "项目: $PROJECT_ROOT"
+    echo "=========================================="
+    echo ""
+
+    # 设置环境变量
+    export ACCOUNT_NAME="$account_name"
+
+    # 步骤5: 过滤how解构结果
+    run_step 1 "过滤how解构结果" "filter_how_results.py" || return 1
+
+    # 步骤6: 提取节点和边
+    run_step 2 "提取节点和边" "extract_nodes_and_edges.py" || return 1
+
+    # 步骤7: 构建人设树
+    run_step 3 "构建人设树" "build_persona_tree.py" || return 1
+
+    # 步骤8: 构建匹配图谱
+    run_step 4 "构建匹配图谱" "build_match_graph.py" || return 1
+
+    # 步骤9: 生成可视化HTML
+    run_step 5 "生成可视化HTML" "visualize_match_graph.py" || return 1
+
+    echo "=========================================="
+    print_success "图谱构建与可视化流程完成!"
+    echo "=========================================="
+}
+
+# 获取默认账号
+get_default_account() {
+    python -c "
+import json
+from pathlib import Path
+config_file = Path('config/accounts.json')
+with open(config_file) as f:
+    config = json.load(f)
+print(config.get('default_account', ''))
+"
+}
+
+# 主逻辑
+main() {
+    local account_name=""
+
+    # 解析参数
+    if [ -n "$1" ]; then
+        account_name="$1"
+    elif [ -n "$ACCOUNT_NAME" ]; then
+        account_name="$ACCOUNT_NAME"
+    else
+        account_name=$(get_default_account)
+        if [ -z "$account_name" ]; then
+            print_error "未指定账号,请通过参数或环境变量指定"
+            echo "用法: $0 <账号名>"
+            exit 1
+        fi
+        print_info "使用默认账号: $account_name"
+    fi
+
+    process_account "$account_name"
+}
+
+main "$@"

Datei-Diff unterdrückt, da er zu groß ist
+ 772 - 150
script/data_processing/visualize_match_graph.py


Einige Dateien werden nicht angezeigt, da zu viele Dateien in diesem Diff geändert wurden.