4 dias atrás · d3022c2454
--- a/script/data_processing/build_match_graph.py
+++ b/script/data_processing/build_match_graph.py
@@ -645,11 +645,30 @@ def process_filtered_result(
 
				     useful_category_edges = [e for e in category_edges
			
 
				                             if e["源节点ID"] in useful_expanded_ids and e["目标节点ID"] in useful_expanded_ids]
			
 
				 
			
 
				+    # 5. 获取直接匹配层（第2层）和扩展层（第3层）之间的所有边（不仅仅是属于边）
			
 
				+    # 这些边连接了直接匹配的人设节点和扩展的分类节点
			
 
				+    cross_layer_edges = []
			
 
				+    for edge in edges_data.get("边列表", []):
			
 
				+        src, tgt = edge["源节点ID"], edge["目标节点ID"]
			
 
				+        edge_type = edge["边类型"]
			
 
				+        # 跳过已经收集的属于边（避免重复）
			
 
				+        if edge_type == "属于":
			
 
				+            continue
			
 
				+        # 一端在直接匹配层，另一端在扩展层
			
 
				+        src_in_direct = src in persona_node_ids
			
 
				+        src_in_expanded = src in useful_expanded_ids
			
 
				+        tgt_in_direct = tgt in persona_node_ids
			
 
				+        tgt_in_expanded = tgt in useful_expanded_ids
			
 
				+        if (src_in_direct and tgt_in_expanded) or (src_in_expanded and tgt_in_direct):
			
 
				+            cross_layer_edges.append(edge)
			
 
				+
			
 
				     # 合并节点列表
			
 
				     all_nodes = post_nodes + persona_nodes + useful_expanded_nodes
			
 
				 
			
 
				     # 合并边列表（加入帖子内的属于边）
			
 
				-    all_edges = post_belong_edges + match_edges + persona_edges + post_edges + useful_expanded_edges + useful_category_edges + post_edges_via_expanded
			
 
				+    all_edges = (post_belong_edges + match_edges + persona_edges + post_edges +
			
 
				+                 useful_expanded_edges + useful_category_edges + cross_layer_edges +
			
 
				+                 post_edges_via_expanded)
			
 
				     # 去重边
			
 
				     seen_edges = set()
			
 
				     unique_edges = []
			
@@ -709,6 +728,7 @@ def process_filtered_result(
 
				                 "匹配边数": len(match_edges),
			
 
				                 "人设节点间边数": len(persona_edges),
			
 
				                 "扩展边数（有效）": len(useful_expanded_edges),
			
 
				+                "跨层边数": len(cross_layer_edges),
			
 
				                 "帖子镜像边数（直接）": len(post_edges),
			
 
				                 "帖子镜像边数（二阶）": len(post_edges_via_expanded),
			
 
				                 "总节点数": len(all_nodes),
			
@@ -724,6 +744,7 @@ def process_filtered_result(
 
				         "匹配边列表": match_edges,
			
 
				         "人设节点间边列表": persona_edges,
			
 
				         "扩展边列表": useful_expanded_edges,
			
 
				+        "跨层边列表": cross_layer_edges,
			
 
				         "帖子镜像边列表（直接）": post_edges,
			
 
				         "帖子镜像边列表（二阶）": post_edges_via_expanded,
			
 
				         "节点列表": all_nodes,
			
@@ -748,6 +769,7 @@ def process_filtered_result(
 
				         "匹配边数": len(match_edges),
			
 
				         "人设边数": len(persona_edges),
			
 
				         "扩展边数": len(useful_expanded_edges),
			
 
				+        "跨层边数": len(cross_layer_edges),
			
 
				         "帖子边数（直接）": len(post_edges),
			
 
				         "帖子边数（二阶）": len(post_edges_via_expanded),
			
 
				         "总节点数": len(all_nodes),
			
@@ -805,7 +827,7 @@ def main():
 
				         result = process_filtered_result(filtered_file, nodes_data, edges_data, output_dir)
			
 
				         results.append(result)
			
 
				         print(f"  帖子节点: {result['帖子节点数']}, 人设节点: {result['人设节点数']}, 扩展节点: {result['扩展节点数']}")
			
 
				-        print(f"  匹配边: {result['匹配边数']}, 人设边: {result['人设边数']}, 扩展边: {result['扩展边数']}")
			
 
				+        print(f"  匹配边: {result['匹配边数']}, 人设边: {result['人设边数']}, 扩展边: {result['扩展边数']}, 跨层边: {result['跨层边数']}")
			
 
				         print(f"  帖子边(直接): {result['帖子边数（直接）']}, 帖子边(二阶): {result['帖子边数（二阶）']}")
			
 
				 
			
 
				     # 汇总统计
			
@@ -819,6 +841,7 @@ def main():
 
				     total_match = sum(r['匹配边数'] for r in results)
			
 
				     total_persona_edges = sum(r['人设边数'] for r in results)
			
 
				     total_expanded_edges = sum(r['扩展边数'] for r in results)
			
 
				+    total_cross_layer_edges = sum(r['跨层边数'] for r in results)
			
 
				     total_post_edges_direct = sum(r['帖子边数（直接）'] for r in results)
			
 
				     total_post_edges_2hop = sum(r['帖子边数（二阶）'] for r in results)
			
 
				     print(f"  总帖子节点: {total_post}")
			
@@ -827,6 +850,7 @@ def main():
 
				     print(f"  总匹配边: {total_match}")
			
 
				     print(f"  总人设边: {total_persona_edges}")
			
 
				     print(f"  总扩展边: {total_expanded_edges}")
			
 
				+    print(f"  总跨层边: {total_cross_layer_edges}")
			
 
				     print(f"  总帖子边(直接): {total_post_edges_direct}")
			
 
				     print(f"  总帖子边(二阶): {total_post_edges_2hop}")
			
 
				     print(f"\n输出目录: {output_dir}")
			
--- a/script/data_processing/run_graph_pipeline.sh
+++ b/script/data_processing/run_graph_pipeline.sh
@@ -0,0 +1,129 @@
 
				+#!/bin/bash
			
 
				+# 图谱构建与可视化流程（步骤5-9）
			
 
				+#
			
 
				+# 依赖前置步骤（1-4）已执行完成：
			
 
				+#   1. extract_feature_categories.py
			
 
				+#   2. extract_features_from_posts.py
			
 
				+#   3. extract_current_posts.py
			
 
				+#   4. match_inspiration_features.py
			
 
				+#
			
 
				+# 本脚本执行：
			
 
				+#   5. filter_how_results.py      - 过滤how解构结果
			
 
				+#   6. extract_nodes_and_edges.py - 提取节点和边
			
 
				+#   7. build_persona_tree.py      - 构建人设树
			
 
				+#   8. build_match_graph.py       - 构建匹配图谱
			
 
				+#   9. visualize_match_graph.py   - 生成可视化HTML
			
 
				+#
			
 
				+# 使用方式：
			
 
				+#   ./run_graph_pipeline.sh              # 使用默认账号
			
 
				+#   ./run_graph_pipeline.sh 阿里多多酱    # 指定账号
			
 
				+#   ACCOUNT_NAME=xxx ./run_graph_pipeline.sh
			
 
				+
			
 
				+set -e  # 遇到错误立即退出
			
 
				+
			
 
				+# 获取脚本所在目录的绝对路径
			
 
				+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
			
 
				+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
			
 
				+
			
 
				+cd "$PROJECT_ROOT"
			
 
				+
			
 
				+# 颜色定义
			
 
				+GREEN='\033[0;32m'
			
 
				+RED='\033[0;31m'
			
 
				+YELLOW='\033[1;33m'
			
 
				+BLUE='\033[0;34m'
			
 
				+NC='\033[0m'
			
 
				+
			
 
				+print_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
			
 
				+print_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
			
 
				+print_error() { echo -e "${RED}[ERROR]${NC} $1"; }
			
 
				+print_step() { echo -e "${YELLOW}[$1]${NC} $2"; }
			
 
				+
			
 
				+# 执行单个步骤
			
 
				+run_step() {
			
 
				+    local step_num=$1
			
 
				+    local step_name=$2
			
 
				+    local script_name=$3
			
 
				+
			
 
				+    print_step "$step_num/5" "$step_name"
			
 
				+
			
 
				+    if python "script/data_processing/$script_name"; then
			
 
				+        print_success "$step_name 完成"
			
 
				+        echo ""
			
 
				+        return 0
			
 
				+    else
			
 
				+        print_error "$step_name 失败"
			
 
				+        return 1
			
 
				+    fi
			
 
				+}
			
 
				+
			
 
				+# 主处理函数
			
 
				+process_account() {
			
 
				+    local account_name=$1
			
 
				+
			
 
				+    echo ""
			
 
				+    echo "=========================================="
			
 
				+    echo "图谱构建与可视化流程"
			
 
				+    echo "账号: $account_name"
			
 
				+    echo "项目: $PROJECT_ROOT"
			
 
				+    echo "=========================================="
			
 
				+    echo ""
			
 
				+
			
 
				+    # 设置环境变量
			
 
				+    export ACCOUNT_NAME="$account_name"
			
 
				+
			
 
				+    # 步骤5: 过滤how解构结果
			
 
				+    run_step 1 "过滤how解构结果" "filter_how_results.py" || return 1
			
 
				+
			
 
				+    # 步骤6: 提取节点和边
			
 
				+    run_step 2 "提取节点和边" "extract_nodes_and_edges.py" || return 1
			
 
				+
			
 
				+    # 步骤7: 构建人设树
			
 
				+    run_step 3 "构建人设树" "build_persona_tree.py" || return 1
			
 
				+
			
 
				+    # 步骤8: 构建匹配图谱
			
 
				+    run_step 4 "构建匹配图谱" "build_match_graph.py" || return 1
			
 
				+
			
 
				+    # 步骤9: 生成可视化HTML
			
 
				+    run_step 5 "生成可视化HTML" "visualize_match_graph.py" || return 1
			
 
				+
			
 
				+    echo "=========================================="
			
 
				+    print_success "图谱构建与可视化流程完成！"
			
 
				+    echo "=========================================="
			
 
				+}
			
 
				+
			
 
				+# 获取默认账号
			
 
				+get_default_account() {
			
 
				+    python -c "
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+config_file = Path('config/accounts.json')
			
 
				+with open(config_file) as f:
			
 
				+    config = json.load(f)
			
 
				+print(config.get('default_account', ''))
			
 
				+"
			
 
				+}
			
 
				+
			
 
				+# 主逻辑
			
 
				+main() {
			
 
				+    local account_name=""
			
 
				+
			
 
				+    # 解析参数
			
 
				+    if [ -n "$1" ]; then
			
 
				+        account_name="$1"
			
 
				+    elif [ -n "$ACCOUNT_NAME" ]; then
			
 
				+        account_name="$ACCOUNT_NAME"
			
 
				+    else
			
 
				+        account_name=$(get_default_account)
			
 
				+        if [ -z "$account_name" ]; then
			
 
				+            print_error "未指定账号，请通过参数或环境变量指定"
			
 
				+            echo "用法: $0 <账号名>"
			
 
				+            exit 1
			
 
				+        fi
			
 
				+        print_info "使用默认账号: $account_name"
			
 
				+    fi
			
 
				+
			
 
				+    process_account "$account_name"
			
 
				+}
			
 
				+
			
 
				+main "$@"