Ver código fonte

Merge remote-tracking branch 'origin/how_1125_v2' into how_1125_v1

yangxiaohui 4 dias atrás
pai
commit
d3022c2454

+ 26 - 2
script/data_processing/build_match_graph.py

@@ -645,11 +645,30 @@ def process_filtered_result(
     useful_category_edges = [e for e in category_edges
                             if e["源节点ID"] in useful_expanded_ids and e["目标节点ID"] in useful_expanded_ids]
 
+    # 5. 获取直接匹配层(第2层)和扩展层(第3层)之间的所有边(不仅仅是属于边)
+    # 这些边连接了直接匹配的人设节点和扩展的分类节点
+    cross_layer_edges = []
+    for edge in edges_data.get("边列表", []):
+        src, tgt = edge["源节点ID"], edge["目标节点ID"]
+        edge_type = edge["边类型"]
+        # 跳过已经收集的属于边(避免重复)
+        if edge_type == "属于":
+            continue
+        # 一端在直接匹配层,另一端在扩展层
+        src_in_direct = src in persona_node_ids
+        src_in_expanded = src in useful_expanded_ids
+        tgt_in_direct = tgt in persona_node_ids
+        tgt_in_expanded = tgt in useful_expanded_ids
+        if (src_in_direct and tgt_in_expanded) or (src_in_expanded and tgt_in_direct):
+            cross_layer_edges.append(edge)
+
     # 合并节点列表
     all_nodes = post_nodes + persona_nodes + useful_expanded_nodes
 
     # 合并边列表(加入帖子内的属于边)
-    all_edges = post_belong_edges + match_edges + persona_edges + post_edges + useful_expanded_edges + useful_category_edges + post_edges_via_expanded
+    all_edges = (post_belong_edges + match_edges + persona_edges + post_edges +
+                 useful_expanded_edges + useful_category_edges + cross_layer_edges +
+                 post_edges_via_expanded)
     # 去重边
     seen_edges = set()
     unique_edges = []
@@ -709,6 +728,7 @@ def process_filtered_result(
                 "匹配边数": len(match_edges),
                 "人设节点间边数": len(persona_edges),
                 "扩展边数(有效)": len(useful_expanded_edges),
+                "跨层边数": len(cross_layer_edges),
                 "帖子镜像边数(直接)": len(post_edges),
                 "帖子镜像边数(二阶)": len(post_edges_via_expanded),
                 "总节点数": len(all_nodes),
@@ -724,6 +744,7 @@ def process_filtered_result(
         "匹配边列表": match_edges,
         "人设节点间边列表": persona_edges,
         "扩展边列表": useful_expanded_edges,
+        "跨层边列表": cross_layer_edges,
         "帖子镜像边列表(直接)": post_edges,
         "帖子镜像边列表(二阶)": post_edges_via_expanded,
         "节点列表": all_nodes,
@@ -748,6 +769,7 @@ def process_filtered_result(
         "匹配边数": len(match_edges),
         "人设边数": len(persona_edges),
         "扩展边数": len(useful_expanded_edges),
+        "跨层边数": len(cross_layer_edges),
         "帖子边数(直接)": len(post_edges),
         "帖子边数(二阶)": len(post_edges_via_expanded),
         "总节点数": len(all_nodes),
@@ -805,7 +827,7 @@ def main():
         result = process_filtered_result(filtered_file, nodes_data, edges_data, output_dir)
         results.append(result)
         print(f"  帖子节点: {result['帖子节点数']}, 人设节点: {result['人设节点数']}, 扩展节点: {result['扩展节点数']}")
-        print(f"  匹配边: {result['匹配边数']}, 人设边: {result['人设边数']}, 扩展边: {result['扩展边数']}")
+        print(f"  匹配边: {result['匹配边数']}, 人设边: {result['人设边数']}, 扩展边: {result['扩展边数']}, 跨层边: {result['跨层边数']}")
         print(f"  帖子边(直接): {result['帖子边数(直接)']}, 帖子边(二阶): {result['帖子边数(二阶)']}")
 
     # 汇总统计
@@ -819,6 +841,7 @@ def main():
     total_match = sum(r['匹配边数'] for r in results)
     total_persona_edges = sum(r['人设边数'] for r in results)
     total_expanded_edges = sum(r['扩展边数'] for r in results)
+    total_cross_layer_edges = sum(r['跨层边数'] for r in results)
     total_post_edges_direct = sum(r['帖子边数(直接)'] for r in results)
     total_post_edges_2hop = sum(r['帖子边数(二阶)'] for r in results)
     print(f"  总帖子节点: {total_post}")
@@ -827,6 +850,7 @@ def main():
     print(f"  总匹配边: {total_match}")
     print(f"  总人设边: {total_persona_edges}")
     print(f"  总扩展边: {total_expanded_edges}")
+    print(f"  总跨层边: {total_cross_layer_edges}")
     print(f"  总帖子边(直接): {total_post_edges_direct}")
     print(f"  总帖子边(二阶): {total_post_edges_2hop}")
     print(f"\n输出目录: {output_dir}")

+ 129 - 0
script/data_processing/run_graph_pipeline.sh

@@ -0,0 +1,129 @@
+#!/bin/bash
+# 图谱构建与可视化流程(步骤5-9)
+#
+# 依赖前置步骤(1-4)已执行完成:
+#   1. extract_feature_categories.py
+#   2. extract_features_from_posts.py
+#   3. extract_current_posts.py
+#   4. match_inspiration_features.py
+#
+# 本脚本执行:
+#   5. filter_how_results.py      - 过滤how解构结果
+#   6. extract_nodes_and_edges.py - 提取节点和边
+#   7. build_persona_tree.py      - 构建人设树
+#   8. build_match_graph.py       - 构建匹配图谱
+#   9. visualize_match_graph.py   - 生成可视化HTML
+#
+# 使用方式:
+#   ./run_graph_pipeline.sh              # 使用默认账号
+#   ./run_graph_pipeline.sh 阿里多多酱    # 指定账号
+#   ACCOUNT_NAME=xxx ./run_graph_pipeline.sh
+
+set -e  # 遇到错误立即退出
+
+# 获取脚本所在目录的绝对路径
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+cd "$PROJECT_ROOT"
+
+# 颜色定义
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+
+print_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
+print_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
+print_error() { echo -e "${RED}[ERROR]${NC} $1"; }
+print_step() { echo -e "${YELLOW}[$1]${NC} $2"; }
+
+# 执行单个步骤
+run_step() {
+    local step_num=$1
+    local step_name=$2
+    local script_name=$3
+
+    print_step "$step_num/5" "$step_name"
+
+    if python "script/data_processing/$script_name"; then
+        print_success "$step_name 完成"
+        echo ""
+        return 0
+    else
+        print_error "$step_name 失败"
+        return 1
+    fi
+}
+
+# 主处理函数
+process_account() {
+    local account_name=$1
+
+    echo ""
+    echo "=========================================="
+    echo "图谱构建与可视化流程"
+    echo "账号: $account_name"
+    echo "项目: $PROJECT_ROOT"
+    echo "=========================================="
+    echo ""
+
+    # 设置环境变量
+    export ACCOUNT_NAME="$account_name"
+
+    # 步骤5: 过滤how解构结果
+    run_step 1 "过滤how解构结果" "filter_how_results.py" || return 1
+
+    # 步骤6: 提取节点和边
+    run_step 2 "提取节点和边" "extract_nodes_and_edges.py" || return 1
+
+    # 步骤7: 构建人设树
+    run_step 3 "构建人设树" "build_persona_tree.py" || return 1
+
+    # 步骤8: 构建匹配图谱
+    run_step 4 "构建匹配图谱" "build_match_graph.py" || return 1
+
+    # 步骤9: 生成可视化HTML
+    run_step 5 "生成可视化HTML" "visualize_match_graph.py" || return 1
+
+    echo "=========================================="
+    print_success "图谱构建与可视化流程完成!"
+    echo "=========================================="
+}
+
+# 获取默认账号
+get_default_account() {
+    python -c "
+import json
+from pathlib import Path
+config_file = Path('config/accounts.json')
+with open(config_file) as f:
+    config = json.load(f)
+print(config.get('default_account', ''))
+"
+}
+
+# 主逻辑
+main() {
+    local account_name=""
+
+    # 解析参数
+    if [ -n "$1" ]; then
+        account_name="$1"
+    elif [ -n "$ACCOUNT_NAME" ]; then
+        account_name="$ACCOUNT_NAME"
+    else
+        account_name=$(get_default_account)
+        if [ -z "$account_name" ]; then
+            print_error "未指定账号,请通过参数或环境变量指定"
+            echo "用法: $0 <账号名>"
+            exit 1
+        fi
+        print_info "使用默认账号: $account_name"
+    fi
+
+    process_account "$account_name"
+}
+
+main "$@"