| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 从源数据文件中提取节点列表和边关系
- 输入:
- 1. 过去帖子_pattern聚合结果.json - 分类节点、标签-分类边
- 2. 过去帖子_what解构结果目录 - 标签节点来源
- 3. dimension_associations_analysis.json - 分类-分类边(共现)
- 输出:
- 1. 节点列表.json
- 2. 边关系.json
- """
- import json
- from pathlib import Path
- from typing import Dict, List, Any, Set, Optional
- import sys
- import re
- # 添加项目根目录到路径
- project_root = Path(__file__).parent.parent.parent
- sys.path.insert(0, str(project_root))
- from script.data_processing.path_config import PathConfig
- from script.detail import get_xiaohongshu_detail
- def get_post_detail(post_id: str) -> Optional[Dict]:
- """获取帖子详情"""
- try:
- detail = get_xiaohongshu_detail(post_id)
- return detail
- except Exception as e:
- print(f" 警告: 获取帖子 {post_id} 详情失败: {e}")
- return None
- def get_last_segment(path: str) -> str:
- """获取路径的最后一段"""
- return path.split("/")[-1]
- def build_node_id(dimension: str, node_type: str, name: str) -> str:
- """
- 构建节点ID
- Args:
- dimension: 节点层级(灵感点、目的点、关键点)
- node_type: 节点类型(分类、标签)
- name: 节点名称
- Returns:
- 节点ID,格式: {层级}_{类型}_{名称}
- """
- return f"{dimension}_{node_type}_{name}"
- def extract_post_id_from_filename(filename: str) -> str:
- """从文件名中提取帖子ID"""
- match = re.match(r'^([^_]+)_', filename)
- if match:
- return match.group(1)
- return ""
- def get_current_post_ids(current_posts_dir: Path) -> Set[str]:
- """
- 获取当前帖子目录中的所有帖子ID
- Args:
- current_posts_dir: 当前帖子目录路径
- Returns:
- 当前帖子ID集合
- """
- if not current_posts_dir.exists():
- print(f"警告: 当前帖子目录不存在: {current_posts_dir}")
- return set()
- json_files = list(current_posts_dir.glob("*.json"))
- if not json_files:
- print(f"警告: 当前帖子目录为空: {current_posts_dir}")
- return set()
- print(f"找到 {len(json_files)} 个当前帖子")
- post_ids = set()
- for file_path in json_files:
- post_id = extract_post_id_from_filename(file_path.name)
- if post_id:
- post_ids.add(post_id)
- print(f"提取到 {len(post_ids)} 个帖子ID")
- return post_ids
- def collect_all_post_ids_from_nodes(nodes: List[Dict]) -> Set[str]:
- """从节点列表中收集所有帖子ID"""
- post_ids = set()
- for node in nodes:
- for source in node.get("节点来源", []):
- post_id = source.get("帖子ID", "")
- if post_id:
- post_ids.add(post_id)
- return post_ids
- def collect_all_post_ids_from_edges(edges: List[Dict]) -> Set[str]:
- """从边列表中收集所有帖子ID"""
- post_ids = set()
- for edge in edges:
- if edge.get("边类型") in ("分类共现(跨点)", "标签共现"):
- edge_details = edge.get("边详情", {})
- common_post_ids = edge_details.get("共同帖子ID", [])
- post_ids.update(common_post_ids)
- # 点内共现边不包含帖子ID
- return post_ids
- def fetch_post_details(post_ids: Set[str]) -> Dict[str, Dict]:
- """
- 批量获取帖子详情
- Args:
- post_ids: 帖子ID集合
- Returns:
- 帖子ID -> 帖子详情 的映射
- """
- print(f"\n正在获取 {len(post_ids)} 个帖子的详情...")
- post_details = {}
- for i, post_id in enumerate(sorted(post_ids), 1):
- print(f" [{i}/{len(post_ids)}] 获取帖子 {post_id} 的详情...")
- detail = get_post_detail(post_id)
- if detail:
- post_details[post_id] = detail
- print(f"成功获取 {len(post_details)} 个帖子详情")
- return post_details
- def filter_nodes_by_post_ids(nodes: List[Dict], exclude_post_ids: Set[str]) -> List[Dict]:
- """
- 过滤节点,排除指定帖子ID的来源
- Args:
- nodes: 节点列表
- exclude_post_ids: 要排除的帖子ID集合
- Returns:
- 过滤后的节点列表
- """
- filtered_nodes = []
- for node in nodes:
- # 过滤节点来源
- filtered_sources = [
- source for source in node.get("节点来源", [])
- if source.get("帖子ID", "") not in exclude_post_ids
- ]
- # 只保留有来源的节点
- if filtered_sources:
- node_copy = node.copy()
- node_copy["节点来源"] = filtered_sources
- # 重新计算帖子数
- unique_post_ids = set(s.get("帖子ID", "") for s in filtered_sources if s.get("帖子ID"))
- node_copy["帖子数"] = len(unique_post_ids)
- filtered_nodes.append(node_copy)
- return filtered_nodes
- def filter_edges_by_post_ids(edges: List[Dict], exclude_post_ids: Set[str]) -> List[Dict]:
- """
- 过滤边,排除指定帖子ID的共现边
- Args:
- edges: 边列表
- exclude_post_ids: 要排除的帖子ID集合
- Returns:
- 过滤后的边列表
- """
- filtered_edges = []
- for edge in edges:
- edge_type = edge["边类型"]
- if edge_type in ("分类共现(跨点)", "标签共现"):
- # 过滤共同帖子ID
- edge_details = edge.get("边详情", {})
- common_post_ids = edge_details.get("共同帖子ID", [])
- filtered_post_ids = [pid for pid in common_post_ids if pid not in exclude_post_ids]
- if filtered_post_ids:
- edge_copy = edge.copy()
- edge_copy["边详情"] = edge_details.copy()
- edge_copy["边详情"]["共同帖子ID"] = filtered_post_ids
- edge_copy["边详情"]["共同帖子数"] = len(filtered_post_ids)
- filtered_edges.append(edge_copy)
- elif edge_type == "分类共现(点内)":
- # 点内共现边不涉及帖子ID,直接保留
- filtered_edges.append(edge)
- else:
- # 属于/包含边不需要过滤
- filtered_edges.append(edge)
- return filtered_edges
- # ========== 分类节点提取 ==========
- def extract_category_nodes_from_pattern(
- pattern_data: Dict,
- dimension_key: str,
- dimension_name: str
- ) -> List[Dict]:
- """
- 从pattern聚合结果中提取分类节点
- Args:
- pattern_data: pattern聚合数据
- dimension_key: 维度键名(灵感点列表、目的点、关键点列表)
- dimension_name: 维度名称(灵感点、目的点、关键点)
- Returns:
- 分类节点列表
- """
- nodes = []
- if dimension_key not in pattern_data:
- return nodes
- def traverse_node(node: Dict, parent_categories: List[str]):
- """递归遍历节点"""
- for key, value in node.items():
- if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
- continue
- if isinstance(value, dict):
- # 当前节点是一个分类
- current_path = parent_categories + [key]
- # 获取帖子列表
- post_ids = value.get("帖子列表", [])
- # 构建节点来源(从特征列表中获取)
- node_sources = []
- if "特征列表" in value:
- for feature in value["特征列表"]:
- source = {
- "点的名称": feature.get("所属点", ""),
- "点的描述": feature.get("点描述", ""),
- "帖子ID": feature.get("帖子id", "")
- }
- node_sources.append(source)
- node_info = {
- "节点ID": build_node_id(dimension_name, "分类", key),
- "节点名称": key,
- "节点类型": "分类",
- "节点层级": dimension_name,
- "所属分类": parent_categories.copy(),
- "帖子数": len(post_ids),
- "节点来源": node_sources
- }
- nodes.append(node_info)
- # 递归处理子节点
- traverse_node(value, current_path)
- traverse_node(pattern_data[dimension_key], [])
- return nodes
- # ========== 标签节点提取 ==========
- def extract_tag_nodes_from_pattern(
- pattern_data: Dict,
- dimension_key: str,
- dimension_name: str
- ) -> List[Dict]:
- """
- 从pattern聚合结果中提取标签节点
- Args:
- pattern_data: pattern聚合数据
- dimension_key: 维度键名
- dimension_name: 维度名称
- Returns:
- 标签节点列表
- """
- nodes = []
- tag_map = {} # 用于合并同名标签
- if dimension_key not in pattern_data:
- return nodes
- def traverse_node(node: Dict, parent_categories: List[str]):
- """递归遍历节点"""
- # 处理特征列表(标签)
- if "特征列表" in node:
- for feature in node["特征列表"]:
- tag_name = feature.get("特征名称", "")
- if not tag_name:
- continue
- source = {
- "点的名称": feature.get("所属点", ""),
- "点的描述": feature.get("点描述", ""),
- "帖子ID": feature.get("帖子id", "")
- }
- tag_id = build_node_id(dimension_name, "标签", tag_name)
- if tag_id not in tag_map:
- tag_map[tag_id] = {
- "节点ID": tag_id,
- "节点名称": tag_name,
- "节点类型": "标签",
- "节点层级": dimension_name,
- "所属分类": parent_categories.copy(),
- "帖子数": 0,
- "节点来源": [],
- "_post_ids": set()
- }
- tag_map[tag_id]["节点来源"].append(source)
- if source["帖子ID"]:
- tag_map[tag_id]["_post_ids"].add(source["帖子ID"])
- # 递归处理子节点
- for key, value in node.items():
- if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
- continue
- if isinstance(value, dict):
- current_path = parent_categories + [key]
- traverse_node(value, current_path)
- traverse_node(pattern_data[dimension_key], [])
- # 转换为列表,计算帖子数
- for tag_id, tag_info in tag_map.items():
- tag_info["帖子数"] = len(tag_info["_post_ids"])
- del tag_info["_post_ids"]
- nodes.append(tag_info)
- return nodes
- # ========== 标签-分类边提取 ==========
- def extract_tag_category_edges_from_pattern(
- pattern_data: Dict,
- dimension_key: str,
- dimension_name: str
- ) -> List[Dict]:
- """
- 从pattern聚合结果中提取标签-分类边(属于/包含)
- Args:
- pattern_data: pattern聚合数据
- dimension_key: 维度键名
- dimension_name: 维度名称
- Returns:
- 边列表
- """
- edges = []
- seen_edges = set() # 避免重复边
- if dimension_key not in pattern_data:
- return edges
- def traverse_node(node: Dict, parent_categories: List[str]):
- """递归遍历节点"""
- current_category = parent_categories[-1] if parent_categories else None
- # 处理特征列表(标签)
- if "特征列表" in node and current_category:
- for feature in node["特征列表"]:
- tag_name = feature.get("特征名称", "")
- if not tag_name:
- continue
- tag_id = build_node_id(dimension_name, "标签", tag_name)
- category_id = build_node_id(dimension_name, "分类", current_category)
- # 属于边:标签 -> 分类
- edge_key_belong = (tag_id, category_id, "属于")
- if edge_key_belong not in seen_edges:
- seen_edges.add(edge_key_belong)
- edges.append({
- "源节点ID": tag_id,
- "目标节点ID": category_id,
- "边类型": "属于",
- "边详情": {}
- })
- # 包含边:分类 -> 标签
- edge_key_contain = (category_id, tag_id, "包含")
- if edge_key_contain not in seen_edges:
- seen_edges.add(edge_key_contain)
- edges.append({
- "源节点ID": category_id,
- "目标节点ID": tag_id,
- "边类型": "包含",
- "边详情": {}
- })
- # 递归处理子节点
- for key, value in node.items():
- if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
- continue
- if isinstance(value, dict):
- current_path = parent_categories + [key]
- traverse_node(value, current_path)
- traverse_node(pattern_data[dimension_key], [])
- return edges
- # ========== 标签-标签共现边提取 ==========
- def extract_tags_from_post(post_data: Dict) -> Dict[str, List[str]]:
- """
- 从单个帖子的解构结果中提取所有标签(特征名称)
- Args:
- post_data: 帖子解构数据
- Returns:
- 按维度分组的标签字典 {"灵感点": [...], "目的点": [...], "关键点": [...]}
- """
- tags_by_dimension = {
- "灵感点": [],
- "目的点": [],
- "关键点": []
- }
- if "三点解构" not in post_data:
- return tags_by_dimension
- three_points = post_data["三点解构"]
- # 提取灵感点的特征
- if "灵感点" in three_points:
- inspiration = three_points["灵感点"]
- for section in ["全新内容", "共性差异", "共性内容"]:
- if section in inspiration and isinstance(inspiration[section], list):
- for item in inspiration[section]:
- if "提取的特征" in item and isinstance(item["提取的特征"], list):
- for feature in item["提取的特征"]:
- tag_name = feature.get("特征名称", "")
- if tag_name:
- tags_by_dimension["灵感点"].append(tag_name)
- # 提取目的点的特征
- if "目的点" in three_points:
- purpose = three_points["目的点"]
- if "purposes" in purpose and isinstance(purpose["purposes"], list):
- for item in purpose["purposes"]:
- if "提取的特征" in item and isinstance(item["提取的特征"], list):
- for feature in item["提取的特征"]:
- tag_name = feature.get("特征名称", "")
- if tag_name:
- tags_by_dimension["目的点"].append(tag_name)
- # 提取关键点的特征
- if "关键点" in three_points:
- key_points = three_points["关键点"]
- if "key_points" in key_points and isinstance(key_points["key_points"], list):
- for item in key_points["key_points"]:
- if "提取的特征" in item and isinstance(item["提取的特征"], list):
- for feature in item["提取的特征"]:
- tag_name = feature.get("特征名称", "")
- if tag_name:
- tags_by_dimension["关键点"].append(tag_name)
- return tags_by_dimension
- def extract_tag_cooccurrence_edges(historical_posts_dir: Path, exclude_post_ids: Set[str] = None) -> List[Dict]:
- """
- 从历史帖子解构结果中提取标签-标签共现边
- Args:
- historical_posts_dir: 历史帖子解构结果目录
- exclude_post_ids: 要排除的帖子ID集合
- Returns:
- 标签共现边列表
- """
- if exclude_post_ids is None:
- exclude_post_ids = set()
- # 存储每对标签的共现信息
- # key: (tag1_id, tag2_id), value: {"共同帖子ID": set()}
- cooccurrence_map = {}
- if not historical_posts_dir.exists():
- print(f"警告: 历史帖子目录不存在: {historical_posts_dir}")
- return []
- json_files = list(historical_posts_dir.glob("*.json"))
- print(f"找到 {len(json_files)} 个历史帖子文件")
- for file_path in json_files:
- # 提取帖子ID
- post_id = extract_post_id_from_filename(file_path.name)
- if not post_id:
- continue
- # 跳过排除的帖子
- if post_id in exclude_post_ids:
- continue
- try:
- with open(file_path, "r", encoding="utf-8") as f:
- post_data = json.load(f)
- # 提取该帖子的所有标签
- tags_by_dimension = extract_tags_from_post(post_data)
- # 对每个维度内的标签两两组合,构建共现关系
- for dimension, tags in tags_by_dimension.items():
- unique_tags = list(set(tags)) # 去重
- for i in range(len(unique_tags)):
- for j in range(i + 1, len(unique_tags)):
- tag1 = unique_tags[i]
- tag2 = unique_tags[j]
- # 构建节点ID
- tag1_id = build_node_id(dimension, "标签", tag1)
- tag2_id = build_node_id(dimension, "标签", tag2)
- # 确保顺序一致(按字典序)
- if tag1_id > tag2_id:
- tag1_id, tag2_id = tag2_id, tag1_id
- key = (tag1_id, tag2_id, dimension)
- if key not in cooccurrence_map:
- cooccurrence_map[key] = {"共同帖子ID": set()}
- cooccurrence_map[key]["共同帖子ID"].add(post_id)
- except Exception as e:
- print(f" 警告: 处理文件 {file_path.name} 时出错: {e}")
- # 转换为边列表
- edges = []
- for (tag1_id, tag2_id, dimension), info in cooccurrence_map.items():
- common_post_ids = list(info["共同帖子ID"])
- edge = {
- "源节点ID": tag1_id,
- "目标节点ID": tag2_id,
- "边类型": "标签共现",
- "边详情": {
- "共同帖子数": len(common_post_ids),
- "共同帖子ID": common_post_ids
- }
- }
- edges.append(edge)
- return edges
- # ========== 分类-分类边提取 ==========
- def extract_category_edges_from_associations(associations_data: Dict) -> List[Dict]:
- """
- 从dimension_associations_analysis.json中提取分类-分类边(共现)
- Args:
- associations_data: 关联分析数据
- Returns:
- 边列表
- """
- edges = []
- if "单维度关联分析" not in associations_data:
- return edges
- single_dim = associations_data["单维度关联分析"]
- # 维度映射
- dimension_map = {
- "灵感点维度": "灵感点",
- "目的点维度": "目的点",
- "关键点维度": "关键点"
- }
- for dim_key, dim_data in single_dim.items():
- if dim_key not in dimension_map:
- continue
- source_dimension = dimension_map[dim_key]
- # 遍历该维度下的所有关联方向
- for direction_key, direction_data in dim_data.items():
- if direction_key == "说明":
- continue
- if "→" not in direction_key:
- continue
- # 遍历每个源分类
- for source_path, source_info in direction_data.items():
- source_name = get_last_segment(source_path)
- source_node_id = build_node_id(source_dimension, "分类", source_name)
- # 确定目标维度
- for field_name, associations in source_info.items():
- if not field_name.startswith("与") or not field_name.endswith("的关联"):
- continue
- target_dimension = field_name[1:-3]
- if not isinstance(associations, list):
- continue
- for assoc in associations:
- target_path = assoc.get("目标分类", "")
- if not target_path:
- continue
- target_name = get_last_segment(target_path)
- target_node_id = build_node_id(target_dimension, "分类", target_name)
- edge = {
- "源节点ID": source_node_id,
- "目标节点ID": target_node_id,
- "边类型": "分类共现(跨点)",
- "边详情": {
- "Jaccard相似度": assoc.get("Jaccard相似度", 0),
- "重叠系数": assoc.get("重叠系数", 0),
- "共同帖子数": assoc.get("共同帖子数", 0),
- "共同帖子ID": assoc.get("共同帖子ID", [])
- }
- }
- edges.append(edge)
- return edges
- # ========== 点内分类共现边提取 ==========
- def extract_intra_category_edges(intra_associations_data: Dict) -> List[Dict]:
- """
- 从intra_dimension_associations_analysis.json中提取点内分类共现边
- Args:
- intra_associations_data: 点内关联分析数据
- Returns:
- 边列表
- """
- edges = []
- seen_edges = set() # 避免重复边
- if "叶子分类组合聚类" not in intra_associations_data:
- return edges
- clusters_by_dim = intra_associations_data["叶子分类组合聚类"]
- for dimension, clusters in clusters_by_dim.items():
- if dimension not in ("灵感点", "目的点", "关键点"):
- continue
- for cluster_key, cluster_data in clusters.items():
- leaf_categories = cluster_data.get("叶子分类组合", [])
- point_count = cluster_data.get("点数", 0)
- point_details = cluster_data.get("点详情列表", [])
- # 提取点名称列表
- point_names = [p.get("点名称", "") for p in point_details if p.get("点名称")]
- # 两两组合生成共现边
- for i in range(len(leaf_categories)):
- for j in range(i + 1, len(leaf_categories)):
- cat1 = leaf_categories[i]
- cat2 = leaf_categories[j]
- # 构建节点ID
- cat1_id = build_node_id(dimension, "分类", cat1)
- cat2_id = build_node_id(dimension, "分类", cat2)
- # 确保顺序一致(按字典序)
- if cat1_id > cat2_id:
- cat1_id, cat2_id = cat2_id, cat1_id
- edge_key = (cat1_id, cat2_id, dimension)
- if edge_key in seen_edges:
- # 已存在的边,累加点数和点名称
- for edge in edges:
- if (edge["源节点ID"] == cat1_id and
- edge["目标节点ID"] == cat2_id and
- edge["边类型"] == "分类共现(点内)"):
- edge["边详情"]["点数"] += point_count
- edge["边详情"]["关联点名称"].extend(point_names)
- break
- else:
- seen_edges.add(edge_key)
- edge = {
- "源节点ID": cat1_id,
- "目标节点ID": cat2_id,
- "边类型": "分类共现(点内)",
- "边详情": {
- "点数": point_count,
- "关联点名称": point_names.copy()
- }
- }
- edges.append(edge)
- return edges
- # ========== 主函数 ==========
- def main():
- # 使用路径配置
- config = PathConfig()
- config.ensure_dirs()
- print(f"账号: {config.account_name}")
- print(f"输出版本: {config.output_version}")
- print(f"过滤模式: {config.filter_mode}")
- print()
- # 输入文件路径
- pattern_file = config.pattern_cluster_file
- associations_file = config.account_dir / "pattern相关文件/optimization/dimension_associations_analysis.json"
- intra_associations_file = config.account_dir / "pattern相关文件/optimization/intra_dimension_associations_analysis.json"
- current_posts_dir = config.current_posts_dir
- # 输出文件路径
- nodes_output_file = config.intermediate_dir / "节点列表.json"
- edges_output_file = config.intermediate_dir / "边关系.json"
- print(f"输入文件:")
- print(f" pattern聚合文件: {pattern_file}")
- print(f" 跨点关联分析文件: {associations_file}")
- print(f" 点内关联分析文件: {intra_associations_file}")
- print(f" 当前帖子目录: {current_posts_dir}")
- print(f"\n输出文件:")
- print(f" 节点列表: {nodes_output_file}")
- print(f" 边关系: {edges_output_file}")
- print()
- # 读取pattern聚合结果
- print("正在读取pattern聚合结果...")
- with open(pattern_file, "r", encoding="utf-8") as f:
- pattern_data = json.load(f)
- # 读取跨点关联分析结果
- print("正在读取跨点关联分析结果...")
- with open(associations_file, "r", encoding="utf-8") as f:
- associations_data = json.load(f)
- # 读取点内关联分析结果
- print("正在读取点内关联分析结果...")
- with open(intra_associations_file, "r", encoding="utf-8") as f:
- intra_associations_data = json.load(f)
- # ===== 提取节点 =====
- print("\n" + "="*60)
- print("正在提取节点...")
- all_nodes = []
- # 维度映射
- dimension_mapping = {
- "灵感点列表": "灵感点",
- "目的点": "目的点",
- "关键点列表": "关键点"
- }
- # 提取分类节点
- print("\n提取分类节点:")
- for dim_key, dim_name in dimension_mapping.items():
- category_nodes = extract_category_nodes_from_pattern(pattern_data, dim_key, dim_name)
- all_nodes.extend(category_nodes)
- print(f" {dim_name}: {len(category_nodes)} 个分类节点")
- # 提取标签节点
- print("\n提取标签节点:")
- for dim_key, dim_name in dimension_mapping.items():
- tag_nodes = extract_tag_nodes_from_pattern(pattern_data, dim_key, dim_name)
- all_nodes.extend(tag_nodes)
- print(f" {dim_name}: {len(tag_nodes)} 个标签节点")
- print(f"\n总计: {len(all_nodes)} 个节点")
- # 统计节点类型
- category_count = sum(1 for n in all_nodes if n["节点类型"] == "分类")
- tag_count = sum(1 for n in all_nodes if n["节点类型"] == "标签")
- print(f" 分类节点: {category_count}")
- print(f" 标签节点: {tag_count}")
- # ===== 提取边 =====
- print("\n" + "="*60)
- print("正在提取边...")
- all_edges = []
- # 提取分类-分类边(跨点共现)
- print("\n提取分类-分类边(跨点共现):")
- category_edges = extract_category_edges_from_associations(associations_data)
- all_edges.extend(category_edges)
- print(f" 分类共现(跨点)边: {len(category_edges)} 条")
- # 提取分类-分类边(点内共现)
- print("\n提取分类-分类边(点内共现):")
- intra_category_edges = extract_intra_category_edges(intra_associations_data)
- all_edges.extend(intra_category_edges)
- print(f" 分类共现(点内)边: {len(intra_category_edges)} 条")
- # 提取标签-分类边(属于/包含)
- print("\n提取标签-分类边(属于/包含):")
- belong_count = 0
- contain_count = 0
- for dim_key, dim_name in dimension_mapping.items():
- tag_category_edges = extract_tag_category_edges_from_pattern(pattern_data, dim_key, dim_name)
- all_edges.extend(tag_category_edges)
- dim_belong = sum(1 for e in tag_category_edges if e["边类型"] == "属于")
- dim_contain = sum(1 for e in tag_category_edges if e["边类型"] == "包含")
- belong_count += dim_belong
- contain_count += dim_contain
- print(f" {dim_name}: {dim_belong} 条属于边, {dim_contain} 条包含边")
- # 提取标签-标签边(共现)- 需要在过滤之前先记录排除的帖子ID
- # 这里先占位,过滤后再处理
- tag_cooccurrence_edges_placeholder = True
- print(f"\n边统计(标签共现待提取):")
- print(f" 分类共现(跨点)边: {len(category_edges)}")
- print(f" 分类共现(点内)边: {len(intra_category_edges)}")
- print(f" 属于边: {belong_count}")
- print(f" 包含边: {contain_count}")
- # ===== 应用过滤 =====
- exclude_post_ids = set()
- filter_mode = config.filter_mode
- if filter_mode == "exclude_current_posts":
- print("\n" + "="*60)
- print("应用过滤规则: 排除当前帖子ID")
- exclude_post_ids = get_current_post_ids(current_posts_dir)
- if exclude_post_ids:
- # 过滤节点
- nodes_before = len(all_nodes)
- all_nodes = filter_nodes_by_post_ids(all_nodes, exclude_post_ids)
- nodes_after = len(all_nodes)
- print(f"\n节点过滤: {nodes_before} -> {nodes_after} (移除 {nodes_before - nodes_after} 个)")
- # 过滤边
- edges_before = len(all_edges)
- all_edges = filter_edges_by_post_ids(all_edges, exclude_post_ids)
- edges_after = len(all_edges)
- print(f"边过滤: {edges_before} -> {edges_after} (移除 {edges_before - edges_after} 条)")
- elif filter_mode == "none":
- print("\n过滤模式: none,不应用任何过滤")
- else:
- print(f"\n警告: 未知的过滤模式 '{filter_mode}',不应用过滤")
- # ===== 提取标签-标签共现边 =====
- print("\n" + "="*60)
- print("提取标签-标签共现边...")
- historical_posts_dir = config.historical_posts_dir
- print(f"历史帖子目录: {historical_posts_dir}")
- tag_cooccurrence_edges = extract_tag_cooccurrence_edges(historical_posts_dir, exclude_post_ids)
- all_edges.extend(tag_cooccurrence_edges)
- print(f" 标签-标签共现边: {len(tag_cooccurrence_edges)} 条")
- # 更新总计
- print(f"\n总计: {len(all_edges)} 条边")
- print(f" 分类共现(跨点)边: {len(category_edges)}")
- print(f" 分类共现(点内)边: {len(intra_category_edges)}")
- print(f" 标签共现边: {len(tag_cooccurrence_edges)}")
- print(f" 属于边: {belong_count}")
- print(f" 包含边: {contain_count}")
- # ===== 获取帖子详情 =====
- print("\n" + "="*60)
- print("获取帖子详情...")
- # 收集所有需要获取详情的帖子ID(从节点和边)
- post_ids_from_nodes = collect_all_post_ids_from_nodes(all_nodes)
- post_ids_from_edges = collect_all_post_ids_from_edges(all_edges)
- all_post_ids = post_ids_from_nodes | post_ids_from_edges
- print(f"节点中的帖子: {len(post_ids_from_nodes)} 个")
- print(f"边中的帖子: {len(post_ids_from_edges)} 个")
- print(f"合计(去重): {len(all_post_ids)} 个")
- # 批量获取帖子详情
- post_details = fetch_post_details(all_post_ids)
- # ===== 保存结果 =====
- print("\n" + "="*60)
- # 输出文件路径
- post_details_output_file = config.intermediate_dir / "帖子详情映射.json"
- # 保存节点列表
- nodes_output = {
- "说明": {
- "描述": "分类和标签节点列表",
- "数据来源": ["过去帖子_pattern聚合结果.json"],
- "过滤模式": filter_mode,
- "过滤帖子数": len(exclude_post_ids) if exclude_post_ids else 0
- },
- "节点列表": all_nodes
- }
- print(f"正在保存节点列表到: {nodes_output_file}")
- with open(nodes_output_file, "w", encoding="utf-8") as f:
- json.dump(nodes_output, f, ensure_ascii=False, indent=2)
- # 构建节点ID索引的边关系: 节点 -> 边类型 -> {目标节点: 完整边信息}
- edges_by_node = {} # key: 节点ID, value: {边类型: {目标节点ID: 完整边信息}}
- for edge in all_edges:
- source_id = edge["源节点ID"]
- target_id = edge["目标节点ID"]
- edge_type = edge["边类型"]
- # 源节点 -> 目标节点
- if source_id not in edges_by_node:
- edges_by_node[source_id] = {}
- if edge_type not in edges_by_node[source_id]:
- edges_by_node[source_id][edge_type] = {}
- edges_by_node[source_id][edge_type][target_id] = edge
- # 保存边关系
- edges_output = {
- "说明": {
- "描述": "分类和标签之间的边关系",
- "数据来源": ["过去帖子_pattern聚合结果.json", "dimension_associations_analysis.json", "过去帖子_what解构结果目录"],
- "过滤模式": filter_mode,
- "过滤帖子数": len(exclude_post_ids) if exclude_post_ids else 0
- },
- "边列表": all_edges,
- "节点边索引": edges_by_node
- }
- print(f"正在保存边关系到: {edges_output_file}")
- with open(edges_output_file, "w", encoding="utf-8") as f:
- json.dump(edges_output, f, ensure_ascii=False, indent=2)
- # 保存帖子详情映射
- post_details_output = {
- "说明": {
- "描述": "帖子ID到帖子详情的映射",
- "帖子数": len(post_details)
- },
- "帖子详情": post_details
- }
- print(f"正在保存帖子详情映射到: {post_details_output_file}")
- with open(post_details_output_file, "w", encoding="utf-8") as f:
- json.dump(post_details_output, f, ensure_ascii=False, indent=2)
- print("\n完成!")
- print(f"\n输出文件:")
- print(f" 节点列表: {len(all_nodes)} 个节点")
- print(f" 边关系: {len(all_edges)} 条边")
- print(f" 帖子详情映射: {len(post_details)} 个帖子")
- if __name__ == "__main__":
- main()
|