#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 从过去帖子解构结果目录中提取特征名称及其来源信息 仅支持新版数据结构(inspiration_final_result, purpose_final_result, keypoint_final) """ import json from pathlib import Path from typing import Dict, List, Optional, Set import re import sys # 添加项目根目录到路径 project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) from script.detail import get_xiaohongshu_detail from script.data_processing.path_config import PathConfig def extract_post_id_from_filename(filename: str) -> str: """从文件名中提取帖子ID 支持格式: 68a6b96f000000001d006058.json """ return filename.replace('.json', '') def get_post_detail(post_id: str) -> Optional[Dict]: """获取帖子详情""" try: detail = get_xiaohongshu_detail(post_id) return detail except Exception as e: print(f" 警告: 获取帖子 {post_id} 详情失败: {e}") return None def process_single_file(file_path: Path) -> Dict[str, Dict[str, List[Dict]]]: """ 处理单个JSON文件,提取所有特征信息 Args: file_path: JSON文件路径 Returns: 包含灵感点、目的点、关键点的特征字典 """ result = { "灵感点": {}, "目的点": {}, "关键点": {} } post_id = extract_post_id_from_filename(file_path.name) try: with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) # 处理灵感点 if "inspiration_final_result" in data: inspiration_data = data["inspiration_final_result"] for item in inspiration_data.get("最终灵感点列表", []): feature_name = item.get("灵感点", "") if not feature_name: continue if feature_name not in result["灵感点"]: result["灵感点"][feature_name] = [] result["灵感点"][feature_name].append({ "点的名称": feature_name, "点的描述": item.get("描述", ""), "帖子id": post_id, "点ID": item.get("id", ""), "类型": item.get("类型", "") }) # 处理目的点(意图+实质) if "purpose_final_result" in data: purpose_data = data["purpose_final_result"] # 处理意图列表 for item in purpose_data.get("最终意图列表", []): feature_name = item.get("目的点", "") if not feature_name: continue if feature_name not in result["目的点"]: result["目的点"][feature_name] = [] result["目的点"][feature_name].append({ "点的名称": feature_name, "点的描述": item.get("描述", ""), "帖子id": post_id, "点ID": item.get("意图ID", ""), "类型": "意图" }) # 处理实质列表 for item in purpose_data.get("最终实质列表", []): feature_name = item.get("目的点", "") if not feature_name: continue if feature_name not in result["目的点"]: result["目的点"][feature_name] = [] result["目的点"][feature_name].append({ "点的名称": feature_name, "点的描述": item.get("描述", ""), "帖子id": post_id, "点ID": item.get("实质ID", ""), "类型": "实质", "关联意图ID": item.get("关联意图ID", "") }) # 处理关键点 if "keypoint_final" in data: keypoint_data = data["keypoint_final"] for item in keypoint_data.get("最终关键点列表", []): feature_name = item.get("关键点", "") if not feature_name: continue if feature_name not in result["关键点"]: result["关键点"][feature_name] = [] result["关键点"][feature_name].append({ "点的名称": feature_name, "点的描述": item.get("描述", ""), "帖子id": post_id, "点ID": item.get("关键点ID", ""), "类型": item.get("类型", ""), "支撑的ID": item.get("支撑的ID", []) }) except Exception as e: print(f"处理文件 {file_path.name} 时出错: {e}") return result def merge_results(all_results: List[Dict]) -> Dict: """合并所有文件的提取结果""" merged = { "灵感点": {}, "目的点": {}, "关键点": {} } for result in all_results: for category in ["灵感点", "目的点", "关键点"]: for feature_name, sources in result[category].items(): if feature_name not in merged[category]: merged[category][feature_name] = {"来源": []} merged[category][feature_name]["来源"].extend(sources) return merged def convert_to_array_format( merged_dict: Dict, fetch_details: bool = True, exclude_post_ids: Optional[Set[str]] = None ) -> Dict: """将字典格式转换为数组格式,并添加帖子详情""" result = { "灵感点": [], "目的点": [], "关键点": [] } # 收集所有需要获取详情的帖子ID post_ids = set() if fetch_details: for category in ["灵感点", "目的点", "关键点"]: for feature_name, data in merged_dict[category].items(): for source in data["来源"]: post_ids.add(source["帖子id"]) # 批量获取帖子详情 print(f"\n正在获取 {len(post_ids)} 个帖子的详情...") post_details = {} for i, post_id in enumerate(post_ids, 1): print(f"[{i}/{len(post_ids)}] 获取帖子 {post_id} 的详情...") detail = get_post_detail(post_id) if detail: post_details[post_id] = detail print(f"成功获取 {len(post_details)} 个帖子详情") # 应用帖子ID过滤 if exclude_post_ids: print(f"\n正在应用帖子ID过滤,排除 {len(exclude_post_ids)} 个当前帖子...") before_count = len(post_details) post_details = {pid: detail for pid, detail in post_details.items() if pid not in exclude_post_ids} filtered_count = before_count - len(post_details) if filtered_count > 0: print(f" ⚠️ 过滤掉 {filtered_count} 个当前帖子") print(f"保留 {len(post_details)} 个帖子") # 转换为数组格式并添加帖子详情 for category in ["灵感点", "目的点", "关键点"]: for feature_name, data in merged_dict[category].items(): enhanced_sources = [] for source in data["来源"]: if fetch_details and exclude_post_ids and source["帖子id"] not in post_details: continue enhanced_source = source.copy() if fetch_details and source["帖子id"] in post_details: enhanced_source["帖子详情"] = post_details[source["帖子id"]] enhanced_sources.append(enhanced_source) if enhanced_sources: result[category].append({ "特征名称": feature_name, "特征来源": enhanced_sources }) return result def get_current_post_ids(current_posts_dir: Path) -> Set[str]: """获取当前帖子目录中的所有帖子ID""" if not current_posts_dir.exists(): print(f"警告: 当前帖子目录不存在: {current_posts_dir}") return set() json_files = list(current_posts_dir.glob("*.json")) if not json_files: print(f"警告: 当前帖子目录为空: {current_posts_dir}") return set() print(f"\n正在获取当前帖子ID...") print(f"找到 {len(json_files)} 个当前帖子") post_ids = set() for file_path in json_files: post_id = extract_post_id_from_filename(file_path.name) if post_id: post_ids.add(post_id) print(f"提取到 {len(post_ids)} 个帖子ID") return post_ids def main(): config = PathConfig() config.ensure_dirs() input_dir = config.historical_posts_dir current_posts_dir = config.current_posts_dir output_file = config.feature_source_mapping_file print(f"账号: {config.account_name}") print(f"过滤模式: {config.filter_mode}") print(f"过去帖子目录: {input_dir}") print(f"当前帖子目录: {current_posts_dir}") print(f"输出文件: {output_file}") print() print(f"\n正在扫描目录: {input_dir}") json_files = list(input_dir.glob("*.json")) print(f"找到 {len(json_files)} 个JSON文件") all_results = [] for i, file_path in enumerate(json_files, 1): print(f"处理文件 [{i}/{len(json_files)}]: {file_path.name}") result = process_single_file(file_path) all_results.append(result) print("\n正在合并结果...") merged_result = merge_results(all_results) # 过滤当前帖子 exclude_post_ids = None if config.filter_mode == "exclude_current_posts": print("\n应用过滤规则: 排除当前帖子ID") exclude_post_ids = get_current_post_ids(current_posts_dir) elif config.filter_mode == "none": print("\n过滤模式: none,不应用任何过滤") print("正在转换为数组格式...") final_result = convert_to_array_format( merged_result, fetch_details=True, exclude_post_ids=exclude_post_ids ) print(f"\n提取统计:") for category in ["灵感点", "目的点", "关键点"]: feature_count = len(final_result[category]) source_count = sum(len(item["特征来源"]) for item in final_result[category]) print(f" {category}: {feature_count} 个特征, {source_count} 个来源") print(f"\n正在保存结果到: {output_file}") with open(output_file, "w", encoding="utf-8") as f: json.dump(final_result, f, ensure_ascii=False, indent=4) print("完成!") if __name__ == "__main__": main()