yangxiaohui
/
how


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
							#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
从过去帖子解构结果目录中提取特征名称及其来源信息
仅支持新版数据结构（inspiration_final_result, purpose_final_result, keypoint_final）
"""

import json
from pathlib import Path
from typing import Dict, List, Optional, Set
import re
import sys

# 添加项目根目录到路径
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

from script.detail import get_xiaohongshu_detail
from script.data_processing.path_config import PathConfig


def extract_post_id_from_filename(filename: str) -> str:
    """从文件名中提取帖子ID
    支持格式: 68a6b96f000000001d006058.json
    """
    return filename.replace('.json', '')


def get_post_detail(post_id: str) -> Optional[Dict]:
    """获取帖子详情"""
    try:
        detail = get_xiaohongshu_detail(post_id)
        return detail
    except Exception as e:
        print(f"  警告: 获取帖子 {post_id} 详情失败: {e}")
        return None


def process_single_file(file_path: Path) -> Dict[str, Dict[str, List[Dict]]]:
    """
    处理单个JSON文件，提取所有特征信息

    Args:
        file_path: JSON文件路径

    Returns:
        包含灵感点、目的点、关键点的特征字典
    """
    result = {
        "灵感点": {},
        "目的点": {},
        "关键点": {}
    }

    post_id = extract_post_id_from_filename(file_path.name)

    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # 处理灵感点
        if "inspiration_final_result" in data:
            inspiration_data = data["inspiration_final_result"]
            for item in inspiration_data.get("最终灵感点列表", []):
                feature_name = item.get("灵感点", "")
                if not feature_name:
                    continue
                if feature_name not in result["灵感点"]:
                    result["灵感点"][feature_name] = []
                result["灵感点"][feature_name].append({
                    "点的名称": feature_name,
                    "点的描述": item.get("描述", ""),
                    "帖子id": post_id,
                    "点ID": item.get("id", ""),
                    "类型": item.get("类型", "")
                })

        # 处理目的点（意图+实质）
        if "purpose_final_result" in data:
            purpose_data = data["purpose_final_result"]

            # 处理意图列表
            for item in purpose_data.get("最终意图列表", []):
                feature_name = item.get("目的点", "")
                if not feature_name:
                    continue
                if feature_name not in result["目的点"]:
                    result["目的点"][feature_name] = []
                result["目的点"][feature_name].append({
                    "点的名称": feature_name,
                    "点的描述": item.get("描述", ""),
                    "帖子id": post_id,
                    "点ID": item.get("意图ID", ""),
                    "类型": "意图"
                })

            # 处理实质列表
            for item in purpose_data.get("最终实质列表", []):
                feature_name = item.get("目的点", "")
                if not feature_name:
                    continue
                if feature_name not in result["目的点"]:
                    result["目的点"][feature_name] = []
                result["目的点"][feature_name].append({
                    "点的名称": feature_name,
                    "点的描述": item.get("描述", ""),
                    "帖子id": post_id,
                    "点ID": item.get("实质ID", ""),
                    "类型": "实质",
                    "关联意图ID": item.get("关联意图ID", "")
                })

        # 处理关键点
        if "keypoint_final" in data:
            keypoint_data = data["keypoint_final"]
            for item in keypoint_data.get("最终关键点列表", []):
                feature_name = item.get("关键点", "")
                if not feature_name:
                    continue
                if feature_name not in result["关键点"]:
                    result["关键点"][feature_name] = []
                result["关键点"][feature_name].append({
                    "点的名称": feature_name,
                    "点的描述": item.get("描述", ""),
                    "帖子id": post_id,
                    "点ID": item.get("关键点ID", ""),
                    "类型": item.get("类型", ""),
                    "支撑的ID": item.get("支撑的ID", [])
                })

    except Exception as e:
        print(f"处理文件 {file_path.name} 时出错: {e}")

    return result


def merge_results(all_results: List[Dict]) -> Dict:
    """合并所有文件的提取结果"""
    merged = {
        "灵感点": {},
        "目的点": {},
        "关键点": {}
    }

    for result in all_results:
        for category in ["灵感点", "目的点", "关键点"]:
            for feature_name, sources in result[category].items():
                if feature_name not in merged[category]:
                    merged[category][feature_name] = {"来源": []}
                merged[category][feature_name]["来源"].extend(sources)

    return merged


def convert_to_array_format(
    merged_dict: Dict,
    fetch_details: bool = True,
    exclude_post_ids: Optional[Set[str]] = None
) -> Dict:
    """将字典格式转换为数组格式，并添加帖子详情"""
    result = {
        "灵感点": [],
        "目的点": [],
        "关键点": []
    }

    # 收集所有需要获取详情的帖子ID
    post_ids = set()
    if fetch_details:
        for category in ["灵感点", "目的点", "关键点"]:
            for feature_name, data in merged_dict[category].items():
                for source in data["来源"]:
                    post_ids.add(source["帖子id"])

        # 批量获取帖子详情
        print(f"\n正在获取 {len(post_ids)} 个帖子的详情...")
        post_details = {}
        for i, post_id in enumerate(post_ids, 1):
            print(f"[{i}/{len(post_ids)}] 获取帖子 {post_id} 的详情...")
            detail = get_post_detail(post_id)
            if detail:
                post_details[post_id] = detail

        print(f"成功获取 {len(post_details)} 个帖子详情")

        # 应用帖子ID过滤
        if exclude_post_ids:
            print(f"\n正在应用帖子ID过滤，排除 {len(exclude_post_ids)} 个当前帖子...")
            before_count = len(post_details)
            post_details = {pid: detail for pid, detail in post_details.items() if pid not in exclude_post_ids}
            filtered_count = before_count - len(post_details)
            if filtered_count > 0:
                print(f"  ⚠️  过滤掉 {filtered_count} 个当前帖子")
            print(f"保留 {len(post_details)} 个帖子")

    # 转换为数组格式并添加帖子详情
    for category in ["灵感点", "目的点", "关键点"]:
        for feature_name, data in merged_dict[category].items():
            enhanced_sources = []
            for source in data["来源"]:
                if fetch_details and exclude_post_ids and source["帖子id"] not in post_details:
                    continue
                enhanced_source = source.copy()
                if fetch_details and source["帖子id"] in post_details:
                    enhanced_source["帖子详情"] = post_details[source["帖子id"]]
                enhanced_sources.append(enhanced_source)

            if enhanced_sources:
                result[category].append({
                    "特征名称": feature_name,
                    "特征来源": enhanced_sources
                })

    return result


def get_current_post_ids(current_posts_dir: Path) -> Set[str]:
    """获取当前帖子目录中的所有帖子ID"""
    if not current_posts_dir.exists():
        print(f"警告: 当前帖子目录不存在: {current_posts_dir}")
        return set()

    json_files = list(current_posts_dir.glob("*.json"))
    if not json_files:
        print(f"警告: 当前帖子目录为空: {current_posts_dir}")
        return set()

    print(f"\n正在获取当前帖子ID...")
    print(f"找到 {len(json_files)} 个当前帖子")

    post_ids = set()
    for file_path in json_files:
        post_id = extract_post_id_from_filename(file_path.name)
        if post_id:
            post_ids.add(post_id)

    print(f"提取到 {len(post_ids)} 个帖子ID")
    return post_ids


def main():
    config = PathConfig()
    config.ensure_dirs()

    input_dir = config.historical_posts_dir
    current_posts_dir = config.current_posts_dir
    output_file = config.feature_source_mapping_file

    print(f"账号: {config.account_name}")
    print(f"过滤模式: {config.filter_mode}")
    print(f"过去帖子目录: {input_dir}")
    print(f"当前帖子目录: {current_posts_dir}")
    print(f"输出文件: {output_file}")
    print()

    print(f"\n正在扫描目录: {input_dir}")

    json_files = list(input_dir.glob("*.json"))
    print(f"找到 {len(json_files)} 个JSON文件")

    all_results = []
    for i, file_path in enumerate(json_files, 1):
        print(f"处理文件 [{i}/{len(json_files)}]: {file_path.name}")
        result = process_single_file(file_path)
        all_results.append(result)

    print("\n正在合并结果...")
    merged_result = merge_results(all_results)

    # 过滤当前帖子
    exclude_post_ids = None
    if config.filter_mode == "exclude_current_posts":
        print("\n应用过滤规则: 排除当前帖子ID")
        exclude_post_ids = get_current_post_ids(current_posts_dir)
    elif config.filter_mode == "none":
        print("\n过滤模式: none，不应用任何过滤")

    print("正在转换为数组格式...")
    final_result = convert_to_array_format(
        merged_result,
        fetch_details=True,
        exclude_post_ids=exclude_post_ids
    )

    print(f"\n提取统计:")
    for category in ["灵感点", "目的点", "关键点"]:
        feature_count = len(final_result[category])
        source_count = sum(len(item["特征来源"]) for item in final_result[category])
        print(f"  {category}: {feature_count} 个特征, {source_count} 个来源")

    print(f"\n正在保存结果到: {output_file}")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(final_result, f, ensure_ascii=False, indent=4)

    print("完成!")


if __name__ == "__main__":
    main()