yangxiaohui
/
how


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
							#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
从过去帖子_what解构结果目录中提取特征名称及其来源信息
"""

import json
from pathlib import Path
from typing import Dict, List, Optional, Set
import re
import sys

# 添加项目根目录到路径
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

from script.detail import get_xiaohongshu_detail
from script.data_processing.path_config import PathConfig


def extract_post_id_from_filename(filename: str) -> str:
    """从文件名中提取帖子ID"""
    match = re.match(r'^([^_]+)_', filename)
    if match:
        return match.group(1)
    return ""


def get_post_detail(post_id: str) -> Optional[Dict]:
    """
    获取帖子详情

    Args:
        post_id: 帖子ID

    Returns:
        帖子详情字典，如果获取失败则返回None
    """
    try:
        detail = get_xiaohongshu_detail(post_id)
        return detail
    except Exception as e:
        print(f"  警告: 获取帖子 {post_id} 详情失败: {e}")
        return None


def extract_features_from_point(point_data: Dict, post_id: str, point_name: str, point_description: str) -> List[Dict]:
    """
    从单个点（灵感点/目的点/关键点）中提取特征信息

    Args:
        point_data: 点的数据
        post_id: 帖子ID
        point_name: 点的名称
        point_description: 点的描述

    Returns:
        特征列表
    """
    features = []

    # 检查是否有"提取的特征"字段
    if "提取的特征" in point_data and isinstance(point_data["提取的特征"], list):
        for feature in point_data["提取的特征"]:
            if "特征名称" in feature:
                features.append({
                    "特征名称": feature["特征名称"],
                    "点的名称": point_name,
                    "点的描述": point_description,
                    "帖子id": post_id
                })

    return features


def process_single_file(file_path: Path) -> Dict[str, Dict[str, List[Dict]]]:
    """
    处理单个JSON文件，提取所有特征信息

    Args:
        file_path: JSON文件路径

    Returns:
        包含灵感点、目的点、关键点的特征字典
    """
    result = {
        "灵感点": {},
        "目的点": {},
        "关键点": {}
    }

    # 从文件名提取帖子ID
    post_id = extract_post_id_from_filename(file_path.name)

    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # 提取三点解构数据
        if "三点解构" not in data:
            return result

        three_points = data["三点解构"]

        # 处理灵感点
        if "灵感点" in three_points:
            inspiration = three_points["灵感点"]

            # 处理全新内容
            if "全新内容" in inspiration and isinstance(inspiration["全新内容"], list):
                for item in inspiration["全新内容"]:
                    point_name = item.get("灵感点", "")
                    point_desc = item.get("描述", "")
                    features = extract_features_from_point(item, post_id, point_name, point_desc)

                    for feature in features:
                        feature_name = feature["特征名称"]
                        if feature_name not in result["灵感点"]:
                            result["灵感点"][feature_name] = []
                        result["灵感点"][feature_name].append({
                            "点的名称": feature["点的名称"],
                            "点的描述": feature["点的描述"],
                            "帖子id": feature["帖子id"]
                        })

            # 处理共性差异
            if "共性差异" in inspiration and isinstance(inspiration["共性差异"], list):
                for item in inspiration["共性差异"]:
                    point_name = item.get("灵感点", "")
                    point_desc = item.get("描述", "")
                    features = extract_features_from_point(item, post_id, point_name, point_desc)

                    for feature in features:
                        feature_name = feature["特征名称"]
                        if feature_name not in result["灵感点"]:
                            result["灵感点"][feature_name] = []
                        result["灵感点"][feature_name].append({
                            "点的名称": feature["点的名称"],
                            "点的描述": feature["点的描述"],
                            "帖子id": feature["帖子id"]
                        })

            # 处理共性内容
            if "共性内容" in inspiration and isinstance(inspiration["共性内容"], list):
                for item in inspiration["共性内容"]:
                    point_name = item.get("灵感点", "")
                    point_desc = item.get("描述", "")
                    features = extract_features_from_point(item, post_id, point_name, point_desc)

                    for feature in features:
                        feature_name = feature["特征名称"]
                        if feature_name not in result["灵感点"]:
                            result["灵感点"][feature_name] = []
                        result["灵感点"][feature_name].append({
                            "点的名称": feature["点的名称"],
                            "点的描述": feature["点的描述"],
                            "帖子id": feature["帖子id"]
                        })

        # 处理目的点
        if "目的点" in three_points:
            purpose = three_points["目的点"]

            if "purposes" in purpose and isinstance(purpose["purposes"], list):
                for item in purpose["purposes"]:
                    point_name = item.get("目的点", "")
                    point_desc = item.get("描述", "")
                    features = extract_features_from_point(item, post_id, point_name, point_desc)

                    for feature in features:
                        feature_name = feature["特征名称"]
                        if feature_name not in result["目的点"]:
                            result["目的点"][feature_name] = []
                        result["目的点"][feature_name].append({
                            "点的名称": feature["点的名称"],
                            "点的描述": feature["点的描述"],
                            "帖子id": feature["帖子id"]
                        })

        # 处理关键点
        if "关键点" in three_points:
            key_points = three_points["关键点"]

            if "key_points" in key_points and isinstance(key_points["key_points"], list):
                for item in key_points["key_points"]:
                    point_name = item.get("关键点", "")
                    point_desc = item.get("描述", "")
                    features = extract_features_from_point(item, post_id, point_name, point_desc)

                    for feature in features:
                        feature_name = feature["特征名称"]
                        if feature_name not in result["关键点"]:
                            result["关键点"][feature_name] = []
                        result["关键点"][feature_name].append({
                            "点的名称": feature["点的名称"],
                            "点的描述": feature["点的描述"],
                            "帖子id": feature["帖子id"]
                        })

    except Exception as e:
        print(f"处理文件 {file_path.name} 时出错: {e}")

    return result


def merge_results(all_results: List[Dict]) -> Dict:
    """
    合并所有文件的提取结果

    Args:
        all_results: 所有文件的结果列表

    Returns:
        合并后的结果
    """
    merged = {
        "灵感点": {},
        "目的点": {},
        "关键点": {}
    }

    for result in all_results:
        for category in ["灵感点", "目的点", "关键点"]:
            for feature_name, sources in result[category].items():
                if feature_name not in merged[category]:
                    merged[category][feature_name] = {"来源": []}
                merged[category][feature_name]["来源"].extend(sources)

    return merged


def convert_to_array_format(
    merged_dict: Dict,
    fetch_details: bool = True,
    time_filter: Optional[str] = None,
    exclude_post_ids: Optional[Set[str]] = None
) -> Dict:
    """
    将字典格式转换为数组格式，并添加帖子详情

    Args:
        merged_dict: 字典格式的结果
        fetch_details: 是否获取帖子详情，默认为True
        time_filter: 时间过滤阈值，只保留发布时间<该时间的帖子，格式为 "YYYY-MM-DD HH:MM:SS"
        exclude_post_ids: 要排除的帖子ID集合

    Returns:
        数组格式的结果
    """
    result = {
        "灵感点": [],
        "目的点": [],
        "关键点": []
    }

    # 收集所有需要获取详情的帖子ID
    post_ids = set()
    if fetch_details:
        for category in ["灵感点", "目的点", "关键点"]:
            for feature_name, data in merged_dict[category].items():
                for source in data["来源"]:
                    post_ids.add(source["帖子id"])

        # 批量获取帖子详情
        print(f"\n正在获取 {len(post_ids)} 个帖子的详情...")
        post_details = {}
        for i, post_id in enumerate(post_ids, 1):
            print(f"[{i}/{len(post_ids)}] 获取帖子 {post_id} 的详情...")
            detail = get_post_detail(post_id)
            if detail:
                post_details[post_id] = detail

        print(f"成功获取 {len(post_details)} 个帖子详情")

        # 应用过滤规则
        filtered_count = 0

        # 1. 如果启用帖子ID过滤
        if exclude_post_ids:
            print(f"\n正在应用帖子ID过滤，排除 {len(exclude_post_ids)} 个当前帖子...")
            before_count = len(post_details)
            post_details = {pid: detail for pid, detail in post_details.items() if pid not in exclude_post_ids}
            filtered_count = before_count - len(post_details)
            if filtered_count > 0:
                print(f"  ⚠️  过滤掉 {filtered_count} 个当前帖子")
            print(f"保留 {len(post_details)} 个帖子")

        # 2. 如果启用时间过滤（过滤掉发布时间晚于等于阈值的帖子，避免穿越）
        elif time_filter:
            print(f"\n正在应用时间过滤 (< {time_filter})，避免使用晚于当前帖子的数据...")
            filtered_post_ids = set()
            for post_id, detail in post_details.items():
                publish_time = detail.get('publish_time', '')
                if publish_time < time_filter:
                    filtered_post_ids.add(post_id)
                else:
                    filtered_count += 1
                    print(f"  ⚠️  过滤掉帖子 {post_id} (发布时间: {publish_time}，晚于阈值)")

            print(f"过滤掉 {filtered_count} 个帖子（穿越），保留 {len(filtered_post_ids)} 个帖子")
            # 更新post_details，只保留符合时间条件的
            post_details = {pid: detail for pid, detail in post_details.items() if pid in filtered_post_ids}

    # 转换为数组格式并添加帖子详情
    for category in ["灵感点", "目的点", "关键点"]:
        for feature_name, data in merged_dict[category].items():
            # 为每个来源添加帖子详情
            enhanced_sources = []
            for source in data["来源"]:
                # 如果启用过滤，跳过不符合条件的帖子
                if fetch_details and (time_filter or exclude_post_ids) and source["帖子id"] not in post_details:
                    continue

                enhanced_source = source.copy()
                if fetch_details and source["帖子id"] in post_details:
                    enhanced_source["帖子详情"] = post_details[source["帖子id"]]
                enhanced_sources.append(enhanced_source)

            # 只添加有来源的特征
            if enhanced_sources:
                result[category].append({
                    "特征名称": feature_name,
                    "特征来源": enhanced_sources
                })

    return result


def get_current_post_ids(current_posts_dir: Path) -> Set[str]:
    """
    获取当前帖子目录中的所有帖子ID

    Args:
        current_posts_dir: 当前帖子目录路径

    Returns:
        当前帖子ID集合
    """
    if not current_posts_dir.exists():
        print(f"警告: 当前帖子目录不存在: {current_posts_dir}")
        return set()

    json_files = list(current_posts_dir.glob("*.json"))
    if not json_files:
        print(f"警告: 当前帖子目录为空: {current_posts_dir}")
        return set()

    print(f"\n正在获取当前帖子ID...")
    print(f"找到 {len(json_files)} 个当前帖子")

    post_ids = set()
    for file_path in json_files:
        post_id = extract_post_id_from_filename(file_path.name)
        if post_id:
            post_ids.add(post_id)

    print(f"提取到 {len(post_ids)} 个帖子ID")
    return post_ids


def get_earliest_publish_time(current_posts_dir: Path) -> Optional[str]:
    """
    获取当前帖子目录中最早的发布时间

    Args:
        current_posts_dir: 当前帖子目录路径

    Returns:
        最早的发布时间字符串，格式为 "YYYY-MM-DD HH:MM:SS"
    """
    if not current_posts_dir.exists():
        print(f"警告: 当前帖子目录不存在: {current_posts_dir}")
        return None

    json_files = list(current_posts_dir.glob("*.json"))
    if not json_files:
        print(f"警告: 当前帖子目录为空: {current_posts_dir}")
        return None

    print(f"\n正在获取当前帖子的发布时间...")
    print(f"找到 {len(json_files)} 个当前帖子")

    earliest_time = None
    for file_path in json_files:
        post_id = extract_post_id_from_filename(file_path.name)
        if not post_id:
            continue

        try:
            detail = get_post_detail(post_id)
            if detail and 'publish_time' in detail:
                publish_time = detail['publish_time']
                if earliest_time is None or publish_time < earliest_time:
                    earliest_time = publish_time
                    print(f"  更新最早时间: {publish_time} (帖子: {post_id})")
        except Exception as e:
            print(f"  警告: 获取帖子 {post_id} 发布时间失败: {e}")

    if earliest_time:
        print(f"\n当前帖子最早发布时间: {earliest_time}")
    else:
        print("\n警告: 未能获取到任何当前帖子的发布时间")

    return earliest_time


def main():
    # 使用路径配置
    config = PathConfig()

    # 确保输出目录存在
    config.ensure_dirs()

    # 获取路径
    input_dir = config.historical_posts_dir
    current_posts_dir = config.current_posts_dir
    output_file = config.feature_source_mapping_file

    print(f"账号: {config.account_name}")
    print(f"过滤模式: {config.filter_mode}")
    print(f"过去帖子目录: {input_dir}")
    print(f"当前帖子目录: {current_posts_dir}")
    print(f"输出文件: {output_file}")
    print()

    print(f"\n正在扫描目录: {input_dir}")

    # 获取所有JSON文件
    json_files = list(input_dir.glob("*.json"))
    print(f"找到 {len(json_files)} 个JSON文件")

    # 处理所有文件
    all_results = []
    for i, file_path in enumerate(json_files, 1):
        print(f"处理文件 [{i}/{len(json_files)}]: {file_path.name}")
        result = process_single_file(file_path)
        all_results.append(result)

    # 合并结果
    print("\n正在合并结果...")
    merged_result = merge_results(all_results)

    # 根据配置的过滤模式应用过滤
    filter_mode = config.filter_mode
    time_filter = None
    exclude_post_ids = None

    if filter_mode == "exclude_current_posts":
        # 新规则：排除当前帖子ID
        print("\n应用过滤规则: 排除当前帖子ID")
        exclude_post_ids = get_current_post_ids(current_posts_dir)
    elif filter_mode == "time_based":
        # 旧规则：基于发布时间
        print("\n应用过滤规则: 基于发布时间")
        time_filter = get_earliest_publish_time(current_posts_dir)
    elif filter_mode == "none":
        print("\n过滤模式: none，不应用任何过滤")
    else:
        print(f"\n警告: 未知的过滤模式 '{filter_mode}'，不应用过滤")

    # 转换为数组格式（带过滤）
    print("正在转换为数组格式...")
    final_result = convert_to_array_format(
        merged_result,
        fetch_details=True,
        time_filter=time_filter,
        exclude_post_ids=exclude_post_ids
    )

    # 统计信息
    print(f"\n提取统计:")
    for category in ["灵感点", "目的点", "关键点"]:
        feature_count = len(final_result[category])
        source_count = sum(len(item["特征来源"]) for item in final_result[category])
        print(f"  {category}: {feature_count} 个特征, {source_count} 个来源")

    # 保存结果
    print(f"\n正在保存结果到: {output_file}")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(final_result, f, ensure_ascii=False, indent=4)

    print("完成!")


if __name__ == "__main__":
    main()