yangxiaohui
/
how


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
							"""
构建点到帖子的映射索引

从 what解构结果 中提取每个帖子的灵感点、目的点、关键点，
构建从具体点到帖子详情的映射关系。

使用方式:
    python build_point_to_note_index.py --what-dir data/阿里多多酱/out/人设_1110/what解构结果
"""
import os
import json
import argparse
from typing import Dict, List, Any
from glob import glob
from script.detail import get_xiaohongshu_detail


def extract_points_from_what_file(what_file: str) -> Dict[str, Any]:
    """从单个 what 解构文件中提取所有点

    Args:
        what_file: what解构文件路径

    Returns:
        包含灵感点、目的点、关键点的字典
    """
    # 从文件名提取note_id
    filename = os.path.basename(what_file)
    note_id = filename.split("_with_history_")[0]

    with open(what_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    three_points = data.get("三点解构", {})

    # 提取灵感点
    inspiration_points = []
    inspiration_data = three_points.get("灵感点", {})

    for field in ["全新内容", "共性差异", "共性内容"]:
        items = inspiration_data.get(field, [])
        for item in items:
            point = item.get("灵感点", "")
            if point:
                inspiration_points.append({
                    "灵感点": point,
                    "来源字段": field,
                    "维度": item.get("维度", ""),
                    "描述": item.get("描述", "")
                })

    # 提取目的点
    purpose_points = []
    purpose_data = three_points.get("目的点", {})

    # 主目的
    main_purpose = purpose_data.get("main_purpose", {})
    if main_purpose:
        point = main_purpose.get("目的点", "")
        if point:
            purpose_points.append({
                "目的点": point,
                "类型": "主目的",
                "维度": main_purpose.get("维度", ""),
                "描述": main_purpose.get("描述", "")
            })

    # 次要目的
    secondary_purposes = purpose_data.get("secondary_purposes", [])
    for sec_purpose in secondary_purposes:
        point = sec_purpose.get("目的点", "")
        if point:
            purpose_points.append({
                "目的点": point,
                "类型": "次要目的",
                "维度": sec_purpose.get("维度", ""),
                "描述": sec_purpose.get("描述", "")
            })

    # 提取关键点
    key_points = []
    keypoint_data = three_points.get("关键点", {})
    kp_list = keypoint_data.get("key_points", [])

    for kp in kp_list:
        point = kp.get("关键点", "")
        if point:
            key_points.append({
                "关键点": point,
                "维度大类": kp.get("维度大类", ""),
                "维度细分": kp.get("维度细分", ""),
                "描述": kp.get("描述", "")
            })

    return {
        "note_id": note_id,
        "灵感点列表": inspiration_points,
        "目的点列表": purpose_points,
        "关键点列表": key_points
    }


def build_point_to_note_index(what_dir: str, fetch_details: bool = True) -> Dict[str, Any]:
    """构建点到帖子的映射索引

    Args:
        what_dir: what解构结果目录路径
        fetch_details: 是否获取帖子详情

    Returns:
        完整的映射索引
    """
    # 读取所有what解构文件
    what_files = glob(os.path.join(what_dir, "*_with_history_*.json"))

    print(f"{'=' * 80}")
    print(f"开始构建点到帖子的映射索引")
    print(f"{'=' * 80}")
    print(f"解构文件数量: {len(what_files)}\n")

    # 初始化索引结构
    inspiration_index = {}  # {灵感点: [note_id1, note_id2, ...]}
    purpose_index = {}      # {目的点: [note_id1, note_id2, ...]}
    keypoint_index = {}     # {关键点: [note_id1, note_id2, ...]}

    # 帖子到点的映射
    note_to_points = {}     # {note_id: {灵感点: [], 目的点: [], 关键点: []}}

    # 遍历所有文件
    for what_file in what_files:
        points_data = extract_points_from_what_file(what_file)
        note_id = points_data["note_id"]

        # 初始化帖子的点列表
        note_to_points[note_id] = {
            "灵感点列表": points_data["灵感点列表"],
            "目的点列表": points_data["目的点列表"],
            "关键点列表": points_data["关键点列表"]
        }

        # 构建灵感点到帖子的映射
        for insp in points_data["灵感点列表"]:
            point_name = insp["灵感点"]
            if point_name not in inspiration_index:
                inspiration_index[point_name] = {
                    "灵感点": point_name,
                    "维度": insp["维度"],
                    "描述": insp["描述"],
                    "帖子ID列表": []
                }
            if note_id not in inspiration_index[point_name]["帖子ID列表"]:
                inspiration_index[point_name]["帖子ID列表"].append(note_id)

        # 构建目的点到帖子的映射
        for purp in points_data["目的点列表"]:
            point_name = purp["目的点"]
            if point_name not in purpose_index:
                purpose_index[point_name] = {
                    "目的点": point_name,
                    "类型": purp["类型"],
                    "维度": purp["维度"],
                    "描述": purp["描述"],
                    "帖子ID列表": []
                }
            if note_id not in purpose_index[point_name]["帖子ID列表"]:
                purpose_index[point_name]["帖子ID列表"].append(note_id)

        # 构建关键点到帖子的映射
        for kp in points_data["关键点列表"]:
            point_name = kp["关键点"]
            if point_name not in keypoint_index:
                keypoint_index[point_name] = {
                    "关键点": point_name,
                    "维度大类": kp["维度大类"],
                    "维度细分": kp["维度细分"],
                    "描述": kp["描述"],
                    "帖子ID列表": []
                }
            if note_id not in keypoint_index[point_name]["帖子ID列表"]:
                keypoint_index[point_name]["帖子ID列表"].append(note_id)

    print(f"✓ 灵感点: {len(inspiration_index)} 个")
    print(f"✓ 目的点: {len(purpose_index)} 个")
    print(f"✓ 关键点: {len(keypoint_index)} 个")
    print(f"✓ 帖子: {len(note_to_points)} 个\n")

    # 获取帖子详情
    if fetch_details:
        # 收集所有唯一的note_ids
        all_note_ids = list(note_to_points.keys())

        print(f"{'=' * 80}")
        print(f"开始获取帖子详情...")
        print(f"{'=' * 80}\n")

        # 获取所有帖子详情（缓存到内存）
        note_details_cache = {}
        for i, note_id in enumerate(all_note_ids, 1):
            try:
                print(f"[{i}/{len(all_note_ids)}] 获取详情: {note_id}")
                detail = get_xiaohongshu_detail(note_id)
                note_details_cache[note_id] = detail
            except Exception as e:
                print(f"  ⚠️  获取失败: {e}")
                note_details_cache[note_id] = {
                    "channel_content_id": note_id,
                    "error": str(e)
                }

        print(f"\n✓ 帖子详情获取完成\n")

        # 填充详情到每个索引
        for point_info in inspiration_index.values():
            note_ids = point_info.get("帖子ID列表", [])
            point_info["帖子详情列表"] = [
                note_details_cache.get(note_id, {"channel_content_id": note_id})
                for note_id in note_ids
            ]

        for point_info in purpose_index.values():
            note_ids = point_info.get("帖子ID列表", [])
            point_info["帖子详情列表"] = [
                note_details_cache.get(note_id, {"channel_content_id": note_id})
                for note_id in note_ids
            ]

        for point_info in keypoint_index.values():
            note_ids = point_info.get("帖子ID列表", [])
            point_info["帖子详情列表"] = [
                note_details_cache.get(note_id, {"channel_content_id": note_id})
                for note_id in note_ids
            ]

        # 填充帖子到点的映射中的详情
        for note_id in note_to_points.keys():
            note_to_points[note_id]["帖子详情"] = note_details_cache.get(
                note_id,
                {"channel_content_id": note_id}
            )

    return {
        "点到帖子映射": {
            "灵感点": inspiration_index,
            "目的点": purpose_index,
            "关键点": keypoint_index
        },
        "帖子到点映射": note_to_points
    }


def save_index(index_data: Dict[str, Any], output_file: str):
    """保存索引到文件

    Args:
        index_data: 索引数据
        output_file: 输出文件路径
    """
    output_dir = os.path.dirname(output_file)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(index_data, f, ensure_ascii=False, indent=2)

    print(f"✓ 索引已保存: {output_file}")


def print_statistics(index_data: Dict[str, Any]):
    """打印统计信息

    Args:
        index_data: 完整的索引数据
    """
    print(f"\n{'=' * 80}")
    print(f"索引统计信息")
    print(f"{'=' * 80}\n")

    point_to_note = index_data.get("点到帖子映射", {})
    note_to_point = index_data.get("帖子到点映射", {})

    print(f"点到帖子映射:")
    for point_type, points in point_to_note.items():
        total_points = len(points)
        all_note_ids = set()
        for point_info in points.values():
            all_note_ids.update(point_info.get("帖子ID列表", []))
        total_notes = len(all_note_ids)
        avg_notes = total_notes / total_points if total_points > 0 else 0

        print(f"  {point_type}:")
        print(f"    点的数量: {total_points}")
        print(f"    关联帖子数: {total_notes}")
        print(f"    平均每个点关联帖子数: {avg_notes:.1f}")

    print(f"\n帖子到点映射:")
    print(f"  帖子数量: {len(note_to_point)}")

    # 统计每个帖子平均有多少个点
    total_insp = sum(len(v.get("灵感点列表", [])) for v in note_to_point.values())
    total_purp = sum(len(v.get("目的点列表", [])) for v in note_to_point.values())
    total_kp = sum(len(v.get("关键点列表", [])) for v in note_to_point.values())

    note_count = len(note_to_point)
    if note_count > 0:
        print(f"  平均每个帖子的灵感点数: {total_insp / note_count:.1f}")
        print(f"  平均每个帖子的目的点数: {total_purp / note_count:.1f}")
        print(f"  平均每个帖子的关键点数: {total_kp / note_count:.1f}")


def main():
    """主函数"""
    parser = argparse.ArgumentParser(
        description="构建点到帖子的映射索引",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
使用示例:
  # 基本使用
  python build_point_to_note_index.py --what-dir data/阿里多多酱/out/人设_1110/what解构结果

  # 只构建索引，不获取帖子详情
  python build_point_to_note_index.py --what-dir data/阿里多多酱/out/人设_1110/what解构结果 --no-details

  # 自定义输出文件
  python build_point_to_note_index.py --what-dir data/阿里多多酱/out/人设_1110/what解构结果 --output custom.json
        """
    )

    parser.add_argument(
        "--what-dir",
        required=True,
        help="what解构结果目录路径"
    )

    parser.add_argument(
        "--output",
        default=None,
        help="输出文件路径（默认: {what_dir}/../点到帖子映射.json）"
    )

    parser.add_argument(
        "--no-details",
        action="store_true",
        help="不获取帖子详情（只构建索引结构）"
    )

    args = parser.parse_args()

    what_dir = args.what_dir
    fetch_details = not args.no_details

    if not os.path.exists(what_dir):
        print(f"❌ 错误: 找不到what解构目录: {what_dir}")
        return

    # 构建索引
    index_data = build_point_to_note_index(what_dir, fetch_details=fetch_details)

    # 确定输出文件路径
    if args.output:
        output_file = args.output
    else:
        parent_dir = os.path.dirname(what_dir.rstrip('/'))
        output_file = os.path.join(parent_dir, "点到帖子映射.json")

    # 保存索引
    save_index(index_data, output_file)

    # 打印统计信息
    print_statistics(index_data)


if __name__ == "__main__":
    main()