yangxiaohui
/
how


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
							"""
构建人设分类的反向索引

将人设数据和what解构数据转换为以分类名称为键的反向索引结构，包含：
- 灵感分类（来自人设.json）
- 目的分类（来自what解构）
- 关键点分类（来自what解构）

使用方式:
    python build_category_index.py --persona-dir data/阿里多多酱/out/人设_1110
"""
import os
import json
import argparse
from typing import Dict, List, Any
from glob import glob
from script.detail import get_xiaohongshu_detail


def build_inspiration_index(persona_data: Dict[str, Any]) -> Dict[str, Any]:
    """构建灵感点索引

    Args:
        persona_data: 人设数据（包含灵感点列表）

    Returns:
        灵感分类索引
    """
    index = {}

    # 遍历所有视角
    for perspective in persona_data.get("灵感点列表", []):
        perspective_name = perspective.get("视角名称", "")
        perspective_desc = perspective.get("视角描述", "")

        # 遍历一级分类（模式列表）
        for category_l1 in perspective.get("模式列表", []):
            category_l1_name = category_l1.get("分类名称", "")
            category_l1_def = category_l1.get("核心定义", "")

            # 收集一级分类下所有二级分类的帖子
            category_l1_note_ids = set()

            # 遍历二级分类（二级细分）
            for category_l2 in category_l1.get("二级细分", []):
                category_l2_name = category_l2.get("分类名称", "")
                category_l2_def = category_l2.get("分类定义", "")
                note_ids = category_l2.get("帖子ID列表", [])

                # 去重帖子ID
                unique_note_ids = list(dict.fromkeys(note_ids))

                # 添加到一级分类的帖子集合
                category_l1_note_ids.update(unique_note_ids)

                # 构建二级分类路径
                category_l2_path = [
                    {
                        "视角名称": perspective_name,
                        "视角描述": perspective_desc
                    },
                    {
                        "分类名称": category_l1_name,
                        "分类定义": category_l1_def
                    },
                    {
                        "分类名称": category_l2_name,
                        "分类定义": category_l2_def
                    }
                ]

                # 如果二级分类已存在，合并帖子列表
                if category_l2_name in index:
                    existing_ids = set(index[category_l2_name]["帖子ID列表"])
                    new_ids = set(unique_note_ids)
                    index[category_l2_name]["帖子ID列表"] = list(existing_ids | new_ids)
                else:
                    # 创建新的二级分类索引
                    index[category_l2_name] = {
                        "分类层级": "二级分类",
                        "分类名称": category_l2_name,
                        "分类定义": category_l2_def,
                        "分类路径": category_l2_path,
                        "帖子ID列表": unique_note_ids
                    }

            # 构建一级分类路径
            category_l1_path = [
                {
                    "视角名称": perspective_name,
                    "视角描述": perspective_desc
                },
                {
                    "分类名称": category_l1_name,
                    "分类定义": category_l1_def
                }
            ]

            # 添加一级分类索引
            if category_l1_name in index:
                existing_ids = set(index[category_l1_name]["帖子ID列表"])
                index[category_l1_name]["帖子ID列表"] = list(existing_ids | category_l1_note_ids)
            else:
                index[category_l1_name] = {
                    "分类层级": "一级分类",
                    "分类名称": category_l1_name,
                    "分类定义": category_l1_def,
                    "分类路径": category_l1_path,
                    "帖子ID列表": list(category_l1_note_ids)
                }

    return index


def build_purpose_index(what_dir: str) -> Dict[str, Any]:
    """构建目的点索引

    Args:
        what_dir: what解构结果目录路径

    Returns:
        目的分类索引
    """
    index = {}

    # 读取所有what解构文件
    what_files = glob(os.path.join(what_dir, "*_with_history_*.json"))

    for what_file in what_files:
        # 从文件名提取note_id
        filename = os.path.basename(what_file)
        note_id = filename.split("_with_history_")[0]

        with open(what_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        purpose_data = data.get("三点解构", {}).get("目的点", {})

        # 主目的
        main_purpose = purpose_data.get("main_purpose", {})
        if main_purpose:
            purpose_name = main_purpose.get("目的点", "")
            if purpose_name:
                if purpose_name not in index:
                    index[purpose_name] = {
                        "分类类型": "主目的",
                        "目的点": purpose_name,
                        "维度": main_purpose.get("维度", ""),
                        "描述": main_purpose.get("描述", ""),
                        "帖子ID列表": []
                    }
                if note_id not in index[purpose_name]["帖子ID列表"]:
                    index[purpose_name]["帖子ID列表"].append(note_id)

        # 次要目的
        secondary_purposes = purpose_data.get("secondary_purposes", [])
        for sec_purpose in secondary_purposes:
            purpose_name = sec_purpose.get("目的点", "")
            if purpose_name:
                if purpose_name not in index:
                    index[purpose_name] = {
                        "分类类型": "次要目的",
                        "目的点": purpose_name,
                        "维度": sec_purpose.get("维度", ""),
                        "描述": sec_purpose.get("描述", ""),
                        "帖子ID列表": []
                    }
                if note_id not in index[purpose_name]["帖子ID列表"]:
                    index[purpose_name]["帖子ID列表"].append(note_id)

    return index


def build_keypoint_index(what_dir: str) -> Dict[str, Any]:
    """构建关键点索引

    Args:
        what_dir: what解构结果目录路径

    Returns:
        关键点分类索引
    """
    index = {}

    # 读取所有what解构文件
    what_files = glob(os.path.join(what_dir, "*_with_history_*.json"))

    for what_file in what_files:
        # 从文件名提取note_id
        filename = os.path.basename(what_file)
        note_id = filename.split("_with_history_")[0]

        with open(what_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        keypoint_data = data.get("三点解构", {}).get("关键点", {})
        key_points = keypoint_data.get("key_points", [])

        for kp in key_points:
            kp_name = kp.get("关键点", "")
            if kp_name:
                if kp_name not in index:
                    index[kp_name] = {
                        "关键点": kp_name,
                        "维度大类": kp.get("维度大类", ""),
                        "维度细分": kp.get("维度细分", ""),
                        "描述": kp.get("描述", ""),
                        "帖子ID列表": []
                    }
                if note_id not in index[kp_name]["帖子ID列表"]:
                    index[kp_name]["帖子ID列表"].append(note_id)

    return index


def fetch_note_details(category_data: Dict[str, Any]) -> Dict[str, Any]:
    """获取帖子详情

    Args:
        category_data: 分类数据（包含帖子ID列表）

    Returns:
        更新后的分类数据（包含帖子详情）
    """
    # 收集所有unique的note_ids
    all_note_ids = set()
    for category_info in category_data.values():
        all_note_ids.update(category_info.get("帖子ID列表", []))

    all_note_ids = list(all_note_ids)

    print(f"\n{'=' * 80}")
    print(f"开始获取帖子详情...")
    print(f"{'=' * 80}\n")
    print(f"共有 {len(all_note_ids)} 个唯一帖子\n")

    # 获取所有帖子详情（缓存到内存）
    note_details_cache = {}
    for i, note_id in enumerate(all_note_ids, 1):
        try:
            print(f"[{i}/{len(all_note_ids)}] 获取详情: {note_id}")
            detail = get_xiaohongshu_detail(note_id)
            note_details_cache[note_id] = detail
        except Exception as e:
            print(f"  ⚠️  获取失败: {e}")
            note_details_cache[note_id] = {
                "channel_content_id": note_id,
                "error": str(e)
            }

    print(f"\n✓ 帖子详情获取完成\n")

    # 填充详情到每个分类
    for category_info in category_data.values():
        note_ids = category_info.get("帖子ID列表", [])
        category_info["帖子详情列表"] = [
            note_details_cache.get(note_id, {"channel_content_id": note_id})
            for note_id in note_ids
        ]

    return category_data


def save_index(index_data: Dict[str, Any], output_file: str):
    """保存索引到文件

    Args:
        index_data: 索引数据
        output_file: 输出文件路径
    """
    output_dir = os.path.dirname(output_file)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(index_data, f, ensure_ascii=False, indent=2)

    print(f"✓ 索引已保存: {output_file}")


def print_statistics(index_data: Dict[str, Any]):
    """打印统计信息

    Args:
        index_data: 完整的索引数据
    """
    print(f"\n{'=' * 80}")
    print(f"索引统计信息")
    print(f"{'=' * 80}\n")

    for index_type, categories in index_data.items():
        total_categories = len(categories)
        all_note_ids = set()
        for cat_info in categories.values():
            all_note_ids.update(cat_info.get("帖子ID列表", []))
        total_notes = len(all_note_ids)
        avg_notes = total_notes / total_categories if total_categories > 0 else 0

        print(f"{index_type}:")
        print(f"  分类数量: {total_categories}")
        print(f"  帖子总数: {total_notes}")
        print(f"  平均每分类帖子数: {avg_notes:.1f}\n")


def main():
    """主函数"""
    parser = argparse.ArgumentParser(
        description="构建人设分类的反向索引（灵感+目的+关键点）",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
使用示例:
  # 基本使用
  python build_category_index.py --persona-dir data/阿里多多酱/out/人设_1110

  # 只构建索引，不获取帖子详情
  python build_category_index.py --persona-dir data/阿里多多酱/out/人设_1110 --no-details

  # 自定义输出文件
  python build_category_index.py --persona-dir data/阿里多多酱/out/人设_1110 --output custom_index.json
        """
    )

    parser.add_argument(
        "--persona-dir",
        required=True,
        help="人设目录路径（包含人设.json和what解构结果/的目录）"
    )

    parser.add_argument(
        "--output",
        default=None,
        help="输出文件路径（默认: {persona_dir}/分类索引_完整.json）"
    )

    parser.add_argument(
        "--no-details",
        action="store_true",
        help="不获取帖子详情（只构建索引结构）"
    )

    args = parser.parse_args()

    persona_dir = args.persona_dir
    fetch_details = not args.no_details

    # 检查必要文件
    persona_file = os.path.join(persona_dir, "人设.json")
    what_dir = os.path.join(persona_dir, "what解构结果")

    if not os.path.exists(persona_file):
        print(f"❌ 错误: 找不到人设文件: {persona_file}")
        return

    if not os.path.exists(what_dir):
        print(f"❌ 错误: 找不到what解构目录: {what_dir}")
        return

    print(f"{'=' * 80}")
    print(f"构建人设分类反向索引（灵感+目的+关键点）")
    print(f"{'=' * 80}")
    print(f"人设文件: {persona_file}")
    print(f"解构目录: {what_dir}")
    print(f"获取详情: {'是' if fetch_details else '否'}\n")

    # 读取人设数据
    with open(persona_file, 'r', encoding='utf-8') as f:
        persona_data = json.load(f)

    # 构建三种索引
    print(f"{'─' * 80}")
    print(f"1. 构建灵感分类索引...")
    print(f"{'─' * 80}\n")
    inspiration_index = build_inspiration_index(persona_data)
    print(f"✓ 灵感分类: {len(inspiration_index)} 个分类\n")

    print(f"{'─' * 80}")
    print(f"2. 构建目的分类索引...")
    print(f"{'─' * 80}\n")
    purpose_index = build_purpose_index(what_dir)
    print(f"✓ 目的分类: {len(purpose_index)} 个分类\n")

    print(f"{'─' * 80}")
    print(f"3. 构建关键点分类索引...")
    print(f"{'─' * 80}\n")
    keypoint_index = build_keypoint_index(what_dir)
    print(f"✓ 关键点分类: {len(keypoint_index)} 个分类\n")

    # 合并为完整索引
    full_index = {
        "灵感分类": inspiration_index,
        "目的分类": purpose_index,
        "关键点分类": keypoint_index
    }

    # 获取帖子详情
    if fetch_details:
        full_index["灵感分类"] = fetch_note_details(inspiration_index)
        full_index["目的分类"] = fetch_note_details(purpose_index)
        full_index["关键点分类"] = fetch_note_details(keypoint_index)

    # 确定输出文件路径
    if args.output:
        output_file = args.output
    else:
        output_file = os.path.join(persona_dir, "分类索引_完整.json")

    # 保存索引
    save_index(full_index, output_file)

    # 打印统计信息
    print_statistics(full_index)


if __name__ == "__main__":
    main()