liulidong
/
knowledge_search


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
							"""
分析搜索结果中的内容类型分布（视频 vs 图文）
"""

import json
from collections import Counter
from typing import Dict, Any, List


def analyze_content_types(stage6_path: str):
    """分析 Stage6 搜索结果中的内容类型"""

    # 加载数据
    with open(stage6_path, 'r', encoding='utf-8') as f:
        stage6_data = json.load(f)

    print("=" * 80)
    print("Stage6 搜索结果内容类型分析")
    print("=" * 80)

    # 收集所有搜索结果的内容类型
    content_type_counter = Counter()
    feature_content_types = {}  # 原始特征 -> 内容类型分布

    total_searches = 0
    total_notes = 0

    for original_feature in stage6_data:
        feature_name = original_feature['原始特征名称']
        feature_types = Counter()

        for association in original_feature.get('找到的关联', []):
            for feature in association.get('特征列表', []):
                search_result = feature.get('search_result')

                if search_result:
                    total_searches += 1

                    # 提取帖子数据
                    notes = search_result.get('data', {}).get('data', [])
                    total_notes += len(notes)

                    for note in notes:
                        note_card = note.get('note_card', {})
                        note_type = note_card.get('type', 'unknown')

                        content_type_counter[note_type] += 1
                        feature_types[note_type] += 1

        if feature_types:
            feature_content_types[feature_name] = feature_types

    # 打印总体统计
    print(f"\n📊 总体统计:")
    print(f"  已执行搜索: {total_searches} 次")
    print(f"  总帖子数: {total_notes} 个")

    print(f"\n📋 内容类型分布:")
    for content_type, count in content_type_counter.most_common():
        percentage = count / total_notes * 100
        print(f"  {content_type}: {count} 个 ({percentage:.1f}%)")

    # 打印各特征的内容类型分布
    print(f"\n📊 各原始特征的内容类型分布:")
    for feature_name, types in feature_content_types.items():
        total_feature_notes = sum(types.values())
        print(f"\n  【{feature_name}】 共 {total_feature_notes} 个帖子")

        for content_type, count in types.most_common():
            percentage = count / total_feature_notes * 100
            print(f"    {content_type}: {count} 个 ({percentage:.1f}%)")

    # 分析视频占比
    video_count = content_type_counter.get('video', 0)
    normal_count = content_type_counter.get('normal', 0)  # 图文类型

    print(f"\n🎯 关键发现:")
    if video_count > 0:
        video_ratio = video_count / total_notes * 100
        print(f"  ⚠️  发现 {video_count} 个视频帖子 (占比 {video_ratio:.1f}%)")
        print(f"  ✓ 图文帖子: {normal_count} 个 (占比 {normal_count/total_notes*100:.1f}%)")
        print(f"\n  问题原因分析:")
        print(f"    - 小红书 API 的 content_type='图文' 参数可能未被严格遵守")
        print(f"    - 或者 API 返回混合类型的内容")
        print(f"    - 建议在客户端侧添加内容类型过滤")
    else:
        print(f"  ✓ 未发现视频内容，全部为图文")

    print("\n" + "=" * 80)


if __name__ == '__main__':
    import sys

    stage6_path = 'output_v2/stage6_with_evaluations.json'

    if len(sys.argv) > 1:
        stage6_path = sys.argv[1]

    analyze_content_types(stage6_path)