""" 分析搜索结果中的内容类型分布(视频 vs 图文) """ import json from collections import Counter from typing import Dict, Any, List def analyze_content_types(stage6_path: str): """分析 Stage6 搜索结果中的内容类型""" # 加载数据 with open(stage6_path, 'r', encoding='utf-8') as f: stage6_data = json.load(f) print("=" * 80) print("Stage6 搜索结果内容类型分析") print("=" * 80) # 收集所有搜索结果的内容类型 content_type_counter = Counter() feature_content_types = {} # 原始特征 -> 内容类型分布 total_searches = 0 total_notes = 0 for original_feature in stage6_data: feature_name = original_feature['原始特征名称'] feature_types = Counter() for association in original_feature.get('找到的关联', []): for feature in association.get('特征列表', []): search_result = feature.get('search_result') if search_result: total_searches += 1 # 提取帖子数据 notes = search_result.get('data', {}).get('data', []) total_notes += len(notes) for note in notes: note_card = note.get('note_card', {}) note_type = note_card.get('type', 'unknown') content_type_counter[note_type] += 1 feature_types[note_type] += 1 if feature_types: feature_content_types[feature_name] = feature_types # 打印总体统计 print(f"\n📊 总体统计:") print(f" 已执行搜索: {total_searches} 次") print(f" 总帖子数: {total_notes} 个") print(f"\n📋 内容类型分布:") for content_type, count in content_type_counter.most_common(): percentage = count / total_notes * 100 print(f" {content_type}: {count} 个 ({percentage:.1f}%)") # 打印各特征的内容类型分布 print(f"\n📊 各原始特征的内容类型分布:") for feature_name, types in feature_content_types.items(): total_feature_notes = sum(types.values()) print(f"\n 【{feature_name}】 共 {total_feature_notes} 个帖子") for content_type, count in types.most_common(): percentage = count / total_feature_notes * 100 print(f" {content_type}: {count} 个 ({percentage:.1f}%)") # 分析视频占比 video_count = content_type_counter.get('video', 0) normal_count = content_type_counter.get('normal', 0) # 图文类型 print(f"\n🎯 关键发现:") if video_count > 0: video_ratio = video_count / total_notes * 100 print(f" ⚠️ 发现 {video_count} 个视频帖子 (占比 {video_ratio:.1f}%)") print(f" ✓ 图文帖子: {normal_count} 个 (占比 {normal_count/total_notes*100:.1f}%)") print(f"\n 问题原因分析:") print(f" - 小红书 API 的 content_type='图文' 参数可能未被严格遵守") print(f" - 或者 API 返回混合类型的内容") print(f" - 建议在客户端侧添加内容类型过滤") else: print(f" ✓ 未发现视频内容,全部为图文") print("\n" + "=" * 80) if __name__ == '__main__': import sys stage6_path = 'output_v2/stage6_with_evaluations.json' if len(sys.argv) > 1: stage6_path = sys.argv[1] analyze_content_types(stage6_path)