| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 |
- """
- 分析搜索结果中的内容类型分布(视频 vs 图文)
- """
- import json
- from collections import Counter
- from typing import Dict, Any, List
- def analyze_content_types(stage6_path: str):
- """分析 Stage6 搜索结果中的内容类型"""
- # 加载数据
- with open(stage6_path, 'r', encoding='utf-8') as f:
- stage6_data = json.load(f)
- print("=" * 80)
- print("Stage6 搜索结果内容类型分析")
- print("=" * 80)
- # 收集所有搜索结果的内容类型
- content_type_counter = Counter()
- feature_content_types = {} # 原始特征 -> 内容类型分布
- total_searches = 0
- total_notes = 0
- for original_feature in stage6_data:
- feature_name = original_feature['原始特征名称']
- feature_types = Counter()
- for association in original_feature.get('找到的关联', []):
- for feature in association.get('特征列表', []):
- search_result = feature.get('search_result')
- if search_result:
- total_searches += 1
- # 提取帖子数据
- notes = search_result.get('data', {}).get('data', [])
- total_notes += len(notes)
- for note in notes:
- note_card = note.get('note_card', {})
- note_type = note_card.get('type', 'unknown')
- content_type_counter[note_type] += 1
- feature_types[note_type] += 1
- if feature_types:
- feature_content_types[feature_name] = feature_types
- # 打印总体统计
- print(f"\n📊 总体统计:")
- print(f" 已执行搜索: {total_searches} 次")
- print(f" 总帖子数: {total_notes} 个")
- print(f"\n📋 内容类型分布:")
- for content_type, count in content_type_counter.most_common():
- percentage = count / total_notes * 100
- print(f" {content_type}: {count} 个 ({percentage:.1f}%)")
- # 打印各特征的内容类型分布
- print(f"\n📊 各原始特征的内容类型分布:")
- for feature_name, types in feature_content_types.items():
- total_feature_notes = sum(types.values())
- print(f"\n 【{feature_name}】 共 {total_feature_notes} 个帖子")
- for content_type, count in types.most_common():
- percentage = count / total_feature_notes * 100
- print(f" {content_type}: {count} 个 ({percentage:.1f}%)")
- # 分析视频占比
- video_count = content_type_counter.get('video', 0)
- normal_count = content_type_counter.get('normal', 0) # 图文类型
- print(f"\n🎯 关键发现:")
- if video_count > 0:
- video_ratio = video_count / total_notes * 100
- print(f" ⚠️ 发现 {video_count} 个视频帖子 (占比 {video_ratio:.1f}%)")
- print(f" ✓ 图文帖子: {normal_count} 个 (占比 {normal_count/total_notes*100:.1f}%)")
- print(f"\n 问题原因分析:")
- print(f" - 小红书 API 的 content_type='图文' 参数可能未被严格遵守")
- print(f" - 或者 API 返回混合类型的内容")
- print(f" - 建议在客户端侧添加内容类型过滤")
- else:
- print(f" ✓ 未发现视频内容,全部为图文")
- print("\n" + "=" * 80)
- if __name__ == '__main__':
- import sys
- stage6_path = 'output_v2/stage6_with_evaluations.json'
- if len(sys.argv) > 1:
- stage6_path = sys.argv[1]
- analyze_content_types(stage6_path)
|