analyze_content_types.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. """
  2. 分析搜索结果中的内容类型分布(视频 vs 图文)
  3. """
  4. import json
  5. from collections import Counter
  6. from typing import Dict, Any, List
  7. def analyze_content_types(stage6_path: str):
  8. """分析 Stage6 搜索结果中的内容类型"""
  9. # 加载数据
  10. with open(stage6_path, 'r', encoding='utf-8') as f:
  11. stage6_data = json.load(f)
  12. print("=" * 80)
  13. print("Stage6 搜索结果内容类型分析")
  14. print("=" * 80)
  15. # 收集所有搜索结果的内容类型
  16. content_type_counter = Counter()
  17. feature_content_types = {} # 原始特征 -> 内容类型分布
  18. total_searches = 0
  19. total_notes = 0
  20. for original_feature in stage6_data:
  21. feature_name = original_feature['原始特征名称']
  22. feature_types = Counter()
  23. for association in original_feature.get('找到的关联', []):
  24. for feature in association.get('特征列表', []):
  25. search_result = feature.get('search_result')
  26. if search_result:
  27. total_searches += 1
  28. # 提取帖子数据
  29. notes = search_result.get('data', {}).get('data', [])
  30. total_notes += len(notes)
  31. for note in notes:
  32. note_card = note.get('note_card', {})
  33. note_type = note_card.get('type', 'unknown')
  34. content_type_counter[note_type] += 1
  35. feature_types[note_type] += 1
  36. if feature_types:
  37. feature_content_types[feature_name] = feature_types
  38. # 打印总体统计
  39. print(f"\n📊 总体统计:")
  40. print(f" 已执行搜索: {total_searches} 次")
  41. print(f" 总帖子数: {total_notes} 个")
  42. print(f"\n📋 内容类型分布:")
  43. for content_type, count in content_type_counter.most_common():
  44. percentage = count / total_notes * 100
  45. print(f" {content_type}: {count} 个 ({percentage:.1f}%)")
  46. # 打印各特征的内容类型分布
  47. print(f"\n📊 各原始特征的内容类型分布:")
  48. for feature_name, types in feature_content_types.items():
  49. total_feature_notes = sum(types.values())
  50. print(f"\n 【{feature_name}】 共 {total_feature_notes} 个帖子")
  51. for content_type, count in types.most_common():
  52. percentage = count / total_feature_notes * 100
  53. print(f" {content_type}: {count} 个 ({percentage:.1f}%)")
  54. # 分析视频占比
  55. video_count = content_type_counter.get('video', 0)
  56. normal_count = content_type_counter.get('normal', 0) # 图文类型
  57. print(f"\n🎯 关键发现:")
  58. if video_count > 0:
  59. video_ratio = video_count / total_notes * 100
  60. print(f" ⚠️ 发现 {video_count} 个视频帖子 (占比 {video_ratio:.1f}%)")
  61. print(f" ✓ 图文帖子: {normal_count} 个 (占比 {normal_count/total_notes*100:.1f}%)")
  62. print(f"\n 问题原因分析:")
  63. print(f" - 小红书 API 的 content_type='图文' 参数可能未被严格遵守")
  64. print(f" - 或者 API 返回混合类型的内容")
  65. print(f" - 建议在客户端侧添加内容类型过滤")
  66. else:
  67. print(f" ✓ 未发现视频内容,全部为图文")
  68. print("\n" + "=" * 80)
  69. if __name__ == '__main__':
  70. import sys
  71. stage6_path = 'output_v2/stage6_with_evaluations.json'
  72. if len(sys.argv) > 1:
  73. stage6_path = sys.argv[1]
  74. analyze_content_types(stage6_path)