run_similarity_analysis.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 快速运行相似度分析脚本
  5. """
  6. import os
  7. import json
  8. import logging
  9. from src.analyzers.similarity_analyzer import SimilarityAnalyzer
  10. # 配置日志
  11. logging.basicConfig(
  12. level=logging.INFO,
  13. format='%(asctime)s - %(levelname)s - %(message)s'
  14. )
  15. def main():
  16. """主函数"""
  17. # 读取解构分析结果
  18. deconstruction_path = "output_v2/deep_analysis_results.json"
  19. if not os.path.exists(deconstruction_path):
  20. print(f"❌ 解构分析结果不存在: {deconstruction_path}")
  21. print(" 请先运行: python3 main.py --enable-stage5 --enable-stage6")
  22. return
  23. print(f"📖 加载解构分析结果: {deconstruction_path}")
  24. with open(deconstruction_path, 'r', encoding='utf-8') as f:
  25. deconstruction_results = json.load(f)
  26. print(f"✓ 加载了 {len(deconstruction_results.get('results', []))} 个解构结果")
  27. # 创建分析器
  28. print("\n🚀 初始化相似度分析器...")
  29. analyzer = SimilarityAnalyzer(
  30. weight_embedding=0.5, # 向量模型权重
  31. weight_semantic=0.5, # LLM模型权重
  32. max_workers=5, # 并发数
  33. min_similarity=0.0, # 最小相似度阈值(0.0保留所有)
  34. target_features=None, # None = 处理所有特征
  35. evaluation_results_path='output_v2/evaluated_results.json',
  36. update_evaluation_scores=True # 自动计算综合得分P
  37. )
  38. # 运行分析
  39. print("\n" + "=" * 60)
  40. print("开始相似度分析...")
  41. print("=" * 60)
  42. similarity_results = analyzer.run(deconstruction_results)
  43. print("\n" + "=" * 60)
  44. print("✅ 相似度分析完成!")
  45. print("=" * 60)
  46. # 打印统计信息
  47. meta = similarity_results['metadata']
  48. stats = meta['overall_statistics']
  49. print(f"\n📊 统计结果:")
  50. print(f" - 处理帖子数: {stats['total_notes']}")
  51. print(f" - 提取特征总数: {stats['total_features_extracted']}")
  52. print(f" - 平均特征/帖子: {stats['avg_features_per_note']}")
  53. print(f" - 平均最高相似度: {stats['avg_max_similarity']}")
  54. print(f" - 包含高相似度特征的帖子: {stats['notes_with_high_similarity']}")
  55. print(f" - 总耗时: {meta['processing_time_seconds']}秒")
  56. print(f"\n📁 输出文件:")
  57. print(f" - output_v2/similarity_analysis_results.json (相似度分析结果)")
  58. print(f" - output_v2/evaluated_results.json (已更新综合得分P)")
  59. print(f"\n🎨 现在可以运行可视化:")
  60. print(f" python3 src/visualizers/deconstruction_visualizer.py")
  61. if __name__ == '__main__':
  62. main()