run_similarity_analysis.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """相似度分析独立运行脚本"""
  4. import sys
  5. import os
  6. # 将项目根目录添加到Python路径
  7. project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
  8. sys.path.insert(0, project_root)
  9. import json
  10. import logging
  11. import argparse
  12. from src.analyzers.similarity_analyzer import SimilarityAnalyzer
  13. def main():
  14. parser = argparse.ArgumentParser(
  15. description='解构特征相似度分析(独立运行)',
  16. formatter_class=argparse.RawDescriptionHelpFormatter,
  17. epilog="""
  18. 使用示例:
  19. # 基础用法 - 处理"墨镜"特征
  20. python3 scripts/run_similarity_analysis.py --feature "墨镜"
  21. # 处理多个特征
  22. python3 scripts/run_similarity_analysis.py --feature "墨镜" "耳环"
  23. # 自定义权重配置
  24. python3 scripts/run_similarity_analysis.py --feature "墨镜" --weight-embedding 0.7 --weight-semantic 0.3
  25. # 过滤低相似度特征
  26. python3 scripts/run_similarity_analysis.py --feature "墨镜" --min-similarity 0.3
  27. # 使用配置文件
  28. python3 scripts/run_similarity_analysis.py --config stage8_config.json
  29. # 自定义输入输出路径
  30. python3 scripts/run_similarity_analysis.py --input output_v2/deep_analysis_custom.json --output output_v2/similarity_custom.json
  31. """
  32. )
  33. # 输入输出
  34. parser.add_argument(
  35. '--input',
  36. default='output_v2/deep_analysis_results.json',
  37. help='解构分析结果文件路径(默认: output_v2/deep_analysis_results.json)'
  38. )
  39. parser.add_argument(
  40. '--output',
  41. default='output_v2/similarity_analysis_results.json',
  42. help='输出文件路径(默认: output_v2/similarity_analysis_results.json)'
  43. )
  44. # 特征过滤
  45. parser.add_argument(
  46. '--feature',
  47. nargs='+',
  48. default=None,
  49. help='指定要处理的原始特征名称(可指定多个),如: --feature "墨镜" "耳环"'
  50. )
  51. # 相似度配置
  52. parser.add_argument(
  53. '--weight-embedding',
  54. type=float,
  55. default=0.5,
  56. help='向量模型权重(默认: 0.5)'
  57. )
  58. parser.add_argument(
  59. '--weight-semantic',
  60. type=float,
  61. default=0.5,
  62. help='LLM 模型权重(默认: 0.5)'
  63. )
  64. parser.add_argument(
  65. '--min-similarity',
  66. type=float,
  67. default=0.0,
  68. help='最小相似度阈值,低于此值的特征会被过滤(默认: 0.0,保留所有)'
  69. )
  70. # 并发配置
  71. parser.add_argument(
  72. '--max-workers',
  73. type=int,
  74. default=5,
  75. help='最大并发数(默认: 5)'
  76. )
  77. # 综合得分P计算配置
  78. parser.add_argument(
  79. '--evaluation-path',
  80. default='output_v2/evaluated_results.json',
  81. help='评估结果数据文件路径,用于计算综合得分P(默认: output_v2/evaluated_results.json)'
  82. )
  83. parser.add_argument(
  84. '--no-update-evaluation',
  85. action='store_true',
  86. help='不计算和更新综合得分P(默认会计算)'
  87. )
  88. # 配置文件
  89. parser.add_argument(
  90. '--config',
  91. help='从配置文件读取参数(JSON 格式)'
  92. )
  93. # 日志级别
  94. parser.add_argument(
  95. '--log-level',
  96. default='INFO',
  97. choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
  98. help='日志级别(默认: INFO)'
  99. )
  100. args = parser.parse_args()
  101. # 配置日志
  102. logging.basicConfig(
  103. level=getattr(logging, args.log_level),
  104. format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
  105. )
  106. logger = logging.getLogger(__name__)
  107. # 如果提供了配置文件,从文件读取参数
  108. if args.config:
  109. logger.info(f"从配置文件读取参数: {args.config}")
  110. try:
  111. with open(args.config, 'r', encoding='utf-8') as f:
  112. config = json.load(f)
  113. # 配置文件中的参数会覆盖命令行默认值,但不会覆盖用户显式指定的命令行参数
  114. args.input = config.get('input', args.input)
  115. args.output = config.get('output', args.output)
  116. args.feature = config.get('feature', args.feature)
  117. args.weight_embedding = config.get('weight_embedding', args.weight_embedding)
  118. args.weight_semantic = config.get('weight_semantic', args.weight_semantic)
  119. args.min_similarity = config.get('min_similarity', args.min_similarity)
  120. args.max_workers = config.get('max_workers', args.max_workers)
  121. args.evaluation_path = config.get('evaluation_path', args.evaluation_path)
  122. if 'no_update_evaluation' in config:
  123. args.no_update_evaluation = config.get('no_update_evaluation', args.no_update_evaluation)
  124. except Exception as e:
  125. logger.error(f"读取配置文件失败: {e}")
  126. return 1
  127. # 验证输入文件
  128. if not os.path.exists(args.input):
  129. logger.error(f"输入文件不存在: {args.input}")
  130. return 1
  131. # 读取解构分析结果
  132. logger.info(f"读取解构分析结果: {args.input}")
  133. try:
  134. with open(args.input, 'r', encoding='utf-8') as f:
  135. deconstruction_results = json.load(f)
  136. except Exception as e:
  137. logger.error(f"读取解构分析结果失败: {e}")
  138. return 1
  139. # 打印配置信息
  140. logger.info("\n" + "=" * 60)
  141. logger.info("相似度分析配置:")
  142. logger.info("=" * 60)
  143. logger.info(f"输入文件: {args.input}")
  144. logger.info(f"输出文件: {args.output}")
  145. if args.feature:
  146. logger.info(f"目标特征: {', '.join(args.feature)}")
  147. else:
  148. logger.info(f"目标特征: 全部")
  149. logger.info(f"向量模型权重: {args.weight_embedding}")
  150. logger.info(f"LLM 模型权重: {args.weight_semantic}")
  151. logger.info(f"最小相似度阈值: {args.min_similarity}")
  152. logger.info(f"最大并发数: {args.max_workers}")
  153. logger.info(f"评估结果文件路径: {args.evaluation_path}")
  154. logger.info(f"计算综合得分P: {'否' if args.no_update_evaluation else '是'}")
  155. logger.info("=" * 60 + "\n")
  156. # 创建分析器
  157. try:
  158. analyzer = SimilarityAnalyzer(
  159. weight_embedding=args.weight_embedding,
  160. weight_semantic=args.weight_semantic,
  161. max_workers=args.max_workers,
  162. min_similarity=args.min_similarity,
  163. target_features=args.feature,
  164. evaluation_results_path=args.evaluation_path,
  165. update_evaluation_scores=not args.no_update_evaluation
  166. )
  167. except Exception as e:
  168. logger.error(f"创建分析器失败: {e}")
  169. return 1
  170. # 运行分析
  171. try:
  172. similarity_results = analyzer.run(deconstruction_results, output_path=args.output)
  173. # 打印摘要
  174. logger.info("\n" + "=" * 60)
  175. logger.info("相似度分析完成")
  176. logger.info("=" * 60)
  177. metadata = similarity_results['metadata']
  178. overall_stats = metadata['overall_statistics']
  179. logger.info(f"处理帖子数: {overall_stats['total_notes']}")
  180. logger.info(f"提取特征总数: {overall_stats['total_features_extracted']}")
  181. logger.info(f"平均特征数/帖子: {overall_stats['avg_features_per_note']}")
  182. logger.info(f"平均最高相似度: {overall_stats['avg_max_similarity']}")
  183. logger.info(f"包含高相似度特征的帖子: {overall_stats['notes_with_high_similarity']}")
  184. logger.info(f"总耗时: {metadata['processing_time_seconds']}秒")
  185. logger.info(f"结果已保存: {args.output}")
  186. logger.info("=" * 60 + "\n")
  187. # 打印 Top 5 高相似度特征示例
  188. if similarity_results['results']:
  189. logger.info("Top 5 高相似度特征示例:")
  190. all_features = []
  191. for result in similarity_results['results']:
  192. for feat in result['deconstructed_features'][:5]: # 每个帖子取前5个
  193. all_features.append({
  194. 'note_id': result['note_id'],
  195. 'feature_name': feat['feature_name'],
  196. 'dimension': feat['dimension'],
  197. 'similarity': feat['similarity_score']
  198. })
  199. # 按相似度排序,取 Top 5
  200. all_features.sort(key=lambda x: x['similarity'], reverse=True)
  201. for i, feat in enumerate(all_features[:5], 1):
  202. logger.info(f" {i}. [{feat['note_id'][:12]}...] "
  203. f"{feat['feature_name']} ({feat['dimension']}) "
  204. f"- 相似度: {feat['similarity']:.3f}")
  205. return 0
  206. except Exception as e:
  207. logger.error(f"相似度分析失败: {e}", exc_info=True)
  208. return 1
  209. if __name__ == '__main__':
  210. exit(main())