run_stage8.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """Stage 8 独立运行脚本"""
  4. import os
  5. import json
  6. import logging
  7. import argparse
  8. from stage8_similarity_analyzer import Stage8SimilarityAnalyzer
  9. def main():
  10. parser = argparse.ArgumentParser(
  11. description='Stage 8 解构特征相似度分析(独立运行)',
  12. formatter_class=argparse.RawDescriptionHelpFormatter,
  13. epilog="""
  14. 使用示例:
  15. # 基础用法 - 处理"墨镜"特征
  16. python3 run_stage8.py --feature "墨镜"
  17. # 处理多个特征
  18. python3 run_stage8.py --feature "墨镜" "耳环"
  19. # 自定义权重配置
  20. python3 run_stage8.py --feature "墨镜" --weight-embedding 0.7 --weight-semantic 0.3
  21. # 过滤低相似度特征
  22. python3 run_stage8.py --feature "墨镜" --min-similarity 0.3
  23. # 使用配置文件
  24. python3 run_stage8.py --config stage8_config.json
  25. # 自定义输入输出路径
  26. python3 run_stage8.py --input output_v2/stage7_custom.json --output output_v2/stage8_custom.json
  27. """
  28. )
  29. # 输入输出
  30. parser.add_argument(
  31. '--input',
  32. default='output_v2/stage7_with_deconstruction.json',
  33. help='Stage 7 结果文件路径(默认: output_v2/stage7_with_deconstruction.json)'
  34. )
  35. parser.add_argument(
  36. '--output',
  37. default='output_v2/stage8_similarity_scores.json',
  38. help='输出文件路径(默认: output_v2/stage8_similarity_scores.json)'
  39. )
  40. # 特征过滤
  41. parser.add_argument(
  42. '--feature',
  43. nargs='+',
  44. default=None,
  45. help='指定要处理的原始特征名称(可指定多个),如: --feature "墨镜" "耳环"'
  46. )
  47. # 相似度配置
  48. parser.add_argument(
  49. '--weight-embedding',
  50. type=float,
  51. default=0.5,
  52. help='向量模型权重(默认: 0.5)'
  53. )
  54. parser.add_argument(
  55. '--weight-semantic',
  56. type=float,
  57. default=0.5,
  58. help='LLM 模型权重(默认: 0.5)'
  59. )
  60. parser.add_argument(
  61. '--min-similarity',
  62. type=float,
  63. default=0.0,
  64. help='最小相似度阈值,低于此值的特征会被过滤(默认: 0.0,保留所有)'
  65. )
  66. # 并发配置
  67. parser.add_argument(
  68. '--max-workers',
  69. type=int,
  70. default=5,
  71. help='最大并发数(默认: 5)'
  72. )
  73. # 配置文件
  74. parser.add_argument(
  75. '--config',
  76. help='从配置文件读取参数(JSON 格式)'
  77. )
  78. # 日志级别
  79. parser.add_argument(
  80. '--log-level',
  81. default='INFO',
  82. choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
  83. help='日志级别(默认: INFO)'
  84. )
  85. args = parser.parse_args()
  86. # 配置日志
  87. logging.basicConfig(
  88. level=getattr(logging, args.log_level),
  89. format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
  90. )
  91. logger = logging.getLogger(__name__)
  92. # 如果提供了配置文件,从文件读取参数
  93. if args.config:
  94. logger.info(f"从配置文件读取参数: {args.config}")
  95. try:
  96. with open(args.config, 'r', encoding='utf-8') as f:
  97. config = json.load(f)
  98. # 配置文件中的参数会覆盖命令行默认值,但不会覆盖用户显式指定的命令行参数
  99. args.input = config.get('input', args.input)
  100. args.output = config.get('output', args.output)
  101. args.feature = config.get('feature', args.feature)
  102. args.weight_embedding = config.get('weight_embedding', args.weight_embedding)
  103. args.weight_semantic = config.get('weight_semantic', args.weight_semantic)
  104. args.min_similarity = config.get('min_similarity', args.min_similarity)
  105. args.max_workers = config.get('max_workers', args.max_workers)
  106. except Exception as e:
  107. logger.error(f"读取配置文件失败: {e}")
  108. return 1
  109. # 验证输入文件
  110. if not os.path.exists(args.input):
  111. logger.error(f"输入文件不存在: {args.input}")
  112. return 1
  113. # 读取 Stage 7 结果
  114. logger.info(f"读取 Stage 7 结果: {args.input}")
  115. try:
  116. with open(args.input, 'r', encoding='utf-8') as f:
  117. stage7_results = json.load(f)
  118. except Exception as e:
  119. logger.error(f"读取 Stage 7 结果失败: {e}")
  120. return 1
  121. # 打印配置信息
  122. logger.info("\n" + "=" * 60)
  123. logger.info("Stage 8 配置:")
  124. logger.info("=" * 60)
  125. logger.info(f"输入文件: {args.input}")
  126. logger.info(f"输出文件: {args.output}")
  127. if args.feature:
  128. logger.info(f"目标特征: {', '.join(args.feature)}")
  129. else:
  130. logger.info(f"目标特征: 全部")
  131. logger.info(f"向量模型权重: {args.weight_embedding}")
  132. logger.info(f"LLM 模型权重: {args.weight_semantic}")
  133. logger.info(f"最小相似度阈值: {args.min_similarity}")
  134. logger.info(f"最大并发数: {args.max_workers}")
  135. logger.info("=" * 60 + "\n")
  136. # 创建分析器
  137. try:
  138. analyzer = Stage8SimilarityAnalyzer(
  139. weight_embedding=args.weight_embedding,
  140. weight_semantic=args.weight_semantic,
  141. max_workers=args.max_workers,
  142. min_similarity=args.min_similarity,
  143. target_features=args.feature
  144. )
  145. except Exception as e:
  146. logger.error(f"创建分析器失败: {e}")
  147. return 1
  148. # 运行分析
  149. try:
  150. stage8_results = analyzer.run(stage7_results, output_path=args.output)
  151. # 打印摘要
  152. logger.info("\n" + "=" * 60)
  153. logger.info("Stage 8 执行完成")
  154. logger.info("=" * 60)
  155. metadata = stage8_results['metadata']
  156. overall_stats = metadata['overall_statistics']
  157. logger.info(f"处理帖子数: {overall_stats['total_notes']}")
  158. logger.info(f"提取特征总数: {overall_stats['total_features_extracted']}")
  159. logger.info(f"平均特征数/帖子: {overall_stats['avg_features_per_note']}")
  160. logger.info(f"平均最高相似度: {overall_stats['avg_max_similarity']}")
  161. logger.info(f"包含高相似度特征的帖子: {overall_stats['notes_with_high_similarity']}")
  162. logger.info(f"总耗时: {metadata['processing_time_seconds']}秒")
  163. logger.info(f"结果已保存: {args.output}")
  164. logger.info("=" * 60 + "\n")
  165. # 打印 Top 5 高相似度特征示例
  166. if stage8_results['results']:
  167. logger.info("Top 5 高相似度特征示例:")
  168. all_features = []
  169. for result in stage8_results['results']:
  170. for feat in result['deconstructed_features'][:5]: # 每个帖子取前5个
  171. all_features.append({
  172. 'note_id': result['note_id'],
  173. 'feature_name': feat['feature_name'],
  174. 'dimension': feat['dimension'],
  175. 'similarity': feat['similarity_score']
  176. })
  177. # 按相似度排序,取 Top 5
  178. all_features.sort(key=lambda x: x['similarity'], reverse=True)
  179. for i, feat in enumerate(all_features[:5], 1):
  180. logger.info(f" {i}. [{feat['note_id'][:12]}...] "
  181. f"{feat['feature_name']} ({feat['dimension']}) "
  182. f"- 相似度: {feat['similarity']:.3f}")
  183. return 0
  184. except Exception as e:
  185. logger.error(f"Stage 8 执行失败: {e}", exc_info=True)
  186. return 1
  187. if __name__ == '__main__':
  188. exit(main())