#!/usr/bin/env python3 # -*- coding: utf-8 -*- """Stage 8 独立运行脚本""" import os import json import logging import argparse from stage8_similarity_analyzer import Stage8SimilarityAnalyzer def main(): parser = argparse.ArgumentParser( description='Stage 8 解构特征相似度分析(独立运行)', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 使用示例: # 基础用法 - 处理"墨镜"特征 python3 run_stage8.py --feature "墨镜" # 处理多个特征 python3 run_stage8.py --feature "墨镜" "耳环" # 自定义权重配置 python3 run_stage8.py --feature "墨镜" --weight-embedding 0.7 --weight-semantic 0.3 # 过滤低相似度特征 python3 run_stage8.py --feature "墨镜" --min-similarity 0.3 # 使用配置文件 python3 run_stage8.py --config stage8_config.json # 自定义输入输出路径 python3 run_stage8.py --input output_v2/stage7_custom.json --output output_v2/stage8_custom.json """ ) # 输入输出 parser.add_argument( '--input', default='output_v2/stage7_with_deconstruction.json', help='Stage 7 结果文件路径(默认: output_v2/stage7_with_deconstruction.json)' ) parser.add_argument( '--output', default='output_v2/stage8_similarity_scores.json', help='输出文件路径(默认: output_v2/stage8_similarity_scores.json)' ) # 特征过滤 parser.add_argument( '--feature', nargs='+', default=None, help='指定要处理的原始特征名称(可指定多个),如: --feature "墨镜" "耳环"' ) # 相似度配置 parser.add_argument( '--weight-embedding', type=float, default=0.5, help='向量模型权重(默认: 0.5)' ) parser.add_argument( '--weight-semantic', type=float, default=0.5, help='LLM 模型权重(默认: 0.5)' ) parser.add_argument( '--min-similarity', type=float, default=0.0, help='最小相似度阈值,低于此值的特征会被过滤(默认: 0.0,保留所有)' ) # 并发配置 parser.add_argument( '--max-workers', type=int, default=5, help='最大并发数(默认: 5)' ) # 综合得分P计算配置 parser.add_argument( '--stage6-path', default='output_v2/stage6_with_evaluations.json', help='Stage 6 数据文件路径,用于计算综合得分P(默认: output_v2/stage6_with_evaluations.json)' ) parser.add_argument( '--no-update-stage6', action='store_true', help='不计算和更新综合得分P(默认会计算)' ) # 配置文件 parser.add_argument( '--config', help='从配置文件读取参数(JSON 格式)' ) # 日志级别 parser.add_argument( '--log-level', default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], help='日志级别(默认: INFO)' ) args = parser.parse_args() # 配置日志 logging.basicConfig( level=getattr(logging, args.log_level), format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # 如果提供了配置文件,从文件读取参数 if args.config: logger.info(f"从配置文件读取参数: {args.config}") try: with open(args.config, 'r', encoding='utf-8') as f: config = json.load(f) # 配置文件中的参数会覆盖命令行默认值,但不会覆盖用户显式指定的命令行参数 args.input = config.get('input', args.input) args.output = config.get('output', args.output) args.feature = config.get('feature', args.feature) args.weight_embedding = config.get('weight_embedding', args.weight_embedding) args.weight_semantic = config.get('weight_semantic', args.weight_semantic) args.min_similarity = config.get('min_similarity', args.min_similarity) args.max_workers = config.get('max_workers', args.max_workers) args.stage6_path = config.get('stage6_path', args.stage6_path) if 'no_update_stage6' in config: args.no_update_stage6 = config.get('no_update_stage6', args.no_update_stage6) except Exception as e: logger.error(f"读取配置文件失败: {e}") return 1 # 验证输入文件 if not os.path.exists(args.input): logger.error(f"输入文件不存在: {args.input}") return 1 # 读取 Stage 7 结果 logger.info(f"读取 Stage 7 结果: {args.input}") try: with open(args.input, 'r', encoding='utf-8') as f: stage7_results = json.load(f) except Exception as e: logger.error(f"读取 Stage 7 结果失败: {e}") return 1 # 打印配置信息 logger.info("\n" + "=" * 60) logger.info("Stage 8 配置:") logger.info("=" * 60) logger.info(f"输入文件: {args.input}") logger.info(f"输出文件: {args.output}") if args.feature: logger.info(f"目标特征: {', '.join(args.feature)}") else: logger.info(f"目标特征: 全部") logger.info(f"向量模型权重: {args.weight_embedding}") logger.info(f"LLM 模型权重: {args.weight_semantic}") logger.info(f"最小相似度阈值: {args.min_similarity}") logger.info(f"最大并发数: {args.max_workers}") logger.info(f"Stage 6 文件路径: {args.stage6_path}") logger.info(f"计算综合得分P: {'否' if args.no_update_stage6 else '是'}") logger.info("=" * 60 + "\n") # 创建分析器 try: analyzer = Stage8SimilarityAnalyzer( weight_embedding=args.weight_embedding, weight_semantic=args.weight_semantic, max_workers=args.max_workers, min_similarity=args.min_similarity, target_features=args.feature, stage6_path=args.stage6_path, update_stage6=not args.no_update_stage6 ) except Exception as e: logger.error(f"创建分析器失败: {e}") return 1 # 运行分析 try: stage8_results = analyzer.run(stage7_results, output_path=args.output) # 打印摘要 logger.info("\n" + "=" * 60) logger.info("Stage 8 执行完成") logger.info("=" * 60) metadata = stage8_results['metadata'] overall_stats = metadata['overall_statistics'] logger.info(f"处理帖子数: {overall_stats['total_notes']}") logger.info(f"提取特征总数: {overall_stats['total_features_extracted']}") logger.info(f"平均特征数/帖子: {overall_stats['avg_features_per_note']}") logger.info(f"平均最高相似度: {overall_stats['avg_max_similarity']}") logger.info(f"包含高相似度特征的帖子: {overall_stats['notes_with_high_similarity']}") logger.info(f"总耗时: {metadata['processing_time_seconds']}秒") logger.info(f"结果已保存: {args.output}") logger.info("=" * 60 + "\n") # 打印 Top 5 高相似度特征示例 if stage8_results['results']: logger.info("Top 5 高相似度特征示例:") all_features = [] for result in stage8_results['results']: for feat in result['deconstructed_features'][:5]: # 每个帖子取前5个 all_features.append({ 'note_id': result['note_id'], 'feature_name': feat['feature_name'], 'dimension': feat['dimension'], 'similarity': feat['similarity_score'] }) # 按相似度排序,取 Top 5 all_features.sort(key=lambda x: x['similarity'], reverse=True) for i, feat in enumerate(all_features[:5], 1): logger.info(f" {i}. [{feat['note_id'][:12]}...] " f"{feat['feature_name']} ({feat['dimension']}) " f"- 相似度: {feat['similarity']:.3f}") return 0 except Exception as e: logger.error(f"Stage 8 执行失败: {e}", exc_info=True) return 1 if __name__ == '__main__': exit(main())