| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """Stage 8 独立运行脚本"""
- import os
- import json
- import logging
- import argparse
- from stage8_similarity_analyzer import Stage8SimilarityAnalyzer
- def main():
- parser = argparse.ArgumentParser(
- description='Stage 8 解构特征相似度分析(独立运行)',
- formatter_class=argparse.RawDescriptionHelpFormatter,
- epilog="""
- 使用示例:
- # 基础用法 - 处理"墨镜"特征
- python3 run_stage8.py --feature "墨镜"
- # 处理多个特征
- python3 run_stage8.py --feature "墨镜" "耳环"
- # 自定义权重配置
- python3 run_stage8.py --feature "墨镜" --weight-embedding 0.7 --weight-semantic 0.3
- # 过滤低相似度特征
- python3 run_stage8.py --feature "墨镜" --min-similarity 0.3
- # 使用配置文件
- python3 run_stage8.py --config stage8_config.json
- # 自定义输入输出路径
- python3 run_stage8.py --input output_v2/stage7_custom.json --output output_v2/stage8_custom.json
- """
- )
- # 输入输出
- parser.add_argument(
- '--input',
- default='output_v2/stage7_with_deconstruction.json',
- help='Stage 7 结果文件路径(默认: output_v2/stage7_with_deconstruction.json)'
- )
- parser.add_argument(
- '--output',
- default='output_v2/stage8_similarity_scores.json',
- help='输出文件路径(默认: output_v2/stage8_similarity_scores.json)'
- )
- # 特征过滤
- parser.add_argument(
- '--feature',
- nargs='+',
- default=None,
- help='指定要处理的原始特征名称(可指定多个),如: --feature "墨镜" "耳环"'
- )
- # 相似度配置
- parser.add_argument(
- '--weight-embedding',
- type=float,
- default=0.5,
- help='向量模型权重(默认: 0.5)'
- )
- parser.add_argument(
- '--weight-semantic',
- type=float,
- default=0.5,
- help='LLM 模型权重(默认: 0.5)'
- )
- parser.add_argument(
- '--min-similarity',
- type=float,
- default=0.0,
- help='最小相似度阈值,低于此值的特征会被过滤(默认: 0.0,保留所有)'
- )
- # 并发配置
- parser.add_argument(
- '--max-workers',
- type=int,
- default=5,
- help='最大并发数(默认: 5)'
- )
- # 配置文件
- parser.add_argument(
- '--config',
- help='从配置文件读取参数(JSON 格式)'
- )
- # 日志级别
- parser.add_argument(
- '--log-level',
- default='INFO',
- choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
- help='日志级别(默认: INFO)'
- )
- args = parser.parse_args()
- # 配置日志
- logging.basicConfig(
- level=getattr(logging, args.log_level),
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
- )
- logger = logging.getLogger(__name__)
- # 如果提供了配置文件,从文件读取参数
- if args.config:
- logger.info(f"从配置文件读取参数: {args.config}")
- try:
- with open(args.config, 'r', encoding='utf-8') as f:
- config = json.load(f)
- # 配置文件中的参数会覆盖命令行默认值,但不会覆盖用户显式指定的命令行参数
- args.input = config.get('input', args.input)
- args.output = config.get('output', args.output)
- args.feature = config.get('feature', args.feature)
- args.weight_embedding = config.get('weight_embedding', args.weight_embedding)
- args.weight_semantic = config.get('weight_semantic', args.weight_semantic)
- args.min_similarity = config.get('min_similarity', args.min_similarity)
- args.max_workers = config.get('max_workers', args.max_workers)
- except Exception as e:
- logger.error(f"读取配置文件失败: {e}")
- return 1
- # 验证输入文件
- if not os.path.exists(args.input):
- logger.error(f"输入文件不存在: {args.input}")
- return 1
- # 读取 Stage 7 结果
- logger.info(f"读取 Stage 7 结果: {args.input}")
- try:
- with open(args.input, 'r', encoding='utf-8') as f:
- stage7_results = json.load(f)
- except Exception as e:
- logger.error(f"读取 Stage 7 结果失败: {e}")
- return 1
- # 打印配置信息
- logger.info("\n" + "=" * 60)
- logger.info("Stage 8 配置:")
- logger.info("=" * 60)
- logger.info(f"输入文件: {args.input}")
- logger.info(f"输出文件: {args.output}")
- if args.feature:
- logger.info(f"目标特征: {', '.join(args.feature)}")
- else:
- logger.info(f"目标特征: 全部")
- logger.info(f"向量模型权重: {args.weight_embedding}")
- logger.info(f"LLM 模型权重: {args.weight_semantic}")
- logger.info(f"最小相似度阈值: {args.min_similarity}")
- logger.info(f"最大并发数: {args.max_workers}")
- logger.info("=" * 60 + "\n")
- # 创建分析器
- try:
- analyzer = Stage8SimilarityAnalyzer(
- weight_embedding=args.weight_embedding,
- weight_semantic=args.weight_semantic,
- max_workers=args.max_workers,
- min_similarity=args.min_similarity,
- target_features=args.feature
- )
- except Exception as e:
- logger.error(f"创建分析器失败: {e}")
- return 1
- # 运行分析
- try:
- stage8_results = analyzer.run(stage7_results, output_path=args.output)
- # 打印摘要
- logger.info("\n" + "=" * 60)
- logger.info("Stage 8 执行完成")
- logger.info("=" * 60)
- metadata = stage8_results['metadata']
- overall_stats = metadata['overall_statistics']
- logger.info(f"处理帖子数: {overall_stats['total_notes']}")
- logger.info(f"提取特征总数: {overall_stats['total_features_extracted']}")
- logger.info(f"平均特征数/帖子: {overall_stats['avg_features_per_note']}")
- logger.info(f"平均最高相似度: {overall_stats['avg_max_similarity']}")
- logger.info(f"包含高相似度特征的帖子: {overall_stats['notes_with_high_similarity']}")
- logger.info(f"总耗时: {metadata['processing_time_seconds']}秒")
- logger.info(f"结果已保存: {args.output}")
- logger.info("=" * 60 + "\n")
- # 打印 Top 5 高相似度特征示例
- if stage8_results['results']:
- logger.info("Top 5 高相似度特征示例:")
- all_features = []
- for result in stage8_results['results']:
- for feat in result['deconstructed_features'][:5]: # 每个帖子取前5个
- all_features.append({
- 'note_id': result['note_id'],
- 'feature_name': feat['feature_name'],
- 'dimension': feat['dimension'],
- 'similarity': feat['similarity_score']
- })
- # 按相似度排序,取 Top 5
- all_features.sort(key=lambda x: x['similarity'], reverse=True)
- for i, feat in enumerate(all_features[:5], 1):
- logger.info(f" {i}. [{feat['note_id'][:12]}...] "
- f"{feat['feature_name']} ({feat['dimension']}) "
- f"- 相似度: {feat['similarity']:.3f}")
- return 0
- except Exception as e:
- logger.error(f"Stage 8 执行失败: {e}", exc_info=True)
- return 1
- if __name__ == '__main__':
- exit(main())
|