| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Stage 7 独立运行脚本
- 从 Stage 6 结果开始,进行深度解构分析
- 支持指定 feature 和数量限制
- """
- import os
- import json
- import logging
- import argparse
- import webbrowser
- from pathlib import Path
- from stage7_analyzer import Stage7DeconstructionAnalyzer
- from stage8_similarity_analyzer import Stage8SimilarityAnalyzer
- import visualize_stage78_with_deconstruction
- # 配置日志
- logging.basicConfig(
- level=logging.INFO,
- format='%(asctime)s - %(levelname)s - %(message)s',
- datefmt='%Y-%m-%d %H:%M:%S',
- handlers=[
- logging.FileHandler('stage7_standalone.log', encoding='utf-8'),
- logging.StreamHandler()
- ]
- )
- logger = logging.getLogger(__name__)
- def main():
- """主函数"""
- parser = argparse.ArgumentParser(
- description='Stage 7 深度解构分析(独立运行,支持流水线执行)',
- formatter_class=argparse.RawDescriptionHelpFormatter,
- epilog='''
- 基础用法示例:
- # 只处理"墨镜"特征的前10个高分帖子
- python3 run_stage7.py --feature "墨镜" --max-notes 10
- # 处理"墨镜"和"耳环"两个特征,每个最多5个
- python3 run_stage7.py --feature "墨镜" "耳环" --max-notes 5
- # 按数据原始顺序处理前50个(不排序)
- python3 run_stage7.py --sort-by none --max-notes 50
- # 处理所有特征,按时间排序,前20个
- python3 run_stage7.py --sort-by time --max-notes 20
- # 只处理"墨镜",按互动量排序,跳过前3个
- python3 run_stage7.py --feature "墨镜" --sort-by engagement --skip 3
- # 降低分数阈值,处理更多帖子
- python3 run_stage7.py --feature "墨镜" --min-score 6.0 --max-notes 30
- 流水线执行示例(推荐):
- # 完整流水线: Stage 7 → Stage 8 → 可视化 → 自动打开浏览器
- python3 run_stage7.py --feature "墨镜" --max-notes 10 --run-stage8 --visualize
- # Stage 7 → Stage 8(不生成可视化)
- python3 run_stage7.py --feature "墨镜" --max-notes 10 --run-stage8
- # Stage 7 → 可视化(跳过 Stage 8)
- python3 run_stage7.py --feature "墨镜" --max-notes 10 --visualize
- # 完整流水线,不自动打开浏览器
- python3 run_stage7.py --feature "墨镜" --run-stage8 --visualize --no-open
- # 自定义 Stage 8 相似度权重
- python3 run_stage7.py --feature "墨镜" --run-stage8 --visualize \\
- --stage8-weight-embedding 0.7 --stage8-weight-semantic 0.3
- # 过滤低相似度特征
- python3 run_stage7.py --feature "墨镜" --run-stage8 --visualize \\
- --stage8-min-similarity 0.3
- 配置文件示例:
- # 使用配置文件(支持所有参数)
- python3 run_stage7.py --config pipeline_config.json
- # 配置文件示例内容(pipeline_config.json):
- {
- "feature": ["墨镜"],
- "max_notes": 10,
- "timeout": 600,
- "run_stage8": true,
- "visualize": true,
- "stage8_weight_embedding": 0.5,
- "stage8_weight_semantic": 0.5
- }
- '''
- )
- # 输入输出配置
- parser.add_argument(
- '--input',
- default='output_v2/stage6_with_evaluations.json',
- help='Stage 6 结果文件路径(默认: output_v2/stage6_with_evaluations.json)'
- )
- parser.add_argument(
- '--output',
- default='output_v2/stage7_with_deconstruction.json',
- help='Stage 7 输出文件路径(默认: output_v2/stage7_with_deconstruction.json)'
- )
- # Feature 过滤(新增)
- parser.add_argument(
- '--feature',
- nargs='+',
- default=None,
- help='指定要处理的原始特征名称(可指定多个),如: --feature "墨镜" "耳环"。不指定则处理所有特征'
- )
- # 过滤参数
- parser.add_argument(
- '--min-score',
- type=float,
- default=0.8,
- help='最低分数阈值,只处理 >= 此分数的帖子(默认: 0.8)'
- )
- parser.add_argument(
- '--skip',
- type=int,
- default=0,
- help='跳过前 N 个帖子(默认: 0)'
- )
- parser.add_argument(
- '--max-notes',
- type=int,
- default=None,
- help='最多处理多少个帖子(默认: None 不限制)'
- )
- parser.add_argument(
- '--sort-by',
- choices=['none', 'score', 'time', 'engagement'],
- default='score',
- help='排序方式: none(不排序,保持数据原始顺序), score(评分), time(时间), engagement(互动量)(默认: score)'
- )
- # API 配置
- parser.add_argument(
- '--api-url',
- default='http://192.168.245.150:7000/what/analysis/single',
- help='解构 API 地址(默认: http://192.168.245.150:7000/what/analysis/single)'
- )
- parser.add_argument(
- '--timeout',
- type=int,
- default=800,
- help='API 超时时间(秒)(默认: 600,即10分钟)'
- )
- parser.add_argument(
- '--max-retries',
- type=int,
- default=3,
- help='API 最大重试次数(默认: 3)'
- )
- # 并发配置
- parser.add_argument(
- '--max-workers',
- type=int,
- default=5,
- help='并发处理数(默认: 5)'
- )
- # 从配置文件加载
- parser.add_argument(
- '--config',
- default=None,
- help='从 JSON 配置文件加载参数'
- )
- # 流水线控制参数
- parser.add_argument(
- '--run-stage8',
- action='store_true',
- help='Stage 7 完成后自动运行 Stage 8'
- )
- parser.add_argument(
- '--visualize',
- action='store_true',
- help='生成可视化结果'
- )
- parser.add_argument(
- '--open-browser',
- action='store_true',
- default=True,
- help='自动在浏览器中打开可视化结果(默认: True)'
- )
- parser.add_argument(
- '--no-open',
- action='store_true',
- help='禁用自动打开浏览器'
- )
- # Stage 8 输出配置
- parser.add_argument(
- '--stage8-output',
- default='output_v2/stage8_similarity_scores.json',
- help='Stage 8 输出文件路径(默认: output_v2/stage8_similarity_scores.json)'
- )
- # Stage 8 相似度配置
- parser.add_argument(
- '--stage8-weight-embedding',
- type=float,
- default=0.5,
- help='Stage 8 向量模型权重(默认: 0.5)'
- )
- parser.add_argument(
- '--stage8-weight-semantic',
- type=float,
- default=0.5,
- help='Stage 8 LLM 模型权重(默认: 0.5)'
- )
- parser.add_argument(
- '--stage8-min-similarity',
- type=float,
- default=0.0,
- help='Stage 8 最小相似度阈值(默认: 0.0)'
- )
- parser.add_argument(
- '--stage8-max-workers',
- type=int,
- default=5,
- help='Stage 8 最大并发数(默认: 5)'
- )
- # 可视化输出配置
- parser.add_argument(
- '--viz-output',
- default=None,
- help='可视化输出目录(默认: visualization/)'
- )
- args = parser.parse_args()
- # 如果提供了配置文件,加载配置
- if args.config:
- logger.info(f"从配置文件加载参数: {args.config}")
- with open(args.config, 'r', encoding='utf-8') as f:
- config = json.load(f)
- # 配置文件中的参数会覆盖命令行参数
- for key, value in config.items():
- setattr(args, key.replace('-', '_'), value)
- # 检查输入文件是否存在
- if not os.path.exists(args.input):
- logger.error(f"输入文件不存在: {args.input}")
- return
- # 加载 Stage 6 结果
- logger.info(f"加载 Stage 6 结果: {args.input}")
- with open(args.input, 'r', encoding='utf-8') as f:
- stage6_results = json.load(f)
- # 打印配置
- logger.info("=" * 60)
- logger.info("运行配置:")
- logger.info(f" 输入文件: {args.input}")
- logger.info(f" 输出文件: {args.output}")
- if args.feature:
- logger.info(f" 指定特征: {', '.join(args.feature)}")
- else:
- logger.info(f" 指定特征: 全部")
- logger.info(f" API 地址: {args.api_url}")
- logger.info(f" 最低分数阈值: {args.min_score}")
- logger.info(f" 跳过前 N 个: {args.skip}")
- logger.info(f" 最多处理数: {args.max_notes if args.max_notes else '不限制'}")
- logger.info(f" 排序方式: {args.sort_by}")
- logger.info(f" 并发数: {args.max_workers}")
- logger.info(f" API 超时: {args.timeout}秒")
- logger.info(f" 最大重试: {args.max_retries}次")
- logger.info("=" * 60)
- # 创建分析器
- analyzer = Stage7DeconstructionAnalyzer(
- api_url=args.api_url,
- max_workers=args.max_workers,
- max_notes=args.max_notes,
- min_score=args.min_score,
- skip_count=args.skip,
- sort_by=args.sort_by,
- timeout=args.timeout,
- max_retries=args.max_retries,
- output_dir=os.path.dirname(args.output) or 'output_v2',
- target_features=args.feature # 传递 feature 过滤参数
- )
- # 运行分析
- try:
- stage7_results = analyzer.run(
- stage6_results=stage6_results,
- output_path=args.output
- )
- # 打印结果摘要
- logger.info("\n" + "=" * 60)
- logger.info("Stage 7 执行完成!")
- logger.info(f" 总匹配帖子数: {stage7_results['metadata']['total_matched_notes']}")
- logger.info(f" 实际处理数: {stage7_results['metadata']['processed_notes']}")
- logger.info(f" 成功: {stage7_results['metadata']['success_count']}")
- logger.info(f" 失败: {stage7_results['metadata']['failed_count']}")
- logger.info(f" 总耗时: {stage7_results['metadata']['processing_time_seconds']}秒")
- logger.info(f" 结果已保存: {args.output}")
- logger.info("=" * 60)
- # Stage 8: 相似度分析
- stage8_results = None
- if args.run_stage8:
- logger.info("\n" + "=" * 60)
- logger.info("开始执行 Stage 8 相似度分析...")
- logger.info("=" * 60)
- try:
- # 创建 Stage 8 分析器
- stage8_analyzer = Stage8SimilarityAnalyzer(
- weight_embedding=args.stage8_weight_embedding,
- weight_semantic=args.stage8_weight_semantic,
- max_workers=args.stage8_max_workers,
- min_similarity=args.stage8_min_similarity,
- target_features=args.feature
- )
- # 运行 Stage 8 分析
- stage8_results = stage8_analyzer.run(
- stage7_results=stage7_results,
- output_path=args.stage8_output
- )
- # 打印 Stage 8 结果摘要
- logger.info("\n" + "=" * 60)
- logger.info("Stage 8 执行完成!")
- metadata = stage8_results['metadata']
- overall_stats = metadata['overall_statistics']
- logger.info(f" 处理帖子数: {overall_stats['total_notes']}")
- logger.info(f" 提取特征总数: {overall_stats['total_features_extracted']}")
- logger.info(f" 平均特征数/帖子: {overall_stats['avg_features_per_note']:.2f}")
- logger.info(f" 平均最高相似度: {overall_stats['avg_max_similarity']:.3f}")
- logger.info(f" 包含高相似度特征的帖子: {overall_stats['notes_with_high_similarity']}")
- logger.info(f" 总耗时: {metadata['processing_time_seconds']:.2f}秒")
- logger.info(f" 结果已保存: {args.stage8_output}")
- logger.info("=" * 60)
- # 打印 Top 5 高相似度特征示例
- if stage8_results['results']:
- logger.info("\nTop 5 高相似度特征示例:")
- all_features = []
- for result in stage8_results['results']:
- for feat in result['deconstructed_features'][:5]:
- all_features.append({
- 'note_id': result['note_id'],
- 'feature_name': feat['feature_name'],
- 'dimension': feat['dimension'],
- 'similarity': feat['similarity_score']
- })
- # 按相似度排序,取 Top 5
- all_features.sort(key=lambda x: x['similarity'], reverse=True)
- for i, feat in enumerate(all_features[:5], 1):
- logger.info(f" {i}. [{feat['note_id'][:12]}...] "
- f"{feat['feature_name']} ({feat['dimension']}) "
- f"- 相似度: {feat['similarity']:.3f}")
- except Exception as e:
- logger.error(f"Stage 8 执行失败: {e}", exc_info=True)
- logger.warning("继续执行后续步骤...")
- # 可视化生成
- viz_path = None
- if args.visualize:
- logger.info("\n" + "=" * 60)
- logger.info("开始生成可视化结果...")
- logger.info("=" * 60)
- try:
- # 准备可视化所需的数据文件路径
- viz_args = [
- '--stage6', args.input,
- '--stage7', args.output
- ]
- # 如果有 Stage 8 结果,添加到参数中
- if stage8_results and args.stage8_output:
- viz_args.extend(['--stage8', args.stage8_output])
- # 如果指定了可视化输出目录
- if args.viz_output:
- viz_args.extend(['--output-dir', args.viz_output])
- # 调用可视化模块
- import sys
- original_argv = sys.argv
- try:
- sys.argv = ['visualize_stage78_with_deconstruction.py'] + viz_args
- viz_path = visualize_stage78_with_deconstruction.main()
- finally:
- sys.argv = original_argv
- if viz_path:
- logger.info("\n" + "=" * 60)
- logger.info("可视化生成完成!")
- logger.info(f" 可视化文件: {viz_path}")
- logger.info("=" * 60)
- # 自动打开浏览器
- if args.open_browser and not args.no_open:
- logger.info("\n正在打开浏览器...")
- try:
- # 使用 Path.as_uri() 来正确处理包含中文和特殊字符的路径
- file_url = Path(viz_path).resolve().as_uri()
- webbrowser.open(file_url)
- logger.info("浏览器已打开")
- except Exception as e:
- logger.warning(f"无法自动打开浏览器: {e}")
- logger.info(f"请手动打开: {os.path.abspath(viz_path)}")
- else:
- logger.warning("可视化生成返回了空路径")
- except Exception as e:
- logger.error(f"可视化生成失败: {e}", exc_info=True)
- logger.warning("跳过可视化步骤")
- # 流水线执行完成
- logger.info("\n" + "=" * 60)
- logger.info("流水线执行完成!")
- logger.info("=" * 60)
- except Exception as e:
- logger.error(f"执行失败: {e}", exc_info=True)
- raise
- if __name__ == '__main__':
- main()
|