| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Stage 8 相似度分析器
- 计算 Stage 7 解构特征与原始特征的相似度评分
- """
- import os
- import json
- import time
- import logging
- import asyncio
- from datetime import datetime
- from typing import Dict, List, Any, Optional
- from lib.hybrid_similarity import compare_phrases_cartesian
- from lib.config import get_cache_dir
- try:
- from tqdm import tqdm
- TQDM_AVAILABLE = True
- except ImportError:
- TQDM_AVAILABLE = False
- logger = logging.getLogger(__name__)
- def extract_deconstructed_features(api_response: Dict) -> List[Dict]:
- """
- 从三点解构中提取所有特征
- Args:
- api_response: Stage 7 的 api_response 对象
- Returns:
- 特征列表,每个特征包含:
- - feature_name: 特征名称
- - dimension: 维度 (灵感点-全新内容/灵感点-共性差异/灵感点-共性内容/目的点/关键点)
- - dimension_detail: 维度细分 (实质/形式/意图等)
- - weight: 权重
- - source_index: 在该维度中的索引
- - source_*: 溯源信息 (候选编号、目的点描述、关键点描述等)
- """
- features = []
- # 检查 API 响应状态
- if api_response.get('status') != 'success':
- logger.warning(" API 响应状态不是 success,无法提取特征")
- return features
- result = api_response.get('result', {})
- # 检查是否有 data 字段
- if 'data' not in result:
- logger.warning(" API 响应中没有 data 字段")
- return features
- data = result['data']
- three_point = data.get('三点解构', {})
- if not three_point:
- logger.warning(" 三点解构数据为空")
- return features
- # 1. 提取灵感点 (3个子类别)
- inspiration = three_point.get('灵感点', {})
- for category in ['全新内容', '共性差异', '共性内容']:
- items = inspiration.get(category, [])
- for idx, item in enumerate(items):
- extracted_features = item.get('提取的特征', [])
- for feat in extracted_features:
- feature_name = feat.get('特征名称', '')
- if not feature_name:
- continue
- features.append({
- 'feature_name': feature_name,
- 'dimension': f'灵感点-{category}',
- 'dimension_detail': feat.get('维度分类', ''), # 注意字段名
- 'weight': feat.get('权重', 0),
- 'source_index': idx,
- 'source_candidate_number': item.get('候选编号', 0),
- 'source_inspiration': item.get('灵感点', '')
- })
- # 2. 提取目的点
- purpose = three_point.get('目的点', {})
- purposes_list = purpose.get('purposes', [])
- for idx, item in enumerate(purposes_list):
- extracted_features = item.get('提取的特征', [])
- for feat in extracted_features:
- feature_name = feat.get('特征名称', '')
- if not feature_name:
- continue
- features.append({
- 'feature_name': feature_name,
- 'dimension': '目的点',
- 'dimension_detail': feat.get('特征分类', ''), # 注意字段名
- 'weight': feat.get('权重', 0),
- 'source_index': idx,
- 'source_purpose': item.get('目的点', ''),
- 'source_purpose_dimension': item.get('维度', {})
- })
- # 3. 提取关键点
- key_points_data = three_point.get('关键点', {})
- key_points_list = key_points_data.get('key_points', [])
- for idx, item in enumerate(key_points_list):
- extracted_features = item.get('提取的特征', [])
- for feat in extracted_features:
- feature_name = feat.get('特征名称', '')
- if not feature_name:
- continue
- features.append({
- 'feature_name': feature_name,
- 'dimension': '关键点',
- 'dimension_detail': feat.get('维度', ''), # 注意字段名
- 'weight': feat.get('权重', 0),
- 'source_index': idx,
- 'source_candidate_number': item.get('候选编号', 0),
- 'source_key_point': item.get('关键点', ''),
- 'source_key_point_dimension': item.get('维度', '')
- })
- logger.info(f" 提取特征数量: {len(features)}")
- if features:
- # 统计各维度数量
- dimension_counts = {}
- for feat in features:
- dim = feat['dimension']
- dimension_counts[dim] = dimension_counts.get(dim, 0) + 1
- logger.info(f" 维度分布: {dimension_counts}")
- return features
- async def calculate_similarity_for_note(
- note_result: Dict,
- original_feature: str,
- weight_embedding: float = 0.5,
- weight_semantic: float = 0.5,
- min_similarity: float = 0.0
- ) -> Dict:
- """
- 计算单个帖子的所有特征与原始特征的相似度
- Args:
- note_result: Stage 7 的单个 result 对象
- original_feature: 原始特征名称
- weight_embedding: 向量模型权重
- weight_semantic: LLM 模型权重
- min_similarity: 最小相似度阈值,低于此值的特征会被过滤
- Returns:
- 包含相似度信息的结果对象
- """
- note_id = note_result.get('note_id', '')
- logger.info(f" [{note_id}] 开始计算相似度...")
- # 1. 提取解构特征
- deconstructed_features = extract_deconstructed_features(
- note_result['api_response']
- )
- if not deconstructed_features:
- logger.warning(f" [{note_id}] 没有提取到特征")
- return {
- 'note_id': note_id,
- 'original_feature': original_feature,
- 'evaluation_score': note_result.get('evaluation_score', 0),
- 'search_word': note_result.get('search_word', ''),
- 'note_data': note_result.get('note_data', {}),
- 'deconstructed_features': [],
- 'similarity_statistics': {
- 'total_features': 0,
- 'max_similarity': 0,
- 'min_similarity': 0,
- 'avg_similarity': 0,
- 'high_similarity_count': 0,
- 'medium_similarity_count': 0,
- 'low_similarity_count': 0
- }
- }
- # 2. 构建特征名称列表
- feature_names = [f['feature_name'] for f in deconstructed_features]
- logger.info(f" [{note_id}] 调用相似度计算 API (1×{len(feature_names)} 笛卡尔积)...")
- # 3. 批量计算相似度 (1×N 笛卡尔积)
- try:
- start_time = time.time()
- similarity_results = await compare_phrases_cartesian(
- phrases_a=[original_feature],
- phrases_b=feature_names,
- max_concurrent=50
- )
- elapsed = time.time() - start_time
- logger.info(f" [{note_id}] 相似度计算完成 ({elapsed:.1f}秒)")
- # 4. 映射结果回特征对象
- for i, feat in enumerate(deconstructed_features):
- feat['similarity_score'] = similarity_results[0][i]['相似度']
- feat['similarity_explanation'] = similarity_results[0][i]['说明']
- # 5. 过滤低相似度特征
- if min_similarity > 0:
- original_count = len(deconstructed_features)
- deconstructed_features = [
- f for f in deconstructed_features
- if f['similarity_score'] >= min_similarity
- ]
- filtered_count = original_count - len(deconstructed_features)
- if filtered_count > 0:
- logger.info(f" [{note_id}] 过滤掉 {filtered_count} 个低相似度特征 (< {min_similarity})")
- # 6. 计算统计信息
- if deconstructed_features:
- scores = [f['similarity_score'] for f in deconstructed_features]
- statistics = {
- 'total_features': len(scores),
- 'max_similarity': round(max(scores), 3),
- 'min_similarity': round(min(scores), 3),
- 'avg_similarity': round(sum(scores) / len(scores), 3),
- 'high_similarity_count': sum(1 for s in scores if s >= 0.7),
- 'medium_similarity_count': sum(1 for s in scores if 0.5 <= s < 0.7),
- 'low_similarity_count': sum(1 for s in scores if s < 0.5)
- }
- # 7. 按相似度降序排序
- deconstructed_features.sort(key=lambda x: x['similarity_score'], reverse=True)
- logger.info(f" [{note_id}] 统计: 最高={statistics['max_similarity']}, "
- f"平均={statistics['avg_similarity']}, "
- f"高相似度={statistics['high_similarity_count']}个")
- else:
- statistics = {
- 'total_features': 0,
- 'max_similarity': 0,
- 'min_similarity': 0,
- 'avg_similarity': 0,
- 'high_similarity_count': 0,
- 'medium_similarity_count': 0,
- 'low_similarity_count': 0
- }
- return {
- 'note_id': note_id,
- 'original_feature': original_feature,
- 'evaluation_score': note_result.get('evaluation_score', 0),
- 'search_word': note_result.get('search_word', ''),
- 'note_data': note_result.get('note_data', {}),
- 'deconstructed_features': deconstructed_features,
- 'similarity_statistics': statistics,
- 'processing_time_seconds': round(elapsed, 2)
- }
- except Exception as e:
- logger.error(f" [{note_id}] 相似度计算失败: {e}")
- return {
- 'note_id': note_id,
- 'original_feature': original_feature,
- 'evaluation_score': note_result.get('evaluation_score', 0),
- 'search_word': note_result.get('search_word', ''),
- 'note_data': note_result.get('note_data', {}),
- 'deconstructed_features': [],
- 'similarity_statistics': {
- 'total_features': 0,
- 'error': str(e)
- }
- }
- class Stage8SimilarityAnalyzer:
- """Stage 8: 解构特征与原始特征的相似度分析"""
- def __init__(
- self,
- weight_embedding: float = 0.5,
- weight_semantic: float = 0.5,
- max_workers: int = 5,
- min_similarity: float = 0.0,
- output_dir: str = "output_v2",
- target_features: Optional[List[str]] = None,
- stage6_path: str = 'output_v2/stage6_with_evaluations.json',
- update_stage6: bool = True
- ):
- """
- 初始化 Stage 8 分析器
- Args:
- weight_embedding: 向量模型权重(默认 0.5)
- weight_semantic: LLM 模型权重(默认 0.5)
- max_workers: 最大并发数(默认 5)
- min_similarity: 最小相似度阈值(默认 0.0,保留所有特征)
- output_dir: 输出目录
- target_features: 指定要处理的原始特征列表(None = 处理所有特征)
- stage6_path: Stage 6 数据文件路径(用于计算综合得分)
- update_stage6: 是否计算并更新 Stage 6 的综合得分(默认 True)
- """
- self.weight_embedding = weight_embedding
- self.weight_semantic = weight_semantic
- self.max_workers = max_workers
- self.min_similarity = min_similarity
- self.output_dir = output_dir
- self.target_features = target_features
- self.stage6_path = stage6_path
- self.update_stage6 = update_stage6
- # 验证权重
- total_weight = weight_embedding + weight_semantic
- if abs(total_weight - 1.0) > 0.001:
- raise ValueError(f"权重之和必须为1.0,当前为: {total_weight}")
- def _save_intermediate_results(
- self,
- results: List[Dict],
- output_path: str,
- processed_count: int,
- total_count: int,
- start_time: float
- ):
- """保存中间结果"""
- base_dir = os.path.dirname(output_path) or self.output_dir
- base_name = os.path.basename(output_path)
- name_without_ext = os.path.splitext(base_name)[0]
- intermediate_path = os.path.join(
- base_dir,
- f"{name_without_ext}_partial_{processed_count}of{total_count}.json"
- )
- # 统计
- total_features = sum(r['similarity_statistics']['total_features'] for r in results)
- avg_max_sim = sum(r['similarity_statistics']['max_similarity'] for r in results) / len(results)
- intermediate_result = {
- 'metadata': {
- 'stage': 'stage8_partial',
- 'description': f'部分结果({processed_count}/{total_count})',
- 'processed_notes': len(results),
- 'total_features_extracted': total_features,
- 'avg_max_similarity': round(avg_max_sim, 3),
- 'saved_at': datetime.now().isoformat(),
- 'processing_time_seconds': round(time.time() - start_time, 2)
- },
- 'results': results
- }
- os.makedirs(base_dir, exist_ok=True)
- with open(intermediate_path, 'w', encoding='utf-8') as f:
- json.dump(intermediate_result, f, ensure_ascii=False, indent=2)
- logger.info(f" 已保存中间结果: {intermediate_path}")
- async def run_async(
- self,
- stage7_results: Dict,
- output_path: Optional[str] = None
- ) -> Dict:
- """
- 执行 Stage 8 相似度分析(异步版本)
- Args:
- stage7_results: Stage 7 结果
- output_path: 输出路径(可选)
- Returns:
- Stage 8 结果
- """
- logger.info("\n" + "=" * 60)
- logger.info("Stage 8: 解构特征与原始特征的相似度分析")
- logger.info("=" * 60)
- # 打印配置
- logger.info("配置参数:")
- logger.info(f" 向量模型权重: {self.weight_embedding}")
- logger.info(f" LLM 模型权重: {self.weight_semantic}")
- logger.info(f" 最大并发数: {self.max_workers}")
- logger.info(f" 最小相似度阈值: {self.min_similarity}")
- if self.target_features:
- logger.info(f" 目标特征: {', '.join(self.target_features)}")
- else:
- logger.info(f" 目标特征: 全部")
- # 默认输出路径
- if output_path is None:
- output_path = os.path.join(self.output_dir, "stage8_similarity_scores.json")
- # 提取 Stage 7 结果
- results_list = stage7_results.get('results', [])
- # 过滤目标特征
- if self.target_features:
- results_list = [
- r for r in results_list
- if r.get('original_feature') in self.target_features
- ]
- total_notes = len(results_list)
- logger.info(f" 待处理帖子数: {total_notes}")
- if total_notes == 0:
- logger.warning(" 没有需要处理的帖子")
- return {
- 'metadata': {
- 'stage': 'stage8',
- 'processed_notes': 0
- },
- 'results': []
- }
- # 创建任务列表
- start_time = time.time()
- results = []
- # 使用 Semaphore 控制并发数
- semaphore = asyncio.Semaphore(self.max_workers)
- async def bounded_task(result):
- async with semaphore:
- return await calculate_similarity_for_note(
- result,
- result.get('original_feature', ''),
- self.weight_embedding,
- self.weight_semantic,
- self.min_similarity
- )
- tasks = [bounded_task(result) for result in results_list]
- # 带进度条执行
- if TQDM_AVAILABLE:
- logger.info(" 使用进度条显示...")
- processed_count = 0
- save_interval = 10
- for coro in tqdm(
- asyncio.as_completed(tasks),
- total=len(tasks),
- desc=" 相似度计算进度",
- unit="帖子",
- ncols=100
- ):
- result = await coro
- results.append(result)
- processed_count += 1
- # 增量保存
- if processed_count % save_interval == 0:
- self._save_intermediate_results(
- results,
- output_path,
- processed_count,
- total_notes,
- start_time
- )
- else:
- # 简单执行
- results = await asyncio.gather(*tasks)
- logger.info(f" 完成: {len(results)}/{total_notes}")
- processing_time = time.time() - start_time
- # 计算总体统计
- total_features = sum(r['similarity_statistics']['total_features'] for r in results)
- all_max_similarities = [r['similarity_statistics']['max_similarity'] for r in results if r['similarity_statistics']['total_features'] > 0]
- overall_stats = {
- 'total_notes': total_notes,
- 'total_features_extracted': total_features,
- 'avg_features_per_note': round(total_features / total_notes, 1) if total_notes > 0 else 0,
- 'avg_max_similarity': round(sum(all_max_similarities) / len(all_max_similarities), 3) if all_max_similarities else 0,
- 'notes_with_high_similarity': sum(1 for r in results if r['similarity_statistics'].get('high_similarity_count', 0) > 0)
- }
- logger.info(f"\n 总耗时: {processing_time:.1f}秒")
- logger.info(f" 总特征数: {total_features}")
- logger.info(f" 平均特征数/帖子: {overall_stats['avg_features_per_note']}")
- logger.info(f" 平均最高相似度: {overall_stats['avg_max_similarity']}")
- logger.info(f" 包含高相似度特征的帖子: {overall_stats['notes_with_high_similarity']}")
- # 构建最终结果
- final_result = {
- 'metadata': {
- 'stage': 'stage8',
- 'description': '解构特征与原始特征的相似度评分',
- 'source_file': stage7_results.get('metadata', {}).get('created_at', ''),
- 'target_features': self.target_features if self.target_features else '全部',
- 'similarity_config': {
- 'algorithm': 'hybrid_similarity',
- 'weight_embedding': self.weight_embedding,
- 'weight_semantic': self.weight_semantic,
- 'min_similarity_threshold': self.min_similarity
- },
- 'overall_statistics': overall_stats,
- 'created_at': datetime.now().isoformat(),
- 'processing_time_seconds': round(processing_time, 2)
- },
- 'results': results
- }
- # 保存结果
- os.makedirs(os.path.dirname(output_path) or self.output_dir, exist_ok=True)
- with open(output_path, 'w', encoding='utf-8') as f:
- json.dump(final_result, f, ensure_ascii=False, indent=2)
- logger.info(f" 结果已保存: {output_path}")
- # 计算并更新综合得分P
- if self.update_stage6:
- logger.info("\n" + "=" * 60)
- logger.info("开始计算综合得分P并更新Stage 6数据...")
- logger.info("=" * 60)
- self._calculate_and_update_comprehensive_scores(results)
- return final_result
- def _calculate_and_update_comprehensive_scores(self, stage8_results: List[Dict]):
- """
- 计算综合得分P并更新Stage 6数据
- Args:
- stage8_results: Stage 8 的结果列表
- """
- try:
- # 1. 加载 Stage 6 数据
- logger.info(f" 加载 Stage 6 数据: {self.stage6_path}")
- if not os.path.exists(self.stage6_path):
- logger.error(f" Stage 6 文件不存在: {self.stage6_path}")
- return
- with open(self.stage6_path, 'r', encoding='utf-8') as f:
- stage6_data = json.load(f)
- # 2. 构建 Stage 8 映射 (note_id → max_similarity)
- logger.info(" 构建相似度映射...")
- similarity_map = {}
- for result in stage8_results:
- note_id = result['note_id']
- max_similarity = result['similarity_statistics']['max_similarity']
- similarity_map[note_id] = max_similarity
- logger.info(f" 相似度映射条目数: {len(similarity_map)}")
- # 3. 遍历 Stage 6 中的所有原始特征和搜索词,计算 P 值
- # Stage 6 数据是一个列表,每个元素是一个原始特征
- updated_count = 0
- total_searches = 0
- logger.info(f" 开始遍历 {len(stage6_data)} 个原始特征...")
- for feature_item in stage6_data:
- original_feature = feature_item.get('原始特征名称', '')
- logger.info(f"\n 处理原始特征: {original_feature}")
- # 遍历每个分组
- for group in feature_item.get('组合评估结果_分组', []):
- source_word = group.get('source_word', '')
- # 遍历该分组的所有搜索词
- for search_item in group.get('top10_searches', []):
- search_word = search_item.get('search_word', '')
- total_searches += 1
- logger.info(f" 处理搜索词: {search_word} (来源: {source_word})")
- # 计算该搜索词的综合得分
- p_score, p_detail = self._calculate_single_query_score(
- search_item,
- similarity_map
- )
- # 更新搜索词数据
- if p_score is not None:
- search_item['comprehensive_score'] = round(p_score, 3)
- search_item['comprehensive_score_detail'] = p_detail
- updated_count += 1
- logger.info(f" 综合得分P = {p_score:.3f} (M={p_detail['M']}, N={p_detail['N']})")
- else:
- logger.warning(f" 无法计算综合得分(可能缺少数据)")
- # 4. 保存更新后的 Stage 6 数据
- logger.info(f"\n 保存更新后的 Stage 6 数据...")
- logger.info(f" 已更新 {updated_count}/{total_searches} 个搜索词")
- with open(self.stage6_path, 'w', encoding='utf-8') as f:
- json.dump(stage6_data, f, ensure_ascii=False, indent=2)
- logger.info(f" 更新完成: {self.stage6_path}")
- except Exception as e:
- logger.error(f" 计算综合得分失败: {e}", exc_info=True)
- def _calculate_single_query_score(
- self,
- query: Dict,
- similarity_map: Dict[str, float]
- ) -> tuple[Optional[float], Optional[Dict]]:
- """
- 计算单个查询的综合得分P
- Args:
- query: Stage 6 中的单个查询对象
- similarity_map: note_id → max_similarity 的映射
- Returns:
- (P值, 详细计算信息) 或 (None, None)
- """
- # 获取总帖子数 N
- evaluation_with_filter = query.get('evaluation_with_filter', {})
- N = evaluation_with_filter.get('total_notes', 0)
- if N == 0:
- logger.warning(f" 查询总帖子数为0,无法计算P值")
- return None, None
- # 获取笔记评估数据和原始笔记数据
- notes_evaluation = evaluation_with_filter.get('notes_evaluation', [])
- search_result = query.get('search_result', {})
- notes_data = search_result.get('data', {}).get('data', [])
- if not notes_evaluation or not notes_data:
- logger.warning(f" 缺少评估数据或笔记数据")
- return 0.0, {
- 'N': N,
- 'M': 0,
- 'total_contribution': 0.0,
- 'complete_matches': []
- }
- # 获取完全匹配的帖子列表 (综合得分 >= 0.8)
- complete_matches_data = []
- for note_eval in notes_evaluation:
- score = note_eval.get('综合得分', 0)
- if score >= 0.8:
- note_index = note_eval.get('note_index', -1)
- if 0 <= note_index < len(notes_data):
- # 从原始数据中获取note_id
- note_id = notes_data[note_index].get('id', '')
- note_card = notes_data[note_index].get('note_card', {})
- note_title = note_card.get('display_title', '')
- complete_matches_data.append({
- 'note_id': note_id,
- 'note_title': note_title,
- 'evaluation_score': score,
- 'note_index': note_index
- })
- M = len(complete_matches_data)
- logger.info(f" 完全匹配数: M = {M}/{N}")
- if M == 0:
- # 没有完全匹配,P = 0
- return 0.0, {
- 'N': N,
- 'M': 0,
- 'total_contribution': 0.0,
- 'complete_matches': []
- }
- # 计算每个完全匹配的贡献 a×b
- contributions = []
- total_contribution = 0.0
- for match in complete_matches_data:
- note_id = match['note_id']
- evaluation_score = match['evaluation_score'] # a 值
- # 从 similarity_map 获取 b 值
- max_similarity = similarity_map.get(note_id, 0) # b 值
- # 计算贡献
- contribution = evaluation_score * max_similarity
- total_contribution += contribution
- # 保存详细信息
- contributions.append({
- 'note_id': note_id,
- 'note_title': match['note_title'],
- 'evaluation_score': round(evaluation_score, 3),
- 'max_similarity': round(max_similarity, 3),
- 'contribution': round(contribution, 3)
- })
- # 计算综合得分 P = Σ(a×b) / N
- P = total_contribution / N
- # 按贡献降序排序
- contributions.sort(key=lambda x: x['contribution'], reverse=True)
- # 构建详细信息
- detail = {
- 'N': N,
- 'M': M,
- 'total_contribution': round(total_contribution, 3),
- 'complete_matches': contributions
- }
- return P, detail
- def run(
- self,
- stage7_results: Dict,
- output_path: Optional[str] = None
- ) -> Dict:
- """
- 执行 Stage 8 相似度分析(同步版本)
- Args:
- stage7_results: Stage 7 结果
- output_path: 输出路径(可选)
- Returns:
- Stage 8 结果
- """
- return asyncio.run(self.run_async(stage7_results, output_path))
- def test_stage8_analyzer():
- """测试 Stage 8 分析器"""
- # 读取 Stage 7 结果
- stage7_path = "output_v2/stage7_with_deconstruction.json"
- if not os.path.exists(stage7_path):
- print(f"Stage 7 结果不存在: {stage7_path}")
- return
- with open(stage7_path, 'r', encoding='utf-8') as f:
- stage7_results = json.load(f)
- # 创建分析器
- analyzer = Stage8SimilarityAnalyzer(
- weight_embedding=0.5,
- weight_semantic=0.5,
- max_workers=3,
- min_similarity=0.3,
- target_features=["墨镜"]
- )
- # 运行分析
- stage8_results = analyzer.run(stage7_results)
- print(f"\n处理了 {stage8_results['metadata']['overall_statistics']['total_notes']} 个帖子")
- print(f"提取了 {stage8_results['metadata']['overall_statistics']['total_features_extracted']} 个特征")
- print(f"平均最高相似度: {stage8_results['metadata']['overall_statistics']['avg_max_similarity']}")
- if __name__ == '__main__':
- logging.basicConfig(
- level=logging.INFO,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
- )
- test_stage8_analyzer()
|