#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 相似度分析数据模型 """ from dataclasses import dataclass, field, asdict from typing import List, Dict, Any, Optional from datetime import datetime @dataclass class SimilarFeature: """带相似度的特征""" feature_name: str # 特征名称 dimension: str # 维度 dimension_detail: str # 维度细分 weight: float # 权重 source_index: int # 来源索引 source_info: Dict[str, Any] # 溯源信息 # 相似度信息 similarity_score: float # 相似度得分 similarity_explanation: str # 相似度说明 @dataclass class SimilarityStatistics: """相似度统计信息""" total_features: int = 0 # 特征总数 max_similarity: float = 0.0 # 最高相似度 min_similarity: float = 0.0 # 最低相似度 avg_similarity: float = 0.0 # 平均相似度 high_similarity_count: int = 0 # 高相似度数量 (>=0.7) medium_similarity_count: int = 0 # 中等相似度数量 (0.5-0.7) low_similarity_count: int = 0 # 低相似度数量 (<0.5) error: Optional[str] = None # 错误信息 (如果有) def to_dict(self) -> Dict[str, Any]: """转换为字典""" return asdict(self) @dataclass class ComprehensiveScoreDetail: """综合得分P详细信息""" N: int # 总帖子数 M: int # 完全匹配帖子数 total_contribution: float # 总贡献值 Σ(a×b) complete_matches: List[Dict[str, Any]] # 完全匹配列表 (含每个的贡献) def to_dict(self) -> Dict[str, Any]: """转换为字典""" return asdict(self) @dataclass class SimilarityScore: """单个帖子的相似度评分""" note_id: str # 帖子ID original_feature: str # 原始特征 evaluation_score: float # Stage 4评估得分 search_word: str # 搜索词 # 帖子数据 note_data: Dict[str, Any] = field(default_factory=dict) # 帖子信息 # 相似特征列表 (按相似度降序排序) deconstructed_features: List[SimilarFeature] = field(default_factory=list) # 统计信息 similarity_statistics: SimilarityStatistics = field(default_factory=SimilarityStatistics) # 综合得分P (可选,如果计算了的话) comprehensive_score: Optional[float] = None comprehensive_score_detail: Optional[ComprehensiveScoreDetail] = None # 处理时间 processing_time_seconds: float = 0.0 def to_dict(self) -> Dict[str, Any]: """转换为字典""" result = { 'note_id': self.note_id, 'original_feature': self.original_feature, 'evaluation_score': self.evaluation_score, 'search_word': self.search_word, 'note_data': self.note_data, 'deconstructed_features': [asdict(f) for f in self.deconstructed_features], 'similarity_statistics': self.similarity_statistics.to_dict(), 'processing_time_seconds': round(self.processing_time_seconds, 2) } # 添加综合得分 (如果有) if self.comprehensive_score is not None: result['comprehensive_score'] = round(self.comprehensive_score, 3) if self.comprehensive_score_detail is not None: result['comprehensive_score_detail'] = self.comprehensive_score_detail.to_dict() return result @dataclass class OverallSimilarityStatistics: """整体相似度统计""" total_notes: int # 总帖子数 total_features_extracted: int # 提取的特征总数 avg_features_per_note: float # 平均特征数/帖子 avg_max_similarity: float # 平均最高相似度 notes_with_high_similarity: int # 包含高相似度特征的帖子数 def to_dict(self) -> Dict[str, Any]: """转换为字典""" return asdict(self) @dataclass class PostSimilarityScores: """帖子相似度评分集合""" post_id: str # 帖子ID similarity_scores: List[SimilarityScore] # 相似度评分列表 # 相似度配置 algorithm: str = "hybrid_similarity" # 算法名称 weight_embedding: float = 0.5 # 向量权重 weight_semantic: float = 0.5 # 语义权重 min_similarity_threshold: float = 0.0 # 最小相似度阈值 # 目标特征 target_features: Optional[List[str]] = None # 目标特征列表 # 整体统计 overall_statistics: Optional[OverallSimilarityStatistics] = None # 时间信息 source_file: str = "" # 来源文件 created_at: str = "" # 创建时间 processing_time_seconds: float = 0.0 # 处理耗时(秒) def to_dict(self) -> Dict[str, Any]: """转换为字典""" return { 'metadata': { 'stage': 'similarity', 'description': '解构特征与原始特征的相似度评分', 'post_id': self.post_id, 'source_file': self.source_file, 'target_features': self.target_features if self.target_features else '全部', 'similarity_config': { 'algorithm': self.algorithm, 'weight_embedding': self.weight_embedding, 'weight_semantic': self.weight_semantic, 'min_similarity_threshold': self.min_similarity_threshold }, 'overall_statistics': self.overall_statistics.to_dict() if self.overall_statistics else None, 'created_at': self.created_at or datetime.now().isoformat(), 'processing_time_seconds': round(self.processing_time_seconds, 2) }, 'results': [s.to_dict() for s in self.similarity_scores] } @classmethod def from_json_file(cls, file_path: str) -> 'PostSimilarityScores': """从JSON文件加载""" import json with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) metadata = data['metadata'] results_data = data['results'] # 重建SimilarityScore对象 scores = [] for r in results_data: # 重建特征列表 features = [ SimilarFeature(**f) for f in r.get('deconstructed_features', []) ] # 重建统计信息 stats = SimilarityStatistics(**r['similarity_statistics']) # 重建综合得分详情 comprehensive_detail = None if 'comprehensive_score_detail' in r: comprehensive_detail = ComprehensiveScoreDetail(**r['comprehensive_score_detail']) score = SimilarityScore( note_id=r['note_id'], original_feature=r['original_feature'], evaluation_score=r['evaluation_score'], search_word=r['search_word'], note_data=r['note_data'], deconstructed_features=features, similarity_statistics=stats, comprehensive_score=r.get('comprehensive_score'), comprehensive_score_detail=comprehensive_detail, processing_time_seconds=r['processing_time_seconds'] ) scores.append(score) # 重建整体统计 overall_stats = None if metadata.get('overall_statistics'): overall_stats = OverallSimilarityStatistics(**metadata['overall_statistics']) similarity_config = metadata['similarity_config'] return cls( post_id=metadata['post_id'], similarity_scores=scores, algorithm=similarity_config['algorithm'], weight_embedding=similarity_config['weight_embedding'], weight_semantic=similarity_config['weight_semantic'], min_similarity_threshold=similarity_config['min_similarity_threshold'], target_features=metadata.get('target_features'), overall_statistics=overall_stats, source_file=metadata['source_file'], created_at=metadata['created_at'], processing_time_seconds=metadata['processing_time_seconds'] )