| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 相似度分析数据模型
- """
- from dataclasses import dataclass, field, asdict
- from typing import List, Dict, Any, Optional
- from datetime import datetime
- @dataclass
- class SimilarFeature:
- """带相似度的特征"""
- feature_name: str # 特征名称
- dimension: str # 维度
- dimension_detail: str # 维度细分
- weight: float # 权重
- source_index: int # 来源索引
- source_info: Dict[str, Any] # 溯源信息
- # 相似度信息
- similarity_score: float # 相似度得分
- similarity_explanation: str # 相似度说明
- @dataclass
- class SimilarityStatistics:
- """相似度统计信息"""
- total_features: int = 0 # 特征总数
- max_similarity: float = 0.0 # 最高相似度
- min_similarity: float = 0.0 # 最低相似度
- avg_similarity: float = 0.0 # 平均相似度
- high_similarity_count: int = 0 # 高相似度数量 (>=0.7)
- medium_similarity_count: int = 0 # 中等相似度数量 (0.5-0.7)
- low_similarity_count: int = 0 # 低相似度数量 (<0.5)
- error: Optional[str] = None # 错误信息 (如果有)
- def to_dict(self) -> Dict[str, Any]:
- """转换为字典"""
- return asdict(self)
- @dataclass
- class ComprehensiveScoreDetail:
- """综合得分P详细信息"""
- N: int # 总帖子数
- M: int # 完全匹配帖子数
- total_contribution: float # 总贡献值 Σ(a×b)
- complete_matches: List[Dict[str, Any]] # 完全匹配列表 (含每个的贡献)
- def to_dict(self) -> Dict[str, Any]:
- """转换为字典"""
- return asdict(self)
- @dataclass
- class SimilarityScore:
- """单个帖子的相似度评分"""
- note_id: str # 帖子ID
- original_feature: str # 原始特征
- evaluation_score: float # Stage 4评估得分
- search_word: str # 搜索词
- # 帖子数据
- note_data: Dict[str, Any] = field(default_factory=dict) # 帖子信息
- # 相似特征列表 (按相似度降序排序)
- deconstructed_features: List[SimilarFeature] = field(default_factory=list)
- # 统计信息
- similarity_statistics: SimilarityStatistics = field(default_factory=SimilarityStatistics)
- # 综合得分P (可选,如果计算了的话)
- comprehensive_score: Optional[float] = None
- comprehensive_score_detail: Optional[ComprehensiveScoreDetail] = None
- # 处理时间
- processing_time_seconds: float = 0.0
- def to_dict(self) -> Dict[str, Any]:
- """转换为字典"""
- result = {
- 'note_id': self.note_id,
- 'original_feature': self.original_feature,
- 'evaluation_score': self.evaluation_score,
- 'search_word': self.search_word,
- 'note_data': self.note_data,
- 'deconstructed_features': [asdict(f) for f in self.deconstructed_features],
- 'similarity_statistics': self.similarity_statistics.to_dict(),
- 'processing_time_seconds': round(self.processing_time_seconds, 2)
- }
- # 添加综合得分 (如果有)
- if self.comprehensive_score is not None:
- result['comprehensive_score'] = round(self.comprehensive_score, 3)
- if self.comprehensive_score_detail is not None:
- result['comprehensive_score_detail'] = self.comprehensive_score_detail.to_dict()
- return result
- @dataclass
- class OverallSimilarityStatistics:
- """整体相似度统计"""
- total_notes: int # 总帖子数
- total_features_extracted: int # 提取的特征总数
- avg_features_per_note: float # 平均特征数/帖子
- avg_max_similarity: float # 平均最高相似度
- notes_with_high_similarity: int # 包含高相似度特征的帖子数
- def to_dict(self) -> Dict[str, Any]:
- """转换为字典"""
- return asdict(self)
- @dataclass
- class PostSimilarityScores:
- """帖子相似度评分集合"""
- post_id: str # 帖子ID
- similarity_scores: List[SimilarityScore] # 相似度评分列表
- # 相似度配置
- algorithm: str = "hybrid_similarity" # 算法名称
- weight_embedding: float = 0.5 # 向量权重
- weight_semantic: float = 0.5 # 语义权重
- min_similarity_threshold: float = 0.0 # 最小相似度阈值
- # 目标特征
- target_features: Optional[List[str]] = None # 目标特征列表
- # 整体统计
- overall_statistics: Optional[OverallSimilarityStatistics] = None
- # 时间信息
- source_file: str = "" # 来源文件
- created_at: str = "" # 创建时间
- processing_time_seconds: float = 0.0 # 处理耗时(秒)
- def to_dict(self) -> Dict[str, Any]:
- """转换为字典"""
- return {
- 'metadata': {
- 'stage': 'similarity',
- 'description': '解构特征与原始特征的相似度评分',
- 'post_id': self.post_id,
- 'source_file': self.source_file,
- 'target_features': self.target_features if self.target_features else '全部',
- 'similarity_config': {
- 'algorithm': self.algorithm,
- 'weight_embedding': self.weight_embedding,
- 'weight_semantic': self.weight_semantic,
- 'min_similarity_threshold': self.min_similarity_threshold
- },
- 'overall_statistics': self.overall_statistics.to_dict() if self.overall_statistics else None,
- 'created_at': self.created_at or datetime.now().isoformat(),
- 'processing_time_seconds': round(self.processing_time_seconds, 2)
- },
- 'results': [s.to_dict() for s in self.similarity_scores]
- }
- @classmethod
- def from_json_file(cls, file_path: str) -> 'PostSimilarityScores':
- """从JSON文件加载"""
- import json
- with open(file_path, 'r', encoding='utf-8') as f:
- data = json.load(f)
- metadata = data['metadata']
- results_data = data['results']
- # 重建SimilarityScore对象
- scores = []
- for r in results_data:
- # 重建特征列表
- features = [
- SimilarFeature(**f) for f in r.get('deconstructed_features', [])
- ]
- # 重建统计信息
- stats = SimilarityStatistics(**r['similarity_statistics'])
- # 重建综合得分详情
- comprehensive_detail = None
- if 'comprehensive_score_detail' in r:
- comprehensive_detail = ComprehensiveScoreDetail(**r['comprehensive_score_detail'])
- score = SimilarityScore(
- note_id=r['note_id'],
- original_feature=r['original_feature'],
- evaluation_score=r['evaluation_score'],
- search_word=r['search_word'],
- note_data=r['note_data'],
- deconstructed_features=features,
- similarity_statistics=stats,
- comprehensive_score=r.get('comprehensive_score'),
- comprehensive_score_detail=comprehensive_detail,
- processing_time_seconds=r['processing_time_seconds']
- )
- scores.append(score)
- # 重建整体统计
- overall_stats = None
- if metadata.get('overall_statistics'):
- overall_stats = OverallSimilarityStatistics(**metadata['overall_statistics'])
- similarity_config = metadata['similarity_config']
- return cls(
- post_id=metadata['post_id'],
- similarity_scores=scores,
- algorithm=similarity_config['algorithm'],
- weight_embedding=similarity_config['weight_embedding'],
- weight_semantic=similarity_config['weight_semantic'],
- min_similarity_threshold=similarity_config['min_similarity_threshold'],
- target_features=metadata.get('target_features'),
- overall_statistics=overall_stats,
- source_file=metadata['source_file'],
- created_at=metadata['created_at'],
- processing_time_seconds=metadata['processing_time_seconds']
- )
|