similarity.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 相似度分析数据模型
  5. """
  6. from dataclasses import dataclass, field, asdict
  7. from typing import List, Dict, Any, Optional
  8. from datetime import datetime
  9. @dataclass
  10. class SimilarFeature:
  11. """带相似度的特征"""
  12. feature_name: str # 特征名称
  13. dimension: str # 维度
  14. dimension_detail: str # 维度细分
  15. weight: float # 权重
  16. source_index: int # 来源索引
  17. source_info: Dict[str, Any] # 溯源信息
  18. # 相似度信息
  19. similarity_score: float # 相似度得分
  20. similarity_explanation: str # 相似度说明
  21. @dataclass
  22. class SimilarityStatistics:
  23. """相似度统计信息"""
  24. total_features: int = 0 # 特征总数
  25. max_similarity: float = 0.0 # 最高相似度
  26. min_similarity: float = 0.0 # 最低相似度
  27. avg_similarity: float = 0.0 # 平均相似度
  28. high_similarity_count: int = 0 # 高相似度数量 (>=0.7)
  29. medium_similarity_count: int = 0 # 中等相似度数量 (0.5-0.7)
  30. low_similarity_count: int = 0 # 低相似度数量 (<0.5)
  31. error: Optional[str] = None # 错误信息 (如果有)
  32. def to_dict(self) -> Dict[str, Any]:
  33. """转换为字典"""
  34. return asdict(self)
  35. @dataclass
  36. class ComprehensiveScoreDetail:
  37. """综合得分P详细信息"""
  38. N: int # 总帖子数
  39. M: int # 完全匹配帖子数
  40. total_contribution: float # 总贡献值 Σ(a×b)
  41. complete_matches: List[Dict[str, Any]] # 完全匹配列表 (含每个的贡献)
  42. def to_dict(self) -> Dict[str, Any]:
  43. """转换为字典"""
  44. return asdict(self)
  45. @dataclass
  46. class SimilarityScore:
  47. """单个帖子的相似度评分"""
  48. note_id: str # 帖子ID
  49. original_feature: str # 原始特征
  50. evaluation_score: float # Stage 4评估得分
  51. search_word: str # 搜索词
  52. # 帖子数据
  53. note_data: Dict[str, Any] = field(default_factory=dict) # 帖子信息
  54. # 相似特征列表 (按相似度降序排序)
  55. deconstructed_features: List[SimilarFeature] = field(default_factory=list)
  56. # 统计信息
  57. similarity_statistics: SimilarityStatistics = field(default_factory=SimilarityStatistics)
  58. # 综合得分P (可选,如果计算了的话)
  59. comprehensive_score: Optional[float] = None
  60. comprehensive_score_detail: Optional[ComprehensiveScoreDetail] = None
  61. # 处理时间
  62. processing_time_seconds: float = 0.0
  63. def to_dict(self) -> Dict[str, Any]:
  64. """转换为字典"""
  65. result = {
  66. 'note_id': self.note_id,
  67. 'original_feature': self.original_feature,
  68. 'evaluation_score': self.evaluation_score,
  69. 'search_word': self.search_word,
  70. 'note_data': self.note_data,
  71. 'deconstructed_features': [asdict(f) for f in self.deconstructed_features],
  72. 'similarity_statistics': self.similarity_statistics.to_dict(),
  73. 'processing_time_seconds': round(self.processing_time_seconds, 2)
  74. }
  75. # 添加综合得分 (如果有)
  76. if self.comprehensive_score is not None:
  77. result['comprehensive_score'] = round(self.comprehensive_score, 3)
  78. if self.comprehensive_score_detail is not None:
  79. result['comprehensive_score_detail'] = self.comprehensive_score_detail.to_dict()
  80. return result
  81. @dataclass
  82. class OverallSimilarityStatistics:
  83. """整体相似度统计"""
  84. total_notes: int # 总帖子数
  85. total_features_extracted: int # 提取的特征总数
  86. avg_features_per_note: float # 平均特征数/帖子
  87. avg_max_similarity: float # 平均最高相似度
  88. notes_with_high_similarity: int # 包含高相似度特征的帖子数
  89. def to_dict(self) -> Dict[str, Any]:
  90. """转换为字典"""
  91. return asdict(self)
  92. @dataclass
  93. class PostSimilarityScores:
  94. """帖子相似度评分集合"""
  95. post_id: str # 帖子ID
  96. similarity_scores: List[SimilarityScore] # 相似度评分列表
  97. # 相似度配置
  98. algorithm: str = "hybrid_similarity" # 算法名称
  99. weight_embedding: float = 0.5 # 向量权重
  100. weight_semantic: float = 0.5 # 语义权重
  101. min_similarity_threshold: float = 0.0 # 最小相似度阈值
  102. # 目标特征
  103. target_features: Optional[List[str]] = None # 目标特征列表
  104. # 整体统计
  105. overall_statistics: Optional[OverallSimilarityStatistics] = None
  106. # 时间信息
  107. source_file: str = "" # 来源文件
  108. created_at: str = "" # 创建时间
  109. processing_time_seconds: float = 0.0 # 处理耗时(秒)
  110. def to_dict(self) -> Dict[str, Any]:
  111. """转换为字典"""
  112. return {
  113. 'metadata': {
  114. 'stage': 'similarity',
  115. 'description': '解构特征与原始特征的相似度评分',
  116. 'post_id': self.post_id,
  117. 'source_file': self.source_file,
  118. 'target_features': self.target_features if self.target_features else '全部',
  119. 'similarity_config': {
  120. 'algorithm': self.algorithm,
  121. 'weight_embedding': self.weight_embedding,
  122. 'weight_semantic': self.weight_semantic,
  123. 'min_similarity_threshold': self.min_similarity_threshold
  124. },
  125. 'overall_statistics': self.overall_statistics.to_dict() if self.overall_statistics else None,
  126. 'created_at': self.created_at or datetime.now().isoformat(),
  127. 'processing_time_seconds': round(self.processing_time_seconds, 2)
  128. },
  129. 'results': [s.to_dict() for s in self.similarity_scores]
  130. }
  131. @classmethod
  132. def from_json_file(cls, file_path: str) -> 'PostSimilarityScores':
  133. """从JSON文件加载"""
  134. import json
  135. with open(file_path, 'r', encoding='utf-8') as f:
  136. data = json.load(f)
  137. metadata = data['metadata']
  138. results_data = data['results']
  139. # 重建SimilarityScore对象
  140. scores = []
  141. for r in results_data:
  142. # 重建特征列表
  143. features = [
  144. SimilarFeature(**f) for f in r.get('deconstructed_features', [])
  145. ]
  146. # 重建统计信息
  147. stats = SimilarityStatistics(**r['similarity_statistics'])
  148. # 重建综合得分详情
  149. comprehensive_detail = None
  150. if 'comprehensive_score_detail' in r:
  151. comprehensive_detail = ComprehensiveScoreDetail(**r['comprehensive_score_detail'])
  152. score = SimilarityScore(
  153. note_id=r['note_id'],
  154. original_feature=r['original_feature'],
  155. evaluation_score=r['evaluation_score'],
  156. search_word=r['search_word'],
  157. note_data=r['note_data'],
  158. deconstructed_features=features,
  159. similarity_statistics=stats,
  160. comprehensive_score=r.get('comprehensive_score'),
  161. comprehensive_score_detail=comprehensive_detail,
  162. processing_time_seconds=r['processing_time_seconds']
  163. )
  164. scores.append(score)
  165. # 重建整体统计
  166. overall_stats = None
  167. if metadata.get('overall_statistics'):
  168. overall_stats = OverallSimilarityStatistics(**metadata['overall_statistics'])
  169. similarity_config = metadata['similarity_config']
  170. return cls(
  171. post_id=metadata['post_id'],
  172. similarity_scores=scores,
  173. algorithm=similarity_config['algorithm'],
  174. weight_embedding=similarity_config['weight_embedding'],
  175. weight_semantic=similarity_config['weight_semantic'],
  176. min_similarity_threshold=similarity_config['min_similarity_threshold'],
  177. target_features=metadata.get('target_features'),
  178. overall_statistics=overall_stats,
  179. source_file=metadata['source_file'],
  180. created_at=metadata['created_at'],
  181. processing_time_seconds=metadata['processing_time_seconds']
  182. )