stage8_similarity_analyzer.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Stage 8 相似度分析器
  5. 计算 Stage 7 解构特征与原始特征的相似度评分
  6. """
  7. import os
  8. import json
  9. import time
  10. import logging
  11. import asyncio
  12. from datetime import datetime
  13. from typing import Dict, List, Any, Optional
  14. from lib.hybrid_similarity import compare_phrases_cartesian
  15. from lib.config import get_cache_dir
  16. try:
  17. from tqdm import tqdm
  18. TQDM_AVAILABLE = True
  19. except ImportError:
  20. TQDM_AVAILABLE = False
  21. logger = logging.getLogger(__name__)
  22. def extract_deconstructed_features(api_response: Dict) -> List[Dict]:
  23. """
  24. 从三点解构中提取所有特征
  25. Args:
  26. api_response: Stage 7 的 api_response 对象
  27. Returns:
  28. 特征列表,每个特征包含:
  29. - feature_name: 特征名称
  30. - dimension: 维度 (灵感点-全新内容/灵感点-共性差异/灵感点-共性内容/目的点/关键点)
  31. - dimension_detail: 维度细分 (实质/形式/意图等)
  32. - weight: 权重
  33. - source_index: 在该维度中的索引
  34. - source_*: 溯源信息 (候选编号、目的点描述、关键点描述等)
  35. """
  36. features = []
  37. # 检查 API 响应状态
  38. if api_response.get('status') != 'success':
  39. logger.warning(" API 响应状态不是 success,无法提取特征")
  40. return features
  41. result = api_response.get('result', {})
  42. # 检查是否有 data 字段
  43. if 'data' not in result:
  44. logger.warning(" API 响应中没有 data 字段")
  45. return features
  46. data = result['data']
  47. three_point = data.get('三点解构', {})
  48. if not three_point:
  49. logger.warning(" 三点解构数据为空")
  50. return features
  51. # 1. 提取灵感点 (3个子类别)
  52. inspiration = three_point.get('灵感点', {})
  53. for category in ['全新内容', '共性差异', '共性内容']:
  54. items = inspiration.get(category, [])
  55. for idx, item in enumerate(items):
  56. extracted_features = item.get('提取的特征', [])
  57. for feat in extracted_features:
  58. feature_name = feat.get('特征名称', '')
  59. if not feature_name:
  60. continue
  61. features.append({
  62. 'feature_name': feature_name,
  63. 'dimension': f'灵感点-{category}',
  64. 'dimension_detail': feat.get('维度分类', ''), # 注意字段名
  65. 'weight': feat.get('权重', 0),
  66. 'source_index': idx,
  67. 'source_candidate_number': item.get('候选编号', 0),
  68. 'source_inspiration': item.get('灵感点', '')
  69. })
  70. # 2. 提取目的点
  71. purpose = three_point.get('目的点', {})
  72. purposes_list = purpose.get('purposes', [])
  73. for idx, item in enumerate(purposes_list):
  74. extracted_features = item.get('提取的特征', [])
  75. for feat in extracted_features:
  76. feature_name = feat.get('特征名称', '')
  77. if not feature_name:
  78. continue
  79. features.append({
  80. 'feature_name': feature_name,
  81. 'dimension': '目的点',
  82. 'dimension_detail': feat.get('特征分类', ''), # 注意字段名
  83. 'weight': feat.get('权重', 0),
  84. 'source_index': idx,
  85. 'source_purpose': item.get('目的点', ''),
  86. 'source_purpose_dimension': item.get('维度', {})
  87. })
  88. # 3. 提取关键点
  89. key_points_data = three_point.get('关键点', {})
  90. key_points_list = key_points_data.get('key_points', [])
  91. for idx, item in enumerate(key_points_list):
  92. extracted_features = item.get('提取的特征', [])
  93. for feat in extracted_features:
  94. feature_name = feat.get('特征名称', '')
  95. if not feature_name:
  96. continue
  97. features.append({
  98. 'feature_name': feature_name,
  99. 'dimension': '关键点',
  100. 'dimension_detail': feat.get('维度', ''), # 注意字段名
  101. 'weight': feat.get('权重', 0),
  102. 'source_index': idx,
  103. 'source_candidate_number': item.get('候选编号', 0),
  104. 'source_key_point': item.get('关键点', ''),
  105. 'source_key_point_dimension': item.get('维度', '')
  106. })
  107. logger.info(f" 提取特征数量: {len(features)}")
  108. if features:
  109. # 统计各维度数量
  110. dimension_counts = {}
  111. for feat in features:
  112. dim = feat['dimension']
  113. dimension_counts[dim] = dimension_counts.get(dim, 0) + 1
  114. logger.info(f" 维度分布: {dimension_counts}")
  115. return features
  116. async def calculate_similarity_for_note(
  117. note_result: Dict,
  118. original_feature: str,
  119. weight_embedding: float = 0.5,
  120. weight_semantic: float = 0.5,
  121. min_similarity: float = 0.0
  122. ) -> Dict:
  123. """
  124. 计算单个帖子的所有特征与原始特征的相似度
  125. Args:
  126. note_result: Stage 7 的单个 result 对象
  127. original_feature: 原始特征名称
  128. weight_embedding: 向量模型权重
  129. weight_semantic: LLM 模型权重
  130. min_similarity: 最小相似度阈值,低于此值的特征会被过滤
  131. Returns:
  132. 包含相似度信息的结果对象
  133. """
  134. note_id = note_result.get('note_id', '')
  135. logger.info(f" [{note_id}] 开始计算相似度...")
  136. # 1. 提取解构特征
  137. deconstructed_features = extract_deconstructed_features(
  138. note_result['api_response']
  139. )
  140. if not deconstructed_features:
  141. logger.warning(f" [{note_id}] 没有提取到特征")
  142. return {
  143. 'note_id': note_id,
  144. 'original_feature': original_feature,
  145. 'evaluation_score': note_result.get('evaluation_score', 0),
  146. 'search_word': note_result.get('search_word', ''),
  147. 'note_data': note_result.get('note_data', {}),
  148. 'deconstructed_features': [],
  149. 'similarity_statistics': {
  150. 'total_features': 0,
  151. 'max_similarity': 0,
  152. 'min_similarity': 0,
  153. 'avg_similarity': 0,
  154. 'high_similarity_count': 0,
  155. 'medium_similarity_count': 0,
  156. 'low_similarity_count': 0
  157. }
  158. }
  159. # 2. 构建特征名称列表
  160. feature_names = [f['feature_name'] for f in deconstructed_features]
  161. logger.info(f" [{note_id}] 调用相似度计算 API (1×{len(feature_names)} 笛卡尔积)...")
  162. # 3. 批量计算相似度 (1×N 笛卡尔积)
  163. try:
  164. start_time = time.time()
  165. similarity_results = await compare_phrases_cartesian(
  166. phrases_a=[original_feature],
  167. phrases_b=feature_names,
  168. max_concurrent=50
  169. )
  170. elapsed = time.time() - start_time
  171. logger.info(f" [{note_id}] 相似度计算完成 ({elapsed:.1f}秒)")
  172. # 4. 映射结果回特征对象
  173. for i, feat in enumerate(deconstructed_features):
  174. feat['similarity_score'] = similarity_results[0][i]['相似度']
  175. feat['similarity_explanation'] = similarity_results[0][i]['说明']
  176. # 5. 过滤低相似度特征
  177. if min_similarity > 0:
  178. original_count = len(deconstructed_features)
  179. deconstructed_features = [
  180. f for f in deconstructed_features
  181. if f['similarity_score'] >= min_similarity
  182. ]
  183. filtered_count = original_count - len(deconstructed_features)
  184. if filtered_count > 0:
  185. logger.info(f" [{note_id}] 过滤掉 {filtered_count} 个低相似度特征 (< {min_similarity})")
  186. # 6. 计算统计信息
  187. if deconstructed_features:
  188. scores = [f['similarity_score'] for f in deconstructed_features]
  189. statistics = {
  190. 'total_features': len(scores),
  191. 'max_similarity': round(max(scores), 3),
  192. 'min_similarity': round(min(scores), 3),
  193. 'avg_similarity': round(sum(scores) / len(scores), 3),
  194. 'high_similarity_count': sum(1 for s in scores if s >= 0.7),
  195. 'medium_similarity_count': sum(1 for s in scores if 0.5 <= s < 0.7),
  196. 'low_similarity_count': sum(1 for s in scores if s < 0.5)
  197. }
  198. # 7. 按相似度降序排序
  199. deconstructed_features.sort(key=lambda x: x['similarity_score'], reverse=True)
  200. logger.info(f" [{note_id}] 统计: 最高={statistics['max_similarity']}, "
  201. f"平均={statistics['avg_similarity']}, "
  202. f"高相似度={statistics['high_similarity_count']}个")
  203. else:
  204. statistics = {
  205. 'total_features': 0,
  206. 'max_similarity': 0,
  207. 'min_similarity': 0,
  208. 'avg_similarity': 0,
  209. 'high_similarity_count': 0,
  210. 'medium_similarity_count': 0,
  211. 'low_similarity_count': 0
  212. }
  213. return {
  214. 'note_id': note_id,
  215. 'original_feature': original_feature,
  216. 'evaluation_score': note_result.get('evaluation_score', 0),
  217. 'search_word': note_result.get('search_word', ''),
  218. 'note_data': note_result.get('note_data', {}),
  219. 'deconstructed_features': deconstructed_features,
  220. 'similarity_statistics': statistics,
  221. 'processing_time_seconds': round(elapsed, 2)
  222. }
  223. except Exception as e:
  224. logger.error(f" [{note_id}] 相似度计算失败: {e}")
  225. return {
  226. 'note_id': note_id,
  227. 'original_feature': original_feature,
  228. 'evaluation_score': note_result.get('evaluation_score', 0),
  229. 'search_word': note_result.get('search_word', ''),
  230. 'note_data': note_result.get('note_data', {}),
  231. 'deconstructed_features': [],
  232. 'similarity_statistics': {
  233. 'total_features': 0,
  234. 'error': str(e)
  235. }
  236. }
  237. class Stage8SimilarityAnalyzer:
  238. """Stage 8: 解构特征与原始特征的相似度分析"""
  239. def __init__(
  240. self,
  241. weight_embedding: float = 0.5,
  242. weight_semantic: float = 0.5,
  243. max_workers: int = 5,
  244. min_similarity: float = 0.0,
  245. output_dir: str = "output_v2",
  246. target_features: Optional[List[str]] = None,
  247. stage6_path: str = 'output_v2/stage6_with_evaluations.json',
  248. update_stage6: bool = True
  249. ):
  250. """
  251. 初始化 Stage 8 分析器
  252. Args:
  253. weight_embedding: 向量模型权重(默认 0.5)
  254. weight_semantic: LLM 模型权重(默认 0.5)
  255. max_workers: 最大并发数(默认 5)
  256. min_similarity: 最小相似度阈值(默认 0.0,保留所有特征)
  257. output_dir: 输出目录
  258. target_features: 指定要处理的原始特征列表(None = 处理所有特征)
  259. stage6_path: Stage 6 数据文件路径(用于计算综合得分)
  260. update_stage6: 是否计算并更新 Stage 6 的综合得分(默认 True)
  261. """
  262. self.weight_embedding = weight_embedding
  263. self.weight_semantic = weight_semantic
  264. self.max_workers = max_workers
  265. self.min_similarity = min_similarity
  266. self.output_dir = output_dir
  267. self.target_features = target_features
  268. self.stage6_path = stage6_path
  269. self.update_stage6 = update_stage6
  270. # 验证权重
  271. total_weight = weight_embedding + weight_semantic
  272. if abs(total_weight - 1.0) > 0.001:
  273. raise ValueError(f"权重之和必须为1.0,当前为: {total_weight}")
  274. def _save_intermediate_results(
  275. self,
  276. results: List[Dict],
  277. output_path: str,
  278. processed_count: int,
  279. total_count: int,
  280. start_time: float
  281. ):
  282. """保存中间结果"""
  283. base_dir = os.path.dirname(output_path) or self.output_dir
  284. base_name = os.path.basename(output_path)
  285. name_without_ext = os.path.splitext(base_name)[0]
  286. intermediate_path = os.path.join(
  287. base_dir,
  288. f"{name_without_ext}_partial_{processed_count}of{total_count}.json"
  289. )
  290. # 统计
  291. total_features = sum(r['similarity_statistics']['total_features'] for r in results)
  292. avg_max_sim = sum(r['similarity_statistics']['max_similarity'] for r in results) / len(results)
  293. intermediate_result = {
  294. 'metadata': {
  295. 'stage': 'stage8_partial',
  296. 'description': f'部分结果({processed_count}/{total_count})',
  297. 'processed_notes': len(results),
  298. 'total_features_extracted': total_features,
  299. 'avg_max_similarity': round(avg_max_sim, 3),
  300. 'saved_at': datetime.now().isoformat(),
  301. 'processing_time_seconds': round(time.time() - start_time, 2)
  302. },
  303. 'results': results
  304. }
  305. os.makedirs(base_dir, exist_ok=True)
  306. with open(intermediate_path, 'w', encoding='utf-8') as f:
  307. json.dump(intermediate_result, f, ensure_ascii=False, indent=2)
  308. logger.info(f" 已保存中间结果: {intermediate_path}")
  309. async def run_async(
  310. self,
  311. stage7_results: Dict,
  312. output_path: Optional[str] = None
  313. ) -> Dict:
  314. """
  315. 执行 Stage 8 相似度分析(异步版本)
  316. Args:
  317. stage7_results: Stage 7 结果
  318. output_path: 输出路径(可选)
  319. Returns:
  320. Stage 8 结果
  321. """
  322. logger.info("\n" + "=" * 60)
  323. logger.info("Stage 8: 解构特征与原始特征的相似度分析")
  324. logger.info("=" * 60)
  325. # 打印配置
  326. logger.info("配置参数:")
  327. logger.info(f" 向量模型权重: {self.weight_embedding}")
  328. logger.info(f" LLM 模型权重: {self.weight_semantic}")
  329. logger.info(f" 最大并发数: {self.max_workers}")
  330. logger.info(f" 最小相似度阈值: {self.min_similarity}")
  331. if self.target_features:
  332. logger.info(f" 目标特征: {', '.join(self.target_features)}")
  333. else:
  334. logger.info(f" 目标特征: 全部")
  335. # 默认输出路径
  336. if output_path is None:
  337. output_path = os.path.join(self.output_dir, "stage8_similarity_scores.json")
  338. # 提取 Stage 7 结果
  339. results_list = stage7_results.get('results', [])
  340. # 过滤目标特征
  341. if self.target_features:
  342. results_list = [
  343. r for r in results_list
  344. if r.get('original_feature') in self.target_features
  345. ]
  346. total_notes = len(results_list)
  347. logger.info(f" 待处理帖子数: {total_notes}")
  348. if total_notes == 0:
  349. logger.warning(" 没有需要处理的帖子")
  350. return {
  351. 'metadata': {
  352. 'stage': 'stage8',
  353. 'processed_notes': 0
  354. },
  355. 'results': []
  356. }
  357. # 创建任务列表
  358. start_time = time.time()
  359. results = []
  360. # 使用 Semaphore 控制并发数
  361. semaphore = asyncio.Semaphore(self.max_workers)
  362. async def bounded_task(result):
  363. async with semaphore:
  364. return await calculate_similarity_for_note(
  365. result,
  366. result.get('original_feature', ''),
  367. self.weight_embedding,
  368. self.weight_semantic,
  369. self.min_similarity
  370. )
  371. tasks = [bounded_task(result) for result in results_list]
  372. # 带进度条执行
  373. if TQDM_AVAILABLE:
  374. logger.info(" 使用进度条显示...")
  375. processed_count = 0
  376. save_interval = 10
  377. for coro in tqdm(
  378. asyncio.as_completed(tasks),
  379. total=len(tasks),
  380. desc=" 相似度计算进度",
  381. unit="帖子",
  382. ncols=100
  383. ):
  384. result = await coro
  385. results.append(result)
  386. processed_count += 1
  387. # 增量保存
  388. if processed_count % save_interval == 0:
  389. self._save_intermediate_results(
  390. results,
  391. output_path,
  392. processed_count,
  393. total_notes,
  394. start_time
  395. )
  396. else:
  397. # 简单执行
  398. results = await asyncio.gather(*tasks)
  399. logger.info(f" 完成: {len(results)}/{total_notes}")
  400. processing_time = time.time() - start_time
  401. # 计算总体统计
  402. total_features = sum(r['similarity_statistics']['total_features'] for r in results)
  403. all_max_similarities = [r['similarity_statistics']['max_similarity'] for r in results if r['similarity_statistics']['total_features'] > 0]
  404. overall_stats = {
  405. 'total_notes': total_notes,
  406. 'total_features_extracted': total_features,
  407. 'avg_features_per_note': round(total_features / total_notes, 1) if total_notes > 0 else 0,
  408. 'avg_max_similarity': round(sum(all_max_similarities) / len(all_max_similarities), 3) if all_max_similarities else 0,
  409. 'notes_with_high_similarity': sum(1 for r in results if r['similarity_statistics'].get('high_similarity_count', 0) > 0)
  410. }
  411. logger.info(f"\n 总耗时: {processing_time:.1f}秒")
  412. logger.info(f" 总特征数: {total_features}")
  413. logger.info(f" 平均特征数/帖子: {overall_stats['avg_features_per_note']}")
  414. logger.info(f" 平均最高相似度: {overall_stats['avg_max_similarity']}")
  415. logger.info(f" 包含高相似度特征的帖子: {overall_stats['notes_with_high_similarity']}")
  416. # 构建最终结果
  417. final_result = {
  418. 'metadata': {
  419. 'stage': 'stage8',
  420. 'description': '解构特征与原始特征的相似度评分',
  421. 'source_file': stage7_results.get('metadata', {}).get('created_at', ''),
  422. 'target_features': self.target_features if self.target_features else '全部',
  423. 'similarity_config': {
  424. 'algorithm': 'hybrid_similarity',
  425. 'weight_embedding': self.weight_embedding,
  426. 'weight_semantic': self.weight_semantic,
  427. 'min_similarity_threshold': self.min_similarity
  428. },
  429. 'overall_statistics': overall_stats,
  430. 'created_at': datetime.now().isoformat(),
  431. 'processing_time_seconds': round(processing_time, 2)
  432. },
  433. 'results': results
  434. }
  435. # 保存结果
  436. os.makedirs(os.path.dirname(output_path) or self.output_dir, exist_ok=True)
  437. with open(output_path, 'w', encoding='utf-8') as f:
  438. json.dump(final_result, f, ensure_ascii=False, indent=2)
  439. logger.info(f" 结果已保存: {output_path}")
  440. # 计算并更新综合得分P
  441. if self.update_stage6:
  442. logger.info("\n" + "=" * 60)
  443. logger.info("开始计算综合得分P并更新Stage 6数据...")
  444. logger.info("=" * 60)
  445. self._calculate_and_update_comprehensive_scores(results)
  446. return final_result
  447. def _calculate_and_update_comprehensive_scores(self, stage8_results: List[Dict]):
  448. """
  449. 计算综合得分P并更新Stage 6数据
  450. Args:
  451. stage8_results: Stage 8 的结果列表
  452. """
  453. try:
  454. # 1. 加载 Stage 6 数据
  455. logger.info(f" 加载 Stage 6 数据: {self.stage6_path}")
  456. if not os.path.exists(self.stage6_path):
  457. logger.error(f" Stage 6 文件不存在: {self.stage6_path}")
  458. return
  459. with open(self.stage6_path, 'r', encoding='utf-8') as f:
  460. stage6_data = json.load(f)
  461. # 2. 构建 Stage 8 映射 (note_id → max_similarity)
  462. logger.info(" 构建相似度映射...")
  463. similarity_map = {}
  464. for result in stage8_results:
  465. note_id = result['note_id']
  466. max_similarity = result['similarity_statistics']['max_similarity']
  467. similarity_map[note_id] = max_similarity
  468. logger.info(f" 相似度映射条目数: {len(similarity_map)}")
  469. # 3. 遍历 Stage 6 中的所有原始特征和搜索词,计算 P 值
  470. # Stage 6 数据是一个列表,每个元素是一个原始特征
  471. updated_count = 0
  472. total_searches = 0
  473. logger.info(f" 开始遍历 {len(stage6_data)} 个原始特征...")
  474. for feature_item in stage6_data:
  475. original_feature = feature_item.get('原始特征名称', '')
  476. logger.info(f"\n 处理原始特征: {original_feature}")
  477. # 遍历每个分组
  478. for group in feature_item.get('组合评估结果_分组', []):
  479. source_word = group.get('source_word', '')
  480. # 遍历该分组的所有搜索词
  481. for search_item in group.get('top10_searches', []):
  482. search_word = search_item.get('search_word', '')
  483. total_searches += 1
  484. logger.info(f" 处理搜索词: {search_word} (来源: {source_word})")
  485. # 计算该搜索词的综合得分
  486. p_score, p_detail = self._calculate_single_query_score(
  487. search_item,
  488. similarity_map
  489. )
  490. # 更新搜索词数据
  491. if p_score is not None:
  492. search_item['comprehensive_score'] = round(p_score, 3)
  493. search_item['comprehensive_score_detail'] = p_detail
  494. updated_count += 1
  495. logger.info(f" 综合得分P = {p_score:.3f} (M={p_detail['M']}, N={p_detail['N']})")
  496. else:
  497. logger.warning(f" 无法计算综合得分(可能缺少数据)")
  498. # 4. 保存更新后的 Stage 6 数据
  499. logger.info(f"\n 保存更新后的 Stage 6 数据...")
  500. logger.info(f" 已更新 {updated_count}/{total_searches} 个搜索词")
  501. with open(self.stage6_path, 'w', encoding='utf-8') as f:
  502. json.dump(stage6_data, f, ensure_ascii=False, indent=2)
  503. logger.info(f" 更新完成: {self.stage6_path}")
  504. except Exception as e:
  505. logger.error(f" 计算综合得分失败: {e}", exc_info=True)
  506. def _calculate_single_query_score(
  507. self,
  508. query: Dict,
  509. similarity_map: Dict[str, float]
  510. ) -> tuple[Optional[float], Optional[Dict]]:
  511. """
  512. 计算单个查询的综合得分P
  513. Args:
  514. query: Stage 6 中的单个查询对象
  515. similarity_map: note_id → max_similarity 的映射
  516. Returns:
  517. (P值, 详细计算信息) 或 (None, None)
  518. """
  519. # 获取总帖子数 N
  520. evaluation_with_filter = query.get('evaluation_with_filter', {})
  521. N = evaluation_with_filter.get('total_notes', 0)
  522. if N == 0:
  523. logger.warning(f" 查询总帖子数为0,无法计算P值")
  524. return None, None
  525. # 获取笔记评估数据和原始笔记数据
  526. notes_evaluation = evaluation_with_filter.get('notes_evaluation', [])
  527. search_result = query.get('search_result', {})
  528. notes_data = search_result.get('data', {}).get('data', [])
  529. if not notes_evaluation or not notes_data:
  530. logger.warning(f" 缺少评估数据或笔记数据")
  531. return 0.0, {
  532. 'N': N,
  533. 'M': 0,
  534. 'total_contribution': 0.0,
  535. 'complete_matches': []
  536. }
  537. # 获取完全匹配的帖子列表 (综合得分 >= 0.8)
  538. complete_matches_data = []
  539. for note_eval in notes_evaluation:
  540. score = note_eval.get('综合得分', 0)
  541. if score >= 0.8:
  542. note_index = note_eval.get('note_index', -1)
  543. if 0 <= note_index < len(notes_data):
  544. # 从原始数据中获取note_id
  545. note_id = notes_data[note_index].get('id', '')
  546. note_card = notes_data[note_index].get('note_card', {})
  547. note_title = note_card.get('display_title', '')
  548. complete_matches_data.append({
  549. 'note_id': note_id,
  550. 'note_title': note_title,
  551. 'evaluation_score': score,
  552. 'note_index': note_index
  553. })
  554. M = len(complete_matches_data)
  555. logger.info(f" 完全匹配数: M = {M}/{N}")
  556. if M == 0:
  557. # 没有完全匹配,P = 0
  558. return 0.0, {
  559. 'N': N,
  560. 'M': 0,
  561. 'total_contribution': 0.0,
  562. 'complete_matches': []
  563. }
  564. # 计算每个完全匹配的贡献 a×b
  565. contributions = []
  566. total_contribution = 0.0
  567. for match in complete_matches_data:
  568. note_id = match['note_id']
  569. evaluation_score = match['evaluation_score'] # a 值
  570. # 从 similarity_map 获取 b 值
  571. max_similarity = similarity_map.get(note_id, 0) # b 值
  572. # 计算贡献
  573. contribution = evaluation_score * max_similarity
  574. total_contribution += contribution
  575. # 保存详细信息
  576. contributions.append({
  577. 'note_id': note_id,
  578. 'note_title': match['note_title'],
  579. 'evaluation_score': round(evaluation_score, 3),
  580. 'max_similarity': round(max_similarity, 3),
  581. 'contribution': round(contribution, 3)
  582. })
  583. # 计算综合得分 P = Σ(a×b) / N
  584. P = total_contribution / N
  585. # 按贡献降序排序
  586. contributions.sort(key=lambda x: x['contribution'], reverse=True)
  587. # 构建详细信息
  588. detail = {
  589. 'N': N,
  590. 'M': M,
  591. 'total_contribution': round(total_contribution, 3),
  592. 'complete_matches': contributions
  593. }
  594. return P, detail
  595. def run(
  596. self,
  597. stage7_results: Dict,
  598. output_path: Optional[str] = None
  599. ) -> Dict:
  600. """
  601. 执行 Stage 8 相似度分析(同步版本)
  602. Args:
  603. stage7_results: Stage 7 结果
  604. output_path: 输出路径(可选)
  605. Returns:
  606. Stage 8 结果
  607. """
  608. return asyncio.run(self.run_async(stage7_results, output_path))
  609. def test_stage8_analyzer():
  610. """测试 Stage 8 分析器"""
  611. # 读取 Stage 7 结果
  612. stage7_path = "output_v2/stage7_with_deconstruction.json"
  613. if not os.path.exists(stage7_path):
  614. print(f"Stage 7 结果不存在: {stage7_path}")
  615. return
  616. with open(stage7_path, 'r', encoding='utf-8') as f:
  617. stage7_results = json.load(f)
  618. # 创建分析器
  619. analyzer = Stage8SimilarityAnalyzer(
  620. weight_embedding=0.5,
  621. weight_semantic=0.5,
  622. max_workers=3,
  623. min_similarity=0.3,
  624. target_features=["墨镜"]
  625. )
  626. # 运行分析
  627. stage8_results = analyzer.run(stage7_results)
  628. print(f"\n处理了 {stage8_results['metadata']['overall_statistics']['total_notes']} 个帖子")
  629. print(f"提取了 {stage8_results['metadata']['overall_statistics']['total_features_extracted']} 个特征")
  630. print(f"平均最高相似度: {stage8_results['metadata']['overall_statistics']['avg_max_similarity']}")
  631. if __name__ == '__main__':
  632. logging.basicConfig(
  633. level=logging.INFO,
  634. format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
  635. )
  636. test_stage8_analyzer()