analyze_associations.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 分析 dimension_associations_analysis.json 中的关联关系
  5. """
  6. import json
  7. from collections import defaultdict, Counter
  8. from typing import Dict, List, Any
  9. def load_data(file_path: str) -> Dict:
  10. """加载JSON数据"""
  11. with open(file_path, 'r', encoding='utf-8') as f:
  12. return json.load(f)
  13. def analyze_basic_info(data: Dict) -> None:
  14. """分析基本信息"""
  15. print("=" * 80)
  16. print("📊 基本信息分析")
  17. print("=" * 80)
  18. info = data.get("分析说明", {})
  19. print(f"\n分析类型: {', '.join(info.get('分析类型', []))}")
  20. print(f"最小共同帖子数: {info.get('最小共同帖子数', 0)}")
  21. print(f"\n维度统计:")
  22. print(f" 灵感点: {info.get('灵感点分类数(全部)', 0)} 个分类 (非一级: {info.get('灵感点非一级分类数', 0)})")
  23. print(f" 目的点: {info.get('目的点分类数(全部)', 0)} 个分类 (非一级: {info.get('目的点非一级分类数', 0)})")
  24. print(f" 关键点: {info.get('关键点分类数(全部)', 0)} 个分类 (非一级: {info.get('关键点非一级分类数', 0)})")
  25. def analyze_single_dimension(data: Dict) -> None:
  26. """分析单维度关联"""
  27. print("\n" + "=" * 80)
  28. print("🔗 单维度关联分析")
  29. print("=" * 80)
  30. single_dim = data.get("单维度关联分析", {})
  31. for dimension_name, dimension_data in single_dim.items():
  32. print(f"\n【{dimension_name}】")
  33. print(f"说明: {dimension_data.get('说明', '')}")
  34. # 统计每种关联方向
  35. for direction, associations in dimension_data.items():
  36. if direction == "说明":
  37. continue
  38. print(f"\n {direction}:")
  39. # 统计总体情况
  40. total_sources = len(associations)
  41. total_associations = 0
  42. high_similarity = [] # 高相似度关联
  43. high_overlap = [] # 高重叠系数关联
  44. for source_name, source_data in associations.items():
  45. assoc_list = source_data.get("与目的点的关联", []) or \
  46. source_data.get("与关键点的关联", []) or \
  47. source_data.get("与灵感点的关联", [])
  48. total_associations += len(assoc_list)
  49. # 找出高相似度和高重叠系数的关联
  50. for assoc in assoc_list:
  51. jaccard = assoc.get("Jaccard相似度", 0)
  52. overlap = assoc.get("重叠系数", 0)
  53. if jaccard >= 0.5:
  54. high_similarity.append({
  55. "源": source_name,
  56. "目标": assoc.get("目标分类", ""),
  57. "Jaccard": jaccard,
  58. "共同帖子数": assoc.get("共同帖子数", 0)
  59. })
  60. if overlap >= 0.8:
  61. high_overlap.append({
  62. "源": source_name,
  63. "目标": assoc.get("目标分类", ""),
  64. "重叠系数": overlap,
  65. "共同帖子数": assoc.get("共同帖子数", 0)
  66. })
  67. print(f" 总源分类数: {total_sources}")
  68. print(f" 总关联数: {total_associations}")
  69. print(f" 平均每个源分类的关联数: {total_associations/total_sources:.2f}" if total_sources > 0 else " 平均每个源分类的关联数: 0")
  70. if high_similarity:
  71. print(f"\n 🔥 高相似度关联 (Jaccard >= 0.5): {len(high_similarity)} 个")
  72. for item in sorted(high_similarity, key=lambda x: x["Jaccard"], reverse=True)[:5]:
  73. print(f" • {item['源']} → {item['目标']}")
  74. print(f" Jaccard: {item['Jaccard']:.4f}, 共同帖子: {item['共同帖子数']}")
  75. if high_overlap:
  76. print(f"\n 🎯 高重叠系数关联 (重叠 >= 0.8): {len(high_overlap)} 个")
  77. for item in sorted(high_overlap, key=lambda x: x["重叠系数"], reverse=True)[:5]:
  78. print(f" • {item['源']} → {item['目标']}")
  79. print(f" 重叠系数: {item['重叠系数']:.4f}, 共同帖子: {item['共同帖子数']}")
  80. def analyze_triple_dimension(data: Dict) -> None:
  81. """分析三维正交关联"""
  82. print("\n" + "=" * 80)
  83. print("🎲 三维正交关联分析")
  84. print("=" * 80)
  85. triple_dim = data.get("三维正交关联分析", {})
  86. if not triple_dim:
  87. print("未找到三维正交关联数据")
  88. return
  89. # 按灵感点分类组织
  90. total_inspiration_classes = len(triple_dim)
  91. total_orthogonal_combinations = 0
  92. all_combinations = []
  93. print(f"\n灵感点分类数: {total_inspiration_classes}")
  94. for inspiration_class, inspiration_data in triple_dim.items():
  95. orthogonal_list = inspiration_data.get("正交关联", [])
  96. total_orthogonal_combinations += len(orthogonal_list)
  97. for combo in orthogonal_list:
  98. all_combinations.append({
  99. "灵感点": inspiration_class,
  100. "目的点": combo.get("目的点分类", ""),
  101. "关键点": combo.get("关键点分类", ""),
  102. "三维共同帖子数": combo.get("三维共同帖子数", 0),
  103. "三维交集占灵感点比例": combo.get("三维交集占灵感点比例", 0),
  104. "三维交集占目的点比例": combo.get("三维交集占目的点比例", 0),
  105. "三维交集占关键点比例": combo.get("三维交集占关键点比例", 0),
  106. "共同帖子ID": combo.get("三维共同帖子ID", [])
  107. })
  108. print(f"总正交组合数: {total_orthogonal_combinations}")
  109. print(f"平均每个灵感点的正交组合数: {total_orthogonal_combinations/total_inspiration_classes:.2f}" if total_inspiration_classes > 0 else "平均每个灵感点的正交组合数: 0")
  110. if all_combinations:
  111. post_counts = [c["三维共同帖子数"] for c in all_combinations]
  112. print(f"\n正交组合帖子数统计:")
  113. print(f" 平均值: {sum(post_counts)/len(post_counts):.2f}")
  114. print(f" 最大值: {max(post_counts)}")
  115. print(f" 最小值: {min(post_counts)}")
  116. # 高频组合
  117. high_post_combinations = [c for c in all_combinations if c["三维共同帖子数"] >= 2]
  118. if high_post_combinations:
  119. print(f"\n🌟 高频三维正交组合 (三维共同帖子数 >= 2): {len(high_post_combinations)} 个")
  120. for combo in sorted(high_post_combinations, key=lambda x: x["三维共同帖子数"], reverse=True)[:10]:
  121. print(f"\n 三维共同帖子数: {combo['三维共同帖子数']}")
  122. print(f" 灵感点: {combo['灵感点']}")
  123. print(f" 目的点: {combo['目的点']}")
  124. print(f" 关键点: {combo['关键点']}")
  125. print(f" 交集占比 - 灵感:{combo['三维交集占灵感点比例']:.2f} 目的:{combo['三维交集占目的点比例']:.2f} 关键:{combo['三维交集占关键点比例']:.2f}")
  126. # 高交集占比组合
  127. high_ratio_combinations = [c for c in all_combinations if
  128. c["三维交集占灵感点比例"] >= 0.5 and
  129. c["三维交集占目的点比例"] >= 0.5 and
  130. c["三维交集占关键点比例"] >= 0.5]
  131. if high_ratio_combinations:
  132. print(f"\n🔥 高交集占比正交组合 (三维度占比均 >= 0.5): {len(high_ratio_combinations)} 个")
  133. for combo in sorted(high_ratio_combinations, key=lambda x: x["三维共同帖子数"], reverse=True)[:5]:
  134. print(f"\n 三维共同帖子数: {combo['三维共同帖子数']}")
  135. print(f" 灵感点: {combo['灵感点']}")
  136. print(f" 目的点: {combo['目的点']}")
  137. print(f" 关键点: {combo['关键点']}")
  138. print(f" 交集占比 - 灵感:{combo['三维交集占灵感点比例']:.2f} 目的:{combo['三维交集占目的点比例']:.2f} 关键:{combo['三维交集占关键点比例']:.2f}")
  139. def analyze_association_strength(data: Dict) -> None:
  140. """分析关联强度分布"""
  141. print("\n" + "=" * 80)
  142. print("📈 关联强度分布分析")
  143. print("=" * 80)
  144. single_dim = data.get("单维度关联分析", {})
  145. all_jaccard = []
  146. all_overlap = []
  147. all_coverage_source = []
  148. all_coverage_target = []
  149. for dimension_name, dimension_data in single_dim.items():
  150. for direction, associations in dimension_data.items():
  151. if direction == "说明":
  152. continue
  153. for source_name, source_data in associations.items():
  154. assoc_list = source_data.get("与目的点的关联", []) or \
  155. source_data.get("与关键点的关联", []) or \
  156. source_data.get("与灵感点的关联", [])
  157. for assoc in assoc_list:
  158. all_jaccard.append(assoc.get("Jaccard相似度", 0))
  159. all_overlap.append(assoc.get("重叠系数", 0))
  160. # 根据direction确定覆盖率字段
  161. if "灵感点→" in direction:
  162. all_coverage_source.append(assoc.get("灵感点覆盖率", 0))
  163. elif "目的点→" in direction:
  164. all_coverage_source.append(assoc.get("目的点覆盖率", 0))
  165. elif "关键点→" in direction:
  166. all_coverage_source.append(assoc.get("关键点覆盖率", 0))
  167. all_coverage_target.append(assoc.get("目标维度覆盖率", 0))
  168. if all_jaccard:
  169. print(f"\nJaccard相似度分布:")
  170. print(f" 平均值: {sum(all_jaccard)/len(all_jaccard):.4f}")
  171. print(f" 中位数: {sorted(all_jaccard)[len(all_jaccard)//2]:.4f}")
  172. print(f" 最大值: {max(all_jaccard):.4f}")
  173. print(f" 最小值: {min(all_jaccard):.4f}")
  174. # 分段统计
  175. ranges = [(0, 0.2), (0.2, 0.4), (0.4, 0.6), (0.6, 0.8), (0.8, 1.0)]
  176. for low, high in ranges:
  177. count = sum(1 for j in all_jaccard if low <= j < high)
  178. pct = count / len(all_jaccard) * 100
  179. print(f" [{low:.1f}, {high:.1f}): {count} ({pct:.1f}%)")
  180. if all_overlap:
  181. print(f"\n重叠系数分布:")
  182. print(f" 平均值: {sum(all_overlap)/len(all_overlap):.4f}")
  183. print(f" 中位数: {sorted(all_overlap)[len(all_overlap)//2]:.4f}")
  184. print(f" 最大值: {max(all_overlap):.4f}")
  185. print(f" 最小值: {min(all_overlap):.4f}")
  186. # 统计完全重叠(1.0)的数量
  187. perfect_overlap = sum(1 for o in all_overlap if o == 1.0)
  188. print(f" 完全重叠(1.0): {perfect_overlap} ({perfect_overlap/len(all_overlap)*100:.1f}%)")
  189. def main():
  190. file_path = "/Users/liulidong/project/pattern相关文件/optimization/dimension_associations_analysis.json"
  191. print("🔍 加载数据...")
  192. data = load_data(file_path)
  193. # 执行各项分析
  194. analyze_basic_info(data)
  195. analyze_single_dimension(data)
  196. analyze_triple_dimension(data)
  197. analyze_association_strength(data)
  198. print("\n" + "=" * 80)
  199. print("✅ 分析完成!")
  200. print("=" * 80)
  201. if __name__ == "__main__":
  202. main()