pattern_data_process.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. #!/usr/bin/env python3
  2. """
  3. pattern 数据精简:5 种 depth 合并为一份,按 items 去重;
  4. pattern 仅保留 s/l/i,i 为去重后字符串 "A+B+C",单行 JSON。
  5. """
  6. import json
  7. from pathlib import Path
  8. INPUT_FILE = (
  9. Path(__file__).resolve().parent / "input/家有大志/原始数据/pattern/processed_edge_data.json"
  10. )
  11. OUTPUT_DIR = Path(__file__).resolve().parent / "input/家有大志/pattern"
  12. OUTPUT_FILE = OUTPUT_DIR / "processed_edge_data.json"
  13. TOP_KEYS = [
  14. "depth_max_with_name",
  15. "depth_mixed",
  16. "depth_max_concrete",
  17. "depth2_medium",
  18. "depth1_abstract",
  19. "depth_max_minus_1",
  20. "depth_max_minus_2",
  21. "depth_3",
  22. "depth_4",
  23. ]
  24. SUB_KEYS = ["two_x", "one_x", "zero_x"]
  25. def slim_pattern(p):
  26. """提取 name 列表,去重保序,返回 (support, length, items_key)。"""
  27. names = [item["name"] for item in (p.get("items") or [])]
  28. # 内部去重,保序
  29. seen = set()
  30. unique = []
  31. for n in names:
  32. if n not in seen:
  33. seen.add(n)
  34. unique.append(n)
  35. support = round(float(p["support"]), 4)
  36. length = p["length"]
  37. return support, length, unique
  38. def to_short_entry(support, length, names):
  39. """短格式:无 id,s/l/i,i 为 'A+B+C'。"""
  40. return {"s": support, "l": length, "i": "+".join(names)}
  41. def merge_and_dedupe(patterns):
  42. """按 items 的 name 集合去重(不区分顺序),留 support 最大;再按 s*l 降序;输出短格式。"""
  43. key_to_best = {}
  44. for p in patterns:
  45. support, length, unique = slim_pattern(p)
  46. key = tuple(sorted(unique)) # 同名字集合、顺序不同算同一条
  47. if key not in key_to_best or support > key_to_best[key][0]:
  48. key_to_best[key] = (support, length)
  49. out = [to_short_entry(s, l, list(k)) for k, (s, l) in key_to_best.items() if s >= 0.1]
  50. out.sort(key=lambda x: x["s"] * x["l"], reverse=True)
  51. return out
  52. def main():
  53. with open(INPUT_FILE, "r", encoding="utf-8") as f:
  54. data = json.load(f)
  55. all_patterns = []
  56. for top in TOP_KEYS:
  57. if top not in data:
  58. continue
  59. block = data[top]
  60. for sub in SUB_KEYS:
  61. all_patterns.extend(block.get(sub) or [])
  62. result = merge_and_dedupe(all_patterns)
  63. result.sort(key=lambda x: x["s"], reverse=True)
  64. result = result[:500]
  65. OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
  66. with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
  67. json.dump(result, f, ensure_ascii=False, indent=None, separators=(",", ":"))
  68. f.write("\n")
  69. print(f"已输出: {OUTPUT_FILE} (共 {len(result)} 条 pattern)")
  70. if __name__ == "__main__":
  71. main()