pattern_data_process.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. #!/usr/bin/env python3
  2. """
  3. pattern 数据精简:5 种 depth 合并为一份,按 items 去重;
  4. pattern 仅保留 s/l/i,i 为去重后字符串 "A+B+C",单行 JSON。
  5. """
  6. import json
  7. from pathlib import Path
  8. INPUT_FILE = (
  9. Path(__file__).resolve().parent / "input/家有大志/原始数据/pattern/processed_edge_data.json"
  10. )
  11. OUTPUT_DIR = Path(__file__).resolve().parent / "input/家有大志/pattern"
  12. OUTPUT_FILE = OUTPUT_DIR / "processed_edge_data.json"
  13. TOP_KEYS = [
  14. "depth_max_with_name",
  15. "depth_mixed",
  16. "depth_max_concrete",
  17. "depth2_medium",
  18. "depth1_abstract",
  19. ]
  20. SUB_KEYS = ["two_x", "one_x", "zero_x"]
  21. def slim_pattern(p):
  22. """提取 name 列表,去重保序,返回 (support, length, items_key)。"""
  23. names = [item["name"] for item in (p.get("items") or [])]
  24. # 内部去重,保序
  25. seen = set()
  26. unique = []
  27. for n in names:
  28. if n not in seen:
  29. seen.add(n)
  30. unique.append(n)
  31. support = round(float(p["support"]), 4)
  32. length = p["length"]
  33. return support, length, unique
  34. def to_short_entry(support, length, names):
  35. """短格式:无 id,s/l/i,i 为 'A+B+C'。"""
  36. return {"s": support, "l": length, "i": "+".join(names)}
  37. def merge_and_dedupe(patterns):
  38. """按 items 的 name 集合去重(不区分顺序),留 support 最大;再按 s*l 降序;输出短格式。"""
  39. key_to_best = {}
  40. for p in patterns:
  41. support, length, unique = slim_pattern(p)
  42. key = tuple(sorted(unique)) # 同名字集合、顺序不同算同一条
  43. if key not in key_to_best or support > key_to_best[key][0]:
  44. key_to_best[key] = (support, length)
  45. out = [to_short_entry(s, l, list(k)) for k, (s, l) in key_to_best.items() if s >= 0.1]
  46. out.sort(key=lambda x: x["s"] * x["l"], reverse=True)
  47. return out
  48. def main():
  49. with open(INPUT_FILE, "r", encoding="utf-8") as f:
  50. data = json.load(f)
  51. all_patterns = []
  52. for top in TOP_KEYS:
  53. if top not in data:
  54. continue
  55. block = data[top]
  56. for sub in SUB_KEYS:
  57. all_patterns.extend(block.get(sub) or [])
  58. result = merge_and_dedupe(all_patterns)
  59. result.sort(key=lambda x: x["s"], reverse=True)
  60. result = result[:500]
  61. OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
  62. with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
  63. json.dump(result, f, ensure_ascii=False, indent=None, separators=(",", ":"))
  64. f.write("\n")
  65. print(f"已输出: {OUTPUT_FILE} (共 {len(result)} 条 pattern)")
  66. if __name__ == "__main__":
  67. main()