#!/usr/bin/env python3 """ pattern 数据精简:5 种 depth 合并为一份,按 items 去重; pattern 仅保留 s/l/i,i 为去重后字符串 "A+B+C",单行 JSON。 """ import json from pathlib import Path INPUT_FILE = ( Path(__file__).resolve().parent / "input/家有大志/原始数据/pattern/processed_edge_data.json" ) OUTPUT_DIR = Path(__file__).resolve().parent / "input/家有大志/pattern" OUTPUT_FILE = OUTPUT_DIR / "processed_edge_data.json" TOP_KEYS = [ "depth_max_with_name", "depth_mixed", "depth_max_concrete", "depth2_medium", "depth1_abstract", ] SUB_KEYS = ["two_x", "one_x", "zero_x"] def slim_pattern(p): """提取 name 列表,去重保序,返回 (support, length, items_key)。""" names = [item["name"] for item in (p.get("items") or [])] # 内部去重,保序 seen = set() unique = [] for n in names: if n not in seen: seen.add(n) unique.append(n) support = round(float(p["support"]), 4) length = p["length"] return support, length, unique def to_short_entry(support, length, names): """短格式:无 id,s/l/i,i 为 'A+B+C'。""" return {"s": support, "l": length, "i": "+".join(names)} def merge_and_dedupe(patterns): """按 items 的 name 集合去重(不区分顺序),留 support 最大;再按 s*l 降序;输出短格式。""" key_to_best = {} for p in patterns: support, length, unique = slim_pattern(p) key = tuple(sorted(unique)) # 同名字集合、顺序不同算同一条 if key not in key_to_best or support > key_to_best[key][0]: key_to_best[key] = (support, length) out = [to_short_entry(s, l, list(k)) for k, (s, l) in key_to_best.items() if s >= 0.1] out.sort(key=lambda x: x["s"] * x["l"], reverse=True) return out def main(): with open(INPUT_FILE, "r", encoding="utf-8") as f: data = json.load(f) all_patterns = [] for top in TOP_KEYS: if top not in data: continue block = data[top] for sub in SUB_KEYS: all_patterns.extend(block.get(sub) or []) result = merge_and_dedupe(all_patterns) result.sort(key=lambda x: x["s"], reverse=True) result = result[:500] OUTPUT_DIR.mkdir(parents=True, exist_ok=True) with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(result, f, ensure_ascii=False, indent=None, separators=(",", ":")) f.write("\n") print(f"已输出: {OUTPUT_FILE} (共 {len(result)} 条 pattern)") if __name__ == "__main__": main()