| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- #!/usr/bin/env python3
- """
- pattern 数据精简:5 种 depth 合并为一份,按 items 去重;
- pattern 仅保留 s/l/i,i 为去重后字符串 "A+B+C",单行 JSON。
- """
- import json
- from pathlib import Path
- INPUT_FILE = (
- Path(__file__).resolve().parent / "input/家有大志/原始数据/pattern/processed_edge_data.json"
- )
- OUTPUT_DIR = Path(__file__).resolve().parent / "input/家有大志/pattern"
- OUTPUT_FILE = OUTPUT_DIR / "processed_edge_data.json"
- TOP_KEYS = [
- "depth_max_with_name",
- "depth_mixed",
- "depth_max_concrete",
- "depth2_medium",
- "depth1_abstract",
- ]
- SUB_KEYS = ["two_x", "one_x", "zero_x"]
- def slim_pattern(p):
- """提取 name 列表,去重保序,返回 (support, length, items_key)。"""
- names = [item["name"] for item in (p.get("items") or [])]
- # 内部去重,保序
- seen = set()
- unique = []
- for n in names:
- if n not in seen:
- seen.add(n)
- unique.append(n)
- support = round(float(p["support"]), 4)
- length = p["length"]
- return support, length, unique
- def to_short_entry(support, length, names):
- """短格式:无 id,s/l/i,i 为 'A+B+C'。"""
- return {"s": support, "l": length, "i": "+".join(names)}
- def merge_and_dedupe(patterns):
- """按 items 的 name 集合去重(不区分顺序),留 support 最大;再按 s*l 降序;输出短格式。"""
- key_to_best = {}
- for p in patterns:
- support, length, unique = slim_pattern(p)
- key = tuple(sorted(unique)) # 同名字集合、顺序不同算同一条
- if key not in key_to_best or support > key_to_best[key][0]:
- key_to_best[key] = (support, length)
- out = [to_short_entry(s, l, list(k)) for k, (s, l) in key_to_best.items() if s >= 0.1]
- out.sort(key=lambda x: x["s"] * x["l"], reverse=True)
- return out
- def main():
- with open(INPUT_FILE, "r", encoding="utf-8") as f:
- data = json.load(f)
- all_patterns = []
- for top in TOP_KEYS:
- if top not in data:
- continue
- block = data[top]
- for sub in SUB_KEYS:
- all_patterns.extend(block.get(sub) or [])
- result = merge_and_dedupe(all_patterns)
- result.sort(key=lambda x: x["s"], reverse=True)
- result = result[:500]
- OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
- with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
- json.dump(result, f, ensure_ascii=False, indent=None, separators=(",", ":"))
- f.write("\n")
- print(f"已输出: {OUTPUT_FILE} (共 {len(result)} 条 pattern)")
- if __name__ == "__main__":
- main()
|