""" 从原始 pattern.json 读取 full / substance_form_only / point_type_only, 将三段的 depth_max 合并为 depth_max_concrete,三段的 depth_3 合并为 depth_4; 各层内按 combination_type 分到 two_x / one_x / zero_x。 输出格式对齐 processed_edge_data.json(type_key、items 中 point 恒为空字符串)。 """ from __future__ import annotations import json import sys from pathlib import Path from typing import Any, Dict, List _OVR = Path(__file__).resolve().parent.parent if str(_OVR) not in sys.path: sys.path.insert(0, str(_OVR)) SECTION_KEYS = ("full", "substance_form_only", "point_type_only") BUCKET_KEYS = ("two_x", "one_x", "zero_x") def _load_json(path: Path) -> Any: with open(path, "r", encoding="utf-8") as f: return json.load(f) def _normalize_item_row(row: Dict[str, Any]) -> Dict[str, Any]: # 与创业邦 processed_edge_data 一致:point 恒为空,不保留 path return { "name": row.get("name") or "", "point": "", "dimension": row.get("dimension") or "", "type": row.get("type") or "分类", } def _normalize_pattern_entry(entry: Dict[str, Any]) -> Dict[str, Any]: combination_type = entry.get("combination_type") or "" raw_id = entry.get("id") if isinstance(raw_id, str) and raw_id.isdigit(): pid: Any = int(raw_id) else: pid = raw_id items_in = entry.get("items") or [] items = [_normalize_item_row(x) for x in items_in if isinstance(x, dict)] try: support = float(entry.get("support", 0.0)) except (TypeError, ValueError): support = 0.0 matched = entry.get("matched_posts") if matched is None: matched = [] return { "id": pid, "type_key": combination_type, "support": support, "absolute_support": entry.get("absolute_support"), "length": entry.get("length"), "post_count": entry.get("post_count"), "matched_posts": matched, "items": items, } def _entries_to_buckets(entries: List[Any]) -> Dict[str, List[Dict[str, Any]]]: out: Dict[str, List[Dict[str, Any]]] = {k: [] for k in BUCKET_KEYS} for entry in entries: if not isinstance(entry, dict): continue combination_type = entry.get("combination_type") or "" bucket = _combination_type_bucket(combination_type) out[bucket].append(_normalize_pattern_entry(entry)) return out def _combination_type_bucket(combination_type: str) -> str: """ 根据组合类型中的符号数量映射到 two_x / one_x / zero_x。 规则: - 先统计组合类型中的 '×' 数量; - 若没有 '×',则再根据 '+' 数量判断。 """ if not combination_type: return "zero_x" times_count = combination_type.count("×") if times_count >= 2: return "two_x" if times_count == 1: return "one_x" # 没有 '×' 时,才按 '+' 数量判断 plus_count = combination_type.count("+") if plus_count >= 2: return "two_x" if plus_count == 1: return "one_x" return "zero_x" def _collect_depth_list(raw: Dict[str, Any], depth_key: str) -> List[Any]: merged: List[Any] = [] for sec_name in SECTION_KEYS: sec = raw.get(sec_name) if not isinstance(sec, dict): continue part = sec.get(depth_key) if isinstance(part, list): merged.extend(part) return merged def process_pattern_for_account(account_name: str) -> Dict[str, Any]: base = _OVR in_path = base / "input" / account_name / "原始数据" / "pattern" / "pattern.json" raw = _load_json(in_path) if not isinstance(raw, dict): raise ValueError(f"顶层应为对象: {in_path}") depth_max_entries = _collect_depth_list(raw, "depth_max") depth_3_entries = _collect_depth_list(raw, "depth_3") return { "depth_max_concrete": _entries_to_buckets(depth_max_entries), "depth_4": _entries_to_buckets(depth_3_entries), } def main(account_name: str) -> Path: out_dir = _OVR / "input" / account_name / "处理后数据" / "pattern" out_dir.mkdir(parents=True, exist_ok=True) out_path = out_dir / "pattern.json" data = process_pattern_for_account(account_name) with open(out_path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) print(f"[{account_name}] 已写入: {out_path}") return out_path if __name__ == "__main__": acc = sys.argv[1] if len(sys.argv) >= 2 else "空间点阵设计研究室" main(acc)