"""Check (or rewrite) config JSON files into the canonical byte form. Grown from the M1 byte-equal feasibility spike (2026-06-09). The V2 plan (`tech_documents/工程落地/06_V2阶段开发计划.md`, V2-M1) locks the converter to a single canonical formatter so Excel->JSON output is byte-equal to the runtime JSON and `policy_bundle_hash` (`policy_json.py`) stays stable. Canonical form = `json.dumps(data, indent=2, ensure_ascii=False)` + trailing "\n", preserving key insertion order (no sort). Spike findings this encodes: - `douyin_rule_packs.v1.json` (the only file hashed by policy_json) already round-trips to exact bytes under this formatter. - `douyin_walk_strategy.v1.json` is hand-mixed (leaf objects collapsed to single lines); it is NOT hashed and no test pins its bytes, so `--write` re-normalizes it once to the canonical form with zero runtime impact. Usage: python scripts/check_config_json_canonical.py # --check (default) python scripts/check_config_json_canonical.py --write # rewrite in place python scripts/check_config_json_canonical.py --check path/to/other.json Exit code: 0 = all canonical (or written), 1 = drift found in --check mode. """ from __future__ import annotations import argparse import json import sys from pathlib import Path from typing import Any ROOT = Path(__file__).resolve().parents[1] # Default config JSONs the V2 converter governs. DEFAULT_TARGETS = [ Path("product_documents/规则包/douyin_rule_packs.v1.json"), Path("product_documents/抖音游走策略/douyin_walk_strategy.v1.json"), ] def canonical_dumps(data: Any) -> str: """The single canonical formatter the M1 converter must emit. indent=2, ensure_ascii=False, insertion-order keys, trailing newline. """ return json.dumps(data, indent=2, ensure_ascii=False) + "\n" def _first_diff(a: bytes, b: bytes) -> dict[str, Any] | None: n = min(len(a), len(b)) for i in range(n): if a[i] != b[i]: lo = max(0, i - 40) return { "byte_offset": i, "expected": a[lo : i + 40].decode("utf-8", "replace"), "actual": b[lo : i + 40].decode("utf-8", "replace"), } if len(a) != len(b): return {"byte_offset": n, "note": f"length differs: canonical={len(b)} file={len(a)}"} return None def _check_one(path: Path) -> dict[str, Any]: raw = path.read_bytes() canonical = canonical_dumps(json.loads(raw.decode("utf-8"))).encode("utf-8") ok = raw == canonical finding: dict[str, Any] = { "config_path": str(path.relative_to(ROOT)), "canonical": ok, "file_bytes": len(raw), "canonical_bytes": len(canonical), } if not ok: finding["first_diff"] = _first_diff(raw, canonical) return finding def main() -> int: args = _parse_args() targets = [p if p.is_absolute() else ROOT / p for p in (args.paths or DEFAULT_TARGETS)] findings = [] rewritten = [] for path in targets: if not path.exists(): findings.append({"config_path": str(path), "canonical": False, "error": "not_found"}) continue if args.write: canonical = canonical_dumps(json.loads(path.read_text(encoding="utf-8"))) before = path.read_bytes() path.write_text(canonical, encoding="utf-8") changed = before != canonical.encode("utf-8") rewritten.append({"config_path": str(path.relative_to(ROOT)), "changed": changed}) else: findings.append(_check_one(path)) if args.write: result = {"mode": "write", "rewritten": rewritten} print(json.dumps(result, ensure_ascii=False, indent=2)) return 0 status = "fail" if any(not f.get("canonical") for f in findings) else "pass" print(json.dumps({"mode": "check", "status": status, "findings": findings}, ensure_ascii=False, indent=2)) return 1 if status == "fail" else 0 def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("paths", nargs="*", type=Path, help="JSON files (default: the two config JSONs)") parser.add_argument("--write", action="store_true", help="rewrite files in canonical form instead of checking") return parser.parse_args() if __name__ == "__main__": sys.exit(main())