| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- """Check (or rewrite) config JSON files into the canonical byte form.
- Grown from the M1 byte-equal feasibility spike (2026-06-09). The V2 plan
- (`tech_documents/工程落地/06_V2阶段开发计划.md`, V2-M1) locks the converter to a
- single canonical formatter so Excel->JSON output is byte-equal to the runtime
- JSON and `policy_bundle_hash` (`policy_json.py`) stays stable.
- Canonical form = `json.dumps(data, indent=2, ensure_ascii=False)` + trailing "\n",
- preserving key insertion order (no sort).
- Spike findings this encodes:
- - `douyin_rule_packs.v1.json` (the only file hashed by policy_json) already
- round-trips to exact bytes under this formatter.
- - `douyin_walk_strategy.v1.json` is hand-mixed (leaf objects collapsed to single
- lines); it is NOT hashed and no test pins its bytes, so `--write` re-normalizes
- it once to the canonical form with zero runtime impact.
- Usage:
- python scripts/check_config_json_canonical.py # --check (default)
- python scripts/check_config_json_canonical.py --write # rewrite in place
- python scripts/check_config_json_canonical.py --check path/to/other.json
- Exit code: 0 = all canonical (or written), 1 = drift found in --check mode.
- """
- from __future__ import annotations
- import argparse
- import json
- import sys
- from pathlib import Path
- from typing import Any
- ROOT = Path(__file__).resolve().parents[1]
- # Default config JSONs the V2 converter governs.
- DEFAULT_TARGETS = [
- Path("product_documents/规则包/douyin_rule_packs.v1.json"),
- Path("product_documents/抖音游走策略/douyin_walk_strategy.v1.json"),
- ]
- def canonical_dumps(data: Any) -> str:
- """The single canonical formatter the M1 converter must emit.
- indent=2, ensure_ascii=False, insertion-order keys, trailing newline.
- """
- return json.dumps(data, indent=2, ensure_ascii=False) + "\n"
- def _first_diff(a: bytes, b: bytes) -> dict[str, Any] | None:
- n = min(len(a), len(b))
- for i in range(n):
- if a[i] != b[i]:
- lo = max(0, i - 40)
- return {
- "byte_offset": i,
- "expected": a[lo : i + 40].decode("utf-8", "replace"),
- "actual": b[lo : i + 40].decode("utf-8", "replace"),
- }
- if len(a) != len(b):
- return {"byte_offset": n, "note": f"length differs: canonical={len(b)} file={len(a)}"}
- return None
- def _check_one(path: Path) -> dict[str, Any]:
- raw = path.read_bytes()
- canonical = canonical_dumps(json.loads(raw.decode("utf-8"))).encode("utf-8")
- ok = raw == canonical
- finding: dict[str, Any] = {
- "config_path": str(path.relative_to(ROOT)),
- "canonical": ok,
- "file_bytes": len(raw),
- "canonical_bytes": len(canonical),
- }
- if not ok:
- finding["first_diff"] = _first_diff(raw, canonical)
- return finding
- def main() -> int:
- args = _parse_args()
- targets = [p if p.is_absolute() else ROOT / p for p in (args.paths or DEFAULT_TARGETS)]
- findings = []
- rewritten = []
- for path in targets:
- if not path.exists():
- findings.append({"config_path": str(path), "canonical": False, "error": "not_found"})
- continue
- if args.write:
- canonical = canonical_dumps(json.loads(path.read_text(encoding="utf-8")))
- before = path.read_bytes()
- path.write_text(canonical, encoding="utf-8")
- changed = before != canonical.encode("utf-8")
- rewritten.append({"config_path": str(path.relative_to(ROOT)), "changed": changed})
- else:
- findings.append(_check_one(path))
- if args.write:
- result = {"mode": "write", "rewritten": rewritten}
- print(json.dumps(result, ensure_ascii=False, indent=2))
- return 0
- status = "fail" if any(not f.get("canonical") for f in findings) else "pass"
- print(json.dumps({"mode": "check", "status": status, "findings": findings}, ensure_ascii=False, indent=2))
- return 1 if status == "fail" else 0
- def _parse_args() -> argparse.Namespace:
- parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument("paths", nargs="*", type=Path, help="JSON files (default: the two config JSONs)")
- parser.add_argument("--write", action="store_true", help="rewrite files in canonical form instead of checking")
- return parser.parse_args()
- if __name__ == "__main__":
- sys.exit(main())
|