check_config_json_canonical.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. """Check (or rewrite) config JSON files into the canonical byte form.
  2. Grown from the M1 byte-equal feasibility spike (2026-06-09). The V2 plan
  3. (`tech_documents/工程落地/06_V2阶段开发计划.md`, V2-M1) locks the converter to a
  4. single canonical formatter so Excel->JSON output is byte-equal to the runtime
  5. JSON and `policy_bundle_hash` (`policy_json.py`) stays stable.
  6. Canonical form = `json.dumps(data, indent=2, ensure_ascii=False)` + trailing "\n",
  7. preserving key insertion order (no sort).
  8. Spike findings this encodes:
  9. - `douyin_rule_packs.v1.json` (the only file hashed by policy_json) already
  10. round-trips to exact bytes under this formatter.
  11. - `douyin_walk_strategy.v1.json` is hand-mixed (leaf objects collapsed to single
  12. lines); it is NOT hashed and no test pins its bytes, so `--write` re-normalizes
  13. it once to the canonical form with zero runtime impact.
  14. Usage:
  15. python scripts/check_config_json_canonical.py # --check (default)
  16. python scripts/check_config_json_canonical.py --write # rewrite in place
  17. python scripts/check_config_json_canonical.py --check path/to/other.json
  18. Exit code: 0 = all canonical (or written), 1 = drift found in --check mode.
  19. """
  20. from __future__ import annotations
  21. import argparse
  22. import json
  23. import sys
  24. from pathlib import Path
  25. from typing import Any
  26. ROOT = Path(__file__).resolve().parents[1]
  27. # Default config JSONs the V2 converter governs.
  28. DEFAULT_TARGETS = [
  29. Path("product_documents/规则包/douyin_rule_packs.v1.json"),
  30. Path("product_documents/抖音游走策略/douyin_walk_strategy.v1.json"),
  31. ]
  32. def canonical_dumps(data: Any) -> str:
  33. """The single canonical formatter the M1 converter must emit.
  34. indent=2, ensure_ascii=False, insertion-order keys, trailing newline.
  35. """
  36. return json.dumps(data, indent=2, ensure_ascii=False) + "\n"
  37. def _first_diff(a: bytes, b: bytes) -> dict[str, Any] | None:
  38. n = min(len(a), len(b))
  39. for i in range(n):
  40. if a[i] != b[i]:
  41. lo = max(0, i - 40)
  42. return {
  43. "byte_offset": i,
  44. "expected": a[lo : i + 40].decode("utf-8", "replace"),
  45. "actual": b[lo : i + 40].decode("utf-8", "replace"),
  46. }
  47. if len(a) != len(b):
  48. return {"byte_offset": n, "note": f"length differs: canonical={len(b)} file={len(a)}"}
  49. return None
  50. def _check_one(path: Path) -> dict[str, Any]:
  51. raw = path.read_bytes()
  52. canonical = canonical_dumps(json.loads(raw.decode("utf-8"))).encode("utf-8")
  53. ok = raw == canonical
  54. finding: dict[str, Any] = {
  55. "config_path": str(path.relative_to(ROOT)),
  56. "canonical": ok,
  57. "file_bytes": len(raw),
  58. "canonical_bytes": len(canonical),
  59. }
  60. if not ok:
  61. finding["first_diff"] = _first_diff(raw, canonical)
  62. return finding
  63. def main() -> int:
  64. args = _parse_args()
  65. targets = [p if p.is_absolute() else ROOT / p for p in (args.paths or DEFAULT_TARGETS)]
  66. findings = []
  67. rewritten = []
  68. for path in targets:
  69. if not path.exists():
  70. findings.append({"config_path": str(path), "canonical": False, "error": "not_found"})
  71. continue
  72. if args.write:
  73. canonical = canonical_dumps(json.loads(path.read_text(encoding="utf-8")))
  74. before = path.read_bytes()
  75. path.write_text(canonical, encoding="utf-8")
  76. changed = before != canonical.encode("utf-8")
  77. rewritten.append({"config_path": str(path.relative_to(ROOT)), "changed": changed})
  78. else:
  79. findings.append(_check_one(path))
  80. if args.write:
  81. result = {"mode": "write", "rewritten": rewritten}
  82. print(json.dumps(result, ensure_ascii=False, indent=2))
  83. return 0
  84. status = "fail" if any(not f.get("canonical") for f in findings) else "pass"
  85. print(json.dumps({"mode": "check", "status": status, "findings": findings}, ensure_ascii=False, indent=2))
  86. return 1 if status == "fail" else 0
  87. def _parse_args() -> argparse.Namespace:
  88. parser = argparse.ArgumentParser(description=__doc__)
  89. parser.add_argument("paths", nargs="*", type=Path, help="JSON files (default: the two config JSONs)")
  90. parser.add_argument("--write", action="store_true", help="rewrite files in canonical form instead of checking")
  91. return parser.parse_args()
  92. if __name__ == "__main__":
  93. sys.exit(main())