tree_data_process.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. #!/usr/bin/env python3
  2. """
  3. 人设树 JSON 精简处理:去掉指定字段,输出到目标目录。
  4. 进一步精简建议(可选):
  5. --minify 单行 JSON,去掉缩进与多余空白(体积可降约 40%+,推荐)
  6. --round N 数值保留 N 位小数(默认 4,可再减小体积)
  7. --short-keys 用短键名(t/n/w/r/c/ch),体积最小,但读取时需配合 KEY_MAP 还原
  8. """
  9. import argparse
  10. import json
  11. from pathlib import Path
  12. # 需要移除的字段
  13. FIELDS_TO_REMOVE = {"_post_ids", "_child_categories_relation", "_child_categories_relation_detail"}
  14. # 短键名映射(仅当 --short-keys 时使用)
  15. KEY_MAP = {
  16. "_type": "t",
  17. "_post_count": "n",
  18. "_persona_weight_score": "w",
  19. "_ratio": "r",
  20. "_is_constant": "c",
  21. "_is_local_constant": "lc",
  22. "children": "ch",
  23. }
  24. KEY_MAP_INV = {v: k for k, v in KEY_MAP.items()}
  25. INPUT_DIR = Path(__file__).resolve().parent / "input/家有大志/原始数据/tree"
  26. OUTPUT_DIR = Path(__file__).resolve().parent / "input/家有大志/tree"
  27. def strip_fields(obj):
  28. """递归移除指定字段。"""
  29. if isinstance(obj, dict):
  30. for key in list(obj.keys()):
  31. if key in FIELDS_TO_REMOVE:
  32. del obj[key]
  33. else:
  34. strip_fields(obj[key])
  35. elif isinstance(obj, list):
  36. for item in obj:
  37. strip_fields(item)
  38. return obj
  39. def round_floats(obj, ndigits: int):
  40. """递归将浮点数四舍五入到 ndigits 位。"""
  41. if isinstance(obj, dict):
  42. for k, v in obj.items():
  43. obj[k] = round_floats(v, ndigits)
  44. elif isinstance(obj, list):
  45. for i, v in enumerate(obj):
  46. obj[i] = round_floats(v, ndigits)
  47. elif isinstance(obj, float):
  48. return round(obj, ndigits)
  49. return obj
  50. def abbreviate_keys(obj):
  51. """递归将已知长键名替换为短键名(仅处理 KEY_MAP 中的键)。"""
  52. if isinstance(obj, dict):
  53. new_obj = {}
  54. for k, v in obj.items():
  55. new_key = KEY_MAP.get(k, k)
  56. new_obj[new_key] = abbreviate_keys(v)
  57. return new_obj
  58. if isinstance(obj, list):
  59. return [abbreviate_keys(x) for x in obj]
  60. return obj
  61. def expand_keys(obj):
  62. """递归将短键名还原为长键名(读取 --short-keys 输出的文件时使用)。"""
  63. if isinstance(obj, dict):
  64. new_obj = {}
  65. for k, v in obj.items():
  66. new_key = KEY_MAP_INV.get(k, k)
  67. new_obj[new_key] = expand_keys(v)
  68. return new_obj
  69. if isinstance(obj, list):
  70. return [expand_keys(x) for x in obj]
  71. return obj
  72. def process_tree_json(
  73. in_path: Path,
  74. out_path: Path,
  75. *,
  76. minify: bool = False,
  77. round_ndigits: int | None = None,
  78. short_keys: bool = False,
  79. ) -> None:
  80. """读取一个树 JSON,精简后写入 out_path。"""
  81. with open(in_path, "r", encoding="utf-8") as f:
  82. data = json.load(f)
  83. strip_fields(data)
  84. if round_ndigits is not None:
  85. round_floats(data, round_ndigits)
  86. if short_keys:
  87. data = abbreviate_keys(data)
  88. out_path.parent.mkdir(parents=True, exist_ok=True)
  89. with open(out_path, "w", encoding="utf-8") as f:
  90. json.dump(
  91. data,
  92. f,
  93. ensure_ascii=False,
  94. indent=None if minify else 2,
  95. separators=(",", ":") if minify else (", ", ": "),
  96. )
  97. if minify:
  98. f.write("\n")
  99. def main():
  100. parser = argparse.ArgumentParser(description="人设树 JSON 精简")
  101. parser.add_argument("--minify", action="store_true", help="单行 JSON,减小体积")
  102. parser.add_argument("--round", type=int, default=None, metavar="N", help="数值保留 N 位小数")
  103. parser.add_argument("--short-keys", action="store_true", help="使用短键名(读取时需还原)")
  104. args = parser.parse_args()
  105. INPUT_DIR.mkdir(parents=True, exist_ok=True)
  106. OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
  107. for in_file in sorted(INPUT_DIR.glob("*.json")):
  108. out_file = OUTPUT_DIR / in_file.name
  109. process_tree_json(
  110. in_file,
  111. out_file,
  112. minify=args.minify,
  113. round_ndigits=args.round,
  114. short_keys=args.short_keys,
  115. )
  116. size = out_file.stat().st_size
  117. print(f"已处理: {in_file.name} -> {out_file} ({size:,} B)")
  118. if __name__ == "__main__":
  119. main()