fix_json_quotes.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. """
  2. 修复 case JSON 文件中的引号错误
  3. 常见问题:
  4. 1. 中文引号("")误用为英文引号
  5. 2. 字符串值中包含未转义的英文双引号(LLM 生成时常见)
  6. """
  7. import json
  8. import re
  9. from pathlib import Path
  10. from typing import Any, Dict, Optional, Tuple
  11. def fix_chinese_quotes(text: str) -> str:
  12. """将中文引号替换为英文引号"""
  13. return text.replace('“', '"').replace('”', '"').replace('‘', "'").replace('’', "'")
  14. def fix_unescaped_quotes_in_values(text: str) -> str:
  15. """
  16. 修复 JSON 字符串值中未转义的双引号。
  17. 策略:找到 JSON 键值对中的字符串值,将值内部的未转义引号替换为中文引号。
  18. """
  19. # 匹配 "key": "value" 中的 value 部分(value 可能包含未转义的引号)
  20. # 使用逐字符解析来精确处理
  21. result = []
  22. i = 0
  23. in_string = False
  24. is_value = False # 是否在值的字符串中(而非键)
  25. colon_seen = False
  26. while i < len(text):
  27. c = text[i]
  28. if not in_string:
  29. if c == '"':
  30. in_string = True
  31. # 判断这是键还是值
  32. is_value = colon_seen
  33. if colon_seen:
  34. colon_seen = False
  35. result.append(c)
  36. elif c == ':':
  37. colon_seen = True
  38. result.append(c)
  39. elif c in ('{', '[', ',', '\n', ' ', '\t', '\r'):
  40. if c in ('{', '[', ','):
  41. colon_seen = False
  42. result.append(c)
  43. else:
  44. result.append(c)
  45. else:
  46. if c == '\\':
  47. # 转义序列,直接保留
  48. result.append(c)
  49. i += 1
  50. if i < len(text):
  51. result.append(text[i])
  52. elif c == '"':
  53. # 检查是否是字符串结束
  54. # 向后看:如果后面跟着 ,/}/]/: 或空白+这些字符,则是结束引号
  55. j = i + 1
  56. while j < len(text) and text[j] in (' ', '\t', '\r', '\n'):
  57. j += 1
  58. next_char = text[j] if j < len(text) else ''
  59. if next_char in (',', '}', ']', ':') or j >= len(text):
  60. # 这是字符串结束引号
  61. in_string = False
  62. result.append(c)
  63. else:
  64. # 这是值内部的未转义引号,替换为中文引号
  65. if is_value:
  66. result.append('“') # "
  67. else:
  68. result.append(c)
  69. else:
  70. result.append(c)
  71. i += 1
  72. return ''.join(result)
  73. def _fix_by_escaping_inner_quotes(text: str) -> str:
  74. """
  75. 尝试通过正则找到常见的字符串字段,并将值内部的双引号转义。
  76. 主要处理中文文本字段,如 title/description/body/effects 等。
  77. """
  78. import re
  79. # 匹配 "key": "value" 结构,允许 value 中包含未转义的引号
  80. # 这里用一个宽松的模式:从字段开头到行尾/逗号/右括号前
  81. def repl(match):
  82. prefix = match.group(1) # "key": "
  83. value = match.group(2) # value 内容
  84. suffix = match.group(3) # ", 或 "} 或 "]
  85. # 将 value 内部未转义的双引号转义
  86. fixed_value = value.replace('\\"', '__ESCAPED_QUOTE__')
  87. fixed_value = fixed_value.replace('"', '\\"')
  88. fixed_value = fixed_value.replace('__ESCAPED_QUOTE__', '\\"')
  89. return prefix + fixed_value + suffix
  90. # 处理常见的字符串字段
  91. patterns = [
  92. r'("(?:title|description|body|effects|visual_notes|execution_process|core_parameters|why|explanation|步骤描述|ability_description|ability_name|name|cluster_name)"\s*:\s*")([\s\S]*?)("\s*[},\]])',
  93. ]
  94. fixed = text
  95. for pattern in patterns:
  96. fixed = re.sub(pattern, repl, fixed)
  97. return fixed
  98. def _fix_brute_force_escape(text: str) -> str:
  99. """
  100. 暴力修复:逐字符扫描,在字符串值内部遇到未转义双引号时,尽量转义。
  101. 比 `fix_unescaped_quotes_in_values` 更激进。
  102. """
  103. result = []
  104. i = 0
  105. in_string = False
  106. escaped = False
  107. while i < len(text):
  108. c = text[i]
  109. if escaped:
  110. result.append(c)
  111. escaped = False
  112. elif c == '\\':
  113. result.append(c)
  114. escaped = True
  115. elif c == '"':
  116. if not in_string:
  117. in_string = True
  118. result.append(c)
  119. else:
  120. # 向后看,判断是否可能是字符串结束
  121. j = i + 1
  122. while j < len(text) and text[j] in ' \t\r\n':
  123. j += 1
  124. next_char = text[j] if j < len(text) else ''
  125. if next_char in ',}]' or j >= len(text):
  126. in_string = False
  127. result.append(c)
  128. else:
  129. # 很可能是值内部的引号,转义它
  130. result.append('\\"')
  131. else:
  132. result.append(c)
  133. i += 1
  134. return ''.join(result)
  135. def try_fix_and_parse(content: str) -> Tuple[bool, Any, str]:
  136. """
  137. 尝试多种修复策略解析 JSON
  138. Returns:
  139. (success, data, fix_description)
  140. """
  141. # 策略 1:直接解析
  142. try:
  143. return True, json.loads(content), "valid"
  144. except json.JSONDecodeError:
  145. pass
  146. # 策略 2:替换中文引号
  147. fixed = fix_chinese_quotes(content)
  148. try:
  149. return True, json.loads(fixed), "fixed_chinese_quotes"
  150. except json.JSONDecodeError:
  151. pass
  152. # 策略 2.5:优先尝试 json_repair(更强的通用修复)
  153. try:
  154. import json_repair
  155. data = json_repair.repair_json(content, return_objects=True)
  156. if data:
  157. return True, data, "fixed_by_json_repair"
  158. except ImportError:
  159. pass
  160. except Exception:
  161. pass
  162. # 策略 3:修复值中未转义的引号
  163. fixed2 = fix_unescaped_quotes_in_values(fixed)
  164. try:
  165. return True, json.loads(fixed2), "fixed_unescaped_quotes"
  166. except json.JSONDecodeError:
  167. pass
  168. # 策略 3.5:逐行定位错误并修复
  169. fixed3 = _fix_by_escaping_inner_quotes(fixed)
  170. try:
  171. return True, json.loads(fixed3), "fixed_escaped_inner_quotes"
  172. except json.JSONDecodeError:
  173. pass
  174. # 策略 3.6:暴力替换——把所有看起来像值内部的双引号转义
  175. fixed4 = _fix_brute_force_escape(content)
  176. try:
  177. return True, json.loads(fixed4), "fixed_brute_force"
  178. except json.JSONDecodeError:
  179. pass
  180. return False, None, "unfixable"
  181. def fix_json_file(file_path: Path, backup: bool = True) -> Dict[str, Any]:
  182. """修复 JSON 文件"""
  183. result = {"success": False, "message": "", "fixed": False, "file": str(file_path)}
  184. try:
  185. with open(file_path, 'r', encoding='utf-8') as f:
  186. content = f.read()
  187. except Exception as e:
  188. result["message"] = f"File read error: {e}"
  189. return result
  190. success, data, fix_desc = try_fix_and_parse(content)
  191. if success:
  192. if fix_desc != "valid":
  193. if backup:
  194. backup_path = file_path.with_suffix('.json.bak')
  195. backup_path.write_text(content, encoding='utf-8')
  196. result["backup"] = str(backup_path)
  197. with open(file_path, 'w', encoding='utf-8') as f:
  198. json.dump(data, f, ensure_ascii=False, indent=2)
  199. result["fixed"] = True
  200. result["message"] = fix_desc
  201. else:
  202. result["message"] = "Already valid JSON"
  203. result["success"] = True
  204. else:
  205. result["message"] = "unfixable"
  206. return result
  207. def fix_directory(dir_path: Path, pattern: str = "case_*.json") -> Dict[str, Any]:
  208. """修复目录下所有匹配的 JSON 文件"""
  209. results = []
  210. total = fixed = failed = 0
  211. for file_path in sorted(dir_path.glob(pattern)):
  212. total += 1
  213. result = fix_json_file(file_path, backup=True)
  214. results.append(result)
  215. if result["success"]:
  216. if result["fixed"]:
  217. fixed += 1
  218. print(f"[FIXED] {file_path.name}: {result['message']}")
  219. else:
  220. print(f"[OK] {file_path.name}")
  221. else:
  222. failed += 1
  223. print(f"[FAILED] {file_path.name}: {result['message']}")
  224. return {"total": total, "fixed": fixed, "failed": failed, "results": results}
  225. if __name__ == "__main__":
  226. import sys
  227. if len(sys.argv) < 2:
  228. print("Usage: python fix_json_quotes.py <directory>")
  229. sys.exit(1)
  230. dir_path = Path(sys.argv[1])
  231. print(f"Fixing JSON files in: {dir_path}")
  232. print("=" * 60)
  233. summary = fix_directory(dir_path)
  234. print("=" * 60)
  235. print(f"Total: {summary['total']}, Fixed: {summary['fixed']}, Failed: {summary['failed']}")