fix_json_quotes.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. """
  2. 修复 case JSON 文件中的引号错误
  3. 常见问题:
  4. 1. 中文引号("")误用为英文引号
  5. 2. 字符串值中包含未转义的英文双引号(LLM 生成时常见)
  6. """
  7. import json
  8. import re
  9. from pathlib import Path
  10. from typing import Any, Dict, Optional, Tuple
  11. def fix_chinese_quotes(text: str) -> str:
  12. """将中文引号替换为英文引号"""
  13. return text.replace('“', '"').replace('”', '"').replace('‘', "'").replace('’', "'")
  14. def fix_unescaped_quotes_in_values(text: str) -> str:
  15. """
  16. 修复 JSON 字符串值中未转义的双引号。
  17. 策略:找到 JSON 键值对中的字符串值,将值内部的未转义引号替换为中文引号。
  18. """
  19. # 匹配 "key": "value" 中的 value 部分(value 可能包含未转义的引号)
  20. # 使用逐字符解析来精确处理
  21. result = []
  22. i = 0
  23. in_string = False
  24. is_value = False # 是否在值的字符串中(而非键)
  25. colon_seen = False
  26. while i < len(text):
  27. c = text[i]
  28. if not in_string:
  29. if c == '"':
  30. in_string = True
  31. # 判断这是键还是值
  32. is_value = colon_seen
  33. if colon_seen:
  34. colon_seen = False
  35. result.append(c)
  36. elif c == ':':
  37. colon_seen = True
  38. result.append(c)
  39. elif c in ('{', '[', ',', '\n', ' ', '\t', '\r'):
  40. if c in ('{', '[', ','):
  41. colon_seen = False
  42. result.append(c)
  43. else:
  44. result.append(c)
  45. else:
  46. if c == '\\':
  47. # 转义序列,直接保留
  48. result.append(c)
  49. i += 1
  50. if i < len(text):
  51. result.append(text[i])
  52. elif c == '"':
  53. # 检查是否是字符串结束
  54. # 向后看:如果后面跟着 ,/}/]/: 或空白+这些字符,则是结束引号
  55. j = i + 1
  56. while j < len(text) and text[j] in (' ', '\t', '\r', '\n'):
  57. j += 1
  58. next_char = text[j] if j < len(text) else ''
  59. if next_char in (',', '}', ']', ':') or j >= len(text):
  60. # 这是字符串结束引号
  61. in_string = False
  62. result.append(c)
  63. else:
  64. # 这是值内部的未转义引号,替换为中文引号
  65. if is_value:
  66. result.append('“') # "
  67. else:
  68. result.append(c)
  69. else:
  70. result.append(c)
  71. i += 1
  72. return ''.join(result)
  73. def try_fix_and_parse(content: str) -> Tuple[bool, Any, str]:
  74. """
  75. 尝试多种修复策略解析 JSON
  76. Returns:
  77. (success, data, fix_description)
  78. """
  79. # 策略 1:直接解析
  80. try:
  81. return True, json.loads(content), "valid"
  82. except json.JSONDecodeError:
  83. pass
  84. # 策略 2:替换中文引号
  85. fixed = fix_chinese_quotes(content)
  86. try:
  87. return True, json.loads(fixed), "fixed_chinese_quotes"
  88. except json.JSONDecodeError:
  89. pass
  90. # 策略 3:修复值中未转义的引号
  91. fixed2 = fix_unescaped_quotes_in_values(fixed)
  92. try:
  93. return True, json.loads(fixed2), "fixed_unescaped_quotes"
  94. except json.JSONDecodeError:
  95. pass
  96. # 策略 4:尝试 json_repair 库(如果可用)
  97. try:
  98. import json_repair
  99. data = json_repair.repair_json(content, return_objects=True)
  100. if data:
  101. return True, data, "fixed_by_json_repair"
  102. except ImportError:
  103. pass
  104. except Exception:
  105. pass
  106. return False, None, "unfixable"
  107. def fix_json_file(file_path: Path, backup: bool = True) -> Dict[str, Any]:
  108. """修复 JSON 文件"""
  109. result = {"success": False, "message": "", "fixed": False, "file": str(file_path)}
  110. try:
  111. with open(file_path, 'r', encoding='utf-8') as f:
  112. content = f.read()
  113. except Exception as e:
  114. result["message"] = f"File read error: {e}"
  115. return result
  116. success, data, fix_desc = try_fix_and_parse(content)
  117. if success:
  118. if fix_desc != "valid":
  119. if backup:
  120. backup_path = file_path.with_suffix('.json.bak')
  121. backup_path.write_text(content, encoding='utf-8')
  122. result["backup"] = str(backup_path)
  123. with open(file_path, 'w', encoding='utf-8') as f:
  124. json.dump(data, f, ensure_ascii=False, indent=2)
  125. result["fixed"] = True
  126. result["message"] = fix_desc
  127. else:
  128. result["message"] = "Already valid JSON"
  129. result["success"] = True
  130. else:
  131. result["message"] = "unfixable"
  132. return result
  133. def fix_directory(dir_path: Path, pattern: str = "case_*.json") -> Dict[str, Any]:
  134. """修复目录下所有匹配的 JSON 文件"""
  135. results = []
  136. total = fixed = failed = 0
  137. for file_path in sorted(dir_path.glob(pattern)):
  138. total += 1
  139. result = fix_json_file(file_path, backup=True)
  140. results.append(result)
  141. if result["success"]:
  142. if result["fixed"]:
  143. fixed += 1
  144. print(f"[FIXED] {file_path.name}: {result['message']}")
  145. else:
  146. print(f"[OK] {file_path.name}")
  147. else:
  148. failed += 1
  149. print(f"[FAILED] {file_path.name}: {result['message']}")
  150. return {"total": total, "fixed": fixed, "failed": failed, "results": results}
  151. if __name__ == "__main__":
  152. import sys
  153. if len(sys.argv) < 2:
  154. print("Usage: python fix_json_quotes.py <directory>")
  155. sys.exit(1)
  156. dir_path = Path(sys.argv[1])
  157. print(f"Fixing JSON files in: {dir_path}")
  158. print("=" * 60)
  159. summary = fix_directory(dir_path)
  160. print("=" * 60)
  161. print(f"Total: {summary['total']}, Fixed: {summary['fixed']}, Failed: {summary['failed']}")