""" 修复 case JSON 文件中的引号错误 常见问题: 1. 中文引号("")误用为英文引号 2. 字符串值中包含未转义的英文双引号(LLM 生成时常见) """ import json import re from pathlib import Path from typing import Any, Dict, Optional, Tuple def fix_chinese_quotes(text: str) -> str: """将中文引号替换为英文引号""" return text.replace('“', '"').replace('”', '"').replace('‘', "'").replace('’', "'") def fix_unescaped_quotes_in_values(text: str) -> str: """ 修复 JSON 字符串值中未转义的双引号。 策略:找到 JSON 键值对中的字符串值,将值内部的未转义引号替换为中文引号。 """ # 匹配 "key": "value" 中的 value 部分(value 可能包含未转义的引号) # 使用逐字符解析来精确处理 result = [] i = 0 in_string = False is_value = False # 是否在值的字符串中(而非键) colon_seen = False while i < len(text): c = text[i] if not in_string: if c == '"': in_string = True # 判断这是键还是值 is_value = colon_seen if colon_seen: colon_seen = False result.append(c) elif c == ':': colon_seen = True result.append(c) elif c in ('{', '[', ',', '\n', ' ', '\t', '\r'): if c in ('{', '[', ','): colon_seen = False result.append(c) else: result.append(c) else: if c == '\\': # 转义序列,直接保留 result.append(c) i += 1 if i < len(text): result.append(text[i]) elif c == '"': # 检查是否是字符串结束 # 向后看:如果后面跟着 ,/}/]/: 或空白+这些字符,则是结束引号 j = i + 1 while j < len(text) and text[j] in (' ', '\t', '\r', '\n'): j += 1 next_char = text[j] if j < len(text) else '' if next_char in (',', '}', ']', ':') or j >= len(text): # 这是字符串结束引号 in_string = False result.append(c) else: # 这是值内部的未转义引号,替换为中文引号 if is_value: result.append('“') # " else: result.append(c) else: result.append(c) i += 1 return ''.join(result) def _fix_by_escaping_inner_quotes(text: str) -> str: """ 尝试通过正则找到常见的字符串字段,并将值内部的双引号转义。 主要处理中文文本字段,如 title/description/body/effects 等。 """ import re # 匹配 "key": "value" 结构,允许 value 中包含未转义的引号 # 这里用一个宽松的模式:从字段开头到行尾/逗号/右括号前 def repl(match): prefix = match.group(1) # "key": " value = match.group(2) # value 内容 suffix = match.group(3) # ", 或 "} 或 "] # 将 value 内部未转义的双引号转义 fixed_value = value.replace('\\"', '__ESCAPED_QUOTE__') fixed_value = fixed_value.replace('"', '\\"') fixed_value = fixed_value.replace('__ESCAPED_QUOTE__', '\\"') return prefix + fixed_value + suffix # 处理常见的字符串字段 patterns = [ r'("(?:title|description|body|effects|visual_notes|execution_process|core_parameters|why|explanation|步骤描述|ability_description|ability_name|name|cluster_name)"\s*:\s*")([\s\S]*?)("\s*[},\]])', ] fixed = text for pattern in patterns: fixed = re.sub(pattern, repl, fixed) return fixed def _fix_brute_force_escape(text: str) -> str: """ 暴力修复:逐字符扫描,在字符串值内部遇到未转义双引号时,尽量转义。 比 `fix_unescaped_quotes_in_values` 更激进。 """ result = [] i = 0 in_string = False escaped = False while i < len(text): c = text[i] if escaped: result.append(c) escaped = False elif c == '\\': result.append(c) escaped = True elif c == '"': if not in_string: in_string = True result.append(c) else: # 向后看,判断是否可能是字符串结束 j = i + 1 while j < len(text) and text[j] in ' \t\r\n': j += 1 next_char = text[j] if j < len(text) else '' if next_char in ',}]' or j >= len(text): in_string = False result.append(c) else: # 很可能是值内部的引号,转义它 result.append('\\"') else: result.append(c) i += 1 return ''.join(result) def try_fix_and_parse(content: str) -> Tuple[bool, Any, str]: """ 尝试多种修复策略解析 JSON Returns: (success, data, fix_description) """ # 策略 1:直接解析 try: return True, json.loads(content), "valid" except json.JSONDecodeError: pass # 策略 2:替换中文引号 fixed = fix_chinese_quotes(content) try: return True, json.loads(fixed), "fixed_chinese_quotes" except json.JSONDecodeError: pass # 策略 2.5:优先尝试 json_repair(更强的通用修复) try: import json_repair data = json_repair.repair_json(content, return_objects=True) if data: return True, data, "fixed_by_json_repair" except ImportError: pass except Exception: pass # 策略 3:修复值中未转义的引号 fixed2 = fix_unescaped_quotes_in_values(fixed) try: return True, json.loads(fixed2), "fixed_unescaped_quotes" except json.JSONDecodeError: pass # 策略 3.5:逐行定位错误并修复 fixed3 = _fix_by_escaping_inner_quotes(fixed) try: return True, json.loads(fixed3), "fixed_escaped_inner_quotes" except json.JSONDecodeError: pass # 策略 3.6:暴力替换——把所有看起来像值内部的双引号转义 fixed4 = _fix_brute_force_escape(content) try: return True, json.loads(fixed4), "fixed_brute_force" except json.JSONDecodeError: pass return False, None, "unfixable" def fix_json_file(file_path: Path, backup: bool = True) -> Dict[str, Any]: """修复 JSON 文件""" result = {"success": False, "message": "", "fixed": False, "file": str(file_path)} try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() except Exception as e: result["message"] = f"File read error: {e}" return result success, data, fix_desc = try_fix_and_parse(content) if success: if fix_desc != "valid": if backup: backup_path = file_path.with_suffix('.json.bak') backup_path.write_text(content, encoding='utf-8') result["backup"] = str(backup_path) with open(file_path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) result["fixed"] = True result["message"] = fix_desc else: result["message"] = "Already valid JSON" result["success"] = True else: result["message"] = "unfixable" return result def fix_directory(dir_path: Path, pattern: str = "case_*.json") -> Dict[str, Any]: """修复目录下所有匹配的 JSON 文件""" results = [] total = fixed = failed = 0 for file_path in sorted(dir_path.glob(pattern)): total += 1 result = fix_json_file(file_path, backup=True) results.append(result) if result["success"]: if result["fixed"]: fixed += 1 print(f"[FIXED] {file_path.name}: {result['message']}") else: print(f"[OK] {file_path.name}") else: failed += 1 print(f"[FAILED] {file_path.name}: {result['message']}") return {"total": total, "fixed": fixed, "failed": failed, "results": results} if __name__ == "__main__": import sys if len(sys.argv) < 2: print("Usage: python fix_json_quotes.py ") sys.exit(1) dir_path = Path(sys.argv[1]) print(f"Fixing JSON files in: {dir_path}") print("=" * 60) summary = fix_directory(dir_path) print("=" * 60) print(f"Total: {summary['total']}, Fixed: {summary['fixed']}, Failed: {summary['failed']}")