howard
/
Agent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
							"""
修复 case JSON 文件中的引号错误

常见问题：
1. 中文引号（""）误用为英文引号
2. 字符串值中包含未转义的英文双引号（LLM 生成时常见）
"""

import json
import re
from pathlib import Path
from typing import Any, Dict, Optional, Tuple


def fix_chinese_quotes(text: str) -> str:
    """将中文引号替换为英文引号"""
    return text.replace('“', '"').replace('”', '"').replace('‘', "'").replace('’', "'")


def fix_unescaped_quotes_in_values(text: str) -> str:
    """
    修复 JSON 字符串值中未转义的双引号。
    策略：找到 JSON 键值对中的字符串值，将值内部的未转义引号替换为中文引号。
    """
    # 匹配 "key": "value" 中的 value 部分（value 可能包含未转义的引号）
    # 使用逐字符解析来精确处理
    result = []
    i = 0
    in_string = False
    is_value = False  # 是否在值的字符串中（而非键）
    colon_seen = False

    while i < len(text):
        c = text[i]

        if not in_string:
            if c == '"':
                in_string = True
                # 判断这是键还是值
                is_value = colon_seen
                if colon_seen:
                    colon_seen = False
                result.append(c)
            elif c == ':':
                colon_seen = True
                result.append(c)
            elif c in ('{', '[', ',', '\n', ' ', '\t', '\r'):
                if c in ('{', '[', ','):
                    colon_seen = False
                result.append(c)
            else:
                result.append(c)
        else:
            if c == '\\':
                # 转义序列，直接保留
                result.append(c)
                i += 1
                if i < len(text):
                    result.append(text[i])
            elif c == '"':
                # 检查是否是字符串结束
                # 向后看：如果后面跟着 ,/}/]/: 或空白+这些字符，则是结束引号
                j = i + 1
                while j < len(text) and text[j] in (' ', '\t', '\r', '\n'):
                    j += 1
                next_char = text[j] if j < len(text) else ''

                if next_char in (',', '}', ']', ':') or j >= len(text):
                    # 这是字符串结束引号
                    in_string = False
                    result.append(c)
                else:
                    # 这是值内部的未转义引号，替换为中文引号
                    if is_value:
                        result.append('“')  # "
                    else:
                        result.append(c)
            else:
                result.append(c)
        i += 1

    return ''.join(result)


def _fix_by_escaping_inner_quotes(text: str) -> str:
    """
    尝试通过正则找到常见的字符串字段，并将值内部的双引号转义。
    主要处理中文文本字段，如 title/description/body/effects 等。
    """
    import re

    # 匹配 "key": "value" 结构，允许 value 中包含未转义的引号
    # 这里用一个宽松的模式：从字段开头到行尾/逗号/右括号前
    def repl(match):
        prefix = match.group(1)  # "key": "
        value = match.group(2)   # value 内容
        suffix = match.group(3)  # ", 或 "} 或 "]

        # 将 value 内部未转义的双引号转义
        fixed_value = value.replace('\\"', '__ESCAPED_QUOTE__')
        fixed_value = fixed_value.replace('"', '\\"')
        fixed_value = fixed_value.replace('__ESCAPED_QUOTE__', '\\"')

        return prefix + fixed_value + suffix

    # 处理常见的字符串字段
    patterns = [
        r'("(?:title|description|body|effects|visual_notes|execution_process|core_parameters|why|explanation|步骤描述|ability_description|ability_name|name|cluster_name)"\s*:\s*")([\s\S]*?)("\s*[},\]])',
    ]

    fixed = text
    for pattern in patterns:
        fixed = re.sub(pattern, repl, fixed)

    return fixed


def _fix_brute_force_escape(text: str) -> str:
    """
    暴力修复：逐字符扫描，在字符串值内部遇到未转义双引号时，尽量转义。
    比 `fix_unescaped_quotes_in_values` 更激进。
    """
    result = []
    i = 0
    in_string = False
    escaped = False

    while i < len(text):
        c = text[i]

        if escaped:
            result.append(c)
            escaped = False
        elif c == '\\':
            result.append(c)
            escaped = True
        elif c == '"':
            if not in_string:
                in_string = True
                result.append(c)
            else:
                # 向后看，判断是否可能是字符串结束
                j = i + 1
                while j < len(text) and text[j] in ' \t\r\n':
                    j += 1
                next_char = text[j] if j < len(text) else ''
                if next_char in ',}]' or j >= len(text):
                    in_string = False
                    result.append(c)
                else:
                    # 很可能是值内部的引号，转义它
                    result.append('\\"')
        else:
            result.append(c)
        i += 1

    return ''.join(result)


def try_fix_and_parse(content: str) -> Tuple[bool, Any, str]:
    """
    尝试多种修复策略解析 JSON

    Returns:
        (success, data, fix_description)
    """
    # 策略 1：直接解析
    try:
        return True, json.loads(content), "valid"
    except json.JSONDecodeError:
        pass

    # 策略 2：替换中文引号
    fixed = fix_chinese_quotes(content)
    try:
        return True, json.loads(fixed), "fixed_chinese_quotes"
    except json.JSONDecodeError:
        pass

    # 策略 2.5：优先尝试 json_repair（更强的通用修复）
    try:
        import json_repair
        data = json_repair.repair_json(content, return_objects=True)
        if data:
            return True, data, "fixed_by_json_repair"
    except ImportError:
        pass
    except Exception:
        pass

    # 策略 3：修复值中未转义的引号
    fixed2 = fix_unescaped_quotes_in_values(fixed)
    try:
        return True, json.loads(fixed2), "fixed_unescaped_quotes"
    except json.JSONDecodeError:
        pass

    # 策略 3.5：逐行定位错误并修复
    fixed3 = _fix_by_escaping_inner_quotes(fixed)
    try:
        return True, json.loads(fixed3), "fixed_escaped_inner_quotes"
    except json.JSONDecodeError:
        pass

    # 策略 3.6：暴力替换——把所有看起来像值内部的双引号转义
    fixed4 = _fix_brute_force_escape(content)
    try:
        return True, json.loads(fixed4), "fixed_brute_force"
    except json.JSONDecodeError:
        pass

    return False, None, "unfixable"


def fix_json_file(file_path: Path, backup: bool = True) -> Dict[str, Any]:
    """修复 JSON 文件"""
    result = {"success": False, "message": "", "fixed": False, "file": str(file_path)}

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
    except Exception as e:
        result["message"] = f"File read error: {e}"
        return result

    success, data, fix_desc = try_fix_and_parse(content)

    if success:
        if fix_desc != "valid":
            if backup:
                backup_path = file_path.with_suffix('.json.bak')
                backup_path.write_text(content, encoding='utf-8')
                result["backup"] = str(backup_path)

            with open(file_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)

            result["fixed"] = True
            result["message"] = fix_desc
        else:
            result["message"] = "Already valid JSON"
        result["success"] = True
    else:
        result["message"] = "unfixable"

    return result


def fix_directory(dir_path: Path, pattern: str = "case_*.json") -> Dict[str, Any]:
    """修复目录下所有匹配的 JSON 文件"""
    results = []
    total = fixed = failed = 0

    for file_path in sorted(dir_path.glob(pattern)):
        total += 1
        result = fix_json_file(file_path, backup=True)
        results.append(result)

        if result["success"]:
            if result["fixed"]:
                fixed += 1
                print(f"[FIXED] {file_path.name}: {result['message']}")
            else:
                print(f"[OK] {file_path.name}")
        else:
            failed += 1
            print(f"[FAILED] {file_path.name}: {result['message']}")

    return {"total": total, "fixed": fixed, "failed": failed, "results": results}


if __name__ == "__main__":
    import sys
    if len(sys.argv) < 2:
        print("Usage: python fix_json_quotes.py <directory>")
        sys.exit(1)

    dir_path = Path(sys.argv[1])
    print(f"Fixing JSON files in: {dir_path}")
    print("=" * 60)
    summary = fix_directory(dir_path)
    print("=" * 60)
    print(f"Total: {summary['total']}, Fixed: {summary['fixed']}, Failed: {summary['failed']}")