""" Schema 管理工具:统一加载和验证 JSON Schema 设计原则: 1. 每个 prompt 文件对应一个 .schema.json 文件 2. Schema 文件和 prompt 文件放在同一目录 3. 每个 schema 文件完全独立,不依赖外部引用 4. 所有验证逻辑都通过 jsonschema 库自动完成,不再硬编码字段名 后缀约定(用于标注契约边界): - `-boundary`: 容器字段,名称不可变,内部元素可演进(如 abilities-boundary) - `-ref`: 被外部直接引用的字段,名称和类型都不可变(如 ability_id-ref) - 无后缀: 内部字段,可自由演进 校验时自动剥离后缀,实际匹配的 key 是去掉后缀的版本。 """ import json from pathlib import Path from typing import Any, Dict, Optional, Tuple import copy try: import jsonschema from jsonschema import Draft7Validator, ValidationError JSONSCHEMA_AVAILABLE = True except ImportError: JSONSCHEMA_AVAILABLE = False print("Warning: jsonschema not installed. Run: pip install jsonschema") class SchemaManager: """Schema 管理器,负责加载和验证 JSON Schema""" CONTRACT_SUFFIXES = ["-boundary", "-ref"] def __init__(self, prompts_dir: Path): """ 初始化 Schema 管理器 Args: prompts_dir: prompts 目录路径 """ self.prompts_dir = Path(prompts_dir) self._schema_cache: Dict[str, Dict] = {} def load_schema(self, prompt_name: str) -> Optional[Dict]: """ 加载指定 prompt 对应的 schema Args: prompt_name: prompt 文件名(不含 .prompt 后缀) Returns: Schema 字典,如果文件不存在则返回 None """ # 检查缓存 if prompt_name in self._schema_cache: return self._schema_cache[prompt_name] # 加载 schema 文件(先找 prompts/,再找 prompts/temp_schema/) schema_file = self.prompts_dir / f"{prompt_name}.schema.json" if not schema_file.exists(): schema_file = self.prompts_dir / "temp_schema" / f"{prompt_name}.schema.json" if not schema_file.exists(): return None try: with open(schema_file, "r", encoding="utf-8") as f: schema = json.load(f) self._schema_cache[prompt_name] = schema return schema except Exception as e: print(f"Error loading schema {schema_file}: {e}") return None @classmethod def _strip_suffix(cls, key: str) -> str: """剥离契约后缀,返回实际字段名""" for suffix in cls.CONTRACT_SUFFIXES: if key.endswith(suffix): return key[:-len(suffix)] return key @classmethod def _strip_schema(cls, schema: Any) -> Any: """ 递归遍历 schema,将所有带后缀的 key 替换为剥离后的版本。 返回一份新的 schema(不修改原始对象)。 """ if isinstance(schema, dict): result = {} for k, v in schema.items(): new_key = k # 只对 properties 和 required 里的 key 做剥离 if k == "properties": # properties 的 value 是 {field_name: field_schema} result[k] = { cls._strip_suffix(fk): cls._strip_schema(fv) for fk, fv in v.items() } elif k == "required": # required 是字段名数组 result[k] = [cls._strip_suffix(r) for r in v] else: result[k] = cls._strip_schema(v) return result elif isinstance(schema, list): return [cls._strip_schema(item) for item in schema] else: return schema def validate(self, data: Any, prompt_name: str) -> Tuple[bool, Optional[str]]: """ 使用 JSON Schema 验证数据 Args: data: 要验证的数据 prompt_name: prompt 文件名(不含 .prompt 后缀) Returns: (is_valid, error_message) 元组 """ if not JSONSCHEMA_AVAILABLE: return True, None schema = self.load_schema(prompt_name) if schema is None: return True, None try: clean_schema = self._strip_schema(schema) validator = Draft7Validator(clean_schema) validator.validate(data) return True, None except ValidationError as e: path = ".".join(str(p) for p in e.absolute_path) if e.absolute_path else "root" return False, f"{path}: {e.message}" except Exception as e: return False, f"Validation error: {str(e)}" def get_example_output(self, prompt_name: str) -> Optional[Dict]: """ 从 schema 中提取示例输出(如果有的话) Args: prompt_name: prompt 文件名(不含 .prompt 后缀) Returns: 示例输出字典,如果没有则返回 None """ schema = self.load_schema(prompt_name) if schema is None: return None # 尝试从 schema 中提取 examples if "examples" in schema: return schema["examples"][0] if schema["examples"] else None # 或者根据 schema 生成一个最小示例 return self._generate_minimal_example(schema) def get_stripped_schema(self, prompt_name: str) -> Optional[Dict]: """ 获取剥离后缀的 schema(用于传给 LLM 的 response_format) Args: prompt_name: prompt 文件名(不含 .prompt 后缀) Returns: 剥离后缀的 schema 字典,如果文件不存在则返回 None """ schema = self.load_schema(prompt_name) if schema is None: return None return self._strip_schema(schema) def _generate_minimal_example(self, schema: Dict) -> Dict: """ 根据 schema 生成一个最小示例 Args: schema: JSON Schema 字典 Returns: 最小示例字典 """ if schema.get("type") != "object": return {} example = {} required = schema.get("required", []) properties = schema.get("properties", {}) for key in required: if key in properties: prop = properties[key] prop_type = prop.get("type") if prop_type == "string": example[key] = prop.get("examples", [""])[0] if "examples" in prop else "" elif prop_type == "integer": example[key] = prop.get("examples", [0])[0] if "examples" in prop else 0 elif prop_type == "boolean": example[key] = prop.get("default", False) elif prop_type == "array": example[key] = [] elif prop_type == "object": example[key] = {} elif isinstance(prop_type, list) and "null" in prop_type: example[key] = None return example # 全局单例 _schema_manager: Optional[SchemaManager] = None def get_schema_manager(prompts_dir: Optional[Path] = None) -> SchemaManager: """ 获取全局 Schema 管理器单例 Args: prompts_dir: prompts 目录路径(首次调用时必须提供) Returns: SchemaManager 实例 """ global _schema_manager if _schema_manager is None: if prompts_dir is None: # 默认路径 base_dir = Path(__file__).parent.parent prompts_dir = base_dir / "prompts" _schema_manager = SchemaManager(prompts_dir) return _schema_manager def validate_with_schema(data: Any, prompt_name: str) -> Optional[str]: """ 便捷函数:使用 schema 验证数据 Args: data: 要验证的数据 prompt_name: prompt 文件名(不含 .prompt 后缀) Returns: 错误消息字符串,如果验证通过则返回 None """ manager = get_schema_manager() is_valid, error = manager.validate(data, prompt_name) return error if not is_valid else None # 示例用法 if __name__ == "__main__": # 测试加载 schema manager = get_schema_manager() # 测试 extract_workflow schema schema = manager.load_schema("extract_workflow") if schema: print("✓ Loaded extract_workflow.schema.json") # 测试验证 test_data = { "id": "strategy-001", "name": "测试工序", "description": "这是一个测试", "modality": "图文", "inputs": {}, "outputs": {}, "steps": [ { "order": 1, "type": "capability", "description": "测试步骤", "inputs": {}, "outputs": {} } ] } is_valid, error = manager.validate(test_data, "extract_workflow") if is_valid: print("✓ Validation passed") else: print(f"✗ Validation failed: {error}")