| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292 |
- """
- Schema 管理工具:统一加载和验证 JSON Schema
- 设计原则:
- 1. 每个 prompt 文件对应一个 .schema.json 文件
- 2. Schema 文件和 prompt 文件放在同一目录
- 3. 每个 schema 文件完全独立,不依赖外部引用
- 4. 所有验证逻辑都通过 jsonschema 库自动完成,不再硬编码字段名
- 后缀约定(用于标注契约边界):
- - `-boundary`: 容器字段,名称不可变,内部元素可演进(如 abilities-boundary)
- - `-ref`: 被外部直接引用的字段,名称和类型都不可变(如 ability_id-ref)
- - 无后缀: 内部字段,可自由演进
- 校验时自动剥离后缀,实际匹配的 key 是去掉后缀的版本。
- """
- import json
- from pathlib import Path
- from typing import Any, Dict, Optional, Tuple
- import copy
- try:
- import jsonschema
- from jsonschema import Draft7Validator, ValidationError
- JSONSCHEMA_AVAILABLE = True
- except ImportError:
- JSONSCHEMA_AVAILABLE = False
- print("Warning: jsonschema not installed. Run: pip install jsonschema")
- class SchemaManager:
- """Schema 管理器,负责加载和验证 JSON Schema"""
- CONTRACT_SUFFIXES = ["-boundary", "-ref"]
- def __init__(self, prompts_dir: Path):
- """
- 初始化 Schema 管理器
- Args:
- prompts_dir: prompts 目录路径
- """
- self.prompts_dir = Path(prompts_dir)
- self._schema_cache: Dict[str, Dict] = {}
- def load_schema(self, prompt_name: str) -> Optional[Dict]:
- """
- 加载指定 prompt 对应的 schema
- Args:
- prompt_name: prompt 文件名(不含 .prompt 后缀)
- Returns:
- Schema 字典,如果文件不存在则返回 None
- """
- # 检查缓存
- if prompt_name in self._schema_cache:
- return self._schema_cache[prompt_name]
- # 加载 schema 文件(先找 prompts/,再找 prompts/temp_schema/)
- schema_file = self.prompts_dir / f"{prompt_name}.schema.json"
- if not schema_file.exists():
- schema_file = self.prompts_dir / "temp_schema" / f"{prompt_name}.schema.json"
- if not schema_file.exists():
- return None
- try:
- with open(schema_file, "r", encoding="utf-8") as f:
- schema = json.load(f)
- self._schema_cache[prompt_name] = schema
- return schema
- except Exception as e:
- print(f"Error loading schema {schema_file}: {e}")
- return None
- @classmethod
- def _strip_suffix(cls, key: str) -> str:
- """剥离契约后缀,返回实际字段名"""
- for suffix in cls.CONTRACT_SUFFIXES:
- if key.endswith(suffix):
- return key[:-len(suffix)]
- return key
- @classmethod
- def _strip_schema(cls, schema: Any) -> Any:
- """
- 递归遍历 schema,将所有带后缀的 key 替换为剥离后的版本。
- 返回一份新的 schema(不修改原始对象)。
- """
- if isinstance(schema, dict):
- result = {}
- for k, v in schema.items():
- new_key = k
- # 只对 properties 和 required 里的 key 做剥离
- if k == "properties":
- # properties 的 value 是 {field_name: field_schema}
- result[k] = {
- cls._strip_suffix(fk): cls._strip_schema(fv)
- for fk, fv in v.items()
- }
- elif k == "required":
- # required 是字段名数组
- result[k] = [cls._strip_suffix(r) for r in v]
- else:
- result[k] = cls._strip_schema(v)
- return result
- elif isinstance(schema, list):
- return [cls._strip_schema(item) for item in schema]
- else:
- return schema
- def validate(self, data: Any, prompt_name: str) -> Tuple[bool, Optional[str]]:
- """
- 使用 JSON Schema 验证数据
- Args:
- data: 要验证的数据
- prompt_name: prompt 文件名(不含 .prompt 后缀)
- Returns:
- (is_valid, error_message) 元组
- """
- if not JSONSCHEMA_AVAILABLE:
- return True, None
- schema = self.load_schema(prompt_name)
- if schema is None:
- return True, None
- try:
- clean_schema = self._strip_schema(schema)
- validator = Draft7Validator(clean_schema)
- validator.validate(data)
- return True, None
- except ValidationError as e:
- path = ".".join(str(p) for p in e.absolute_path) if e.absolute_path else "root"
- return False, f"{path}: {e.message}"
- except Exception as e:
- return False, f"Validation error: {str(e)}"
- def get_example_output(self, prompt_name: str) -> Optional[Dict]:
- """
- 从 schema 中提取示例输出(如果有的话)
- Args:
- prompt_name: prompt 文件名(不含 .prompt 后缀)
- Returns:
- 示例输出字典,如果没有则返回 None
- """
- schema = self.load_schema(prompt_name)
- if schema is None:
- return None
- # 尝试从 schema 中提取 examples
- if "examples" in schema:
- return schema["examples"][0] if schema["examples"] else None
- # 或者根据 schema 生成一个最小示例
- return self._generate_minimal_example(schema)
- def get_stripped_schema(self, prompt_name: str) -> Optional[Dict]:
- """
- 获取剥离后缀的 schema(用于传给 LLM 的 response_format)
- Args:
- prompt_name: prompt 文件名(不含 .prompt 后缀)
- Returns:
- 剥离后缀的 schema 字典,如果文件不存在则返回 None
- """
- schema = self.load_schema(prompt_name)
- if schema is None:
- return None
- return self._strip_schema(schema)
- def _generate_minimal_example(self, schema: Dict) -> Dict:
- """
- 根据 schema 生成一个最小示例
- Args:
- schema: JSON Schema 字典
- Returns:
- 最小示例字典
- """
- if schema.get("type") != "object":
- return {}
- example = {}
- required = schema.get("required", [])
- properties = schema.get("properties", {})
- for key in required:
- if key in properties:
- prop = properties[key]
- prop_type = prop.get("type")
- if prop_type == "string":
- example[key] = prop.get("examples", [""])[0] if "examples" in prop else ""
- elif prop_type == "integer":
- example[key] = prop.get("examples", [0])[0] if "examples" in prop else 0
- elif prop_type == "boolean":
- example[key] = prop.get("default", False)
- elif prop_type == "array":
- example[key] = []
- elif prop_type == "object":
- example[key] = {}
- elif isinstance(prop_type, list) and "null" in prop_type:
- example[key] = None
- return example
- # 全局单例
- _schema_manager: Optional[SchemaManager] = None
- def get_schema_manager(prompts_dir: Optional[Path] = None) -> SchemaManager:
- """
- 获取全局 Schema 管理器单例
- Args:
- prompts_dir: prompts 目录路径(首次调用时必须提供)
- Returns:
- SchemaManager 实例
- """
- global _schema_manager
- if _schema_manager is None:
- if prompts_dir is None:
- # 默认路径
- base_dir = Path(__file__).parent.parent
- prompts_dir = base_dir / "prompts"
- _schema_manager = SchemaManager(prompts_dir)
- return _schema_manager
- def validate_with_schema(data: Any, prompt_name: str) -> Optional[str]:
- """
- 便捷函数:使用 schema 验证数据
- Args:
- data: 要验证的数据
- prompt_name: prompt 文件名(不含 .prompt 后缀)
- Returns:
- 错误消息字符串,如果验证通过则返回 None
- """
- manager = get_schema_manager()
- is_valid, error = manager.validate(data, prompt_name)
- return error if not is_valid else None
- # 示例用法
- if __name__ == "__main__":
- # 测试加载 schema
- manager = get_schema_manager()
- # 测试 extract_workflow schema
- schema = manager.load_schema("extract_workflow")
- if schema:
- print("✓ Loaded extract_workflow.schema.json")
- # 测试验证
- test_data = {
- "id": "strategy-001",
- "name": "测试工序",
- "description": "这是一个测试",
- "modality": "图文",
- "inputs": {},
- "outputs": {},
- "steps": [
- {
- "order": 1,
- "type": "capability",
- "description": "测试步骤",
- "inputs": {},
- "outputs": {}
- }
- ]
- }
- is_valid, error = manager.validate(test_data, "extract_workflow")
- if is_valid:
- print("✓ Validation passed")
- else:
- print(f"✗ Validation failed: {error}")
|