count_schema_registry.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. from __future__ import annotations
  2. import ast
  3. import json
  4. import re
  5. from dataclasses import dataclass
  6. from pathlib import Path
  7. from typing import Any
  8. ROOT = Path(__file__).resolve().parents[1]
  9. SQL_PATH = ROOT / "sql/content_agent_schema.sql"
  10. RUNTIME_FILES_PATH = ROOT / "content_agent/integrations/runtime_files.py"
  11. REGISTRY_PATH = ROOT / "tech_documents/数据库字段总览/content_agent_schema_registry.json"
  12. REPORT_JSON = ROOT / "tech_documents/数据库字段总览/content_agent_schema_registry_count_report.json"
  13. REPORT_MD = ROOT / "tech_documents/数据库字段总览/content_agent_schema_registry_count_report.md"
  14. EXPECTED_BUSINESS_MODULES = [
  15. "source_seed",
  16. "search_intent",
  17. "platform_access",
  18. "content_discovery",
  19. "rule_judgment",
  20. "walk_strategy",
  21. "policy_version",
  22. "result_source_lookup",
  23. "run_record",
  24. "learning_review",
  25. ]
  26. @dataclass(frozen=True)
  27. class ColumnDef:
  28. name: str
  29. column_type: str
  30. data_type: str
  31. nullable: bool
  32. default: str | None
  33. extra: str | None
  34. ordinal_position: int
  35. @property
  36. def is_json(self) -> bool:
  37. return self.data_type.upper() == "JSON"
  38. @dataclass(frozen=True)
  39. class IndexDef:
  40. name: str
  41. columns: list[str]
  42. unique: bool
  43. @dataclass(frozen=True)
  44. class TableDef:
  45. name: str
  46. columns: list[ColumnDef]
  47. unique_indexes: list[IndexDef]
  48. secondary_indexes: list[IndexDef]
  49. def main() -> None:
  50. sql_tables = parse_sql_schema(SQL_PATH)
  51. runtime_files = parse_runtime_filenames(RUNTIME_FILES_PATH)
  52. registry = _load_registry()
  53. report = build_report(sql_tables, runtime_files, registry)
  54. REPORT_JSON.parent.mkdir(parents=True, exist_ok=True)
  55. REPORT_JSON.write_text(
  56. json.dumps(report, ensure_ascii=False, indent=2) + "\n",
  57. encoding="utf-8",
  58. )
  59. REPORT_MD.write_text(_render_markdown(report), encoding="utf-8")
  60. print(json.dumps(report["summary"], ensure_ascii=False))
  61. def parse_sql_schema(path: Path = SQL_PATH) -> dict[str, TableDef]:
  62. text = path.read_text(encoding="utf-8")
  63. tables: dict[str, TableDef] = {}
  64. pattern = re.compile(
  65. r"CREATE TABLE IF NOT EXISTS\s+`?(content_agent_[A-Za-z0-9_]+)`?\s*\((.*?)\)\s+ENGINE=",
  66. re.S,
  67. )
  68. for table_name, body in pattern.findall(text):
  69. columns: list[ColumnDef] = []
  70. unique_indexes: list[IndexDef] = []
  71. secondary_indexes: list[IndexDef] = []
  72. for line in _split_table_body(body):
  73. parsed_column = _parse_column_line(line, len(columns) + 1)
  74. if parsed_column:
  75. columns.append(parsed_column)
  76. continue
  77. parsed_index = _parse_index_line(line)
  78. if parsed_index:
  79. if parsed_index.unique:
  80. unique_indexes.append(parsed_index)
  81. else:
  82. secondary_indexes.append(parsed_index)
  83. tables[table_name] = TableDef(
  84. name=table_name,
  85. columns=columns,
  86. unique_indexes=unique_indexes,
  87. secondary_indexes=secondary_indexes,
  88. )
  89. return tables
  90. def parse_runtime_filenames(path: Path = RUNTIME_FILES_PATH) -> list[str]:
  91. tree = ast.parse(path.read_text(encoding="utf-8"))
  92. for node in tree.body:
  93. if isinstance(node, ast.Assign):
  94. targets = [target.id for target in node.targets if isinstance(target, ast.Name)]
  95. if "RUNTIME_FILENAMES" in targets:
  96. value = ast.literal_eval(node.value)
  97. if isinstance(value, list):
  98. return [str(item) for item in value]
  99. raise ValueError("RUNTIME_FILENAMES not found")
  100. def build_report(
  101. sql_tables: dict[str, TableDef],
  102. runtime_files: list[str],
  103. registry: dict[str, Any] | None,
  104. ) -> dict[str, Any]:
  105. table_summaries = {
  106. name: {
  107. "column_count": len(table.columns),
  108. "json_column_count": sum(1 for column in table.columns if column.is_json),
  109. "unique_index_count": len(table.unique_indexes),
  110. "secondary_index_count": len(table.secondary_indexes),
  111. "json_columns": [column.name for column in table.columns if column.is_json],
  112. }
  113. for name, table in sql_tables.items()
  114. }
  115. summary = {
  116. "table_count": len(sql_tables),
  117. "sql_column_count": sum(len(table.columns) for table in sql_tables.values()),
  118. "json_column_count": sum(
  119. 1 for table in sql_tables.values() for column in table.columns if column.is_json
  120. ),
  121. "unique_index_count": sum(len(table.unique_indexes) for table in sql_tables.values()),
  122. "secondary_index_count": sum(
  123. len(table.secondary_indexes) for table in sql_tables.values()
  124. ),
  125. "business_module_count": len(EXPECTED_BUSINESS_MODULES),
  126. "runtime_file_count": len(runtime_files),
  127. }
  128. registry_summary: dict[str, Any] = {}
  129. if registry:
  130. registry_summary = {
  131. "table_count": len(registry.get("tables", {})),
  132. "business_module_count": len(registry.get("business_modules", {})),
  133. "runtime_file_count": len(registry.get("runtime_files", {})),
  134. "field_reference_count": sum(
  135. len(table.get("columns", [])) for table in registry.get("tables", {}).values()
  136. ),
  137. }
  138. return {
  139. "summary": summary,
  140. "source_files": {
  141. "sql": str(SQL_PATH.relative_to(ROOT)),
  142. "runtime_files": str(RUNTIME_FILES_PATH.relative_to(ROOT)),
  143. "registry": str(REGISTRY_PATH.relative_to(ROOT)),
  144. },
  145. "tables": table_summaries,
  146. "runtime_files": runtime_files,
  147. "business_modules": EXPECTED_BUSINESS_MODULES,
  148. "registry_summary": registry_summary,
  149. }
  150. def _load_registry() -> dict[str, Any] | None:
  151. if not REGISTRY_PATH.exists():
  152. return None
  153. return json.loads(REGISTRY_PATH.read_text(encoding="utf-8"))
  154. def _split_table_body(body: str) -> list[str]:
  155. lines: list[str] = []
  156. current: list[str] = []
  157. depth = 0
  158. for char in body:
  159. if char == "(":
  160. depth += 1
  161. elif char == ")":
  162. depth -= 1
  163. if char == "," and depth == 0:
  164. line = "".join(current).strip()
  165. if line:
  166. lines.append(line)
  167. current = []
  168. else:
  169. current.append(char)
  170. line = "".join(current).strip()
  171. if line:
  172. lines.append(line)
  173. return [line.replace("\n", " ").strip() for line in lines]
  174. def _parse_column_line(line: str, ordinal_position: int) -> ColumnDef | None:
  175. if line.startswith(("PRIMARY KEY", "UNIQUE KEY", "KEY", "CONSTRAINT")):
  176. return None
  177. match = re.match(r"`?([A-Za-z_][A-Za-z0-9_]*)`?\s+(.+)$", line)
  178. if not match:
  179. return None
  180. name, rest = match.groups()
  181. tokens = rest.split()
  182. column_type_tokens: list[str] = []
  183. for token in tokens:
  184. if token.upper() in {"NOT", "NULL", "DEFAULT", "AUTO_INCREMENT", "ON", "COMMENT"}:
  185. break
  186. column_type_tokens.append(token)
  187. column_type = " ".join(column_type_tokens)
  188. data_type = column_type.split("(", 1)[0].upper()
  189. nullable = "NOT NULL" not in rest.upper()
  190. default_match = re.search(
  191. r"\bDEFAULT\s+((?:'[^']*')|(?:[A-Za-z0-9_().+-]+))",
  192. rest,
  193. flags=re.I,
  194. )
  195. default = default_match.group(1) if default_match else None
  196. extras: list[str] = []
  197. if "AUTO_INCREMENT" in rest.upper():
  198. extras.append("AUTO_INCREMENT")
  199. if "ON UPDATE" in rest.upper():
  200. extras.append(rest[rest.upper().index("ON UPDATE") :])
  201. return ColumnDef(
  202. name=name,
  203. column_type=column_type,
  204. data_type=data_type,
  205. nullable=nullable,
  206. default=default,
  207. extra=" ".join(extras) or None,
  208. ordinal_position=ordinal_position,
  209. )
  210. def _parse_index_line(line: str) -> IndexDef | None:
  211. unique = line.startswith("UNIQUE KEY")
  212. if not unique and not line.startswith("KEY"):
  213. return None
  214. match = re.match(r"(?:UNIQUE\s+)?KEY\s+`?([A-Za-z0-9_]+)`?\s*\((.+)\)", line)
  215. if not match:
  216. return None
  217. name, columns_text = match.groups()
  218. columns = [
  219. re.sub(r"\(\d+\)$", "", part.strip().strip("`"))
  220. for part in columns_text.split(",")
  221. ]
  222. return IndexDef(name=name, columns=columns, unique=unique)
  223. def _render_markdown(report: dict[str, Any]) -> str:
  224. summary = report["summary"]
  225. lines = [
  226. "# Schema Registry 计数报告",
  227. "",
  228. f"- 数据库表数量:`{summary['table_count']}`",
  229. f"- SQL 字段数量:`{summary['sql_column_count']}`",
  230. f"- JSON 字段数量:`{summary['json_column_count']}`",
  231. f"- 唯一键数量:`{summary['unique_index_count']}`",
  232. f"- 二级索引数量:`{summary['secondary_index_count']}`",
  233. f"- 业务模块数量:`{summary['business_module_count']}`",
  234. f"- Runtime 文件数量:`{summary['runtime_file_count']}`",
  235. "",
  236. "## 按表计数",
  237. "",
  238. "| 表 | 字段 | JSON 字段 | 唯一键 | 二级索引 |",
  239. "|---|---:|---:|---:|---:|",
  240. ]
  241. for table_name, table in report["tables"].items():
  242. lines.append(
  243. "| {name} | {columns} | {json_columns} | {unique_indexes} | {secondary_indexes} |".format(
  244. name=f"`{table_name}`",
  245. columns=table["column_count"],
  246. json_columns=table["json_column_count"],
  247. unique_indexes=table["unique_index_count"],
  248. secondary_indexes=table["secondary_index_count"],
  249. )
  250. )
  251. lines.extend(["", "## Runtime 文件", ""])
  252. for filename in report["runtime_files"]:
  253. lines.append(f"- `{filename}`")
  254. lines.append("")
  255. return "\n".join(lines)
  256. if __name__ == "__main__":
  257. main()