check_naming_standards.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. from __future__ import annotations
  2. import json
  3. import re
  4. from dataclasses import dataclass
  5. from pathlib import Path
  6. from typing import Any
  7. ROOT = Path(__file__).resolve().parents[1]
  8. MANIFEST_PATH = ROOT / "tech_documents/命名规范/naming_standards_manifest.json"
  9. REPORT_JSON = ROOT / "tech_documents/命名规范/naming_standards_report.json"
  10. REPORT_MD = ROOT / "tech_documents/命名规范/naming_standards_report.md"
  11. SCAN_ROOTS = [
  12. "product_documents",
  13. "tech_documents",
  14. "content_agent",
  15. "tests",
  16. "sql",
  17. "scripts",
  18. ]
  19. TEXT_EXTENSIONS = {
  20. ".md",
  21. ".json",
  22. ".py",
  23. ".sql",
  24. ".toml",
  25. ".txt",
  26. }
  27. @dataclass(frozen=True)
  28. class Finding:
  29. term: str
  30. path: str
  31. line_number: int
  32. line: str
  33. reason: str
  34. def main() -> None:
  35. manifest = json.loads(MANIFEST_PATH.read_text(encoding="utf-8"))
  36. banned_terms = manifest["banned_terms"]
  37. exception_paths = manifest["allowed_exception_paths"]
  38. violations: list[Finding] = []
  39. allowed_exceptions: list[Finding] = []
  40. for path in _iter_scan_files():
  41. rel_path = path.relative_to(ROOT).as_posix()
  42. text = path.read_text(encoding="utf-8", errors="ignore")
  43. for line_number, line in enumerate(text.splitlines(), start=1):
  44. for term in banned_terms:
  45. if not _line_contains_term(line, term):
  46. continue
  47. if _is_allowed_exception(rel_path, line, exception_paths, term):
  48. allowed_exceptions.append(
  49. Finding(term, rel_path, line_number, line.strip(), "allowed_exception")
  50. )
  51. else:
  52. violations.append(
  53. Finding(term, rel_path, line_number, line.strip(), "violation")
  54. )
  55. payload = {
  56. "status": "fail" if violations else "pass",
  57. "violation_count": len(violations),
  58. "allowed_exception_count": len(allowed_exceptions),
  59. "violations": [finding.__dict__ for finding in violations],
  60. "allowed_exceptions": [finding.__dict__ for finding in allowed_exceptions],
  61. }
  62. REPORT_JSON.write_text(
  63. json.dumps(payload, ensure_ascii=False, indent=2) + "\n",
  64. encoding="utf-8",
  65. )
  66. REPORT_MD.write_text(_render_markdown(payload), encoding="utf-8")
  67. print(json.dumps({"status": payload["status"], "violations": len(violations)}, ensure_ascii=False))
  68. if violations:
  69. raise SystemExit(1)
  70. def _iter_scan_files() -> list[Path]:
  71. paths: list[Path] = []
  72. for root in SCAN_ROOTS:
  73. base = ROOT / root
  74. if not base.exists():
  75. continue
  76. for path in base.rglob("*"):
  77. if not path.is_file():
  78. continue
  79. if "__pycache__" in path.parts:
  80. continue
  81. if path.name in {
  82. "naming_standards_report.json",
  83. "naming_standards_report.md",
  84. }:
  85. continue
  86. if path.suffix not in TEXT_EXTENSIONS:
  87. continue
  88. paths.append(path)
  89. return sorted(paths)
  90. def _line_contains_term(line: str, term: str) -> bool:
  91. if term.endswith(".jsonl"):
  92. return term in line
  93. return re.search(
  94. rf"(?<![A-Za-z0-9_]){re.escape(term)}(?![A-Za-z0-9_])",
  95. line,
  96. ) is not None
  97. def _is_allowed_exception(rel_path: str, line: str, exception_paths: list[str], term: str = "") -> bool:
  98. if any(rel_path.startswith(path) or rel_path == path for path in exception_paths):
  99. return True
  100. legacy_runtime_aliases = {
  101. "queries.jsonl",
  102. "candidate_pool.jsonl",
  103. "media_assets.jsonl",
  104. "source_edges.jsonl",
  105. "trace_events.jsonl",
  106. }
  107. if rel_path == "tech_documents/数据库字段总览/content_agent_schema_registry.json" and any(
  108. alias in line for alias in legacy_runtime_aliases
  109. ):
  110. return True
  111. if rel_path in {"scripts/validate_schema_registry.py", "scripts/check_naming_standards.py"} and any(
  112. alias in line for alias in legacy_runtime_aliases
  113. ):
  114. return True
  115. if rel_path == "scripts/check_naming_standards.py" and term == "evidence_refs":
  116. return True
  117. if "platform_raw_payload" in line:
  118. return True
  119. if "source_post_id" in line or "matched_post_ids" in line or "video_ids" in line:
  120. return True
  121. if term == "evidence_refs" and rel_path in {
  122. "tech_documents/数据接口与来源/01_DemandAgent输入合同.md",
  123. "tech_documents/Pattern回扣与分类树/00_全链路说明.md",
  124. "tech_documents/Pattern回扣与分类树/02_前置坑与FAQ.md",
  125. }:
  126. return True
  127. return False
  128. def _render_markdown(payload: dict[str, Any]) -> str:
  129. lines = [
  130. "# 命名规范检查报告",
  131. "",
  132. f"- 状态:`{payload['status']}`",
  133. f"- 违规数量:`{payload['violation_count']}`",
  134. f"- 允许例外数量:`{payload['allowed_exception_count']}`",
  135. "",
  136. ]
  137. if payload["violations"]:
  138. lines.extend(["## 违规残留", ""])
  139. for finding in payload["violations"][:200]:
  140. lines.append(
  141. f"- `{finding['term']}` at `{finding['path']}:{finding['line_number']}`: {finding['line']}"
  142. )
  143. if len(payload["violations"]) > 200:
  144. lines.append(f"- 其余 {len(payload['violations']) - 200} 条见 JSON 报告。")
  145. lines.append("")
  146. if payload["allowed_exceptions"]:
  147. lines.extend(["## 允许例外", ""])
  148. for finding in payload["allowed_exceptions"][:100]:
  149. lines.append(
  150. f"- `{finding['term']}` at `{finding['path']}:{finding['line_number']}`"
  151. )
  152. if len(payload["allowed_exceptions"]) > 100:
  153. lines.append(f"- 其余 {len(payload['allowed_exceptions']) - 100} 条见 JSON 报告。")
  154. lines.append("")
  155. return "\n".join(lines) + "\n"
  156. if __name__ == "__main__":
  157. main()