build_static.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. #!/usr/bin/env python3
  2. """把 extraction-viewer.html + data/search_result + data/extract_result + images
  3. 打包成可离线分享的静态版本。
  4. 输出:
  5. data/dist/extraction-viewer.html - 单文件,所有 JSON 内联,marked.js 内联
  6. data/dist/images/ - 图片直接拷贝(不 base64,避免 HTML 巨大)
  7. data/dist.zip - 上面两项一起的压缩包
  8. 用法:
  9. python3 build_static.py # 生成 dist/ + dist.zip
  10. python3 build_static.py --no-zip # 只生成 dist/
  11. """
  12. from __future__ import annotations
  13. import argparse
  14. import json
  15. import shutil
  16. import sys
  17. import urllib.request
  18. import zipfile
  19. from pathlib import Path
  20. ROOT = Path(__file__).resolve().parent
  21. PROJECT_ROOT = ROOT.parents[1]
  22. DATA_DIR = PROJECT_ROOT / "data"
  23. SEARCH_RESULT_DIR = DATA_DIR / "search_result"
  24. EXTRACT_RESULT_DIR = DATA_DIR / "extract_result"
  25. VIEWER = ROOT / "extraction-viewer.html"
  26. RESULT = SEARCH_RESULT_DIR / "result.json"
  27. EXTRACTED_CAP = EXTRACT_RESULT_DIR / "capability"
  28. EXTRACTED_STRAT = EXTRACT_RESULT_DIR / "strategy"
  29. BATCH = DATA_DIR / "batch_extracted"
  30. IMAGES = SEARCH_RESULT_DIR / "images"
  31. FIELD_SPECS = ROOT / "capability_strategy_fields.csv"
  32. DIST = DATA_DIR / "dist"
  33. DIST_ZIP = DATA_DIR / "dist.zip"
  34. MARKED_URL = "https://cdn.jsdelivr.net/npm/marked/marked.min.js"
  35. MARKED_LOCAL = ROOT / ".marked.min.js" # cache locally so reruns are offline
  36. def load_json_dir(p: Path) -> dict[str, object]:
  37. if not p.is_dir():
  38. return {}
  39. out: dict[str, object] = {}
  40. for f in sorted(p.glob("*.json")):
  41. with f.open(encoding="utf-8") as fh:
  42. out[f.name] = json.load(fh)
  43. return out
  44. def fetch_marked() -> str:
  45. if MARKED_LOCAL.exists():
  46. return MARKED_LOCAL.read_text(encoding="utf-8")
  47. print(f" 下载 marked.js: {MARKED_URL}")
  48. with urllib.request.urlopen(MARKED_URL, timeout=20) as r:
  49. text = r.read().decode("utf-8")
  50. MARKED_LOCAL.write_text(text, encoding="utf-8")
  51. return text
  52. def build_html(viewer_html: str, bundle: dict, marked_js: str) -> str:
  53. # JSON 安全嵌入: 把 </ 转义掉, 防止 </script> 提早闭合
  54. bundle_json = (
  55. json.dumps(bundle, ensure_ascii=False, separators=(",", ":"))
  56. .replace("</", "<\\/")
  57. )
  58. inject = (
  59. "<script>window.__BUNDLED__ = "
  60. + bundle_json
  61. + ";</script>\n"
  62. + "<script>"
  63. + marked_js
  64. + "</script>\n"
  65. )
  66. # 把 CDN 那行换成内联的 marked + 数据
  67. cdn_tag = '<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>'
  68. if cdn_tag not in viewer_html:
  69. print("WARN: 没找到 CDN marked 脚本标签, 直接在 </head> 前注入", file=sys.stderr)
  70. return viewer_html.replace("</head>", inject + "</head>")
  71. return viewer_html.replace(cdn_tag, inject)
  72. def main() -> int:
  73. ap = argparse.ArgumentParser()
  74. ap.add_argument("--no-zip", action="store_true", help="只输出 dist/, 不打 zip")
  75. ap.add_argument(
  76. "--no-images",
  77. action="store_true",
  78. help="不复制 images/(仅适合所有图片都来自外部 URL 的场景)",
  79. )
  80. args = ap.parse_args()
  81. if not VIEWER.exists():
  82. print(f"找不到 {VIEWER}", file=sys.stderr)
  83. return 1
  84. print("[1/5] 读取 viewer + JSON")
  85. viewer_html = VIEWER.read_text(encoding="utf-8")
  86. result = json.loads(RESULT.read_text(encoding="utf-8")) if RESULT.exists() else []
  87. capability = load_json_dir(EXTRACTED_CAP)
  88. strategy = load_json_dir(EXTRACTED_STRAT)
  89. batch = load_json_dir(BATCH)
  90. print(
  91. f" result.json: {len(result)} 条; "
  92. f"extracted/capability: {len(capability)}; "
  93. f"extracted/strategy: {len(strategy)}; "
  94. f"batch_extracted: {len(batch)}"
  95. )
  96. print("[2/5] 抓取 marked.js")
  97. marked_js = fetch_marked()
  98. print("[3/5] 内联生成 HTML")
  99. field_specs_csv = FIELD_SPECS.read_text(encoding="utf-8") if FIELD_SPECS.exists() else ""
  100. if field_specs_csv:
  101. print(f" capability_strategy_fields.csv: {len(field_specs_csv)} bytes")
  102. # 注: 数据 key 必须等于 viewer SOURCES[*].dir 的相对路径
  103. bundle = {
  104. "result": result,
  105. "extracted/capability": capability,
  106. "extracted/strategy": strategy,
  107. "batch_extracted": batch,
  108. "field_specs_csv": field_specs_csv,
  109. }
  110. new_html = build_html(viewer_html, bundle, marked_js)
  111. if DIST.exists():
  112. shutil.rmtree(DIST)
  113. DIST.mkdir()
  114. out_html = DIST / "extraction-viewer.html"
  115. out_html.write_text(new_html, encoding="utf-8")
  116. size_mb = out_html.stat().st_size / 1024 / 1024
  117. print(f" 写入 {out_html.relative_to(PROJECT_ROOT)} ({size_mb:.2f} MB)")
  118. print("[4/5] 复制 images/")
  119. if args.no_images:
  120. print(" 跳过(--no-images)")
  121. elif IMAGES.is_dir():
  122. shutil.copytree(IMAGES, DIST / "images")
  123. n = len(list((DIST / "images").iterdir()))
  124. size_mb = sum(p.stat().st_size for p in (DIST / "images").iterdir()) / 1024 / 1024
  125. print(f" 复制 {n} 张图片, {size_mb:.1f} MB")
  126. else:
  127. print(" 没有 images/, 跳过")
  128. if args.no_zip:
  129. print("[5/5] 跳过 zip(--no-zip)")
  130. else:
  131. print(f"[5/5] 打包 {DIST_ZIP.name}")
  132. if DIST_ZIP.exists():
  133. DIST_ZIP.unlink()
  134. with zipfile.ZipFile(DIST_ZIP, "w", zipfile.ZIP_DEFLATED, compresslevel=6) as zf:
  135. for p in DIST.rglob("*"):
  136. if p.is_file():
  137. zf.write(p, p.relative_to(DIST.parent))
  138. zip_mb = DIST_ZIP.stat().st_size / 1024 / 1024
  139. print(f" {DIST_ZIP.relative_to(PROJECT_ROOT)} ({zip_mb:.1f} MB)")
  140. print(
  141. "\n完成。同事拿到 dist.zip 后:\n"
  142. " 1. 解压\n"
  143. " 2. 双击 dist/extraction-viewer.html\n"
  144. " 3. 直接看(不需要起 server,不需要联网)"
  145. )
  146. return 0
  147. if __name__ == "__main__":
  148. sys.exit(main())