| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- #!/usr/bin/env python3
- """把 extraction-viewer.html + data/search_result + data/extract_result + images
- 打包成可离线分享的静态版本。
- 输出:
- data/dist/extraction-viewer.html - 单文件,所有 JSON 内联,marked.js 内联
- data/dist/images/ - 图片直接拷贝(不 base64,避免 HTML 巨大)
- data/dist.zip - 上面两项一起的压缩包
- 用法:
- python3 build_static.py # 生成 dist/ + dist.zip
- python3 build_static.py --no-zip # 只生成 dist/
- """
- from __future__ import annotations
- import argparse
- import json
- import shutil
- import sys
- import urllib.request
- import zipfile
- from pathlib import Path
- ROOT = Path(__file__).resolve().parent
- PROJECT_ROOT = ROOT.parents[1]
- DATA_DIR = PROJECT_ROOT / "data"
- SEARCH_RESULT_DIR = DATA_DIR / "search_result"
- EXTRACT_RESULT_DIR = DATA_DIR / "extract_result"
- VIEWER = ROOT / "extraction-viewer.html"
- RESULT = SEARCH_RESULT_DIR / "result.json"
- EXTRACTED_CAP = EXTRACT_RESULT_DIR / "capability"
- EXTRACTED_STRAT = EXTRACT_RESULT_DIR / "strategy"
- BATCH = DATA_DIR / "batch_extracted"
- IMAGES = SEARCH_RESULT_DIR / "images"
- FIELD_SPECS = ROOT / "capability_strategy_fields.csv"
- DIST = DATA_DIR / "dist"
- DIST_ZIP = DATA_DIR / "dist.zip"
- MARKED_URL = "https://cdn.jsdelivr.net/npm/marked/marked.min.js"
- MARKED_LOCAL = ROOT / ".marked.min.js" # cache locally so reruns are offline
- def load_json_dir(p: Path) -> dict[str, object]:
- if not p.is_dir():
- return {}
- out: dict[str, object] = {}
- for f in sorted(p.glob("*.json")):
- with f.open(encoding="utf-8") as fh:
- out[f.name] = json.load(fh)
- return out
- def fetch_marked() -> str:
- if MARKED_LOCAL.exists():
- return MARKED_LOCAL.read_text(encoding="utf-8")
- print(f" 下载 marked.js: {MARKED_URL}")
- with urllib.request.urlopen(MARKED_URL, timeout=20) as r:
- text = r.read().decode("utf-8")
- MARKED_LOCAL.write_text(text, encoding="utf-8")
- return text
- def build_html(viewer_html: str, bundle: dict, marked_js: str) -> str:
- # JSON 安全嵌入: 把 </ 转义掉, 防止 </script> 提早闭合
- bundle_json = (
- json.dumps(bundle, ensure_ascii=False, separators=(",", ":"))
- .replace("</", "<\\/")
- )
- inject = (
- "<script>window.__BUNDLED__ = "
- + bundle_json
- + ";</script>\n"
- + "<script>"
- + marked_js
- + "</script>\n"
- )
- # 把 CDN 那行换成内联的 marked + 数据
- cdn_tag = '<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>'
- if cdn_tag not in viewer_html:
- print("WARN: 没找到 CDN marked 脚本标签, 直接在 </head> 前注入", file=sys.stderr)
- return viewer_html.replace("</head>", inject + "</head>")
- return viewer_html.replace(cdn_tag, inject)
- def main() -> int:
- ap = argparse.ArgumentParser()
- ap.add_argument("--no-zip", action="store_true", help="只输出 dist/, 不打 zip")
- ap.add_argument(
- "--no-images",
- action="store_true",
- help="不复制 images/(仅适合所有图片都来自外部 URL 的场景)",
- )
- args = ap.parse_args()
- if not VIEWER.exists():
- print(f"找不到 {VIEWER}", file=sys.stderr)
- return 1
- print("[1/5] 读取 viewer + JSON")
- viewer_html = VIEWER.read_text(encoding="utf-8")
- result = json.loads(RESULT.read_text(encoding="utf-8")) if RESULT.exists() else []
- capability = load_json_dir(EXTRACTED_CAP)
- strategy = load_json_dir(EXTRACTED_STRAT)
- batch = load_json_dir(BATCH)
- print(
- f" result.json: {len(result)} 条; "
- f"extracted/capability: {len(capability)}; "
- f"extracted/strategy: {len(strategy)}; "
- f"batch_extracted: {len(batch)}"
- )
- print("[2/5] 抓取 marked.js")
- marked_js = fetch_marked()
- print("[3/5] 内联生成 HTML")
- field_specs_csv = FIELD_SPECS.read_text(encoding="utf-8") if FIELD_SPECS.exists() else ""
- if field_specs_csv:
- print(f" capability_strategy_fields.csv: {len(field_specs_csv)} bytes")
- # 注: 数据 key 必须等于 viewer SOURCES[*].dir 的相对路径
- bundle = {
- "result": result,
- "extracted/capability": capability,
- "extracted/strategy": strategy,
- "batch_extracted": batch,
- "field_specs_csv": field_specs_csv,
- }
- new_html = build_html(viewer_html, bundle, marked_js)
- if DIST.exists():
- shutil.rmtree(DIST)
- DIST.mkdir()
- out_html = DIST / "extraction-viewer.html"
- out_html.write_text(new_html, encoding="utf-8")
- size_mb = out_html.stat().st_size / 1024 / 1024
- print(f" 写入 {out_html.relative_to(PROJECT_ROOT)} ({size_mb:.2f} MB)")
- print("[4/5] 复制 images/")
- if args.no_images:
- print(" 跳过(--no-images)")
- elif IMAGES.is_dir():
- shutil.copytree(IMAGES, DIST / "images")
- n = len(list((DIST / "images").iterdir()))
- size_mb = sum(p.stat().st_size for p in (DIST / "images").iterdir()) / 1024 / 1024
- print(f" 复制 {n} 张图片, {size_mb:.1f} MB")
- else:
- print(" 没有 images/, 跳过")
- if args.no_zip:
- print("[5/5] 跳过 zip(--no-zip)")
- else:
- print(f"[5/5] 打包 {DIST_ZIP.name}")
- if DIST_ZIP.exists():
- DIST_ZIP.unlink()
- with zipfile.ZipFile(DIST_ZIP, "w", zipfile.ZIP_DEFLATED, compresslevel=6) as zf:
- for p in DIST.rglob("*"):
- if p.is_file():
- zf.write(p, p.relative_to(DIST.parent))
- zip_mb = DIST_ZIP.stat().st_size / 1024 / 1024
- print(f" {DIST_ZIP.relative_to(PROJECT_ROOT)} ({zip_mb:.1f} MB)")
- print(
- "\n完成。同事拿到 dist.zip 后:\n"
- " 1. 解压\n"
- " 2. 双击 dist/extraction-viewer.html\n"
- " 3. 直接看(不需要起 server,不需要联网)"
- )
- return 0
- if __name__ == "__main__":
- sys.exit(main())
|