#!/usr/bin/env python3 """把 extraction-viewer.html + data/search_result + data/extract_result + images 打包成可离线分享的静态版本。 输出: data/dist/extraction-viewer.html - 单文件,所有 JSON 内联,marked.js 内联 data/dist/images/ - 图片直接拷贝(不 base64,避免 HTML 巨大) data/dist.zip - 上面两项一起的压缩包 用法: python3 build_static.py # 生成 dist/ + dist.zip python3 build_static.py --no-zip # 只生成 dist/ """ from __future__ import annotations import argparse import json import shutil import sys import urllib.request import zipfile from pathlib import Path ROOT = Path(__file__).resolve().parent PROJECT_ROOT = ROOT.parents[1] DATA_DIR = PROJECT_ROOT / "data" SEARCH_RESULT_DIR = DATA_DIR / "search_result" EXTRACT_RESULT_DIR = DATA_DIR / "extract_result" VIEWER = ROOT / "extraction-viewer.html" RESULT = SEARCH_RESULT_DIR / "result.json" EXTRACTED_CAP = EXTRACT_RESULT_DIR / "capability" EXTRACTED_STRAT = EXTRACT_RESULT_DIR / "strategy" BATCH = DATA_DIR / "batch_extracted" IMAGES = SEARCH_RESULT_DIR / "images" FIELD_SPECS = ROOT / "capability_strategy_fields.csv" DIST = DATA_DIR / "dist" DIST_ZIP = DATA_DIR / "dist.zip" MARKED_URL = "https://cdn.jsdelivr.net/npm/marked/marked.min.js" MARKED_LOCAL = ROOT / ".marked.min.js" # cache locally so reruns are offline def load_json_dir(p: Path) -> dict[str, object]: if not p.is_dir(): return {} out: dict[str, object] = {} for f in sorted(p.glob("*.json")): with f.open(encoding="utf-8") as fh: out[f.name] = json.load(fh) return out def fetch_marked() -> str: if MARKED_LOCAL.exists(): return MARKED_LOCAL.read_text(encoding="utf-8") print(f" 下载 marked.js: {MARKED_URL}") with urllib.request.urlopen(MARKED_URL, timeout=20) as r: text = r.read().decode("utf-8") MARKED_LOCAL.write_text(text, encoding="utf-8") return text def build_html(viewer_html: str, bundle: dict, marked_js: str) -> str: # JSON 安全嵌入: 把 提早闭合 bundle_json = ( json.dumps(bundle, ensure_ascii=False, separators=(",", ":")) .replace("window.__BUNDLED__ = " + bundle_json + ";\n" + "\n" ) # 把 CDN 那行换成内联的 marked + 数据 cdn_tag = '' if cdn_tag not in viewer_html: print("WARN: 没找到 CDN marked 脚本标签, 直接在 前注入", file=sys.stderr) return viewer_html.replace("", inject + "") return viewer_html.replace(cdn_tag, inject) def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("--no-zip", action="store_true", help="只输出 dist/, 不打 zip") ap.add_argument( "--no-images", action="store_true", help="不复制 images/(仅适合所有图片都来自外部 URL 的场景)", ) args = ap.parse_args() if not VIEWER.exists(): print(f"找不到 {VIEWER}", file=sys.stderr) return 1 print("[1/5] 读取 viewer + JSON") viewer_html = VIEWER.read_text(encoding="utf-8") result = json.loads(RESULT.read_text(encoding="utf-8")) if RESULT.exists() else [] capability = load_json_dir(EXTRACTED_CAP) strategy = load_json_dir(EXTRACTED_STRAT) batch = load_json_dir(BATCH) print( f" result.json: {len(result)} 条; " f"extracted/capability: {len(capability)}; " f"extracted/strategy: {len(strategy)}; " f"batch_extracted: {len(batch)}" ) print("[2/5] 抓取 marked.js") marked_js = fetch_marked() print("[3/5] 内联生成 HTML") field_specs_csv = FIELD_SPECS.read_text(encoding="utf-8") if FIELD_SPECS.exists() else "" if field_specs_csv: print(f" capability_strategy_fields.csv: {len(field_specs_csv)} bytes") # 注: 数据 key 必须等于 viewer SOURCES[*].dir 的相对路径 bundle = { "result": result, "extracted/capability": capability, "extracted/strategy": strategy, "batch_extracted": batch, "field_specs_csv": field_specs_csv, } new_html = build_html(viewer_html, bundle, marked_js) if DIST.exists(): shutil.rmtree(DIST) DIST.mkdir() out_html = DIST / "extraction-viewer.html" out_html.write_text(new_html, encoding="utf-8") size_mb = out_html.stat().st_size / 1024 / 1024 print(f" 写入 {out_html.relative_to(PROJECT_ROOT)} ({size_mb:.2f} MB)") print("[4/5] 复制 images/") if args.no_images: print(" 跳过(--no-images)") elif IMAGES.is_dir(): shutil.copytree(IMAGES, DIST / "images") n = len(list((DIST / "images").iterdir())) size_mb = sum(p.stat().st_size for p in (DIST / "images").iterdir()) / 1024 / 1024 print(f" 复制 {n} 张图片, {size_mb:.1f} MB") else: print(" 没有 images/, 跳过") if args.no_zip: print("[5/5] 跳过 zip(--no-zip)") else: print(f"[5/5] 打包 {DIST_ZIP.name}") if DIST_ZIP.exists(): DIST_ZIP.unlink() with zipfile.ZipFile(DIST_ZIP, "w", zipfile.ZIP_DEFLATED, compresslevel=6) as zf: for p in DIST.rglob("*"): if p.is_file(): zf.write(p, p.relative_to(DIST.parent)) zip_mb = DIST_ZIP.stat().st_size / 1024 / 1024 print(f" {DIST_ZIP.relative_to(PROJECT_ROOT)} ({zip_mb:.1f} MB)") print( "\n完成。同事拿到 dist.zip 后:\n" " 1. 解压\n" " 2. 双击 dist/extraction-viewer.html\n" " 3. 直接看(不需要起 server,不需要联网)" ) return 0 if __name__ == "__main__": sys.exit(main())