#!/usr/bin/env python3
"""把 extraction-viewer.html + data/search_result + data/extract_result + images
打包成可离线分享的静态版本。
输出:
data/dist/extraction-viewer.html - 单文件,所有 JSON 内联,marked.js 内联
data/dist/images/ - 图片直接拷贝(不 base64,避免 HTML 巨大)
data/dist.zip - 上面两项一起的压缩包
用法:
python3 build_static.py # 生成 dist/ + dist.zip
python3 build_static.py --no-zip # 只生成 dist/
"""
from __future__ import annotations
import argparse
import json
import shutil
import sys
import urllib.request
import zipfile
from pathlib import Path
ROOT = Path(__file__).resolve().parent
PROJECT_ROOT = ROOT.parents[1]
DATA_DIR = PROJECT_ROOT / "data"
SEARCH_RESULT_DIR = DATA_DIR / "search_result"
EXTRACT_RESULT_DIR = DATA_DIR / "extract_result"
VIEWER = ROOT / "extraction-viewer.html"
RESULT = SEARCH_RESULT_DIR / "result.json"
EXTRACTED_CAP = EXTRACT_RESULT_DIR / "capability"
EXTRACTED_STRAT = EXTRACT_RESULT_DIR / "strategy"
BATCH = DATA_DIR / "batch_extracted"
IMAGES = SEARCH_RESULT_DIR / "images"
FIELD_SPECS = ROOT / "capability_strategy_fields.csv"
DIST = DATA_DIR / "dist"
DIST_ZIP = DATA_DIR / "dist.zip"
MARKED_URL = "https://cdn.jsdelivr.net/npm/marked/marked.min.js"
MARKED_LOCAL = ROOT / ".marked.min.js" # cache locally so reruns are offline
def load_json_dir(p: Path) -> dict[str, object]:
if not p.is_dir():
return {}
out: dict[str, object] = {}
for f in sorted(p.glob("*.json")):
with f.open(encoding="utf-8") as fh:
out[f.name] = json.load(fh)
return out
def fetch_marked() -> str:
if MARKED_LOCAL.exists():
return MARKED_LOCAL.read_text(encoding="utf-8")
print(f" 下载 marked.js: {MARKED_URL}")
with urllib.request.urlopen(MARKED_URL, timeout=20) as r:
text = r.read().decode("utf-8")
MARKED_LOCAL.write_text(text, encoding="utf-8")
return text
def build_html(viewer_html: str, bundle: dict, marked_js: str) -> str:
# JSON 安全嵌入: 把 转义掉, 防止 提早闭合
bundle_json = (
json.dumps(bundle, ensure_ascii=False, separators=(",", ":"))
.replace("", "<\\/")
)
inject = (
"\n"
+ "\n"
)
# 把 CDN 那行换成内联的 marked + 数据
cdn_tag = ''
if cdn_tag not in viewer_html:
print("WARN: 没找到 CDN marked 脚本标签, 直接在 前注入", file=sys.stderr)
return viewer_html.replace("", inject + "")
return viewer_html.replace(cdn_tag, inject)
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--no-zip", action="store_true", help="只输出 dist/, 不打 zip")
ap.add_argument(
"--no-images",
action="store_true",
help="不复制 images/(仅适合所有图片都来自外部 URL 的场景)",
)
args = ap.parse_args()
if not VIEWER.exists():
print(f"找不到 {VIEWER}", file=sys.stderr)
return 1
print("[1/5] 读取 viewer + JSON")
viewer_html = VIEWER.read_text(encoding="utf-8")
result = json.loads(RESULT.read_text(encoding="utf-8")) if RESULT.exists() else []
capability = load_json_dir(EXTRACTED_CAP)
strategy = load_json_dir(EXTRACTED_STRAT)
batch = load_json_dir(BATCH)
print(
f" result.json: {len(result)} 条; "
f"extracted/capability: {len(capability)}; "
f"extracted/strategy: {len(strategy)}; "
f"batch_extracted: {len(batch)}"
)
print("[2/5] 抓取 marked.js")
marked_js = fetch_marked()
print("[3/5] 内联生成 HTML")
field_specs_csv = FIELD_SPECS.read_text(encoding="utf-8") if FIELD_SPECS.exists() else ""
if field_specs_csv:
print(f" capability_strategy_fields.csv: {len(field_specs_csv)} bytes")
# 注: 数据 key 必须等于 viewer SOURCES[*].dir 的相对路径
bundle = {
"result": result,
"extracted/capability": capability,
"extracted/strategy": strategy,
"batch_extracted": batch,
"field_specs_csv": field_specs_csv,
}
new_html = build_html(viewer_html, bundle, marked_js)
if DIST.exists():
shutil.rmtree(DIST)
DIST.mkdir()
out_html = DIST / "extraction-viewer.html"
out_html.write_text(new_html, encoding="utf-8")
size_mb = out_html.stat().st_size / 1024 / 1024
print(f" 写入 {out_html.relative_to(PROJECT_ROOT)} ({size_mb:.2f} MB)")
print("[4/5] 复制 images/")
if args.no_images:
print(" 跳过(--no-images)")
elif IMAGES.is_dir():
shutil.copytree(IMAGES, DIST / "images")
n = len(list((DIST / "images").iterdir()))
size_mb = sum(p.stat().st_size for p in (DIST / "images").iterdir()) / 1024 / 1024
print(f" 复制 {n} 张图片, {size_mb:.1f} MB")
else:
print(" 没有 images/, 跳过")
if args.no_zip:
print("[5/5] 跳过 zip(--no-zip)")
else:
print(f"[5/5] 打包 {DIST_ZIP.name}")
if DIST_ZIP.exists():
DIST_ZIP.unlink()
with zipfile.ZipFile(DIST_ZIP, "w", zipfile.ZIP_DEFLATED, compresslevel=6) as zf:
for p in DIST.rglob("*"):
if p.is_file():
zf.write(p, p.relative_to(DIST.parent))
zip_mb = DIST_ZIP.stat().st_size / 1024 / 1024
print(f" {DIST_ZIP.relative_to(PROJECT_ROOT)} ({zip_mb:.1f} MB)")
print(
"\n完成。同事拿到 dist.zip 后:\n"
" 1. 解压\n"
" 2. 双击 dist/extraction-viewer.html\n"
" 3. 直接看(不需要起 server,不需要联网)"
)
return 0
if __name__ == "__main__":
sys.exit(main())