build_html.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. #!/usr/bin/env python3
  2. """
  3. 批量渲染并上传 trace 日志 HTML,然后回写 MySQL。
  4. 约定:
  5. - output 目录下每个子目录名为 trace_id
  6. - 每个 trace_id 目录下有 log 文件(优先 log.txt,其次 run_log_*.txt,再其次任意 *.txt)
  7. 流程(对每个 trace_id):
  8. 1) log.txt -> render_log_html.generate_html(...) 生成 HTML 到同目录
  9. 2) 上传 HTML 到阿里云 OSS,拿到公网 URL
  10. 3) UPDATE demand_find_content_result.web_html_url = <url> WHERE trace_id = <trace_id>
  11. 安全默认:
  12. - 默认只 dry-run 打印候选,不生成/上传/写库
  13. - 加 --apply 才会执行生成 + 上传 + 写库
  14. """
  15. from __future__ import annotations
  16. import argparse
  17. import logging
  18. import os
  19. from dataclasses import dataclass
  20. from pathlib import Path
  21. from typing import Iterable, Optional
  22. from dotenv import load_dotenv
  23. logger = logging.getLogger(__name__)
  24. @dataclass(frozen=True)
  25. class TraceJob:
  26. trace_id: str
  27. trace_dir: Path
  28. log_path: Path
  29. html_path: Path
  30. def _load_env() -> None:
  31. # 兼容从任意目录运行:优先加载 examples/content_finder/.env
  32. load_dotenv(override=False)
  33. load_dotenv(dotenv_path=Path(__file__).resolve().parent / ".env", override=False)
  34. def _resolve_output_dir(output_dir: Optional[str]) -> Path:
  35. """
  36. Resolve output directory.
  37. - If --output-dir provided:
  38. - absolute path: use it
  39. - relative path: resolve against current working directory
  40. - Else:
  41. - ENV OUTPUT_DIR (absolute/relative-to-cwd)
  42. - fallback to examples/content_finder/output (script sibling)
  43. """
  44. if output_dir is not None and str(output_dir).strip() != "":
  45. p = Path(output_dir).expanduser()
  46. return p.resolve() if p.is_absolute() else (Path.cwd() / p).resolve()
  47. raw_env = (os.getenv("OUTPUT_DIR") or "").strip()
  48. if raw_env:
  49. p = Path(raw_env).expanduser()
  50. return p.resolve() if p.is_absolute() else (Path.cwd() / p).resolve()
  51. base = Path(__file__).resolve().parent
  52. return (base / "output").resolve()
  53. def _iter_trace_dirs(output_dir: Path) -> Iterable[Path]:
  54. if not output_dir.exists() or not output_dir.is_dir():
  55. return []
  56. return (p for p in output_dir.iterdir() if p.is_dir())
  57. def _pick_log_file(trace_dir: Path) -> Optional[Path]:
  58. preferred = trace_dir / "log.txt"
  59. if preferred.exists() and preferred.is_file():
  60. return preferred
  61. candidates = sorted(
  62. trace_dir.glob("run_log_*.txt"),
  63. key=lambda p: p.stat().st_mtime,
  64. reverse=True,
  65. )
  66. if candidates:
  67. return candidates[0]
  68. candidates = sorted(
  69. trace_dir.glob("*.txt"),
  70. key=lambda p: p.stat().st_mtime,
  71. reverse=True,
  72. )
  73. if candidates:
  74. return candidates[0]
  75. return None
  76. def _build_job(trace_dir: Path) -> Optional[TraceJob]:
  77. trace_id = trace_dir.name
  78. log_path = _pick_log_file(trace_dir)
  79. if not log_path:
  80. return None
  81. html_path = trace_dir / "log.html"
  82. return TraceJob(
  83. trace_id=trace_id,
  84. trace_dir=trace_dir,
  85. log_path=log_path,
  86. html_path=html_path,
  87. )
  88. def _render_html(job: TraceJob) -> None:
  89. from render_log_html import (
  90. COLLAPSE_ALL_FOLDS,
  91. COLLAPSE_KEYWORDS,
  92. COLLAPSE_PREFIXES,
  93. generate_html,
  94. )
  95. generate_html(
  96. input_path=job.log_path,
  97. output_path=job.html_path,
  98. collapse_prefixes=COLLAPSE_PREFIXES,
  99. collapse_keywords=COLLAPSE_KEYWORDS,
  100. collapse_all=COLLAPSE_ALL_FOLDS,
  101. )
  102. def _upload_html(job: TraceJob) -> str:
  103. from utils.oss_upload import upload_html_to_oss
  104. # object_key 由 upload_html_to_oss 内部用 prefix + task_id 拼接
  105. return upload_html_to_oss(job.html_path, task_id=job.trace_id)
  106. def _update_web_html_url(trace_id: str, url: str) -> int:
  107. from db import update_web_html_url
  108. return update_web_html_url(trace_id=trace_id, web_html_url=url)
  109. def main() -> None:
  110. _load_env()
  111. parser = argparse.ArgumentParser()
  112. parser.add_argument(
  113. "--output-dir",
  114. default=None,
  115. help="Output directory containing trace_id subdirectories. Default: examples/content_finder/output",
  116. )
  117. parser.add_argument(
  118. "--apply",
  119. action="store_true",
  120. help="Actually generate HTML, upload to OSS, and update MySQL. Without this flag, dry-run only.",
  121. )
  122. parser.add_argument(
  123. "--limit",
  124. type=int,
  125. default=0,
  126. help="Process at most N trace dirs (0 means no limit).",
  127. )
  128. args = parser.parse_args()
  129. logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
  130. output_dir = _resolve_output_dir(args.output_dir)
  131. trace_dirs = list(_iter_trace_dirs(output_dir))
  132. jobs = [j for d in trace_dirs if (j := _build_job(d)) is not None]
  133. jobs = sorted(jobs, key=lambda x: x.trace_dir.stat().st_mtime, reverse=True)
  134. if args.limit and args.limit > 0:
  135. jobs = jobs[: args.limit]
  136. print(f"[output_dir] {output_dir}")
  137. print(f"[trace_dirs] {len(trace_dirs)}")
  138. print(f"[jobs] {len(jobs)}")
  139. if not jobs:
  140. return
  141. if not args.apply:
  142. print("[dry-run] Add --apply to generate+upload+update.")
  143. for j in jobs:
  144. print(f"- trace_id={j.trace_id} log={j.log_path.name} -> html={j.html_path.name}")
  145. return
  146. ok = 0
  147. failed = 0
  148. for job in jobs:
  149. try:
  150. _render_html(job)
  151. url = _upload_html(job)
  152. rows = _update_web_html_url(trace_id=job.trace_id, url=url)
  153. print(f"[ok] trace_id={job.trace_id} url={url} rows={rows}")
  154. ok += 1
  155. except Exception as e:
  156. print(f"[failed] trace_id={job.trace_id} err={e}")
  157. logger.exception("job failed: %s", job.trace_id)
  158. failed += 1
  159. print(f"[done] ok={ok} failed={failed}")
  160. if __name__ == "__main__":
  161. main()