#!/usr/bin/env python3 """ 飞书文档转 Markdown 工具 飞书文档是客户端渲染的,文档内容嵌在 JS 变量 window.DATA 中。 本脚本从 HTML 中提取 block_map JSON,解析文档结构树,转换为 Markdown。 使用方法: python feishu_to_md.py <飞书文档URL> -o output.md 依赖安装: pip install httpx """ import argparse import json import re import sys from pathlib import Path try: import httpx except ImportError: print("缺少依赖:pip install httpx") sys.exit(1) def fetch_html(url: str) -> str: print(f"正在抓取: {url}") with httpx.Client(follow_redirects=True, timeout=30.0) as client: resp = client.get(url, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" }) resp.raise_for_status() return resp.text def extract_block_data(html: str) -> dict: """从 HTML 中提取 window.DATA 里的 block_map 和 block_sequence""" # 匹配 clientVars: Object({...}) match = re.search(r'clientVars:\s*Object\((\{.+?\})\)\s*\}', html, re.DOTALL) if not match: raise ValueError("无法从页面中提取 clientVars 数据") raw = match.group(1) # 飞书的 JSON 用了 Unicode 转义,直接 parse try: data = json.loads(raw) except json.JSONDecodeError: # 有时候 JSON 被截断,尝试找完整的 match2 = re.search(r'"block_map"\s*:\s*\{', html) if not match2: raise ValueError("无法解析 block_map") # 从 block_map 开始,找到对应的闭合括号 start = match2.start() depth = 0 for i in range(start, len(html)): if html[i] == '{': depth += 1 elif html[i] == '}': depth -= 1 if depth == 0: try: data = {"data": json.loads(html[start:i+1].replace('"block_map":', '{"block_map":') + '}')} break except: continue else: raise ValueError("无法解析 block_map JSON") return data.get("data", data) def extract_text_from_block(block_data: dict) -> str: """从 block 的 text 字段提取纯文本""" text_info = block_data.get("text", {}) texts = text_info.get("initialAttributedTexts", {}).get("text", {}) if not texts: return "" # texts 是 {"0": "...", "1": "..."} 格式,拼接所有 return "".join(texts.get(str(i), "") for i in range(len(texts))) def extract_link_from_block(block_data: dict) -> str: """提取 block 中的链接""" text_info = block_data.get("text", {}) apool = text_info.get("apool", {}).get("numToAttrib", {}) for _, attr in apool.items(): if isinstance(attr, list) and len(attr) == 2: if attr[0] == "link": from urllib.parse import unquote return unquote(attr[1]) return "" def blocks_to_markdown(block_map: dict, block_sequence: list, page_id: str) -> str: """将 block_map 按 block_sequence 顺序转换为 Markdown""" lines = [] in_code_block = False code_lang = "" code_lines = [] for block_id in block_sequence: block = block_map.get(block_id) if not block: continue data = block.get("data", {}) block_type = data.get("type", "") text = extract_text_from_block(data) link = extract_link_from_block(data) # 跳过页面根节点本身(标题已单独处理) if block_type == "page": if text: lines.append(f"# {text}") lines.append("") continue # 处理代码块的结束 if in_code_block and block_type != "code": lines.append(f"```{code_lang}") for cl in code_lines: lines.append(cl) lines.append("```") lines.append("") in_code_block = False code_lines = [] if block_type == "heading1": lines.append(f"# {text}") lines.append("") elif block_type == "heading2": lines.append(f"## {text}") lines.append("") elif block_type == "heading3": lines.append(f"### {text}") lines.append("") elif block_type == "heading4": lines.append(f"#### {text}") lines.append("") elif block_type == "heading5": lines.append(f"##### {text}") lines.append("") elif block_type == "heading6": lines.append(f"###### {text}") lines.append("") elif block_type == "text": if text: if link: lines.append(f"{text}") lines.append(f" 链接: {link}") else: lines.append(text) lines.append("") else: lines.append("") elif block_type == "bullet": if link: lines.append(f"- [{text}]({link})") else: lines.append(f"- {text}") elif block_type == "ordered": if link: lines.append(f"1. [{text}]({link})") else: lines.append(f"1. {text}") elif block_type == "todo": checked = data.get("checked", False) mark = "x" if checked else " " lines.append(f"- [{mark}] {text}") elif block_type == "code": if not in_code_block: in_code_block = True code_lang = data.get("language", "") code_lines = [] code_lines.append(text) elif block_type == "callout": # callout 容器,内容在子节点中 if text: lines.append(f"> {text}") lines.append("") elif block_type == "quote": lines.append(f"> {text}") lines.append("") elif block_type == "divider": lines.append("---") lines.append("") elif block_type == "image": token = data.get("token", "") if token: lines.append(f"![image]({token})") lines.append("") elif block_type in ("table", "table_cell"): # 表格结构复杂,提取子节点文本 pass else: # 其他未知类型,如果有文本就输出 if text: lines.append(text) lines.append("") # 处理最后一个代码块 if in_code_block: lines.append(f"```{code_lang}") for cl in code_lines: lines.append(cl) lines.append("```") lines.append("") return "\n".join(lines) def extract_tables(block_map: dict) -> list[str]: """提取表格内容为 Markdown 格式""" tables = [] for block_id, block in block_map.items(): data = block.get("data", {}) if data.get("type") != "table": continue columns = data.get("columns_id", []) rows = data.get("rows_id", []) cell_set = data.get("cell_set", {}) if not columns or not rows: continue table_rows = [] for row_id in rows: row_cells = [] for col_id in columns: cell_key = f"{row_id}{col_id}" cell_info = cell_set.get(cell_key, {}) cell_block_id = cell_info.get("block_id", "") if cell_block_id and cell_block_id in block_map: cell_block = block_map[cell_block_id] cell_data = cell_block.get("data", {}) # 单元格内容在 children 中 children = cell_data.get("children", []) cell_texts = [] for child_id in children: if child_id in block_map: child_text = extract_text_from_block(block_map[child_id].get("data", {})) if child_text: cell_texts.append(child_text) row_cells.append(" ".join(cell_texts) if cell_texts else "") else: row_cells.append("") table_rows.append(row_cells) if table_rows: # 生成 Markdown 表格 md_lines = [] # 表头 md_lines.append("| " + " | ".join(table_rows[0]) + " |") md_lines.append("| " + " | ".join(["---"] * len(columns)) + " |") # 数据行 for row in table_rows[1:]: md_lines.append("| " + " | ".join(row) + " |") md_lines.append("") tables.append("\n".join(md_lines)) return tables def convert(html: str) -> str: """主转换函数""" data = extract_block_data(html) block_map = data.get("block_map", {}) block_sequence = data.get("block_sequence", []) if not block_map: raise ValueError("block_map 为空,无法提取文档内容") # 找到页面根节点 page_id = "" for bid, block in block_map.items(): if block.get("data", {}).get("type") == "page": page_id = bid break # 按 block_sequence 转换主体内容 markdown = blocks_to_markdown(block_map, block_sequence, page_id) # 提取表格 tables = extract_tables(block_map) # 如果有表格,附加到文档末尾(因为表格在主体中被跳过) # 实际上更好的做法是在原位插入,但飞书的表格嵌套结构比较复杂 # 这里先简单追加 if tables: markdown += "\n\n## 附录:表格数据\n\n" for i, table in enumerate(tables, 1): markdown += f"### 表格 {i}\n\n{table}\n" # 清理多余空行 markdown = re.sub(r"\n{3,}", "\n\n", markdown) return markdown.strip() def main(): parser = argparse.ArgumentParser(description="飞书文档转 Markdown") parser.add_argument("url", help="飞书文档 URL") parser.add_argument("-o", "--output", default="output.md", help="输出文件路径") args = parser.parse_args() try: html = fetch_html(args.url) print("正在解析文档结构...") markdown = convert(html) output_path = Path(args.output) output_path.write_text(markdown, encoding="utf-8") print(f"成功保存到: {output_path.absolute()}") print(f"文件大小: {len(markdown)} 字符") except httpx.HTTPStatusError as e: print(f"HTTP 错误: {e.response.status_code}") print("可能是文档需要登录或权限不足") sys.exit(1) except Exception as e: print(f"错误: {e}") sys.exit(1) if __name__ == "__main__": main()