| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330 |
- #!/usr/bin/env python3
- """
- 飞书文档转 Markdown 工具
- 飞书文档是客户端渲染的,文档内容嵌在 JS 变量 window.DATA 中。
- 本脚本从 HTML 中提取 block_map JSON,解析文档结构树,转换为 Markdown。
- 使用方法:
- python feishu_to_md.py <飞书文档URL> -o output.md
- 依赖安装:
- pip install httpx
- """
- import argparse
- import json
- import re
- import sys
- from pathlib import Path
- try:
- import httpx
- except ImportError:
- print("缺少依赖:pip install httpx")
- sys.exit(1)
- def fetch_html(url: str) -> str:
- print(f"正在抓取: {url}")
- with httpx.Client(follow_redirects=True, timeout=30.0) as client:
- resp = client.get(url, headers={
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
- })
- resp.raise_for_status()
- return resp.text
- def extract_block_data(html: str) -> dict:
- """从 HTML 中提取 window.DATA 里的 block_map 和 block_sequence"""
- # 匹配 clientVars: Object({...})
- match = re.search(r'clientVars:\s*Object\((\{.+?\})\)\s*\}', html, re.DOTALL)
- if not match:
- raise ValueError("无法从页面中提取 clientVars 数据")
- raw = match.group(1)
- # 飞书的 JSON 用了 Unicode 转义,直接 parse
- try:
- data = json.loads(raw)
- except json.JSONDecodeError:
- # 有时候 JSON 被截断,尝试找完整的
- match2 = re.search(r'"block_map"\s*:\s*\{', html)
- if not match2:
- raise ValueError("无法解析 block_map")
- # 从 block_map 开始,找到对应的闭合括号
- start = match2.start()
- depth = 0
- for i in range(start, len(html)):
- if html[i] == '{':
- depth += 1
- elif html[i] == '}':
- depth -= 1
- if depth == 0:
- try:
- data = {"data": json.loads(html[start:i+1].replace('"block_map":', '{"block_map":') + '}')}
- break
- except:
- continue
- else:
- raise ValueError("无法解析 block_map JSON")
- return data.get("data", data)
- def extract_text_from_block(block_data: dict) -> str:
- """从 block 的 text 字段提取纯文本"""
- text_info = block_data.get("text", {})
- texts = text_info.get("initialAttributedTexts", {}).get("text", {})
- if not texts:
- return ""
- # texts 是 {"0": "...", "1": "..."} 格式,拼接所有
- return "".join(texts.get(str(i), "") for i in range(len(texts)))
- def extract_link_from_block(block_data: dict) -> str:
- """提取 block 中的链接"""
- text_info = block_data.get("text", {})
- apool = text_info.get("apool", {}).get("numToAttrib", {})
- for _, attr in apool.items():
- if isinstance(attr, list) and len(attr) == 2:
- if attr[0] == "link":
- from urllib.parse import unquote
- return unquote(attr[1])
- return ""
- def blocks_to_markdown(block_map: dict, block_sequence: list, page_id: str) -> str:
- """将 block_map 按 block_sequence 顺序转换为 Markdown"""
- lines = []
- in_code_block = False
- code_lang = ""
- code_lines = []
- for block_id in block_sequence:
- block = block_map.get(block_id)
- if not block:
- continue
- data = block.get("data", {})
- block_type = data.get("type", "")
- text = extract_text_from_block(data)
- link = extract_link_from_block(data)
- # 跳过页面根节点本身(标题已单独处理)
- if block_type == "page":
- if text:
- lines.append(f"# {text}")
- lines.append("")
- continue
- # 处理代码块的结束
- if in_code_block and block_type != "code":
- lines.append(f"```{code_lang}")
- for cl in code_lines:
- lines.append(cl)
- lines.append("```")
- lines.append("")
- in_code_block = False
- code_lines = []
- if block_type == "heading1":
- lines.append(f"# {text}")
- lines.append("")
- elif block_type == "heading2":
- lines.append(f"## {text}")
- lines.append("")
- elif block_type == "heading3":
- lines.append(f"### {text}")
- lines.append("")
- elif block_type == "heading4":
- lines.append(f"#### {text}")
- lines.append("")
- elif block_type == "heading5":
- lines.append(f"##### {text}")
- lines.append("")
- elif block_type == "heading6":
- lines.append(f"###### {text}")
- lines.append("")
- elif block_type == "text":
- if text:
- if link:
- lines.append(f"{text}")
- lines.append(f" 链接: {link}")
- else:
- lines.append(text)
- lines.append("")
- else:
- lines.append("")
- elif block_type == "bullet":
- if link:
- lines.append(f"- [{text}]({link})")
- else:
- lines.append(f"- {text}")
- elif block_type == "ordered":
- if link:
- lines.append(f"1. [{text}]({link})")
- else:
- lines.append(f"1. {text}")
- elif block_type == "todo":
- checked = data.get("checked", False)
- mark = "x" if checked else " "
- lines.append(f"- [{mark}] {text}")
- elif block_type == "code":
- if not in_code_block:
- in_code_block = True
- code_lang = data.get("language", "")
- code_lines = []
- code_lines.append(text)
- elif block_type == "callout":
- # callout 容器,内容在子节点中
- if text:
- lines.append(f"> {text}")
- lines.append("")
- elif block_type == "quote":
- lines.append(f"> {text}")
- lines.append("")
- elif block_type == "divider":
- lines.append("---")
- lines.append("")
- elif block_type == "image":
- token = data.get("token", "")
- if token:
- lines.append(f"")
- lines.append("")
- elif block_type in ("table", "table_cell"):
- # 表格结构复杂,提取子节点文本
- pass
- else:
- # 其他未知类型,如果有文本就输出
- if text:
- lines.append(text)
- lines.append("")
- # 处理最后一个代码块
- if in_code_block:
- lines.append(f"```{code_lang}")
- for cl in code_lines:
- lines.append(cl)
- lines.append("```")
- lines.append("")
- return "\n".join(lines)
- def extract_tables(block_map: dict) -> list[str]:
- """提取表格内容为 Markdown 格式"""
- tables = []
- for block_id, block in block_map.items():
- data = block.get("data", {})
- if data.get("type") != "table":
- continue
- columns = data.get("columns_id", [])
- rows = data.get("rows_id", [])
- cell_set = data.get("cell_set", {})
- if not columns or not rows:
- continue
- table_rows = []
- for row_id in rows:
- row_cells = []
- for col_id in columns:
- cell_key = f"{row_id}{col_id}"
- cell_info = cell_set.get(cell_key, {})
- cell_block_id = cell_info.get("block_id", "")
- if cell_block_id and cell_block_id in block_map:
- cell_block = block_map[cell_block_id]
- cell_data = cell_block.get("data", {})
- # 单元格内容在 children 中
- children = cell_data.get("children", [])
- cell_texts = []
- for child_id in children:
- if child_id in block_map:
- child_text = extract_text_from_block(block_map[child_id].get("data", {}))
- if child_text:
- cell_texts.append(child_text)
- row_cells.append(" ".join(cell_texts) if cell_texts else "")
- else:
- row_cells.append("")
- table_rows.append(row_cells)
- if table_rows:
- # 生成 Markdown 表格
- md_lines = []
- # 表头
- md_lines.append("| " + " | ".join(table_rows[0]) + " |")
- md_lines.append("| " + " | ".join(["---"] * len(columns)) + " |")
- # 数据行
- for row in table_rows[1:]:
- md_lines.append("| " + " | ".join(row) + " |")
- md_lines.append("")
- tables.append("\n".join(md_lines))
- return tables
- def convert(html: str) -> str:
- """主转换函数"""
- data = extract_block_data(html)
- block_map = data.get("block_map", {})
- block_sequence = data.get("block_sequence", [])
- if not block_map:
- raise ValueError("block_map 为空,无法提取文档内容")
- # 找到页面根节点
- page_id = ""
- for bid, block in block_map.items():
- if block.get("data", {}).get("type") == "page":
- page_id = bid
- break
- # 按 block_sequence 转换主体内容
- markdown = blocks_to_markdown(block_map, block_sequence, page_id)
- # 提取表格
- tables = extract_tables(block_map)
- # 如果有表格,附加到文档末尾(因为表格在主体中被跳过)
- # 实际上更好的做法是在原位插入,但飞书的表格嵌套结构比较复杂
- # 这里先简单追加
- if tables:
- markdown += "\n\n## 附录:表格数据\n\n"
- for i, table in enumerate(tables, 1):
- markdown += f"### 表格 {i}\n\n{table}\n"
- # 清理多余空行
- markdown = re.sub(r"\n{3,}", "\n\n", markdown)
- return markdown.strip()
- def main():
- parser = argparse.ArgumentParser(description="飞书文档转 Markdown")
- parser.add_argument("url", help="飞书文档 URL")
- parser.add_argument("-o", "--output", default="output.md", help="输出文件路径")
- args = parser.parse_args()
- try:
- html = fetch_html(args.url)
- print("正在解析文档结构...")
- markdown = convert(html)
- output_path = Path(args.output)
- output_path.write_text(markdown, encoding="utf-8")
- print(f"成功保存到: {output_path.absolute()}")
- print(f"文件大小: {len(markdown)} 字符")
- except httpx.HTTPStatusError as e:
- print(f"HTTP 错误: {e.response.status_code}")
- print("可能是文档需要登录或权限不足")
- sys.exit(1)
- except Exception as e:
- print(f"错误: {e}")
- sys.exit(1)
- if __name__ == "__main__":
- main()
|