feishu_to_md.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. #!/usr/bin/env python3
  2. """
  3. 飞书文档转 Markdown 工具
  4. 飞书文档是客户端渲染的,文档内容嵌在 JS 变量 window.DATA 中。
  5. 本脚本从 HTML 中提取 block_map JSON,解析文档结构树,转换为 Markdown。
  6. 使用方法:
  7. python feishu_to_md.py <飞书文档URL> -o output.md
  8. 依赖安装:
  9. pip install httpx
  10. """
  11. import argparse
  12. import json
  13. import re
  14. import sys
  15. from pathlib import Path
  16. try:
  17. import httpx
  18. except ImportError:
  19. print("缺少依赖:pip install httpx")
  20. sys.exit(1)
  21. def fetch_html(url: str) -> str:
  22. print(f"正在抓取: {url}")
  23. with httpx.Client(follow_redirects=True, timeout=30.0) as client:
  24. resp = client.get(url, headers={
  25. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
  26. })
  27. resp.raise_for_status()
  28. return resp.text
  29. def extract_block_data(html: str) -> dict:
  30. """从 HTML 中提取 window.DATA 里的 block_map 和 block_sequence"""
  31. # 匹配 clientVars: Object({...})
  32. match = re.search(r'clientVars:\s*Object\((\{.+?\})\)\s*\}', html, re.DOTALL)
  33. if not match:
  34. raise ValueError("无法从页面中提取 clientVars 数据")
  35. raw = match.group(1)
  36. # 飞书的 JSON 用了 Unicode 转义,直接 parse
  37. try:
  38. data = json.loads(raw)
  39. except json.JSONDecodeError:
  40. # 有时候 JSON 被截断,尝试找完整的
  41. match2 = re.search(r'"block_map"\s*:\s*\{', html)
  42. if not match2:
  43. raise ValueError("无法解析 block_map")
  44. # 从 block_map 开始,找到对应的闭合括号
  45. start = match2.start()
  46. depth = 0
  47. for i in range(start, len(html)):
  48. if html[i] == '{':
  49. depth += 1
  50. elif html[i] == '}':
  51. depth -= 1
  52. if depth == 0:
  53. try:
  54. data = {"data": json.loads(html[start:i+1].replace('"block_map":', '{"block_map":') + '}')}
  55. break
  56. except:
  57. continue
  58. else:
  59. raise ValueError("无法解析 block_map JSON")
  60. return data.get("data", data)
  61. def extract_text_from_block(block_data: dict) -> str:
  62. """从 block 的 text 字段提取纯文本"""
  63. text_info = block_data.get("text", {})
  64. texts = text_info.get("initialAttributedTexts", {}).get("text", {})
  65. if not texts:
  66. return ""
  67. # texts 是 {"0": "...", "1": "..."} 格式,拼接所有
  68. return "".join(texts.get(str(i), "") for i in range(len(texts)))
  69. def extract_link_from_block(block_data: dict) -> str:
  70. """提取 block 中的链接"""
  71. text_info = block_data.get("text", {})
  72. apool = text_info.get("apool", {}).get("numToAttrib", {})
  73. for _, attr in apool.items():
  74. if isinstance(attr, list) and len(attr) == 2:
  75. if attr[0] == "link":
  76. from urllib.parse import unquote
  77. return unquote(attr[1])
  78. return ""
  79. def blocks_to_markdown(block_map: dict, block_sequence: list, page_id: str) -> str:
  80. """将 block_map 按 block_sequence 顺序转换为 Markdown"""
  81. lines = []
  82. in_code_block = False
  83. code_lang = ""
  84. code_lines = []
  85. for block_id in block_sequence:
  86. block = block_map.get(block_id)
  87. if not block:
  88. continue
  89. data = block.get("data", {})
  90. block_type = data.get("type", "")
  91. text = extract_text_from_block(data)
  92. link = extract_link_from_block(data)
  93. # 跳过页面根节点本身(标题已单独处理)
  94. if block_type == "page":
  95. if text:
  96. lines.append(f"# {text}")
  97. lines.append("")
  98. continue
  99. # 处理代码块的结束
  100. if in_code_block and block_type != "code":
  101. lines.append(f"```{code_lang}")
  102. for cl in code_lines:
  103. lines.append(cl)
  104. lines.append("```")
  105. lines.append("")
  106. in_code_block = False
  107. code_lines = []
  108. if block_type == "heading1":
  109. lines.append(f"# {text}")
  110. lines.append("")
  111. elif block_type == "heading2":
  112. lines.append(f"## {text}")
  113. lines.append("")
  114. elif block_type == "heading3":
  115. lines.append(f"### {text}")
  116. lines.append("")
  117. elif block_type == "heading4":
  118. lines.append(f"#### {text}")
  119. lines.append("")
  120. elif block_type == "heading5":
  121. lines.append(f"##### {text}")
  122. lines.append("")
  123. elif block_type == "heading6":
  124. lines.append(f"###### {text}")
  125. lines.append("")
  126. elif block_type == "text":
  127. if text:
  128. if link:
  129. lines.append(f"{text}")
  130. lines.append(f" 链接: {link}")
  131. else:
  132. lines.append(text)
  133. lines.append("")
  134. else:
  135. lines.append("")
  136. elif block_type == "bullet":
  137. if link:
  138. lines.append(f"- [{text}]({link})")
  139. else:
  140. lines.append(f"- {text}")
  141. elif block_type == "ordered":
  142. if link:
  143. lines.append(f"1. [{text}]({link})")
  144. else:
  145. lines.append(f"1. {text}")
  146. elif block_type == "todo":
  147. checked = data.get("checked", False)
  148. mark = "x" if checked else " "
  149. lines.append(f"- [{mark}] {text}")
  150. elif block_type == "code":
  151. if not in_code_block:
  152. in_code_block = True
  153. code_lang = data.get("language", "")
  154. code_lines = []
  155. code_lines.append(text)
  156. elif block_type == "callout":
  157. # callout 容器,内容在子节点中
  158. if text:
  159. lines.append(f"> {text}")
  160. lines.append("")
  161. elif block_type == "quote":
  162. lines.append(f"> {text}")
  163. lines.append("")
  164. elif block_type == "divider":
  165. lines.append("---")
  166. lines.append("")
  167. elif block_type == "image":
  168. token = data.get("token", "")
  169. if token:
  170. lines.append(f"![image]({token})")
  171. lines.append("")
  172. elif block_type in ("table", "table_cell"):
  173. # 表格结构复杂,提取子节点文本
  174. pass
  175. else:
  176. # 其他未知类型,如果有文本就输出
  177. if text:
  178. lines.append(text)
  179. lines.append("")
  180. # 处理最后一个代码块
  181. if in_code_block:
  182. lines.append(f"```{code_lang}")
  183. for cl in code_lines:
  184. lines.append(cl)
  185. lines.append("```")
  186. lines.append("")
  187. return "\n".join(lines)
  188. def extract_tables(block_map: dict) -> list[str]:
  189. """提取表格内容为 Markdown 格式"""
  190. tables = []
  191. for block_id, block in block_map.items():
  192. data = block.get("data", {})
  193. if data.get("type") != "table":
  194. continue
  195. columns = data.get("columns_id", [])
  196. rows = data.get("rows_id", [])
  197. cell_set = data.get("cell_set", {})
  198. if not columns or not rows:
  199. continue
  200. table_rows = []
  201. for row_id in rows:
  202. row_cells = []
  203. for col_id in columns:
  204. cell_key = f"{row_id}{col_id}"
  205. cell_info = cell_set.get(cell_key, {})
  206. cell_block_id = cell_info.get("block_id", "")
  207. if cell_block_id and cell_block_id in block_map:
  208. cell_block = block_map[cell_block_id]
  209. cell_data = cell_block.get("data", {})
  210. # 单元格内容在 children 中
  211. children = cell_data.get("children", [])
  212. cell_texts = []
  213. for child_id in children:
  214. if child_id in block_map:
  215. child_text = extract_text_from_block(block_map[child_id].get("data", {}))
  216. if child_text:
  217. cell_texts.append(child_text)
  218. row_cells.append(" ".join(cell_texts) if cell_texts else "")
  219. else:
  220. row_cells.append("")
  221. table_rows.append(row_cells)
  222. if table_rows:
  223. # 生成 Markdown 表格
  224. md_lines = []
  225. # 表头
  226. md_lines.append("| " + " | ".join(table_rows[0]) + " |")
  227. md_lines.append("| " + " | ".join(["---"] * len(columns)) + " |")
  228. # 数据行
  229. for row in table_rows[1:]:
  230. md_lines.append("| " + " | ".join(row) + " |")
  231. md_lines.append("")
  232. tables.append("\n".join(md_lines))
  233. return tables
  234. def convert(html: str) -> str:
  235. """主转换函数"""
  236. data = extract_block_data(html)
  237. block_map = data.get("block_map", {})
  238. block_sequence = data.get("block_sequence", [])
  239. if not block_map:
  240. raise ValueError("block_map 为空,无法提取文档内容")
  241. # 找到页面根节点
  242. page_id = ""
  243. for bid, block in block_map.items():
  244. if block.get("data", {}).get("type") == "page":
  245. page_id = bid
  246. break
  247. # 按 block_sequence 转换主体内容
  248. markdown = blocks_to_markdown(block_map, block_sequence, page_id)
  249. # 提取表格
  250. tables = extract_tables(block_map)
  251. # 如果有表格,附加到文档末尾(因为表格在主体中被跳过)
  252. # 实际上更好的做法是在原位插入,但飞书的表格嵌套结构比较复杂
  253. # 这里先简单追加
  254. if tables:
  255. markdown += "\n\n## 附录:表格数据\n\n"
  256. for i, table in enumerate(tables, 1):
  257. markdown += f"### 表格 {i}\n\n{table}\n"
  258. # 清理多余空行
  259. markdown = re.sub(r"\n{3,}", "\n\n", markdown)
  260. return markdown.strip()
  261. def main():
  262. parser = argparse.ArgumentParser(description="飞书文档转 Markdown")
  263. parser.add_argument("url", help="飞书文档 URL")
  264. parser.add_argument("-o", "--output", default="output.md", help="输出文件路径")
  265. args = parser.parse_args()
  266. try:
  267. html = fetch_html(args.url)
  268. print("正在解析文档结构...")
  269. markdown = convert(html)
  270. output_path = Path(args.output)
  271. output_path.write_text(markdown, encoding="utf-8")
  272. print(f"成功保存到: {output_path.absolute()}")
  273. print(f"文件大小: {len(markdown)} 字符")
  274. except httpx.HTTPStatusError as e:
  275. print(f"HTTP 错误: {e.response.status_code}")
  276. print("可能是文档需要登录或权限不足")
  277. sys.exit(1)
  278. except Exception as e:
  279. print(f"错误: {e}")
  280. sys.exit(1)
  281. if __name__ == "__main__":
  282. main()