guantao
/
Tool_Agent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
							#!/usr/bin/env python3
"""
飞书文档转 Markdown 工具

飞书文档是客户端渲染的，文档内容嵌在 JS 变量 window.DATA 中。
本脚本从 HTML 中提取 block_map JSON，解析文档结构树，转换为 Markdown。

使用方法：
    python feishu_to_md.py <飞书文档URL> -o output.md

依赖安装：
    pip install httpx
"""

import argparse
import json
import re
import sys
from pathlib import Path

try:
    import httpx
except ImportError:
    print("缺少依赖：pip install httpx")
    sys.exit(1)


def fetch_html(url: str) -> str:
    print(f"正在抓取: {url}")
    with httpx.Client(follow_redirects=True, timeout=30.0) as client:
        resp = client.get(url, headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        })
        resp.raise_for_status()
        return resp.text


def extract_block_data(html: str) -> dict:
    """从 HTML 中提取 window.DATA 里的 block_map 和 block_sequence"""
    # 匹配 clientVars: Object({...})
    match = re.search(r'clientVars:\s*Object\((\{.+?\})\)\s*\}', html, re.DOTALL)
    if not match:
        raise ValueError("无法从页面中提取 clientVars 数据")

    raw = match.group(1)
    # 飞书的 JSON 用了 Unicode 转义，直接 parse
    try:
        data = json.loads(raw)
    except json.JSONDecodeError:
        # 有时候 JSON 被截断，尝试找完整的
        match2 = re.search(r'"block_map"\s*:\s*\{', html)
        if not match2:
            raise ValueError("无法解析 block_map")
        # 从 block_map 开始，找到对应的闭合括号
        start = match2.start()
        depth = 0
        for i in range(start, len(html)):
            if html[i] == '{':
                depth += 1
            elif html[i] == '}':
                depth -= 1
                if depth == 0:
                    try:
                        data = {"data": json.loads(html[start:i+1].replace('"block_map":', '{"block_map":') + '}')}
                        break
                    except:
                        continue
        else:
            raise ValueError("无法解析 block_map JSON")

    return data.get("data", data)


def extract_text_from_block(block_data: dict) -> str:
    """从 block 的 text 字段提取纯文本"""
    text_info = block_data.get("text", {})
    texts = text_info.get("initialAttributedTexts", {}).get("text", {})
    if not texts:
        return ""
    # texts 是 {"0": "...", "1": "..."} 格式，拼接所有
    return "".join(texts.get(str(i), "") for i in range(len(texts)))


def extract_link_from_block(block_data: dict) -> str:
    """提取 block 中的链接"""
    text_info = block_data.get("text", {})
    apool = text_info.get("apool", {}).get("numToAttrib", {})
    for _, attr in apool.items():
        if isinstance(attr, list) and len(attr) == 2:
            if attr[0] == "link":
                from urllib.parse import unquote
                return unquote(attr[1])
    return ""


def blocks_to_markdown(block_map: dict, block_sequence: list, page_id: str) -> str:
    """将 block_map 按 block_sequence 顺序转换为 Markdown"""
    lines = []
    in_code_block = False
    code_lang = ""
    code_lines = []

    for block_id in block_sequence:
        block = block_map.get(block_id)
        if not block:
            continue

        data = block.get("data", {})
        block_type = data.get("type", "")
        text = extract_text_from_block(data)
        link = extract_link_from_block(data)

        # 跳过页面根节点本身（标题已单独处理）
        if block_type == "page":
            if text:
                lines.append(f"# {text}")
                lines.append("")
            continue

        # 处理代码块的结束
        if in_code_block and block_type != "code":
            lines.append(f"```{code_lang}")
            for cl in code_lines:
                lines.append(cl)
            lines.append("```")
            lines.append("")
            in_code_block = False
            code_lines = []

        if block_type == "heading1":
            lines.append(f"# {text}")
            lines.append("")
        elif block_type == "heading2":
            lines.append(f"## {text}")
            lines.append("")
        elif block_type == "heading3":
            lines.append(f"### {text}")
            lines.append("")
        elif block_type == "heading4":
            lines.append(f"#### {text}")
            lines.append("")
        elif block_type == "heading5":
            lines.append(f"##### {text}")
            lines.append("")
        elif block_type == "heading6":
            lines.append(f"###### {text}")
            lines.append("")
        elif block_type == "text":
            if text:
                if link:
                    lines.append(f"{text}")
                    lines.append(f"  链接: {link}")
                else:
                    lines.append(text)
                lines.append("")
            else:
                lines.append("")
        elif block_type == "bullet":
            if link:
                lines.append(f"- [{text}]({link})")
            else:
                lines.append(f"- {text}")
        elif block_type == "ordered":
            if link:
                lines.append(f"1. [{text}]({link})")
            else:
                lines.append(f"1. {text}")
        elif block_type == "todo":
            checked = data.get("checked", False)
            mark = "x" if checked else " "
            lines.append(f"- [{mark}] {text}")
        elif block_type == "code":
            if not in_code_block:
                in_code_block = True
                code_lang = data.get("language", "")
                code_lines = []
            code_lines.append(text)
        elif block_type == "callout":
            # callout 容器，内容在子节点中
            if text:
                lines.append(f"> {text}")
                lines.append("")
        elif block_type == "quote":
            lines.append(f"> {text}")
            lines.append("")
        elif block_type == "divider":
            lines.append("---")
            lines.append("")
        elif block_type == "image":
            token = data.get("token", "")
            if token:
                lines.append(f"![image]({token})")
                lines.append("")
        elif block_type in ("table", "table_cell"):
            # 表格结构复杂，提取子节点文本
            pass
        else:
            # 其他未知类型，如果有文本就输出
            if text:
                lines.append(text)
                lines.append("")

    # 处理最后一个代码块
    if in_code_block:
        lines.append(f"```{code_lang}")
        for cl in code_lines:
            lines.append(cl)
        lines.append("```")
        lines.append("")

    return "\n".join(lines)


def extract_tables(block_map: dict) -> list[str]:
    """提取表格内容为 Markdown 格式"""
    tables = []
    for block_id, block in block_map.items():
        data = block.get("data", {})
        if data.get("type") != "table":
            continue

        columns = data.get("columns_id", [])
        rows = data.get("rows_id", [])
        cell_set = data.get("cell_set", {})

        if not columns or not rows:
            continue

        table_rows = []
        for row_id in rows:
            row_cells = []
            for col_id in columns:
                cell_key = f"{row_id}{col_id}"
                cell_info = cell_set.get(cell_key, {})
                cell_block_id = cell_info.get("block_id", "")
                if cell_block_id and cell_block_id in block_map:
                    cell_block = block_map[cell_block_id]
                    cell_data = cell_block.get("data", {})
                    # 单元格内容在 children 中
                    children = cell_data.get("children", [])
                    cell_texts = []
                    for child_id in children:
                        if child_id in block_map:
                            child_text = extract_text_from_block(block_map[child_id].get("data", {}))
                            if child_text:
                                cell_texts.append(child_text)
                    row_cells.append(" ".join(cell_texts) if cell_texts else "")
                else:
                    row_cells.append("")
            table_rows.append(row_cells)

        if table_rows:
            # 生成 Markdown 表格
            md_lines = []
            # 表头
            md_lines.append("| " + " | ".join(table_rows[0]) + " |")
            md_lines.append("| " + " | ".join(["---"] * len(columns)) + " |")
            # 数据行
            for row in table_rows[1:]:
                md_lines.append("| " + " | ".join(row) + " |")
            md_lines.append("")
            tables.append("\n".join(md_lines))

    return tables


def convert(html: str) -> str:
    """主转换函数"""
    data = extract_block_data(html)
    block_map = data.get("block_map", {})
    block_sequence = data.get("block_sequence", [])

    if not block_map:
        raise ValueError("block_map 为空，无法提取文档内容")

    # 找到页面根节点
    page_id = ""
    for bid, block in block_map.items():
        if block.get("data", {}).get("type") == "page":
            page_id = bid
            break

    # 按 block_sequence 转换主体内容
    markdown = blocks_to_markdown(block_map, block_sequence, page_id)

    # 提取表格
    tables = extract_tables(block_map)

    # 如果有表格，附加到文档末尾（因为表格在主体中被跳过）
    # 实际上更好的做法是在原位插入，但飞书的表格嵌套结构比较复杂
    # 这里先简单追加
    if tables:
        markdown += "\n\n## 附录：表格数据\n\n"
        for i, table in enumerate(tables, 1):
            markdown += f"### 表格 {i}\n\n{table}\n"

    # 清理多余空行
    markdown = re.sub(r"\n{3,}", "\n\n", markdown)

    return markdown.strip()


def main():
    parser = argparse.ArgumentParser(description="飞书文档转 Markdown")
    parser.add_argument("url", help="飞书文档 URL")
    parser.add_argument("-o", "--output", default="output.md", help="输出文件路径")
    args = parser.parse_args()

    try:
        html = fetch_html(args.url)
        print("正在解析文档结构...")
        markdown = convert(html)

        output_path = Path(args.output)
        output_path.write_text(markdown, encoding="utf-8")

        print(f"成功保存到: {output_path.absolute()}")
        print(f"文件大小: {len(markdown)} 字符")

    except httpx.HTTPStatusError as e:
        print(f"HTTP 错误: {e.response.status_code}")
        print("可能是文档需要登录或权限不足")
        sys.exit(1)
    except Exception as e:
        print(f"错误: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()