Просмотр исходного кода

feat(skills): 自包含 odps-skills 包,含 3 个 CLI skill

从根目录复制代码到 skills/src/odps_skills/,修改 import 使其独立:
- client.py: ODPSClient(从 lib/odps_module.py 提取)
- dataworks.py: DataWorksClient(从 lib/odps_module.py 提取)
- feishu.py: 飞书 API 客户端(直接复制)
- run_sql.py / fetch_daily.py / fetch_table_code.py: CLI 入口

产出目录通过 ODPS_SKILLS_DATA_DIR 环境变量配置(必选),
表元数据统一到 production_code/{project.table}.{sql,json}。

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
yangxiaohui 4 дней назад
Родитель
Сommit
1b4ed7c195

+ 94 - 0
skills/README.md

@@ -0,0 +1,94 @@
+# ODPS Skills — 自包含数据分析工具包
+
+独立的 Python 包,包含 3 个 CLI 工具(skill),用于 ODPS/MaxCompute 数据分析。
+
+## 安装 & 配置
+
+### 1. 安装包
+
+```bash
+cd skills
+uv pip install -e .
+```
+
+### 2. 配置产出路径(必选)
+
+所有 skill 的产出文件(production_code/、output/ 等)都基于一个固定的基础路径。
+安装后必须配置,配置一次即可:
+
+```bash
+export ODPS_SKILLS_DATA_DIR=/your/project/root
+```
+
+写入 shell 配置文件(如 `~/.zshrc`、`~/.bashrc`、`.env` 等)可永久生效。
+
+> 后续维护:如果项目目录迁移,只需改这一个环境变量。
+
+### 3. 验证
+
+```bash
+run-sql --help
+fetch-daily --help
+fetch-table-code --help
+```
+
+## Skills 一览
+
+| Skill | CLI 命令 | 用途 |
+|-------|----------|------|
+| `odps-run-query` | `run-sql` | 执行 SQL 查询,输出 CSV |
+| `odps-fetch-daily` | `fetch-daily` | 按天增量获取数据(并发) |
+| `odps-fetch-table-code` | `fetch-table-code` | 获取表的生产代码和 schema |
+
+## 产出目录
+
+所有产出基于 `$ODPS_SKILLS_DATA_DIR`(下文简称 `$ROOT`):
+
+```
+$ROOT/
+├── tasks/某个分析/
+│   ├── query.sql
+│   └── output/
+│       ├── query_20260301_20260305.csv    ← run-sql
+│       └── query/                          ← fetch-daily
+│           ├── 20260301.csv
+│           └── 20260302.csv
+│
+└── production_code/                        ← fetch-table-code
+    ├── loghubods.my_table.sql              ← 生产 ETL 代码
+    └── loghubods.my_table.json             ← schema(列名、类型、上游依赖)
+```
+
+| 目录 | 来源 | 说明 |
+|------|------|------|
+| `{sql_dir}/output/` | run-sql / fetch-daily | 跟随 SQL 文件位置 |
+| `$ROOT/production_code/` | fetch-table-code | 表的代码 `.sql` + 结构 `.json` 统一存放 |
+
+## 路径配置方式
+
+| 优先级 | 方式 | 适用场景 |
+|--------|------|----------|
+| 1 | `--output-dir /path` | 临时覆盖(仅 fetch-table-code 支持) |
+| 2 | `ODPS_SKILLS_DATA_DIR=/path` | **推荐**,配置一次永久生效 |
+
+未配置环境变量且未传 `--output-dir` 时,CLI 会报错并提示配置方法。
+
+## 包结构
+
+```
+skills/
+├── pyproject.toml
+├── README.md
+├── src/odps_skills/
+│   ├── __init__.py
+│   ├── config.py              # 路径配置(ODPS_SKILLS_DATA_DIR)
+│   ├── client.py              # ODPSClient
+│   ├── dataworks.py           # DataWorksClient
+│   ├── feishu.py              # 飞书 API
+│   ├── run_sql.py             # run-sql CLI
+│   ├── fetch_daily.py         # fetch-daily CLI
+│   └── fetch_table_code.py    # fetch-table-code CLI
+├── odps-run-query/SKILL.md
+├── odps-fetch-daily/SKILL.md
+└── odps-fetch-table-code/SKILL.md
+```

+ 28 - 0
skills/pyproject.toml

@@ -0,0 +1,28 @@
+[project]
+name = "odps-skills"
+version = "0.1.0"
+description = "ODPS data analysis CLI skills - self-contained package"
+requires-python = ">=3.10,<3.14"
+dependencies = [
+    "pyodps>=0.12",
+    "pandas>=2.0",
+    "pyarrow>=14.0",
+    "tqdm>=4.60",
+    "requests>=2.28",
+    "curl-cffi>=0.5",
+    "python-dotenv>=1.0",
+    "Pillow>=10.0",
+    "alibabacloud-dataworks-public20240518>=8.0.3",
+]
+
+[project.scripts]
+run-sql = "odps_skills.run_sql:main"
+fetch-daily = "odps_skills.fetch_daily:main"
+fetch-table-code = "odps_skills.fetch_table_code:main"
+
+[build-system]
+requires = ["setuptools>=68"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+where = ["src"]

+ 14 - 0
skills/src/odps_skills.egg-info/PKG-INFO

@@ -0,0 +1,14 @@
+Metadata-Version: 2.4
+Name: odps-skills
+Version: 0.1.0
+Summary: ODPS data analysis CLI skills - self-contained package
+Requires-Python: <3.14,>=3.10
+Requires-Dist: pyodps>=0.12
+Requires-Dist: pandas>=2.0
+Requires-Dist: pyarrow>=14.0
+Requires-Dist: tqdm>=4.60
+Requires-Dist: requests>=2.28
+Requires-Dist: curl-cffi>=0.5
+Requires-Dist: python-dotenv>=1.0
+Requires-Dist: Pillow>=10.0
+Requires-Dist: alibabacloud-dataworks-public20240518>=8.0.3

+ 16 - 0
skills/src/odps_skills.egg-info/SOURCES.txt

@@ -0,0 +1,16 @@
+README.md
+pyproject.toml
+src/odps_skills/__init__.py
+src/odps_skills/client.py
+src/odps_skills/config.py
+src/odps_skills/dataworks.py
+src/odps_skills/feishu.py
+src/odps_skills/fetch_daily.py
+src/odps_skills/fetch_table_code.py
+src/odps_skills/run_sql.py
+src/odps_skills.egg-info/PKG-INFO
+src/odps_skills.egg-info/SOURCES.txt
+src/odps_skills.egg-info/dependency_links.txt
+src/odps_skills.egg-info/entry_points.txt
+src/odps_skills.egg-info/requires.txt
+src/odps_skills.egg-info/top_level.txt

+ 1 - 0
skills/src/odps_skills.egg-info/dependency_links.txt

@@ -0,0 +1 @@
+

+ 4 - 0
skills/src/odps_skills.egg-info/entry_points.txt

@@ -0,0 +1,4 @@
+[console_scripts]
+fetch-daily = odps_skills.fetch_daily:main
+fetch-table-code = odps_skills.fetch_table_code:main
+run-sql = odps_skills.run_sql:main

+ 9 - 0
skills/src/odps_skills.egg-info/requires.txt

@@ -0,0 +1,9 @@
+pyodps>=0.12
+pandas>=2.0
+pyarrow>=14.0
+tqdm>=4.60
+requests>=2.28
+curl-cffi>=0.5
+python-dotenv>=1.0
+Pillow>=10.0
+alibabacloud-dataworks-public20240518>=8.0.3

+ 1 - 0
skills/src/odps_skills.egg-info/top_level.txt

@@ -0,0 +1 @@
+odps_skills

+ 2 - 0
skills/src/odps_skills/__init__.py

@@ -0,0 +1,2 @@
+"""ODPS data analysis CLI skills."""
+__version__ = "0.1.0"

+ 197 - 0
skills/src/odps_skills/client.py

@@ -0,0 +1,197 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+import os
+import time
+import uuid
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from odps import ODPS, options
+from odps.tunnel import TableTunnel
+from tqdm import tqdm
+import pyarrow as pa
+from pyarrow import csv as pa_csv
+
+# 开启 Instance Tunnel,解除 1 万条限制
+options.tunnel.use_instance_tunnel = True
+options.tunnel.limit_instance_tunnel = False
+
+# ODPS 配置
+ODPS_CONFIGS = {
+    "default": {
+        "access_id": "LTAIWYUujJAm7CbH",
+        "access_secret": "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P",
+        "project": "loghubods",
+    },
+    "piaoquan_api": {
+        "access_id": "LTAI5tKyXxh7C6349c1wbwUX",
+        "access_secret": "H8doQDC20KugToRA3giERgRyRD1KR9",
+        "project": "piaoquan_api",
+    },
+}
+
+
+class ODPSClient(object):
+    def __init__(self, project="loghubods", config="default"):
+        """
+        初始化 ODPS 客户端
+
+        Args:
+            project: 项目名(可覆盖配置中的默认项目)
+            config: 配置名,可选 "default" 或 "piaoquan_api"
+        """
+        cfg = ODPS_CONFIGS.get(config, ODPS_CONFIGS["default"])
+
+        self.accessId = cfg["access_id"]
+        self.accessSecret = cfg["access_secret"]
+        self.endpoint = "http://service.odps.aliyun.com/api"
+        self.tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com"
+
+        # 如果指定了 project 且不是默认值,使用指定的;否则用配置中的
+        actual_project = project if project != "loghubods" else cfg["project"]
+
+        self.odps = ODPS(
+            self.accessId,
+            self.accessSecret,
+            actual_project,
+            self.endpoint
+        )
+
+    def execute_sql(self, sql: str, print_logview: bool = True):
+        """执行 SQL 并返回 DataFrame"""
+        hints = {'odps.sql.submit.mode': 'script'}
+        instance = self.odps.execute_sql(sql, hints=hints)
+
+        if print_logview:
+            print(f"LogView: {instance.get_logview_address()}")
+
+        with instance.open_reader(tunnel=True, limit=False) as reader:
+            pd_df = reader.to_pandas()
+        return pd_df
+
+    def execute_sql_result_save_file(self, sql: str, output_file: str):
+        """执行 SQL 并保存到文件(Arrow 直接写 CSV,速度最快)"""
+        hints = {'odps.sql.submit.mode': 'script'}
+
+        start_time = time.time()
+        instance = self.odps.execute_sql(sql, hints=hints)
+        sql_time = time.time() - start_time
+
+        print(f"LogView: {instance.get_logview_address()}")
+        print(f"SQL 执行耗时: {sql_time:.1f}s")
+
+        with instance.open_reader(tunnel=True, limit=False, arrow=True) as reader:
+            total = reader.count
+
+            # 边下载边写入,用 pyarrow 直接写 CSV
+            with open(output_file, 'wb') as f:
+                first = True
+                with tqdm(total=total, unit='行', desc='下载中') as pbar:
+                    for batch in reader:
+                        # pyarrow 写 CSV(比 pandas 快很多)
+                        options = pa_csv.WriteOptions(include_header=first)
+                        pa_csv.write_csv(pa.Table.from_batches([batch]), f, write_options=options)
+                        first = False
+                        pbar.update(batch.num_rows)
+
+        total_time = time.time() - start_time
+        print(f"总耗时: {total_time:.1f}s")
+        print(f"完成: {output_file}")
+
+    def execute_sql_result_save_file_parallel(self, sql: str, output_file: str, workers: int = 4):
+        """执行 SQL 并保存到文件(多线程并行下载,速度最快)"""
+        hints = {'odps.sql.submit.mode': 'script'}
+
+        # 生成临时表名
+        tmp_table = f"tmp_download_{uuid.uuid4().hex[:8]}"
+        create_sql = f"CREATE TABLE {tmp_table} LIFECYCLE 1 AS {sql}"
+
+        start_time = time.time()
+
+        # 1. 创建临时表
+        print(f"创建临时表: {tmp_table}")
+        instance = self.odps.execute_sql(create_sql, hints=hints)
+        print(f"LogView: {instance.get_logview_address()}")
+        instance.wait_for_success()
+        sql_time = time.time() - start_time
+        print(f"SQL 执行耗时: {sql_time:.1f}s")
+
+        try:
+            # 2. 获取表信息
+            table = self.odps.get_table(tmp_table)
+            tunnel = TableTunnel(self.odps)
+            download_session = tunnel.create_download_session(table.name)
+            total = download_session.count
+            print(f"总行数: {total}")
+
+            if total == 0:
+                # 空表,直接写入空 CSV
+                with open(output_file, 'w') as f:
+                    columns = [col.name for col in table.table_schema.columns]
+                    f.write(','.join(columns) + '\n')
+                print(f"完成: {output_file} (空表)")
+                return
+
+            # 3. 分段
+            chunk_size = (total + workers - 1) // workers
+            chunks = []
+            for i in range(workers):
+                start = i * chunk_size
+                end = min((i + 1) * chunk_size, total)
+                if start < end:
+                    chunks.append((i, start, end - start))  # (index, start, count)
+
+            print(f"并行下载: {len(chunks)} 个分片, {workers} 线程")
+
+            # 4. 多线程下载到临时文件(放在输出目录)
+            output_dir = os.path.dirname(output_file)
+            tmp_prefix = os.path.join(output_dir, f".tmp_{os.path.basename(output_file)}_")
+            pbar = tqdm(total=total, unit='行', desc='下载中')
+            pbar_lock = threading.Lock()
+            session_id = download_session.id
+            tmp_files = {}
+
+            def download_chunk(chunk_info):
+                idx, start, count = chunk_info
+                tmp_file = f"{tmp_prefix}{idx:04d}"
+                session = tunnel.create_download_session(table.name, download_id=session_id)
+                with session.open_arrow_reader(start, count) as reader:
+                    batches = []
+                    for batch in reader:
+                        batches.append(batch)
+                        with pbar_lock:
+                            pbar.update(batch.num_rows)
+                    if batches:
+                        tbl = pa.Table.from_batches(batches)
+                        pa_csv.write_csv(tbl, tmp_file)
+                return idx, tmp_file
+
+            # 并行下载
+            with ThreadPoolExecutor(max_workers=workers) as executor:
+                futures = [executor.submit(download_chunk, chunk) for chunk in chunks]
+                for future in as_completed(futures):
+                    idx, tmp_file = future.result()
+                    tmp_files[idx] = tmp_file
+
+            pbar.close()
+
+            # 按顺序合并
+            print("合并文件中...")
+            with open(output_file, 'wb') as outf:
+                for idx in range(len(chunks)):
+                    tmp_file = tmp_files.get(idx)
+                    if tmp_file and os.path.exists(tmp_file):
+                        with open(tmp_file, 'rb') as inf:
+                            if idx > 0:
+                                inf.readline()  # 跳过表头
+                            outf.write(inf.read())
+                        os.remove(tmp_file)
+
+        finally:
+            # 6. 删除临时表
+            print(f"删除临时表: {tmp_table}")
+            self.odps.delete_table(tmp_table, if_exists=True)
+
+        total_time = time.time() - start_time
+        print(f"总耗时: {total_time:.1f}s")
+        print(f"完成: {output_file}")

+ 37 - 0
skills/src/odps_skills/config.py

@@ -0,0 +1,37 @@
+"""Skills 配置:产出目录等全局设置。
+
+产出基础目录(必选):
+  1. CLI --output-dir 参数(单次覆盖)
+  2. 环境变量 ODPS_SKILLS_DATA_DIR(安装后配置一次,长期生效)
+
+未配置时报错退出,避免产出散落到不确定的位置。
+"""
+
+import os
+import sys
+from pathlib import Path
+from dotenv import load_dotenv
+
+# 包结构:src/odps_skills/config.py
+# 上 4 级:odps_skills → src → skills → <project_root>
+_PACKAGE_DIR = Path(__file__).parent          # odps_skills/
+_SKILLS_DIR  = _PACKAGE_DIR.parent.parent     # skills/
+_PROJECT_ROOT = _SKILLS_DIR.parent            # <project_root>/
+
+# 加载 .env(优先 skills/ 下,其次 project root)
+load_dotenv(_SKILLS_DIR / ".env", override=False)
+load_dotenv(_PROJECT_ROOT / ".env", override=False)
+
+
+def get_data_dir() -> str:
+    """获取产出基础目录。
+    
+    优先级:
+    1. 环境变量 ODPS_SKILLS_DATA_DIR(显式配置)
+    2. 自动推导:skill 包的上一级目录(<project_root>/)
+    """
+    data_dir = os.environ.get("ODPS_SKILLS_DATA_DIR")
+    if not data_dir:
+        # 自动推导,无需配置
+        data_dir = str(_PROJECT_ROOT)
+    return data_dir

+ 405 - 0
skills/src/odps_skills/dataworks.py

@@ -0,0 +1,405 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""DataWorks 客户端:根据表名获取生产代码"""
+
+import os
+import time
+from pathlib import Path
+
+from odps_skills.client import ODPSClient, ODPS_CONFIGS
+from odps_skills.config import get_data_dir
+
+# DataWorks SDK(可选依赖,仅 DataWorksClient 用到)
+try:
+    from alibabacloud_dataworks_public20240518.client import Client as _DWClient
+    from alibabacloud_tea_openapi import models as _open_api_models
+    from alibabacloud_dataworks_public20240518 import models as _dw_models
+    _DW_AVAILABLE = True
+except ImportError:
+    _DW_AVAILABLE = False
+
+# 账号下所有可访问的 DataWorks 项目(project_id → name)
+_DW_PROJECTS = {
+    4858:   "loghubods",
+    11300:  "DWH",
+    5477:   "videocdm",
+    548768: "piaoquan_api",
+    148813: "content_safety",
+    96094:  "algo",
+    52578:  "majin",
+    5057:   "useractionbi",
+    5034:   "user_video_action_cdm",
+    4868:   "usercdm",
+    4859:   "videoods",
+    6025:   "videoads",
+    5535:   "user_video_tag",
+    19288:  "RecallEmbedding",
+    10762:  "Test_model1",
+    193831: "cost_mgt_1894469520484605",
+    156474: "dyp_1",
+    156475: "dyp_2",
+    343868: "pq_data_space",
+    343957: "pq_grafana_se",
+}
+
+def _get_cache_dir():
+    """缓存目录:基于可配置的 data_dir。"""
+    return os.path.join(get_data_dir(), "production_code")
+
+
+def _call_with_retry(fn, max_retries=3, base_delay=2):
+    """带限流重试的 API 调用包装。"""
+    for attempt in range(max_retries):
+        try:
+            return fn()
+        except Exception as e:
+            if "Throttling" in str(e) and attempt < max_retries - 1:
+                delay = base_delay * (2 ** attempt)
+                print(f"  [throttled] 等待 {delay}s 后重试...")
+                time.sleep(delay)
+                continue
+            raise
+
+
+class DataWorksClient:
+    def __init__(self):
+        if not _DW_AVAILABLE:
+            raise ImportError(
+                "请先安装 DataWorks SDK:\n"
+                "pip install alibabacloud-dataworks-public20240518"
+            )
+        # 初始化所有 AK 对应的客户端(不同 AK 对不同项目有权限)
+        self._clients = {}
+        for config_name, cfg in ODPS_CONFIGS.items():
+            dw_config = _open_api_models.Config(
+                access_key_id=cfg["access_id"],
+                access_key_secret=cfg["access_secret"],
+                endpoint="dataworks.cn-hangzhou.aliyuncs.com",
+            )
+            self._clients[config_name] = _DWClient(dw_config)
+        self.client = self._clients["default"]
+
+    @staticmethod
+    def _build_entity_id(table_name: str) -> str:
+        """构造 GetTable 的 entity ID。
+
+        支持格式:
+          - project.table  → maxcompute-table:::project::table
+          - table           → maxcompute-table:::loghubods::table
+        """
+        parts = table_name.split(".", 1)
+        if len(parts) == 2:
+            project, table = parts
+        else:
+            project, table = "loghubods", parts[0]
+        return f"maxcompute-table:::{project}::{table}"
+
+    def get_table_info(self, table_name: str) -> dict:
+        """获取表的元信息(含上游任务列表)。
+
+        Returns:
+            dict with keys: name, comment, dataworks_tasks[{id, name}], ...
+        """
+        entity_id = self._build_entity_id(table_name)
+        resp = _call_with_retry(lambda: self.client.get_table(
+            _dw_models.GetTableRequest(id=entity_id, include_business_metadata=True)
+        ))
+        table = resp.body.to_map().get("Table", {})
+        biz = table.get("BusinessMetadata", {})
+        return {
+            "id": table.get("Id"),
+            "name": table.get("Name"),
+            "comment": table.get("Comment"),
+            "project_id": biz.get("Extension", {}).get("ProjectId"),
+            "dataworks_tasks": biz.get("UpstreamTasks", []),
+            "partition_keys": table.get("PartitionKeys", []),
+        }
+
+    def _get_task_code(self, task_id: int) -> dict:
+        """尝试用所有 AK 获取任务代码,返回第一个成功的结果。"""
+        for config_name, client in self._clients.items():
+            try:
+                resp = _call_with_retry(lambda c=client: c.get_task(
+                    _dw_models.GetTaskRequest(id=task_id, project_env="Prod")
+                ))
+                task = resp.body.to_map().get("Task", {})
+                return {
+                    "task_id": task_id,
+                    "task_name": task.get("Name"),
+                    "task_type": task.get("Type"),
+                    "content": (task.get("Script") or {}).get("Content", ""),
+                    "config": config_name,
+                }
+            except Exception as e:
+                if "11020205003" in str(e):
+                    continue  # 无权限,尝试下一个 AK
+                raise
+        return None
+
+    @staticmethod
+    def _normalize_table_name(table_name: str) -> str:
+        """补全 project 前缀:table → loghubods.table"""
+        if "." not in table_name:
+            return f"loghubods.{table_name}"
+        return table_name
+
+    @staticmethod
+    def _cache_path(table_name: str) -> str:
+        return os.path.join(_get_cache_dir(), f"{table_name}.sql")
+
+    @staticmethod
+    def _schema_cache_path(table_name: str) -> str:
+        return os.path.join(_get_cache_dir(), f"{table_name}.json")
+
+    def _read_cache(self, table_name: str) -> str | None:
+        path = self._cache_path(table_name)
+        if os.path.exists(path):
+            with open(path, "r", encoding="utf-8") as f:
+                return f.read()
+        return None
+
+    def _write_cache(self, table_name: str, content: str):
+        os.makedirs(_get_cache_dir(), exist_ok=True)
+        with open(self._cache_path(table_name), "w", encoding="utf-8") as f:
+            f.write(content)
+
+    def _read_schema_cache(self, table_name: str) -> dict | None:
+        import json
+        path = self._schema_cache_path(table_name)
+        if os.path.exists(path):
+            with open(path, "r", encoding="utf-8") as f:
+                return json.load(f)
+        return None
+
+    def _write_schema_cache(self, table_name: str, schema: dict):
+        import json
+        os.makedirs(_get_cache_dir(), exist_ok=True)
+        with open(self._schema_cache_path(table_name), "w", encoding="utf-8") as f:
+            json.dump(schema, f, ensure_ascii=False, indent=2)
+
+    def _ensure_schema_cache(self, table_name: str, force: bool = False,
+                              dataworks_tasks: list | None = None):
+        """确保 schema 缓存存在,无则拉取并写入。
+
+        Args:
+            dataworks_tasks: 预获取的上游任务列表,避免重复调用 get_table_info()
+        """
+        if not force:
+            cached = self._read_schema_cache(table_name)
+            if cached is not None:
+                return
+        try:
+            schema = self.get_table_schema(table_name, dataworks_tasks=dataworks_tasks)
+            self._write_schema_cache(table_name, schema)
+            print(f"[saved] {self._schema_cache_path(table_name)}")
+        except Exception as e:
+            print(f"[WARN] 获取表结构失败 {table_name}: {e}")
+
+    def get_table_schema(self, table_name: str,
+                          dataworks_tasks: list | None = None) -> dict:
+        """通过 ODPS SDK 获取表结构元信息。
+
+        Args:
+            table_name: 表名(支持 project.table 格式)
+            dataworks_tasks: 预获取的上游任务列表,避免重复 API 调用
+
+        Returns:
+            dict: {name, project, comment, columns, partition_keys, dataworks_tasks}
+        """
+        table_name = self._normalize_table_name(table_name)
+        parts = table_name.split(".", 1)
+        project, table = parts[0], parts[1]
+
+        # 用默认 AK 对应的 ODPSClient 获取 ODPS 表结构
+        odps_client = ODPSClient(project=project)
+        t = odps_client.odps.get_table(table)
+
+        columns = [
+            {"name": c.name, "type": str(c.type), "comment": c.comment or ""}
+            for c in t.table_schema.columns
+        ]
+        partition_keys = [
+            {"name": c.name, "type": str(c.type), "comment": c.comment or ""}
+            for c in t.table_schema.partitions
+        ]
+
+        # 上游任务:优先用传入的,否则从 DataWorks API 获取
+        if dataworks_tasks is None:
+            try:
+                info = self.get_table_info(table_name)
+                dataworks_tasks = [
+                    {"id": task.get("Id"), "name": task.get("Name")}
+                    for task in info.get("dataworks_tasks", [])
+                ]
+            except Exception:
+                dataworks_tasks = []
+
+        # 直接上游表(血缘)
+        try:
+            upstream_tables = self.get_upstream_tables(table_name)
+        except Exception:
+            upstream_tables = []
+
+        return {
+            "name": table,
+            "project": project,
+            "comment": t.comment or "",
+            "columns": columns,
+            "partition_keys": partition_keys,
+            "dataworks_tasks": dataworks_tasks,
+            "upstream_tables": upstream_tables,
+        }
+
+    def get_node_code(self, table_name: str, force: bool = False) -> list:
+        """根据表名获取生产代码(优先读本地缓存)。
+
+        流程:本地缓存 → GetTable → GetTask → 写缓存 → 返回代码
+
+        Args:
+            table_name: 表名(支持 project.table 格式)
+            force: True 时跳过缓存,强制从 API 拉取
+
+        Returns:
+            list of dict,每条包含:
+                task_id, task_name, task_type, content
+        """
+        table_name = self._normalize_table_name(table_name)
+
+        # 读缓存
+        if not force:
+            cached = self._read_cache(table_name)
+            if cached is not None:
+                print(f"[cache] {self._cache_path(table_name)}")
+                # 同时检查 schema 缓存,无则补拉
+                self._ensure_schema_cache(table_name, force=False)
+                return [{"task_id": None, "task_name": "(cached)", "task_type": None, "content": cached}]
+
+        # API 拉取
+        info = self.get_table_info(table_name)
+        upstream = info.get("dataworks_tasks", [])
+        if not upstream:
+            print(f"表 '{table_name}' 没有上游任务")
+            return []
+
+        results = []
+        for task in upstream:
+            task_id = task.get("Id")
+            task_name = task.get("Name")
+            result = self._get_task_code(task_id)
+            if result:
+                results.append(result)
+            else:
+                print(f"[WARN] 任务 {task_name}({task_id}) 所有 AK 均无权限")
+
+        # 写缓存
+        if results:
+            parts = []
+            for r in results:
+                header = f"-- Task: {r['task_name']}  ID: {r['task_id']}  Type: {r['task_type']}"
+                parts.append(f"{header}\n{r['content']}")
+            self._write_cache(table_name, "\n\n".join(parts))
+            print(f"[saved] {self._cache_path(table_name)}")
+
+        # 获取并缓存 schema(复用已有的上游任务信息,避免重复 API 调用)
+        up_tasks = [
+            {"id": task.get("Id"), "name": task.get("Name")}
+            for task in upstream
+        ]
+        self._ensure_schema_cache(table_name, force=force, dataworks_tasks=up_tasks)
+
+        return results
+
+    def get_upstream_tables(self, table_name: str) -> list[str]:
+        """通过血缘 API 获取表的直接上游表列表。
+
+        Returns:
+            list of str,如 ["loghubods.user_share_log_flow", ...]
+        """
+        entity_id = self._build_entity_id(table_name)
+        resp = _call_with_retry(lambda: self.client.list_lineages(
+            _dw_models.ListLineagesRequest(dst_entity_id=entity_id, page_size=50)
+        ))
+        lineages = resp.body.to_map().get("PagingInfo", {}).get("Lineages", [])
+
+        tables = []
+        for l in lineages:
+            src_id = l.get("SrcEntity", {}).get("Id", "")
+            # maxcompute-table:::project::table → project.table
+            parts = src_id.replace("maxcompute-table:::", "").split("::")
+            if len(parts) == 2:
+                tables.append(f"{parts[0]}.{parts[1]}")
+        return sorted(set(tables))
+
+    def get_node_code_recursive(self, table_name: str, max_depth: int = 3,
+                                force: bool = False) -> dict:
+        """BFS 逐层获取表及其所有上游表的生产代码。
+
+        通过血缘 API(ListLineages)逐层追溯上游依赖,
+        每层的代码和上游表都会被缓存到 production_code/。
+
+        Args:
+            table_name: 表名(支持 project.table 格式)
+            max_depth: 最大追溯层数,默认 3
+            force: True 时跳过缓存
+
+        Returns:
+            dict: {
+                "project.table": {
+                    "code": [...],            # get_node_code 返回值
+                    "upstream": ["a.b", ...],  # 上游表名列表
+                    "depth": int
+                }, ...
+            }
+        """
+        from collections import deque
+
+        table_name = self._normalize_table_name(table_name)
+        result = {}
+        queue = deque([(table_name, 0)])
+        visited = {table_name}
+
+        while queue:
+            tbl, depth = queue.popleft()
+            indent = "  " * depth
+            print(f"{indent}[depth={depth}] {tbl}")
+
+            # 获取代码
+            code = self.get_node_code(tbl, force=force)
+
+            # 获取上游表
+            upstream = []
+            if depth < max_depth:
+                try:
+                    upstream = self.get_upstream_tables(tbl)
+                except Exception:
+                    pass
+
+            result[tbl] = {"code": code, "upstream": upstream, "depth": depth}
+
+            # 下一层入队
+            for up_tbl in upstream:
+                if up_tbl not in visited:
+                    visited.add(up_tbl)
+                    queue.append((up_tbl, depth + 1))
+
+        # 打印汇总
+        print(f"\n共追溯 {len(result)} 张表:")
+        for tbl, info in result.items():
+            has_code = "有代码" if info["code"] else "无代码"
+            n_up = len(info["upstream"])
+            print(f"  {'  ' * info['depth']}{tbl}  ({has_code}, {n_up} 个上游)")
+
+        return result
+
+    def print_node_code(self, table_name: str):
+        """打印表的生产代码(人类可读格式)"""
+        results = self.get_node_code(table_name)
+        if not results:
+            print(f"未找到 '{table_name}' 的生产代码")
+            return
+        for r in results:
+            print(f"\n{'='*60}")
+            print(f"任务: {r['task_name']}  ID: {r['task_id']}  "
+                  f"类型: {r['task_type']}")
+            print(f"{'='*60}")
+            print(r["content"] or "(无内容)")

+ 1943 - 0
skills/src/odps_skills/feishu.py

@@ -0,0 +1,1943 @@
+# -*- coding: UTF-8 -*-
+import json
+import base64
+import hashlib
+import os
+from curl_cffi import requests as mj_requests
+import requests
+import os
+from dotenv import load_dotenv, find_dotenv
+
+load_dotenv(find_dotenv())
+
+# load from env
+APP_ID = 'cli_a22acf2916b8500e'
+APP_SECRET = 'tE0xAB2gZTMlBGdPczCGLcmpRlZQm5CQ'
+LARK_HOST = 'https://open.feishu.cn'
+APP_HOST = 'https://open.feishu.cn'
+EMAIL = 'semsevens@email.com'
+
+class LarkException(Exception):
+    def __init__(self, code=0, msg=None):
+        self.code = code
+        self.msg = msg
+
+    def __str__(self) -> str:
+        return "{}:{}".format(self.code, self.msg)
+
+    __repr__ = __str__
+
+def request(method, url, headers, payload={}):
+    response = requests.request(method, url, headers=headers, json=payload)
+    # logging.info("URL: " + url)
+    # logging.info("X-Tt-Logid: " + response.headers['X-Tt-Logid'])
+    # logging.info("headers:\n"+json.dumps(headers,indent=2, ensure_ascii=False))
+    # logging.info("payload:\n"+json.dumps(payload,indent=2, ensure_ascii=False))
+    resp = {}
+    if response.text[0] == '{':
+        resp = response.json()
+        # logging.info("response:\n"+json.dumps(resp,indent=2, ensure_ascii=False))
+    else:
+        pass
+        # logging.info("response:\n"+response.text)
+    code = resp.get("code", -1)
+    if code == -1:
+        code = resp.get("StatusCode", -1)
+    if code == -1 and response.status_code != 200:
+        response.raise_for_status()
+    if code != 0:
+        raise LarkException(code=code, msg=resp.get("msg", ""))
+    return resp
+def get_image_data_from_url(img_url, use_cache=True):
+    # 计算URL的MD5哈希值
+    url_hash = hashlib.md5(img_url.encode()).hexdigest()
+    cache_dir = 'image_cache'
+    cache_file = os.path.join(cache_dir, f'{url_hash}.json')
+
+    if use_cache:
+        # 检查缓存目录是否存在,如果不存在则创建
+        if not os.path.exists(cache_dir):
+            os.makedirs(cache_dir)
+
+        # 检查缓存文件是否存在
+        if os.path.exists(cache_file):
+            with open(cache_file, 'r') as f:
+                cached_data = json.load(f)
+            return cached_data['image_data']
+
+    # 如果缓存不存在,从URL获取图片
+    if 'midjourney' in img_url:
+        proxies = {
+            'http': 'http://127.0.0.1:7890',
+            'https': 'http://127.0.0.1:7890',
+        }
+        # response = mj_requests.get(img_url, impersonate="chrome100", proxies=proxies)
+        response = mj_requests.get(img_url.replace("https://", "http://"), impersonate="chrome100")
+    else:
+        # proxies = {
+        #     'http': 'http://t10952018781111:1ap37oc3@d844.kdltps.com:15818',
+        #     'https': 'http://t10952018781111:1ap37oc3@d844.kdltps.com:15818',
+        # }
+        # proxies = {
+        #     'http': None,
+        #     'https': None,
+        # }
+        # response = requests.get(img_url.replace("https://", "http://"), proxies=proxies)
+        response = requests.get(img_url)
+        # response = requests.get(img_url, proxies=proxies)
+    if response.status_code == 200:
+        image_content = response.content
+        missing_padding = 4 - len(image_content) % 4
+        if missing_padding:
+            image_content += b'=' * missing_padding
+        image_data = base64.b64encode(image_content).decode('utf-8')
+
+        # 将图片数据保存到缓存
+        with open(cache_file, 'w') as f:
+            json.dump({'image_data': image_data}, f)
+
+        return image_data
+    else:
+        # import traceback
+        # traceback.print_exc()
+        raise Exception(f"无法从URL获取图片: {img_url}")
+from PIL import Image
+import io
+import os
+def get_image_size(img_url):
+    img_data = get_image_data_from_url(img_url)
+    img = Image.open(io.BytesIO(base64.b64decode(img_data)))
+    width, height = img.size
+    return width, height
+
+if __name__ == "__main__":
+    img_url = "https://sns-webpic.xhscdn.com/1040g2sg31c4vs26n12a05ph3cdp3cutm5prqo90"
+    img_data = get_image_data_from_url(img_url)
+
+    save_path = "/Users/nieqi/Downloads/save.json"
+    with open(save_path, 'w') as f:
+        f.write(img_data)
+
+def column_id(col):
+    '''column int to string id'''
+    ans = ""
+    i = col
+    while i > 0:
+        m = int((i-1) % 26)
+        i = int((i-1) / 26)
+        ans = chr(m+65) + ans
+    return ans
+
+def do_compress_image(image_data, image_type):
+     # 压缩图片
+    from PIL import Image
+    import io
+    import base64
+    Image.MAX_IMAGE_PIXELS = None  # 禁用图片大小限制
+
+    # 将base64转为图片对象
+    image = Image.open(io.BytesIO(base64.b64decode(image_data)))
+
+    # 计算压缩后的尺寸,保持宽高比
+    max_size = 1600
+    ratio = min(max_size/image.width, max_size/image.height)
+    if ratio < 1:
+        new_size = (int(image.width * ratio), int(image.height * ratio))
+        image = image.resize(new_size, Image.Resampling.LANCZOS)
+
+    # 在保存之前转换RGBA为RGB
+    if image.mode == 'RGBA':
+        # 创建白色背景
+        background = Image.new('RGB', image.size, (255, 255, 255))
+        # 将RGBA图片合成到白色背景上
+        background.paste(image, mask=image.split()[3])  # 使用alpha通道作为mask
+        image = background
+
+    buffer = io.BytesIO()
+
+    # 将 'JPG' 转换为 'JPEG'
+    if image_type and image_type.upper() == 'JPG':
+        image_type = 'JPEG'
+    image_type = 'JPEG'
+    # image.save(buffer, format=image_type.upper(), quality=95, optimize=True)
+    image.save(buffer, format=image_type.upper(), quality=85, optimize=True)
+    image_data = base64.b64encode(buffer.getvalue()).decode()
+    return image_data
+
+class Client(object):
+    def __init__(self, lark_host):
+        self._host = lark_host
+
+    def get_tenant_access_token(self, app_id, app_secret):
+        url = self._host+"/open-apis/auth/v3/app_access_token/internal/"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8'
+        }
+        payload = {
+            'app_id': app_id,
+            'app_secret': app_secret
+        }
+        resp = request("POST", url, headers, payload)
+        return resp['tenant_access_token']
+
+    def get_user_access_token(self, tenant_access_token, code):
+        url = self._host+"/open-apis/authen/v1/access_token"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8'
+        }
+        payload = {
+            "grant_type": "authorization_code",
+            "code": code,
+            "app_access_token": tenant_access_token
+        }
+        resp = request("POST", url, headers, payload)
+        return resp['data']['access_token']
+
+    def get_root_folder_token(self, access_token):
+        url = self._host+"/open-apis/drive/explorer/v2/root_folder/meta"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8',
+            'Authorization': 'Bearer '+access_token
+        }
+        resp = request("GET", url, headers)
+        return resp['data']['token']
+
+    def create_spreadsheet(self, access_token, foldertoken, title):
+        url =self._host+"/open-apis/sheets/v3/spreadsheets"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8',
+            'Authorization': 'Bearer '+access_token
+        }
+        payload={
+            "title": title,
+            "folder_token": foldertoken
+        }
+        resp = request("POST", url, headers, payload)
+        return resp['data']['spreadsheet']['spreadsheet_token'], resp['data']['spreadsheet']['url']
+
+    def get_sheetid(self, access_token, doctoken, sheet_index=0):
+        url = self._host+"/open-apis/sheets/v2/spreadsheets/"+doctoken+"/metainfo"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8',
+            'Authorization': 'Bearer '+access_token
+        }
+        resp = request("GET", url, headers)
+        return resp['data']['sheets'][sheet_index]["sheetId"]
+
+    def batch_update_values(self, access_token, doctoken, data):
+        url =self._host+"/open-apis/sheets/v2/spreadsheets/"+doctoken+"/values_batch_update"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8',
+            'Authorization': 'Bearer '+access_token
+        }
+        payload=data
+        resp = request("POST", url, headers, payload)
+        return resp['data']['spreadsheetToken']
+
+    def batch_update_styles(self, access_token, doctoken, data):
+        url =self._host+"/open-apis/sheets/v2/spreadsheets/"+doctoken+"/styles_batch_update"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8',
+            'Authorization': 'Bearer '+access_token
+        }
+        payload=data
+        resp = request("PUT", url, headers, payload)
+        return resp['data']['spreadsheetToken']
+
+    def add_permissions_member(self, access_token, doctoken, doctype, member_type, member_id, perm):
+        url = self._host+"/open-apis/drive/v1/permissions/"+doctoken+"/members?type="+doctype+"&need_notification=false"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8',
+            'Authorization': 'Bearer '+access_token
+        }
+        payload = {
+            "member_type": member_type,
+            "member_id": member_id,
+            "perm": perm
+        }
+        request("POST", url, headers, payload)
+
+    def write_image_to_cell(self, access_token, doctoken, sheetid, img_url, row, col, image_type, compress_image=True):
+        url = f"{self._host}/open-apis/sheets/v2/spreadsheets/{doctoken}/values_image"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8',
+            'Authorization': f'Bearer {access_token}'
+        }
+        try:
+            image_data = get_image_data_from_url(img_url)
+        except Exception as e:
+            print(img_url)
+            print(e)
+            return None, None
+        if compress_image:
+            image_data = do_compress_image(image_data, image_type)
+
+        image_name = img_url.split('/')[-1].replace(f'.{image_type}', '')  # 从URL中提取文件名
+        if compress_image:
+            image_type = 'JPEG'
+        cell_start = column_id(col)+str(row)
+        range = f'{sheetid}!{cell_start}:{cell_start}'
+        payload = {
+            "range": range,
+            "image": image_data,
+            "name": f"{image_name}.{image_type}"
+        }
+        try:
+            resp = request("POST", url, headers, payload)
+        except Exception as e:
+            print(img_url)
+            print(image_name)
+            print(image_type)
+            print(e)
+            return None, None
+        return resp['data']['revision'], resp['data']['updateRange']
+
+    def merge_cells(self, access_token, doctoken, sheetid, start_row, end_row, start_col, end_col):
+        print(f"merge  start_row = {start_row} end_row = {end_row} start_col = {start_col} end_col = {end_col}")
+        url = f"{self._host}/open-apis/sheets/v2/spreadsheets/{doctoken}/merge_cells"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8',
+            'Authorization': f'Bearer {access_token}'
+        }
+
+        start_col_id = column_id(start_col)
+        end_col_id = column_id(end_col)
+
+        payload = {
+            "range": f"{sheetid}!{start_col_id}{start_row}:{end_col_id}{end_row}",
+            "mergeType": "MERGE_ALL",
+        }
+        try:
+            resp = request("POST", url, headers, payload)
+        except Exception as e:
+            print(e)
+            return None
+        return None
+
+    def write_images_to_cell(self, access_token, doctoken, sheetid, img_url_list, row, col, compress_image=True, grid_width=None, grid_height=None, border_width=3, border_color=(200, 200, 200)):
+        """
+        将多张图片拼接后写入单元格
+        
+        Args:
+            img_url_list: 图片URL列表
+            row: 目标单元格行号
+            col: 目标单元格列号
+            compress_image: 是否压缩图片
+            grid_width: 拼接图片的列数,如果为None则自动计算
+            grid_height: 拼接图片的行数,如果为None则自动计算
+            border_width: 边框宽度,像素
+            border_color: 边框颜色,RGB元组
+        """
+        from PIL import Image, ImageDraw
+        import io
+        import base64
+        import math
+
+        # 下载所有图片
+        images = []
+        for img_url in img_url_list:
+            try:
+                image_type = get_image_type(img_url)
+                if not image_type:
+                    continue
+
+                image_data = get_image_data_from_url(img_url)
+                image = Image.open(io.BytesIO(base64.b64decode(image_data)))
+                images.append(image)
+            except Exception as e:
+                print(f"下载图片失败: {img_url}")
+                print(e)
+                continue
+
+        if not images:
+            return None, None
+
+        # 计算拼接图片的行列数
+        img_count = len(images)
+        if grid_width is None and grid_height is None:
+            # 如果未指定行列数,计算最接近正方形的网格
+            grid_width = math.ceil(math.sqrt(img_count))
+            grid_height = math.ceil(img_count / grid_width)
+        elif grid_width is None:
+            # 如果只指定了行数,计算列数
+            grid_width = math.ceil(img_count / grid_height)
+        elif grid_height is None:
+            # 如果只指定了列数,计算行数
+            grid_height = math.ceil(img_count / grid_width)
+
+        # 确保网格能容纳所有图片
+        while grid_width * grid_height < img_count:
+            if grid_width <= grid_height:
+                grid_width += 1
+            else:
+                grid_height += 1
+
+        # 调整所有图片到相同尺寸,保持原始比例
+        if images:
+            # 计算目标尺寸(使用平均尺寸作为参考)
+            avg_width = sum(img.width for img in images) // len(images)
+            avg_height = sum(img.height for img in images) // len(images)
+            target_size = (avg_width, avg_height)
+            
+            # 调整图片尺寸,保持原始比例
+            resized_images = []
+            for img in images:
+                # 计算保持比例的缩放尺寸
+                img_ratio = img.width / img.height
+                target_ratio = target_size[0] / target_size[1]
+                
+                if img_ratio > target_ratio:
+                    # 图片比目标更宽,以宽度为准
+                    new_width = target_size[0]
+                    new_height = int(target_size[0] / img_ratio)
+                else:
+                    # 图片比目标更高,以高度为准
+                    new_height = target_size[1]
+                    new_width = int(target_size[1] * img_ratio)
+                
+                # 缩放图片,保持比例
+                resized_img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+                resized_images.append(resized_img)
+            
+            # 创建拼接画布
+            canvas_width = grid_width * avg_width + (grid_width + 1) * border_width
+            canvas_height = grid_height * avg_height + (grid_height + 1) * border_width
+            canvas = Image.new('RGB', (canvas_width, canvas_height), border_color)
+            
+            # 拼接图片
+            for i, img in enumerate(resized_images):
+                row_idx = i // grid_width
+                col_idx = i % grid_width
+                
+                # 计算每个网格单元的位置
+                cell_x = col_idx * avg_width + (col_idx + 1) * border_width
+                cell_y = row_idx * avg_height + (row_idx + 1) * border_width
+                
+                # 在网格单元中居中放置图片
+                center_x = cell_x + (avg_width - img.width) // 2
+                center_y = cell_y + (avg_height - img.height) // 2
+                
+                canvas.paste(img, (center_x, center_y))
+            
+            # 将拼接后的图片转换为base64
+            output = io.BytesIO()
+            if compress_image:
+                canvas.save(output, format='JPEG', quality=85)
+                image_type = 'JPEG'
+            else:
+                canvas.save(output, format='PNG')
+                image_type = 'PNG'
+            
+            output.seek(0)
+            image_data = base64.b64encode(output.getvalue()).decode()
+            
+            # 调用写入图片的API
+            url = f"{self._host}/open-apis/sheets/v2/spreadsheets/{doctoken}/values_image"
+            headers = {
+                'Content-Type': 'application/json; charset=utf-8',
+                'Authorization': f'Bearer {access_token}'
+            }
+            
+            cell_start = column_id(col) + str(row)
+            range_val = f'{sheetid}!{cell_start}:{cell_start}'
+            payload = {
+                "range": range_val,
+                "image": image_data,
+                "name": f"combined_image.{image_type}"
+            }
+            
+            try:
+                resp = request("POST", url, headers, payload)
+                return resp['data']['revision'], resp['data']['updateRange']
+            except Exception as e:
+                print(f"写入拼接图片失败: {e}")
+                return None, None
+        
+        return None, None
+
+    def read_range_values(self, access_token, doctoken, range_val):
+        """
+        读取指定范围的数据
+        
+        Args:
+            access_token: 访问令牌
+            doctoken: 表格token
+            range_val: 范围,格式如 "Sheet1!A1:C10"
+            
+        Returns:
+            读取到的数据列表
+        """
+        url = f"{self._host}/open-apis/sheets/v2/spreadsheets/{doctoken}/values/{range_val}"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8',
+            'Authorization': f'Bearer {access_token}'
+        }
+        
+        try:
+            resp = request("GET", url, headers)
+            return resp['data']['valueRange']['values']
+        except Exception as e:
+            print(f"读取数据失败: {e}")
+            return []
+
+    def prepend_data(self, access_token, doctoken, range_val, values):
+        """
+        在指定位置前面插入数据
+        
+        Args:
+            access_token: 访问令牌
+            doctoken: 表格token
+            range_val: 插入范围,格式如 "Sheet1!A1:C1"
+            values: 要插入的数据
+            
+        Returns:
+            操作结果
+        """
+        url = f"{self._host}/open-apis/sheets/v3/spreadsheets/{doctoken}/sheets/{range_val.split('!')[0]}/prepend"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8',
+            'Authorization': f'Bearer {access_token}'
+        }
+        
+        # 从range_val中提取行数
+        range_part = range_val.split('!')[1]  # 如 "A1:Z1"
+        start_cell = range_part.split(':')[0]  # 如 "A1"
+        
+        payload = {
+            "values": values
+        }
+        
+        try:
+            resp = request("POST", url, headers, payload)
+            return resp
+        except Exception as e:
+            print(f"插入数据失败: {e}")
+            return None
+
+    def insert_data_at_row(self, access_token, doctoken, sheetid, row, values):
+        """
+        在指定行插入数据(使用批量更新方式)
+        
+        Args:
+            access_token: 访问令牌
+            doctoken: 表格token  
+            sheetid: 工作表ID
+            row: 目标行号
+            values: 要插入的数据
+            
+        Returns:
+            操作结果
+        """
+        # 使用批量更新的方式插入数据
+        cols = len(values[0]) if values else 1
+        end_col = column_id(cols)
+        range_val = f"{sheetid}!A{row}:{end_col}{row}"
+        
+        body = {
+            "valueRanges": [
+                {
+                    "range": range_val,
+                    "values": values
+                }
+            ]
+        }
+        
+        try:
+            result = self.batch_update_values(access_token, doctoken, body)
+            return result
+        except Exception as e:
+            print(f"插入数据到第{row}行失败: {e}")
+            return None
+
+    def insert_rows_before(self, access_token, doctoken, sheetid, row_index, count=1):
+        """
+        在指定行前插入新行(基于飞书官方API)
+        
+        Args:
+            access_token: 访问令牌
+            doctoken: 表格token
+            sheetid: 工作表ID
+            row_index: 插入位置的行号(从1开始,在此行前插入)
+            count: 插入行数(默认1行)
+            
+        Returns:
+            操作结果
+        """
+        # 先获取工作表信息,检查当前行数
+        sheet_props = self.get_sheet_properties(access_token, doctoken, sheetid)
+        if not sheet_props:
+            print("无法获取工作表信息,尝试直接插入")
+            current_row_count = 1000  # 默认值
+        else:
+            current_row_count = sheet_props['row_count']
+            print(f"当前工作表行数: {current_row_count}")
+        
+        # 如果要插入的位置超过了当前行数,使用追加模式
+        if row_index > current_row_count:
+            print(f"插入位置({row_index})超过当前行数({current_row_count}),使用追加模式")
+            # 使用追加方式在末尾添加行
+            return self.append_empty_rows(access_token, doctoken, sheetid, count)
+        
+        url = f"{self._host}/open-apis/sheets/v2/spreadsheets/{doctoken}/insert_dimension_range"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8',
+            'Authorization': f'Bearer {access_token}'
+        }
+        
+        # 转换为0基索引:row_index=3表示第3行,对应startIndex=2
+        start_index = row_index - 1  # 从0开始计数
+        end_index = start_index + count  # 结束位置(不包含)
+        
+        # 确保 endIndex 不超过当前工作表的行数限制
+        if end_index > current_row_count:
+            print(f"警告:计算的endIndex({end_index})超过当前行数({current_row_count}),调整为追加模式")
+            return self.append_empty_rows(access_token, doctoken, sheetid, count)
+        
+        # 智能选择继承样式:插入第2行时继承后面的数据行样式,其他情况继承前面的样式
+        inherit_style = "AFTER" if row_index == 2 else "BEFORE"
+        
+        payload = {
+            "dimension": {
+                "sheetId": sheetid,
+                "majorDimension": "ROWS",
+                "startIndex": start_index,  # 从0开始计数
+                "endIndex": end_index  # 结束位置(不包含此行)
+            },
+            "inheritStyle": inherit_style  # 智能继承样式
+        }
+        
+        try:
+            resp = request("POST", url, headers, payload)
+            print(f"在第{row_index}行前成功插入{count}行(startIndex={start_index}, endIndex={end_index}, inheritStyle={inherit_style})")
+            return resp
+        except Exception as e:
+            print(f"在第{row_index}行前插入{count}行失败: {e}")
+            # 如果插入失败,尝试追加模式
+            print("尝试使用追加模式...")
+            return self.append_empty_rows(access_token, doctoken, sheetid, count)
+
+    def insert_row_with_images(self, access_token, doctoken, sheetid, row, values, compress_image=True, grid_width=None, grid_height=None, border_width=3, border_color=(200, 200, 200)):
+        """
+        在指定行插入数据并同时处理图片写入(覆盖方式)
+        
+        Args:
+            access_token: 访问令牌
+            doctoken: 表格token  
+            sheetid: 工作表ID
+            row: 目标行号
+            values: 要插入的数据
+            compress_image: 是否压缩图片
+            grid_width: 拼接图片的列数
+            grid_height: 拼接图片的行数
+            border_width: 边框宽度
+            border_color: 边框颜色
+            
+        Returns:
+            操作结果
+        """
+        # 1. 先插入文本数据(覆盖指定行)
+        result = self.insert_data_at_row(access_token, doctoken, sheetid, row, values)
+        
+        if not result:
+            return None
+            
+        # 2. 同时处理图片写入
+        if values and len(values) > 0:
+            row_data = values[0]
+            for col_index, cell in enumerate(row_data, start=1):
+                if is_image_list_cell_url(cell):
+                    # 处理图片列表
+                    try:
+                        img_urls = eval(cell)
+                        self.write_images_to_cell(access_token, doctoken, sheetid, img_urls, row, col_index, compress_image, grid_width, grid_height, border_width, border_color)
+                    except Exception as e:
+                        print(f"写入图片列表失败 (第{row}行第{col_index}列): {e}")
+                elif is_image_cell(cell):
+                    # 处理单张图片
+                    image_type = get_image_type(cell)
+                    if image_type:
+                        try:
+                            self.write_image_to_cell(access_token, doctoken, sheetid, cell, row, col_index, image_type, compress_image)
+                        except Exception as e:
+                            print(f"写入单张图片失败 (第{row}行第{col_index}列): {e}")
+        
+        return result
+
+    def update_specific_fields(self, access_token, doctoken, sheetid, row, field_updates, headers=None):
+        """
+        只更新指定字段,其他字段保持不变
+        
+        Args:
+            access_token: 访问令牌
+            doctoken: 表格token  
+            sheetid: 工作表ID
+            row: 目标行号(从1开始)
+            field_updates: 字段更新字典,格式如 {"列名": "新值", "列B": "新值B"} 
+                          或者 {列索引: "新值", 2: "新值B"}(从1开始计数)
+            headers: 表头列表,用于列名到列索引的映射。如果为None,则field_updates的key必须是列索引
+            
+        Returns:
+            操作结果
+        """
+        try:
+            # 如果提供了headers且field_updates的key是列名,则转换为列索引
+            if headers and field_updates:
+                column_updates = {}
+                for field_name, value in field_updates.items():
+                    if isinstance(field_name, str):  # 如果是列名
+                        try:
+                            col_index = headers.index(field_name) + 1  # 转为1基索引
+                            column_updates[col_index] = value
+                        except ValueError:
+                            print(f"警告:找不到列名 '{field_name}',跳过更新")
+                            continue
+                    else:  # 如果已经是列索引
+                        column_updates[field_name] = value
+            else:
+                column_updates = field_updates
+            
+            # 构建批量更新请求
+            value_ranges = []
+            for col_index, value in column_updates.items():
+                col_letter = column_id(col_index)
+                range_val = f"{sheetid}!{col_letter}{row}:{col_letter}{row}"
+                value_ranges.append({
+                    "range": range_val,
+                    "values": [[value]]
+                })
+            
+            body = {
+                "valueRanges": value_ranges
+            }
+            
+            result = self.batch_update_values(access_token, doctoken, body)
+            
+            if result:
+                updated_fields = list(column_updates.keys())
+                print(f"成功更新第{row}行的字段: {updated_fields}")
+            
+            return result
+        except Exception as e:
+            print(f"更新第{row}行指定字段失败: {e}")
+            return None
+
+    def update_row_with_specific_fields_and_images(self, access_token, doctoken, sheetid, row, field_updates, headers=None, compress_image=True, grid_width=None, grid_height=None, border_width=3, border_color=(200, 200, 200)):
+        """
+        更新指定字段并处理图片
+        
+        Args:
+            access_token: 访问令牌
+            doctoken: 表格token  
+            sheetid: 工作表ID
+            row: 目标行号
+            field_updates: 字段更新字典
+            headers: 表头列表
+            compress_image: 是否压缩图片
+            grid_width: 拼接图片的列数
+            grid_height: 拼接图片的行数
+            border_width: 边框宽度
+            border_color: 边框颜色
+            
+        Returns:
+            操作结果
+        """
+        # 1. 先更新文本数据
+        result = self.update_specific_fields(access_token, doctoken, sheetid, row, field_updates, headers)
+        
+        if not result:
+            return None
+        
+        # 2. 处理图片写入
+        column_updates = {}
+        if headers and field_updates:
+            for field_name, value in field_updates.items():
+                if isinstance(field_name, str):  # 如果是列名
+                    try:
+                        col_index = headers.index(field_name) + 1
+                        column_updates[col_index] = value
+                    except ValueError:
+                        continue
+                else:  # 如果已经是列索引
+                    column_updates[field_name] = value
+        else:
+            column_updates = field_updates
+        
+        for col_index, cell in column_updates.items():
+            if is_image_list_cell_url(cell):
+                # 处理图片列表
+                try:
+                    img_urls = eval(cell)
+                    self.write_images_to_cell(access_token, doctoken, sheetid, img_urls, row, col_index, compress_image, grid_width, grid_height, border_width, border_color)
+                except Exception as e:
+                    print(f"写入图片列表失败 (第{row}行第{col_index}列): {e}")
+            elif is_image_cell(cell):
+                # 处理单张图片
+                image_type = get_image_type(cell)
+                if image_type:
+                    try:
+                        self.write_image_to_cell(access_token, doctoken, sheetid, cell, row, col_index, image_type, compress_image)
+                    except Exception as e:
+                        print(f"写入单张图片失败 (第{row}行第{col_index}列): {e}")
+        
+        return result
+
+    def insert_row_with_data_at_position(self, access_token, doctoken, sheetid, row_position, values, compress_image=True, grid_width=None, grid_height=None, border_width=3, border_color=(200, 200, 200)):
+        """
+        在指定位置真正插入新行并填入数据
+        
+        Args:
+            access_token: 访问令牌
+            doctoken: 表格token  
+            sheetid: 工作表ID
+            row_position: 插入位置(从1开始,在此行前插入)
+            values: 要插入的数据
+            compress_image: 是否压缩图片
+            grid_width: 拼接图片的列数
+            grid_height: 拼接图片的行数
+            border_width: 边框宽度
+            border_color: 边框颜色
+            
+        Returns:
+            操作结果
+        """
+        # 获取当前工作表行数
+        sheet_props = self.get_sheet_properties(access_token, doctoken, sheetid)
+        current_row_count = sheet_props['row_count'] if sheet_props else 1
+        
+        # 1. 先插入空行
+        insert_result = self.insert_rows_before(access_token, doctoken, sheetid, row_position, 1)
+        
+        if not insert_result:
+            print(f"插入空行失败,无法在第{row_position}行插入数据")
+            return None
+        
+        # 如果是追加模式(插入位置超过了原有行数),实际数据位置是当前行数+1
+        actual_row_position = row_position
+        if row_position > current_row_count:
+            actual_row_position = current_row_count + 1
+            print(f"追加模式:实际数据插入位置调整为第{actual_row_position}行")
+            
+        # 2. 再在新插入的行中填入数据
+        result = self.insert_data_at_row(access_token, doctoken, sheetid, actual_row_position, values)
+        
+        if not result:
+            print(f"插入数据失败")
+            return None
+            
+        # 3. 同时处理图片写入
+        if values and len(values) > 0:
+            row_data = values[0]
+            for col_index, cell in enumerate(row_data, start=1):
+                if is_image_list_cell_url(cell):
+                    # 处理图片列表
+                    try:
+                        img_urls = eval(cell)
+                        self.write_images_to_cell(access_token, doctoken, sheetid, img_urls, actual_row_position, col_index, compress_image, grid_width, grid_height, border_width, border_color)
+                    except Exception as e:
+                        print(f"写入图片列表失败 (第{actual_row_position}行第{col_index}列): {e}")
+                elif is_image_cell(cell):
+                    # 处理单张图片
+                    image_type = get_image_type(cell)
+                    if image_type:
+                        try:
+                            self.write_image_to_cell(access_token, doctoken, sheetid, cell, actual_row_position, col_index, image_type, compress_image)
+                        except Exception as e:
+                            print(f"写入单张图片失败 (第{actual_row_position}行第{col_index}列): {e}")
+        
+        return result
+
+    def get_sheet_info(self, access_token, doctoken, sheetid):
+        """
+        获取工作表的基础信息
+        
+        Args:
+            access_token: 访问令牌
+            doctoken: 表格token
+            sheetid: 工作表ID
+            
+        Returns:
+            工作表信息,包含行数、列数等
+        """
+        url = f"{self._host}/open-apis/sheets/v3/spreadsheets/{doctoken}/sheets/{sheetid}"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8',
+            'Authorization': f'Bearer {access_token}'
+        }
+        
+        try:
+            resp = request("GET", url, headers)
+            return resp['data']['sheet']
+        except Exception as e:
+            print(f"获取工作表信息失败: {e}")
+            return None
+
+    def get_sheet_properties(self, access_token, doctoken, sheetid):
+        """
+        获取工作表属性,包括行数和列数
+        
+        Args:
+            access_token: 访问令牌
+            doctoken: 表格token
+            sheetid: 工作表ID
+            
+        Returns:
+            dict: 包含 row_count, column_count 等信息
+        """
+        sheet_info = self.get_sheet_info(access_token, doctoken, sheetid)
+        if sheet_info:
+            grid_properties = sheet_info.get('grid_properties', {})
+            return {
+                'row_count': grid_properties.get('row_count', 0),
+                'column_count': grid_properties.get('column_count', 0),
+                'title': sheet_info.get('title', ''),
+                'sheet_id': sheet_info.get('sheet_id', ''),
+                'sheet_type': sheet_info.get('sheet_type', '')
+            }
+        return None
+
+    def append_data(self, access_token, doctoken, range_val, values):
+        """
+        在指定位置后面追加数据
+        
+        Args:
+            access_token: 访问令牌
+            doctoken: 表格token
+            range_val: 追加范围,格式如 "Sheet1!A1:C1"
+            values: 要追加的数据
+            
+        Returns:
+            操作结果
+        """
+        url = f"{self._host}/open-apis/sheets/v2/spreadsheets/{doctoken}/values_append"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8',
+            'Authorization': f'Bearer {access_token}'
+        }
+        
+        payload = {
+            "valueRange": {
+                "range": range_val,
+                "values": values
+            }
+        }
+        
+        try:
+            resp = request("POST", url, headers, payload)
+            return resp
+        except Exception as e:
+            print(f"追加数据失败: {e}")
+            return None
+
+    def delete_rows(self, access_token, doctoken, sheetid, start_row, end_row):
+        """
+        删除指定范围的行
+        
+        Args:
+            access_token: 访问令牌
+            doctoken: 表格token
+            sheetid: 工作表ID
+            start_row: 开始行号(从1开始)
+            end_row: 结束行号(从1开始,包含)
+            
+        Returns:
+            操作结果
+        """
+        url = f"{self._host}/open-apis/sheets/v2/spreadsheets/{doctoken}/dimension_range"
+        headers = {
+            'Content-Type': 'application/json; charset=utf-8',
+            'Authorization': f'Bearer {access_token}'
+        }
+        
+        payload = {
+            "dimension": {
+                "sheetId": sheetid,
+                "majorDimension": "ROWS",
+                "startIndex": start_row,  # 从1开始计数,包含
+                "endIndex": end_row       # 从1开始计数,包含
+            }
+        }
+        
+        try:
+            resp = request("DELETE", url, headers, payload)
+            return resp
+        except Exception as e:
+            print(f"删除第{start_row}-{end_row}行失败: {e}")
+            return None
+
+    def delete_single_row(self, access_token, doctoken, sheetid, row):
+        """
+        删除单行
+        
+        Args:
+            access_token: 访问令牌
+            doctoken: 表格token
+            sheetid: 工作表ID
+            row: 行号(从1开始)
+            
+        Returns:
+            操作结果
+        """
+        return self.delete_rows(access_token, doctoken, sheetid, row, row)
+
+    def append_empty_rows(self, access_token, doctoken, sheetid, count=1):
+        """
+        在工作表末尾追加空行
+        
+        Args:
+            access_token: 访问令牌
+            doctoken: 表格token
+            sheetid: 工作表ID
+            count: 追加行数(默认1行)
+            
+        Returns:
+            操作结果
+        """
+        # 获取当前工作表信息
+        sheet_props = self.get_sheet_properties(access_token, doctoken, sheetid)
+        if not sheet_props:
+            print("无法获取工作表信息,追加失败")
+            return None
+        
+        current_row_count = sheet_props['row_count']
+        current_col_count = sheet_props['column_count']
+        
+        print(f"在工作表末尾追加{count}行,当前行数: {current_row_count}")
+        
+        # 构造空数据行
+        empty_values = [[''] * max(current_col_count, 1) for _ in range(count)]
+        
+        # 使用append_data在末尾追加
+        range_val = f"{sheetid}!A{current_row_count + 1}:{column_id(max(current_col_count, 1))}{current_row_count + count}"
+        
+        try:
+            result = self.append_data(access_token, doctoken, range_val, empty_values)
+            if result:
+                print(f"成功在末尾追加{count}行空行")
+            return result
+        except Exception as e:
+            print(f"追加空行失败: {e}")
+            return None
+
+# -*- coding: UTF-8 -*-
+import json
+import logging
+from datetime import datetime
+import re
+import os
+import requests
+from urllib.parse import urlparse
+
+LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
+logging.basicConfig(format=LOG_FORMAT, level=logging.INFO)
+
+import os
+logging.info(os.getcwd())
+
+def column_id(col):
+    '''column int to string id'''
+    ans = ""
+    i = col
+    while i > 0:
+        m = int((i-1) % 26)
+        i = int((i-1) / 26)
+        ans = chr(m+65) + ans
+    return ans
+
+def get_image_type(url):
+    '''根据图片URL获取图片类型'''
+    try:
+        # 发送 HEAD 请求以获取头信息
+        path = urlparse(url).path
+        ext = path.split('.')[-1].lower()
+        if ext in ['jpg', 'jpeg', 'png', 'gif']:
+            return ext
+        ext = 'jpeg'
+        if 'jpg' in url:
+            ext = 'jpg'
+        if 'jpeg' in url:
+            ext = 'jpeg'
+        if 'png' in url:
+            ext = 'png'
+        if 'gif' in url:
+            ext = 'gif'
+        if "webp" in url:
+            ext = "webp"
+        # 如果无法确定类型,返回 None
+        return ext
+    except Exception as e:
+        print(f"获取图片类型时出错: {str(e)}")
+        return None
+
+def is_image_cell(cell):
+    # 判断是否包含中文字符
+    if isinstance(cell, str):
+        for char in cell:
+            if '\u4e00' <= char <= '\u9fff':
+                return False
+    is_image = False
+    if (
+        isinstance(cell, str) and
+        cell.startswith('http') and
+        (
+            re.match(r'https?://.+\.(jpg|jpeg|png|gif|webp).*', cell, re.I) or re.match(r'http?://.+\.(jpg|jpeg|png|gif|webp).*', cell, re.I) or
+            ('xhscdn.com' in cell and 'format/jpg' in cell) or
+            ('rescdn.yishihui.com' in cell and 'jpg' in cell) or
+            'sns-webpic-qc.xhscdn.com' in cell or 'ci.xiaohongshu.com' in cell
+        )
+    ):
+        is_image = True
+    return is_image
+
+def is_image_list_cell_url(cell):
+    if isinstance(cell, str)  and cell.strip() and cell[0] == '[' and cell[-1] == ']':
+        try:
+            cell_obj = eval(cell)
+        except:
+            return False
+        if type(cell_obj) == list:
+            for c in cell_obj:
+                if not is_image_cell(c):
+                    return False
+            return True
+    return False
+
+def write_images(client, access_token, token, sheetid, data, start_row=1, start_col=1, skip_col=[], compress_image=True, grid_width=None, grid_height=None, border_width=3, border_color=(200, 200, 200)):
+    '''将图片URL写入单元格'''
+    for row_index, row in enumerate(data, start=1):
+        if row_index < start_row:
+            print(f"跳过行: {row_index}")
+            continue
+        for col_index, cell in enumerate(row, start=1):
+            # if cell is not None and "http" in cell and is_image_cell(cell) is False:
+            #     print(f"is_image_cell = {is_image_cell(cell)}, {cell}")
+            if col_index < start_col:
+                continue
+            if col_index in skip_col:
+                continue
+            if is_image_list_cell_url(cell):
+                # print(f"is_image_list_cell_url = True , {cell}")
+                client.write_images_to_cell(access_token, token, sheetid, eval(cell), row_index, col_index, compress_image, grid_width, grid_height, border_width, border_color)
+            elif is_image_cell(cell):
+                image_type = get_image_type(cell)
+                if image_type:
+                    client.write_image_to_cell(access_token, token, sheetid, cell, row_index, col_index,image_type, compress_image)
+
+def merge_cells(client, access_token, token, sheetid, data ):
+    row_cnt = len(data)
+    col_cnt = len(data[0])
+
+    for col in range(0,col_cnt):
+        previous_row = 0
+        previous_value = None
+        for row in range(0,row_cnt):
+            cell_value = data[row][col]
+
+            if cell_value != previous_value :
+                if row - previous_row > 1:
+                    client.merge_cells(access_token, token, sheetid, previous_row+1, row, col+1, col+1)
+                previous_row = row
+                previous_value= cell_value
+
+
+def pack_data(data, sheetid, start_row=1, start_col=1):
+    rows = len(data)
+    cols = len(data[0])
+    range1 = f"{sheetid}!{column_id(start_col)}{start_row}:{column_id(cols)}{rows}"
+    body = {
+        "valueRanges": [
+            {
+                "range": range1,
+                "values": []
+            },
+        ]
+    }
+    print(range1)
+    for d in data[start_row-1:]:
+        row = []
+        for c in d[start_col-1:]:
+            row.append(c)
+        body["valueRanges"][0]["values"].append(row)
+    return body
+
+def write_data_to_sheet(data, sheet_token='IoTOsjZ4khIqlOtTxnec8oTbn7c', sheetid=None, skip_text=False, skip_images=False, start_row=1, start_col=1, skip_col=[], compress_image=True, grid_width=None, grid_height=None, border_width=3, border_color=(200, 200, 200)):
+    '''测试函数'''
+    # 初始化 API 客户端
+    client = Client(LARK_HOST)
+
+    # 获取租户访问令牌
+    access_token = client.get_tenant_access_token(APP_ID, APP_SECRET)
+
+    # 获取第一个 sheet_id
+    if sheetid is None:
+        sheetid = client.get_sheetid(access_token, sheet_token)
+    print(f"Sheet ID: {sheetid}")
+
+
+    # 构建并写入测试数据
+    body = pack_data(data,
+                     sheetid, start_row=start_row, start_col=start_col)
+    if not skip_text:
+        client.batch_update_values(access_token, sheet_token, body)
+
+    # merge_cells(client, access_token, sheet_token, sheetid, data)
+
+    # 写入图片
+    if not skip_images:
+        write_images(client, access_token, sheet_token, sheetid, data, start_row=start_row, start_col=start_col, skip_col=skip_col, compress_image=compress_image, grid_width=grid_width, grid_height=grid_height, border_width=border_width, border_color=border_color)
+
+
+def get_test_data():
+    data = [
+        ["标题1", "标题2", "标题3", "图片"],
+        [1, 2,2, "http://sns-webpic.xhscdn.com/1040g2sg316vc6tdrk4705o8h0c2095f1else4i8?imageView2/2/w/0/format/jpg/v3"],
+        [4, "https://cdn.midjourney.com/f78df4d5-9b8b-4ec7-ae34-5cc04d176f87/0_0.png", 6, "dd"],
+        # [7, 8, 9,  "https://sns-webpic.xhscdn.com/1040g2sg317l7814ck4705n3aa5ik4jgjahhcam0?imageView2/2/w/0/format/jpg/v3"],
+    ]
+    return data
+
+from typing import List, Dict
+import pandas as pd
+import json
+def to_feishu(
+    res_list: List[Dict], 
+    sheet_id: str = 'Qn9MAs',
+    sheet_token: str = 'Rbsysi6FChzCp7tfv19crkWNnEb',
+    start_row: int = 1,
+    start_col: int = 1,
+    grid_width: int = None,
+    grid_height: int = None,
+    border_width: int = 3,
+    border_color: tuple = (200, 200, 200),
+) -> None:
+    """
+    将数据导出到飞书表格
+    
+    Args:
+        res_list: 数据列表
+        sheet_id: 表格ID
+        sheet_token: 表格token
+        start_row: 起始行
+        start_col: 起始列
+        grid_width: 拼接图片的列数,如果为None则自动计算
+        grid_height: 拼接图片的行数,如果为None则自动计算
+        border_width: 边框宽度,像素
+        border_color: 边框颜色,RGB元组
+    """
+    from tqdm import tqdm
+    
+    def truncate_by_bytes(text, max_bytes=450000):
+        """按字节长度截断文本"""
+        if not text:
+            return ""
+        text_str = str(text)
+        encoded = text_str.encode('utf-8')
+        if len(encoded) <= max_bytes:
+            return text_str
+        # 安全截断,避免截断多字节字符
+        truncated = encoded[:max_bytes]
+        while len(truncated) > 0:
+            try:
+                return truncated.decode('utf-8') + "...[已截断]"
+            except UnicodeDecodeError:
+                truncated = truncated[:-1]
+        return ""
+    
+    res_new_v4 = []
+    for row in tqdm(res_list):
+        if not row:
+            continue
+        for k, v in row.items():
+            if isinstance(v, list):
+                if len(v) > 0 and v[0] and v[0].startswith('http'):
+                    row[k] = truncate_by_bytes(str(v))
+                else:
+                    json_str = json.dumps(v, ensure_ascii=False, separators=(',', ':'))
+                    row[k] = truncate_by_bytes(json_str)
+            elif isinstance(v, dict):
+                json_str = json.dumps(v, ensure_ascii=False, indent=2)
+                row[k] = truncate_by_bytes(json_str)
+            else:
+                row[k] = truncate_by_bytes(v)
+        res_new_v4.append(row)
+    df = pd.DataFrame(res_new_v4)
+    df.fillna('', inplace=True)
+    header = df.columns.tolist()
+    data_rows = df.values.tolist()
+    data_with_header = [header] + data_rows
+    
+    write_data_to_sheet(
+        data_with_header, 
+        sheet_token=sheet_token, 
+        sheetid=sheet_id, 
+        start_col=start_col,
+        start_row=start_row,
+        grid_width=grid_width,
+        grid_height=grid_height,
+        border_width=border_width,
+        border_color=border_color,
+    )
+
+def to_feishu_incremental(
+    res_list: List[Dict],
+    sort_field: str = '内容ID',
+    sheet_id: str = 'Qn9MAs', 
+    sheet_token: str = 'Rbsysi6FChzCp7tfv19crkWNnEb',
+    unique_field: str = None,  # 用于去重的唯一字段,默认使用sort_field
+    duplicate_strategy: str = 'skip',  # 重复数据处理策略:'skip'跳过, 'delete'删除后插入, 'update'更新
+    update_fields: List[str] = None,  # 当duplicate_strategy='update'时,指定要更新的字段列表。None表示更新所有字段
+    cleanup_duplicates: bool = True,  # 是否先清理现有表格中的重复数据
+    keep_first: bool = True,  # 清理重复数据时保留第一个(True)还是最后一个(False)
+    sort_ascending: bool = False,  # 排序顺序:True为升序(从小到大),False为降序(从大到小)
+    grid_width: int = None,
+    grid_height: int = None,
+    border_width: int = 3,
+    border_color: tuple = (200, 200, 200),
+) -> None:
+    """
+    逐行增量插入数据到飞书表格,按指定字段查找插入位置
+    
+    Args:
+        res_list: 数据列表
+        sort_field: 用于排序的字段名,如 '内容ID'
+        sheet_id: 表格ID  
+        sheet_token: 表格token
+        unique_field: 用于去重的唯一字段,默认使用sort_field
+        duplicate_strategy: 重复数据处理策略
+            - 'skip': 跳过重复数据(默认)
+            - 'delete': 删除重复数据后插入新数据
+            - 'update': 更新重复数据的指定字段
+        update_fields: 当duplicate_strategy='update'时,指定要更新的字段列表
+            - None: 更新所有字段(除了unique_field)
+            - ['字段1', '字段2']: 只更新指定的字段
+        cleanup_duplicates: 是否先清理现有表格中的重复数据
+        keep_first: 清理重复数据时保留第一个(True)还是最后一个(False)
+        sort_ascending: 排序顺序,True为升序(从小到大),False为降序(从大到小),默认False
+        grid_width: 拼接图片的列数,如果为None则自动计算
+        grid_height: 拼接图片的行数,如果为None则自动计算
+        border_width: 边框宽度,像素
+        border_color: 边框颜色,RGB元组
+    """
+    from tqdm import tqdm
+    import pandas as pd
+    import json
+    from typing import List
+    
+    def truncate_by_bytes(text, max_bytes=450000):
+        """按字节长度截断文本"""
+        if not text:
+            return ""
+        text_str = str(text)
+        encoded = text_str.encode('utf-8')
+        if len(encoded) <= max_bytes:
+            return text_str
+        # 安全截断,避免截断多字节字符
+        truncated = encoded[:max_bytes]
+        while len(truncated) > 0:
+            try:
+                return truncated.decode('utf-8') + "...[已截断]"
+            except UnicodeDecodeError:
+                truncated = truncated[:-1]
+        return ""
+    
+    # 初始化 API 客户端
+    client = Client(LARK_HOST)
+    access_token = client.get_tenant_access_token(APP_ID, APP_SECRET)
+    
+    # 设置去重字段,默认使用排序字段
+    if unique_field is None:
+        unique_field = sort_field
+    
+    # 1. 获取工作表基础信息
+    print("正在获取工作表信息...")
+    sheet_props = client.get_sheet_properties(access_token, sheet_token, sheet_id)
+    
+    if not sheet_props:
+        print("获取工作表信息失败,使用默认范围")
+        max_col = 'ZZ'
+        max_row = 1000
+    else:
+        print(f"工作表信息: 行数={sheet_props['row_count']}, 列数={sheet_props['column_count']}")
+        max_col = column_id(sheet_props['column_count']) if sheet_props['column_count'] > 0 else 'ZZ'
+        max_row = sheet_props['row_count'] if sheet_props['row_count'] > 0 else 1000
+    
+    # 2. 读取表头(使用精确范围)
+    print("正在读取表头...")
+    header_range = f"{sheet_id}!A1:{max_col}1"  # 表头总是从A列开始读取
+    header_data = client.read_range_values(access_token, sheet_token, header_range)
+    
+    if not header_data or not header_data[0] or all(not cell.strip() for cell in header_data[0] if cell):
+        print("表格为空,需要根据数据创建表头")
+        # 从第一条数据中提取字段名作为表头
+        if not res_list or not res_list[0]:
+            print("错误:无法从空数据中创建表头")
+            return
+        
+        # 提取字段名
+        headers = list(res_list[0].keys())
+        print(f"创建表头: {headers}")
+        
+        # 写入表头(表头不包含图片,使用普通插入即可)
+        header_range = f"{sheet_id}!A1:{column_id(len(headers))}1"
+        client.insert_data_at_row(access_token, sheet_token, sheet_id, 1, [headers])
+        
+        # 表头创建后,从第二行开始插入数据
+        print("表头创建完成,开始插入数据...")
+    else:
+        # 解析现有表头
+        headers = [cell.strip() for cell in header_data[0] if cell is not None]
+        headers = [h for h in headers if h]  # 移除空字段
+        print(f"读取到现有表头: {headers}")
+    
+    # 检查排序字段和去重字段是否存在
+    if sort_field not in headers:
+        print(f"警告: 排序字段 '{sort_field}' 未在表头中找到。可用字段: {headers}")
+        # 如果找不到排序字段,就直接追加到末尾
+        # 使用工作表信息中的行数,或从第二行开始(如果刚创建了表头)
+        start_row = len(headers) + 1 if 'headers' in locals() else (max_row + 1 if sheet_props else 2)
+        to_feishu(res_list, sheet_id, sheet_token, start_row, 1, grid_width, grid_height, border_width, border_color)
+        return
+    
+    if unique_field not in headers:
+        print(f"警告: 去重字段 '{unique_field}' 未在表头中找到,将使用排序字段 '{sort_field}' 进行去重")
+        unique_field = sort_field
+    
+    sort_field_index = headers.index(sort_field)
+    sort_field_col = column_id(sort_field_index + 1)  # 转换为列标识符,如A, B, C...
+    
+    unique_field_index = headers.index(unique_field)
+    unique_field_col = column_id(unique_field_index + 1)  # 转换为列标识符,如A, B, C...
+    
+    # 3. 读取排序字段和去重字段的数据
+    print(f"正在读取排序字段 '{sort_field}' 和去重字段 '{unique_field}' 列数据...")
+    
+    # 读取排序字段数据
+    sort_data_range = f"{sheet_id}!{sort_field_col}2:{sort_field_col}{max_row}"
+    all_sort_data = client.read_range_values(access_token, sheet_token, sort_data_range)
+    
+    # 读取去重字段数据(如果与排序字段不同)
+    if unique_field != sort_field:
+        unique_data_range = f"{sheet_id}!{unique_field_col}2:{unique_field_col}{max_row}"
+        all_unique_data = client.read_range_values(access_token, sheet_token, unique_data_range)
+    else:
+        all_unique_data = all_sort_data
+    
+    # 先清理空白行(排序字段和去重字段都为空的行)
+    print("检查并清理空白行...")
+    empty_rows_to_delete = []
+    
+    if all_unique_data and all_sort_data:
+        for i in range(min(len(all_unique_data), len(all_sort_data))):
+            unique_row = all_unique_data[i] if i < len(all_unique_data) else None
+            sort_row = all_sort_data[i] if i < len(all_sort_data) else None
+            
+            # 检查去重字段值
+            unique_value = ""
+            if unique_row and len(unique_row) > 0 and unique_row[0]:
+                unique_value = str(unique_row[0]).strip()
+            
+            # 检查排序字段值
+            sort_value = ""
+            if sort_row and len(sort_row) > 0 and sort_row[0]:
+                sort_value = str(sort_row[0]).strip()
+            
+            # 如果排序字段和去重字段都为空,标记为空白行
+            if not unique_value and not sort_value:
+                row_number = i + 2  # +2 因为从第2行开始,且行号从1开始
+                empty_rows_to_delete.append(row_number)
+                print(f"标记删除空白行: 第{row_number}行")
+    
+    # 删除空白行
+    if empty_rows_to_delete:
+        print(f"开始删除 {len(empty_rows_to_delete)} 个空白行...")
+        # 按行号倒序删除,避免删除后行号变化的问题
+        empty_rows_to_delete.sort(reverse=True)
+        
+        for row_to_delete in empty_rows_to_delete:
+            delete_result = client.delete_single_row(access_token, sheet_token, sheet_id, row_to_delete)
+            if delete_result:
+                print(f"成功删除空白行: 第{row_to_delete}行")
+            else:
+                print(f"删除空白行失败: 第{row_to_delete}行")
+        
+        # 重新读取数据(删除后数据已经改变)
+        print("重新读取数据(清理空白行后)...")
+        # 重新读取排序字段数据
+        sort_data_range = f"{sheet_id}!{sort_field_col}2:{sort_field_col}{max_row}"
+        all_sort_data = client.read_range_values(access_token, sheet_token, sort_data_range)
+        
+        # 重新读取去重字段数据
+        if unique_field != sort_field:
+            unique_data_range = f"{sheet_id}!{unique_field_col}2:{unique_field_col}{max_row}"
+            all_unique_data = client.read_range_values(access_token, sheet_token, unique_data_range)
+        else:
+            all_unique_data = all_sort_data
+    
+    # 构建现有数据的去重集合
+    duplicate_rows_to_delete = []
+    
+    if cleanup_duplicates and all_unique_data:
+        # 先分析重复数据
+        seen_unique_values = {}  # 记录已见过的唯一值和对应行号
+        actual_data_rows = []  # 记录实际有数据的行号
+        
+        print(f"开始分析重复数据,总共读取了 {len(all_unique_data)} 行数据")
+        
+        # 先找出所有有效数据行及其对应的实际行号(必须同时有排序字段和去重字段的值)
+        for i in range(min(len(all_unique_data), len(all_sort_data) if all_sort_data else 0)):
+            unique_row = all_unique_data[i] if i < len(all_unique_data) else None
+            sort_row = all_sort_data[i] if i < len(all_sort_data) else None
+            
+            # 检查去重字段值
+            unique_value = ""
+            if unique_row and len(unique_row) > 0 and unique_row[0]:
+                unique_value = str(unique_row[0]).strip()
+            
+            # 检查排序字段值
+            sort_value = ""
+            if sort_row and len(sort_row) > 0 and sort_row[0]:
+                sort_value = str(sort_row[0]).strip()
+            
+            # 只有当排序字段和去重字段都有值时,才认为是有效数据
+            if unique_value and sort_value:
+                actual_row_number = i + 2  # +2 因为从第2行开始,且行号从1开始
+                actual_data_rows.append((actual_row_number, unique_value, sort_value))
+        
+        print(f"找到 {len(actual_data_rows)} 行有效数据")
+        
+        # 分析重复数据
+        for actual_row_number, unique_value, sort_value in actual_data_rows:
+            if unique_value in seen_unique_values:
+                # 发现重复数据
+                if keep_first:
+                    # 保留第一个,删除当前这个
+                    duplicate_rows_to_delete.append(actual_row_number)
+                    print(f"标记删除重复行: 第{actual_row_number}行 ({unique_field}={unique_value}, {sort_field}={sort_value})")
+                else:
+                    # 保留最后一个,删除之前的
+                    previous_row = seen_unique_values[unique_value]
+                    duplicate_rows_to_delete.append(previous_row)
+                    print(f"标记删除重复行: 第{previous_row}行 ({unique_field}={unique_value}, {sort_field}={sort_value})")
+                    seen_unique_values[unique_value] = actual_row_number
+            else:
+                # 第一次见到这个唯一值
+                seen_unique_values[unique_value] = actual_row_number
+        
+        # 执行清理:删除重复行
+        if duplicate_rows_to_delete:
+            print(f"开始清理 {len(duplicate_rows_to_delete)} 行重复数据...")
+            # 按行号倒序删除,避免删除后行号变化的问题
+            duplicate_rows_to_delete.sort(reverse=True)
+            
+            for row_to_delete in duplicate_rows_to_delete:
+                delete_result = client.delete_single_row(access_token, sheet_token, sheet_id, row_to_delete)
+                if delete_result:
+                    print(f"成功删除重复行: 第{row_to_delete}行")
+                else:
+                    print(f"删除重复行失败: 第{row_to_delete}行")
+            
+            # 重新读取数据(删除后数据已经改变)
+            print("重新读取排序和去重字段数据...")
+            # 重新读取排序字段数据
+            sort_data_range = f"{sheet_id}!{sort_field_col}2:{sort_field_col}{max_row}"
+            all_sort_data = client.read_range_values(access_token, sheet_token, sort_data_range)
+            
+            # 重新读取去重字段数据
+            if unique_field != sort_field:
+                unique_data_range = f"{sheet_id}!{unique_field_col}2:{unique_field_col}{max_row}"
+                all_unique_data = client.read_range_values(access_token, sheet_token, unique_data_range)
+            else:
+                all_unique_data = all_sort_data
+    
+    # 构建最终的去重集合(处理清理后的数据,必须同时有排序字段和去重字段的值)
+    existing_unique_values = set()
+    existing_unique_rows = {}  # 用于update策略:{unique_value: row_number}
+    if all_unique_data and all_sort_data:
+        for i in range(min(len(all_unique_data), len(all_sort_data))):
+            unique_row = all_unique_data[i] if i < len(all_unique_data) else None
+            sort_row = all_sort_data[i] if i < len(all_sort_data) else None
+            
+            # 检查去重字段值
+            unique_value = ""
+            if unique_row and len(unique_row) > 0 and unique_row[0]:
+                unique_value = str(unique_row[0]).strip()
+            
+            # 检查排序字段值
+            sort_value = ""
+            if sort_row and len(sort_row) > 0 and sort_row[0]:
+                sort_value = str(sort_row[0]).strip()
+            
+            # 只有当排序字段和去重字段都有值时,才添加到去重集合
+            if unique_value and sort_value:
+                actual_row_number = i + 2  # +2 因为从第2行开始,且行号从1开始
+                existing_unique_values.add(unique_value)
+                existing_unique_rows[unique_value] = actual_row_number
+    
+    print(f"现有去重值数量: {len(existing_unique_values)}")
+    print(existing_unique_values)
+    
+    # 获取排序数据用于插入位置计算(基于清理后的最新数据)
+    sort_data = []
+    if all_sort_data:
+        # 同时检查排序字段和去重字段,确保数据完整性
+        for i in range(min(len(all_sort_data), len(all_unique_data) if all_unique_data else 0)):
+            sort_row = all_sort_data[i] if i < len(all_sort_data) else None
+            unique_row = all_unique_data[i] if i < len(all_unique_data) else None
+            
+            # 检查排序字段值
+            sort_value = ""
+            if sort_row and len(sort_row) > 0 and sort_row[0]:
+                sort_value = str(sort_row[0]).strip()
+            
+            # 检查去重字段值
+            unique_value = ""
+            if unique_row and len(unique_row) > 0 and unique_row[0]:
+                unique_value = str(unique_row[0]).strip()
+            
+            # 只有当排序字段和去重字段都有值时,才加入排序数据
+            if sort_value and unique_value:
+                sort_data.append([sort_value])
+    
+    if not sort_data:
+        print("未读取到排序字段数据,所有新数据将从第二行开始插入")
+    
+    # 处理新数据
+    processed_data = []
+    for row in tqdm(res_list, desc="处理数据"):
+        if not row:
+            continue
+        processed_row = {}
+        for k, v in row.items():
+            if isinstance(v, list):
+                if len(v) > 0 and v[0] and str(v[0]).startswith('http'):
+                    processed_row[k] = truncate_by_bytes(str(v))
+                else:
+                    json_str = json.dumps(v, ensure_ascii=False, indent=1)
+                    processed_row[k] = truncate_by_bytes(json_str)
+            elif isinstance(v, dict):
+                json_str = json.dumps(v, ensure_ascii=False, indent=1)
+                processed_row[k] = truncate_by_bytes(json_str)
+            else:
+                processed_row[k] = truncate_by_bytes(v)
+        processed_data.append(processed_row)
+    
+    # 转换为DataFrame以便操作
+    df_new = pd.DataFrame(processed_data)
+    df_new.fillna('', inplace=True)
+    
+    # 确保新数据包含所有必要的列
+    for header in headers:
+        if header not in df_new.columns:
+            df_new[header] = ''
+    
+    # 按表头顺序重新排列列
+    df_new = df_new.reindex(columns=headers, fill_value='')
+    
+    # 预处理:过滤重复数据并确定插入顺序
+    print(f"预处理新数据:过滤重复并排序...")
+    print(f"传入数据总量: {len(df_new)} 行")
+    print(f"现有去重集合大小: {len(existing_unique_values)}")
+    
+    valid_rows = []
+    update_rows = []  # 需要更新的行:[{row_number, values, unique_value}, ...]
+    skipped_count = 0
+    new_data_duplicates = 0  # 新数据内部重复计数
+    updated_count = 0  # 更新计数
+    
+    for idx, new_row in df_new.iterrows():
+        new_row_values = new_row.tolist()
+        new_sort_value = str(new_row_values[sort_field_index])
+        new_unique_value = str(new_row_values[unique_field_index])
+        
+        # 检查是否与现有数据重复
+        if new_unique_value in existing_unique_values:
+            if duplicate_strategy == 'update':
+                # 更新策略:记录需要更新的行
+                target_row = existing_unique_rows[new_unique_value]
+                update_rows.append({
+                    'row_number': target_row,
+                    'values': new_row_values,
+                    'unique_value': new_unique_value
+                })
+                print(f"标记更新现有数据: 第{target_row}行 {unique_field}={new_unique_value}")
+                updated_count += 1
+                continue
+            elif duplicate_strategy == 'delete':
+                # 删除策略:先删除现有行,再插入新数据
+                target_row = existing_unique_rows[new_unique_value]
+                delete_result = client.delete_single_row(access_token, sheet_token, sheet_id, target_row)
+                if delete_result:
+                    print(f"成功删除重复行: 第{target_row}行 {unique_field}={new_unique_value}")
+                    # 从去重集合中移除,允许后续插入
+                    existing_unique_values.remove(new_unique_value)
+                    # 更新所有行号(删除后后面的行号会前移)
+                    for key, row_num in existing_unique_rows.items():
+                        if row_num > target_row:
+                            existing_unique_rows[key] = row_num - 1
+                    del existing_unique_rows[new_unique_value]
+                else:
+                    print(f"删除重复行失败: 第{target_row}行 {unique_field}={new_unique_value}")
+                    skipped_count += 1
+                    continue
+            else:  # 'skip' 策略
+                print(f"跳过与现有数据重复: {unique_field}={new_unique_value}")
+                skipped_count += 1
+                continue
+        
+        # 检查新数据内部是否重复
+        already_processed = any(row['unique_value'] == new_unique_value for row in valid_rows)
+        if already_processed:
+            print(f"跳过新数据内部重复: {unique_field}={new_unique_value}")
+            new_data_duplicates += 1
+            continue
+        
+        # 添加到待插入列表
+        valid_rows.append({
+            'values': new_row_values,
+            'sort_value': new_sort_value,
+            'unique_value': new_unique_value
+        })
+    
+    print(f"预处理完成:有效数据 {len(valid_rows)} 行,需要更新 {len(update_rows)} 行,跳过与现有重复 {skipped_count} 行,跳过新数据内部重复 {new_data_duplicates} 行")
+    
+    # 处理更新操作
+    if update_rows:
+        print(f"开始执行更新操作,共 {len(update_rows)} 行...")
+        for update_data in tqdm(update_rows, desc="更新数据"):
+            row_number = update_data['row_number']
+            new_values = update_data['values']
+            unique_value = update_data['unique_value']
+            
+            # 构建字段更新字典
+            if update_fields is None:
+                # 更新所有字段,但排除unique_field(避免修改关键字段)
+                field_updates = {}
+                for i, header in enumerate(headers):
+                    if header != unique_field:  # 不更新去重字段
+                        field_updates[header] = new_values[i]
+                print(f"更新第{row_number}行所有字段(除了{unique_field}): {unique_value}")
+            else:
+                # 只更新指定字段
+                field_updates = {}
+                for field_name in update_fields:
+                    if field_name in headers:
+                        field_index = headers.index(field_name)
+                        field_updates[field_name] = new_values[field_index]
+                    else:
+                        print(f"警告:字段 '{field_name}' 不存在于表头中,跳过")
+                print(f"更新第{row_number}行指定字段 {list(field_updates.keys())}: {unique_value}")
+            
+            # 执行更新
+            if field_updates:
+                result = client.update_row_with_specific_fields_and_images(
+                    access_token, sheet_token, sheet_id, row_number, 
+                    field_updates, headers, True, grid_width, grid_height, border_width, border_color
+                )
+                if result:
+                    print(f"✅ 成功更新第{row_number}行")
+                else:
+                    print(f"❌ 更新第{row_number}行失败")
+    
+    if not valid_rows:
+        if update_rows:
+            print("所有数据均为更新操作,无新数据需要插入")
+        else:
+            print("没有新数据需要插入")
+        return
+    
+    # 按排序字段排序新数据(根据sort_ascending参数决定排序方向)
+    if sort_ascending:
+        # 升序排序:小的值先插入(reverse=False)
+        valid_rows.sort(key=lambda x: x['sort_value'], reverse=False)
+        print(f"新数据排序完成,将按升序插入")
+    else:
+        # 降序排序:大的值先插入(reverse=True)
+        valid_rows.sort(key=lambda x: x['sort_value'], reverse=True)
+        print(f"新数据排序完成,将按降序插入")
+    
+    # 逐行插入已排序的数据
+    for i, row_data in tqdm(enumerate(valid_rows), total=len(valid_rows), desc="插入数据"):
+        new_row_values = row_data['values']
+        new_sort_value = row_data['sort_value']
+        new_unique_value = row_data['unique_value']
+        
+        # 找到合适的插入位置(根据sort_ascending参数确定排序方向)
+        insert_row = len(sort_data) + 2  # 默认插入到末尾
+        
+        print(f"查找插入位置,新值: {new_sort_value}")
+        
+        # 找到两个相邻ID之间的正确插入位置
+        if sort_ascending:
+            # 升序排列:小 → 大,需要找到 prev_value < new_value < current_value 的位置
+            for j in range(len(sort_data)):
+                current_value = str(sort_data[j][0]) if sort_data[j] and len(sort_data[j]) > 0 else ""
+                prev_value = str(sort_data[j-1][0]) if j > 0 and sort_data[j-1] and len(sort_data[j-1]) > 0 else None
+                
+                # 检查是否应该插入到当前位置
+                if prev_value is None:
+                    # 这是第一个位置,检查是否应该插入到最前面
+                    if new_sort_value < current_value:
+                        insert_row = j + 2  # +2 因为表头偏移
+                        print(f"  插入到最前面第{insert_row}行: 新值{new_sort_value} < 第一个值{current_value}")
+                        break
+                else:
+                    # 检查是否在两个相邻值之间
+                    if new_sort_value >= prev_value and new_sort_value < current_value:
+                        insert_row = j + 2  # +2 因为表头偏移
+                        print(f"  插入到第{insert_row}行: {prev_value} <= {new_sort_value} < {current_value}")
+                        break
+                    elif new_sort_value == current_value:
+                        # 值相等时插入到相等值之后
+                        insert_row = j + 3  # +2(表头偏移) +1(插入到此行之后)
+                        print(f"  插入到第{insert_row}行: 新值{new_sort_value} = 现有值{current_value},插入其后")
+                        break
+            
+            # 如果遍历完都没有找到位置,说明新值是最大的,插入到末尾
+            if insert_row == len(sort_data) + 2:
+                last_value = str(sort_data[-1][0]) if sort_data and sort_data[-1] and len(sort_data[-1]) > 0 else "无"
+                print(f"  插入到末尾第{insert_row}行: 新值{new_sort_value} > 最后一个值{last_value}")
+        else:
+            # 降序排列:大 → 小,需要找到 prev_value > new_value > current_value 的位置
+            for j in range(len(sort_data)):
+                current_value = str(sort_data[j][0]) if sort_data[j] and len(sort_data[j]) > 0 else ""
+                prev_value = str(sort_data[j-1][0]) if j > 0 and sort_data[j-1] and len(sort_data[j-1]) > 0 else None
+                
+                # 检查是否应该插入到当前位置
+                if prev_value is None:
+                    # 这是第一个位置,检查是否应该插入到最前面
+                    if new_sort_value > current_value:
+                        insert_row = j + 2  # +2 因为表头偏移
+                        print(f"  插入到最前面第{insert_row}行: 新值{new_sort_value} > 第一个值{current_value}")
+                        break
+                else:
+                    # 检查是否在两个相邻值之间
+                    if new_sort_value <= prev_value and new_sort_value > current_value:
+                        insert_row = j + 2  # +2 因为表头偏移
+                        print(f"  插入到第{insert_row}行: {prev_value} >= {new_sort_value} > {current_value}")
+                        break
+                    elif new_sort_value == current_value:
+                        # 值相等时插入到相等值之后
+                        insert_row = j + 3  # +2(表头偏移) +1(插入到此行之后)
+                        print(f"  插入到第{insert_row}行: 新值{new_sort_value} = 现有值{current_value},插入其后")
+                        break
+            
+            # 如果遍历完都没有找到位置,说明新值是最小的,插入到末尾
+            if insert_row == len(sort_data) + 2:
+                last_value = str(sort_data[-1][0]) if sort_data and sort_data[-1] and len(sort_data[-1]) > 0 else "无"
+                print(f"  插入到末尾第{insert_row}行: 新值{new_sort_value} < 最后一个值{last_value}")
+        
+        print(f"[{i+1}/{len(valid_rows)}] 最终插入位置: 第 {insert_row} 行: {sort_field}={new_sort_value}")
+        
+        # 插入数据到指定行(真正插入新行)
+        result = client.insert_row_with_data_at_position(access_token, sheet_token, sheet_id, insert_row, [new_row_values], True, grid_width, grid_height, border_width, border_color)
+        
+        if result:
+            print(f"成功插入数据和图片到第 {insert_row} 行")
+            # 更新sort_data:在正确的位置添加新的排序值
+            sort_data_index = insert_row - 2  # 转换为sort_data的索引(-2因为表头偏移)
+            sort_data.insert(sort_data_index, [new_sort_value])
+            # 更新去重集合
+            existing_unique_values.add(new_unique_value)
+        else:
+            print(f"插入数据到第 {insert_row} 行失败")
+
+
+
+if __name__ == "__main__":
+    # data = get_test_data()
+    # sheet_token = 'IoTOsjZ4khIqlOtTxnec8oTbn7c'
+    # sheetid = 'K9c4LG'
+    # write_data_to_sheet(data, sheetid=sheetid)
+
+    # is_image_cell_result = is_image_cell('["http://sns-webpic-qc.xhscdn.com/202501021415/1a6e88908930afce92b09206d5a482f8/1040g2sg31b74rf6k7g5g5oo7i8vkgev59lkjet0!nd_whlt34_webp_wm_1","http://sns-webpic-qc.xhscdn.com/202501021415/1a6e88908930afce92b09206d5a482f8/1040g2sg31b74rf6k7g5g5oo7i8vkgev59lkjet0!nd_whlt34_webp_wm_1"]')
+    # print(is_image_cell_result)
+    
+    # 新增函数使用示例
+    """
+    示例:使用 to_feishu_incremental 增量插入数据
+    
+    # 测试数据
+    test_data = [
+        {
+            '内容ID': '1001', 
+            '标题': '测试标题1', 
+            '内容': '测试内容1',
+            '图片': '["http://example.com/image1.jpg", "http://example.com/image2.jpg"]'
+        },
+        {
+            '内容ID': '1003', 
+            '标题': '测试标题2', 
+            '内容': '测试内容2',
+            '图片': 'http://example.com/image3.jpg'
+        }
+    ]
+    
+         # 调用增量插入函数
+     to_feishu_incremental(
+         res_list=test_data,
+         sort_field='内容ID',  # 按此字段排序
+         sheet_id='your_sheet_id', 
+         sheet_token='your_sheet_token',
+         unique_field='内容ID',  # 去重字段,默认使用sort_field
+         duplicate_strategy='update',  # 重复处理策略:'skip'跳过, 'delete'删除后插入, 'update'更新指定字段
+         update_fields=['标题', '内容', '图片'],  # 当strategy='update'时,只更新这些字段
+         cleanup_duplicates=True,  # 先清理现有表格中的重复数据
+         keep_first=True,  # 清理时保留第一个重复项
+         sort_ascending=False,  # 排序顺序:False为降序(大→小),True为升序(小→大)
+         grid_width=2,  # 图片拼接列数
+         grid_height=2,  # 图片拼接行数
+     )
+    
+    # 排序方向示例:
+    
+    # 示例1:按时间戳降序排序(最新的在前面)- 适合新闻、动态等时间敏感内容
+    to_feishu_incremental(
+        res_list=news_data,
+        sort_field='发布时间',
+        sort_ascending=False,  # 降序,最新时间在前面
+        # ... 其他参数
+    )
+    
+    # 示例2:按ID升序排序(从小到大)- 适合有明确编号顺序的内容
+    to_feishu_incremental(
+        res_list=product_data,
+        sort_field='产品ID',
+        sort_ascending=True,  # 升序,小ID在前面
+        # ... 其他参数
+    )
+    
+    # 示例3:按优先级降序排序(高优先级在前面)- 适合任务、问题等需要优先级管理的内容
+    to_feishu_incremental(
+        res_list=task_data,
+        sort_field='优先级',
+        sort_ascending=False,  # 降序,高优先级在前面
+        # ... 其他参数
+    )
+    
+    功能说明:
+    1. **智能表头处理**:
+       - 如果表格为空,自动从数据中提取字段名创建表头
+       - 如果表格已有数据,读取现有表头结构
+    2. **空白行清理**:
+       - 自动检测并删除排序字段和去重字段都为空的空白行
+       - 确保数据的连续性和逻辑一致性
+    3. **重复数据清理**:
+       - cleanup_duplicates=True: 先清理现有表格中的重复数据
+       - keep_first: 保留第一个或最后一个重复项
+    4. **智能去重检查**:
+       - 基于 unique_field 字段检查数据是否已存在
+       - 预处理阶段过滤重复数据,避免插入过程中的状态变化问题
+    5. **排序插入**:根据指定的 sort_field 字段和 sort_ascending 参数查找插入位置
+       - sort_ascending=False(默认):降序排序,较大的值插入到较前面的位置
+       - sort_ascending=True:升序排序,较小的值插入到较前面的位置
+    6. **逐行数据插入**:按排序顺序逐行插入数据,保持表格整体有序
+    7. **完整图片支持**:自动处理图片写入,支持单张图片和图片列表
+    8. **图片拼接功能**:支持多图拼接,可设置拼接的行列数和边框样式
+    
+    适用场景:
+    - ✅ 空表格:自动创建表头并插入数据
+    - ✅ 已有重复数据的表格:先清理重复,再智能插入
+    - ✅ 增量数据更新:逐条插入,保持排序,自动去重
+    - ✅ 重复运行安全:不会插入重复数据
+    - ✅ 数据清理:一键清理现有重复数据
+    - ✅ 灵活排序:支持升序和降序两种排序方式
+    """

+ 769 - 0
skills/src/odps_skills/fetch_daily.py

@@ -0,0 +1,769 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+按天增量获取数据 - 通用版本
+支持并发获取,自动跳过已有数据
+
+用法:
+    fetch-daily tasks/xxx/query.sql                    # 获取最近7天
+    fetch-daily tasks/xxx/query.sql --days 30          # 获取最近30天
+    fetch-daily tasks/xxx/query.sql --start 20260101 --end 20260107
+    fetch-daily tasks/xxx/query.sql --date 20260105    # 单天
+    fetch-daily tasks/xxx/query.sql --date 20260105 --hh 08  # 单天单小时
+    fetch-daily tasks/xxx/query.sql --force            # 强制重新获取
+    fetch-daily tasks/xxx/query.sql --workers 10       # 设置天级并发数
+    fetch-daily tasks/xxx/query.sql --parallel 50      # 单天多线程下载(默认50,大数据量推荐)
+    fetch-daily tasks/xxx/query.sql --parallel 0       # 关闭多线程,使用单线程下载
+    fetch-daily tasks/xxx/query.sql --feishu           # 获取后上传到飞书表格
+    fetch-daily tasks/xxx/query.sql --feishu TOKEN     # 指定飞书表格token
+    fetch-daily tasks/xxx/query.sql --merge --feishu   # 仅合并并上传飞书
+    fetch-daily tasks/xxx/query.sql --config piaoquan_api  # 切换 ODPS 配置
+"""
+import argparse
+from datetime import datetime, timedelta
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+
+from odps_skills.client import ODPSClient
+import csv
+
+# 线程安全的计数器
+counter_lock = threading.Lock()
+success_count = 0
+fail_count = 0
+
+
+def get_existing_dates(daily_dir, hh=None):
+    """获取已下载的日期列表(可选指定小时)"""
+    existing = set()
+    if not daily_dir.exists():
+        return existing
+    for f in daily_dir.glob("*.csv"):
+        try:
+            stem = f.stem
+            if hh is not None:
+                # 带小时格式:20250101_08
+                if len(stem) == 11 and stem[8] == '_':
+                    dt = stem[:8]
+                    file_hh = stem[9:11]
+                    if dt.isdigit() and file_hh == hh:
+                        existing.add(dt)
+            else:
+                # 仅日期格式:20250101
+                if len(stem) == 8 and stem.isdigit():
+                    existing.add(stem)
+        except:
+            pass
+    return existing
+
+
+def merge_csv_files(daily_dir, output_file=None):
+    """合并目录下所有日期 CSV 文件,只保留一个表头"""
+    csv_files = sorted(daily_dir.glob("*.csv"))
+    if not csv_files:
+        print("没有找到 CSV 文件")
+        return None
+
+    if output_file is None:
+        output_file = daily_dir.parent / f"{daily_dir.name}_merged.csv"
+
+    with open(output_file, "w", encoding="utf-8") as out:
+        header_written = False
+        total_rows = 0
+
+        for csv_file in csv_files:
+            with open(csv_file, "r", encoding="utf-8") as f:
+                lines = f.readlines()
+                if not lines:
+                    continue
+
+                if not header_written:
+                    out.write(lines[0])
+                    header_written = True
+
+                for line in lines[1:]:
+                    out.write(line)
+                    total_rows += 1
+
+    print(f"合并完成: {len(csv_files)} 个文件, {total_rows} 行数据")
+    print(f"输出文件: {output_file}")
+    return output_file
+
+
+def infer_column_types(rows):
+    """推断每列的类型:int, float, 或 str"""
+    if not rows:
+        return []
+
+    num_cols = len(rows[0])
+    col_types = []
+
+    for col_idx in range(num_cols):
+        has_float = False
+        all_numeric = True
+
+        for row in rows:
+            if col_idx >= len(row):
+                continue
+            v = row[col_idx].strip() if row[col_idx] else ""
+            if not v:  # 空值不影响类型判断
+                continue
+            try:
+                if '.' in v or 'e' in v.lower():
+                    float(v)
+                    has_float = True
+                else:
+                    int(v)
+            except ValueError:
+                all_numeric = False
+                break
+
+        if all_numeric:
+            col_types.append('float' if has_float else 'int')
+        else:
+            col_types.append('str')
+
+    return col_types
+
+
+def convert_row_by_types(row, col_types):
+    """按列类型转换一行数据"""
+    result = []
+    for i, cell in enumerate(row):
+        if i >= len(col_types):
+            result.append(cell)
+            continue
+
+        v = cell.strip() if cell else ""
+        if not v:
+            result.append("")
+            continue
+
+        col_type = col_types[i]
+        if col_type == 'int':
+            result.append(int(v))
+        elif col_type == 'float':
+            result.append(float(v))
+        else:
+            result.append(cell)
+    return result
+
+
+def load_feishu_config(sql_file):
+    """加载飞书配置,优先级: {sql名}.json > sql目录/default.json > 根目录/default.json > 默认值"""
+    import json
+
+    defaults = {
+        "token": "ONZqsxB9BhGH8tt90EScSJT5nHh",
+        "sheet_id": None,
+        "sort": "dt:desc",
+        "cols": None,
+        "filter": None,
+        "limit": None,
+    }
+
+    root_dir = Path.cwd()
+    sql_dir = sql_file.parent
+    sql_name = sql_file.stem
+
+    def load_json(path, name):
+        if path.exists():
+            try:
+                with open(path, "r", encoding="utf-8") as f:
+                    defaults.update(json.load(f))
+            except Exception as e:
+                print(f"警告: 读取 {name} 失败: {e}")
+
+    # 按优先级从低到高加载(后加载的覆盖先加载的)
+    load_json(root_dir / "default.json", "根目录/default.json")
+    load_json(sql_dir / "default.json", "sql目录/default.json")
+    load_json(sql_dir / f"{sql_name}.json", f"{sql_name}.json")
+
+    return defaults
+
+
+def parse_sort_spec(sort_spec):
+    """解析排序规格,如 'dt:desc,name:asc' -> [('dt', True), ('name', False)]"""
+    if not sort_spec:
+        return []
+    result = []
+    for part in sort_spec.split(","):
+        part = part.strip()
+        if not part:
+            continue
+        if ":" in part:
+            field, order = part.rsplit(":", 1)
+            desc = order.lower() != "asc"
+        else:
+            field, desc = part, True  # 默认逆序
+        result.append((field.strip(), desc))
+    return result
+
+
+def parse_cols_spec(cols_spec):
+    """解析列映射规格,如 'dt:日期,name,value:数值' -> [('dt', '日期'), ('name', 'name'), ('value', '数值')]"""
+    if not cols_spec:
+        return []
+    result = []
+    for part in cols_spec.split(","):
+        part = part.strip()
+        if not part:
+            continue
+        if ":" in part:
+            old_name, new_name = part.split(":", 1)
+            result.append((old_name.strip(), new_name.strip()))
+        else:
+            result.append((part, part))
+    return result
+
+
+def apply_cols_mapping(header, data_rows, cols_spec):
+    """应用列映射:筛选、排序、重命名"""
+    col_mapping = parse_cols_spec(cols_spec)
+    if not col_mapping:
+        return header, data_rows
+
+    # 构建索引映射
+    header_index = {name: i for i, name in enumerate(header)}
+    new_header = []
+    col_indices = []
+
+    for old_name, new_name in col_mapping:
+        if old_name in header_index:
+            col_indices.append(header_index[old_name])
+            new_header.append(new_name)
+        else:
+            print(f"警告: 字段 '{old_name}' 不存在,已跳过")
+
+    if not col_indices:
+        print("警告: 没有有效的列映射,保持原样")
+        return header, data_rows
+
+    # 应用映射
+    new_rows = []
+    for row in data_rows:
+        new_row = [row[i] if i < len(row) else "" for i in col_indices]
+        new_rows.append(new_row)
+
+    print(f"列映射: {len(col_indices)} 列")
+    return new_header, new_rows
+
+
+def column_index_to_letter(col_idx):
+    """列索引转字母,如 1->A, 26->Z, 27->AA"""
+    result = ""
+    while col_idx > 0:
+        col_idx, remainder = divmod(col_idx - 1, 26)
+        result = chr(65 + remainder) + result
+    return result
+
+
+def upload_to_feishu(csv_file, sheet_token, sheet_id=None, sort_spec="dt:desc", cols_spec=None, filter_spec=None, limit=None):
+    """上传 CSV 文件到飞书表格(通过模板行继承样式)
+
+    第1行: 表头
+    第2行: 样式模板(用于继承,最后删除)
+    第3行起: 数据
+
+    Args:
+        csv_file: CSV 文件路径
+        sheet_token: 飞书表格 token
+        sheet_id: 工作表 ID(None 时自动获取第一个)
+        sort_spec: 排序规格,如 "dt:desc,name:asc"
+        cols_spec: 列映射规格,如 "dt:日期,name,value:数值"
+        filter_spec: 过滤条件,dict {"字段": "值"} 或 str "字段=值,字段=值"
+        limit: 上传行数上限
+    """
+    from odps_skills.feishu import Client, LARK_HOST, APP_ID, APP_SECRET, request
+
+    # 读取 CSV
+    with open(csv_file, "r", encoding="utf-8") as f:
+        reader = csv.reader(f)
+        rows = list(reader)
+
+    if len(rows) < 2:
+        print("CSV 文件为空,跳过上传")
+        return
+
+    header = rows[0]
+    data_rows = rows[1:]
+
+    # 排序(在列映射之前,使用原始列名)
+    sort_fields = parse_sort_spec(sort_spec)
+    if sort_fields:
+        applied = []
+        for field, desc in reversed(sort_fields):
+            if field in header:
+                idx = header.index(field)
+                data_rows.sort(key=lambda row: row[idx] if idx < len(row) else "", reverse=desc)
+                applied.append(f"{field}:{'desc' if desc else 'asc'}")
+        if applied:
+            print(f"排序: {', '.join(reversed(applied))}")
+
+    # 过滤(排序之后)
+    if filter_spec:
+        # 支持 dict(来自 JSON 配置)或 str(来自命令行 "字段=值,字段!=值")
+        if isinstance(filter_spec, str):
+            filters = []
+            for part in filter_spec.split(","):
+                if "!=" in part:
+                    k, v = part.split("!=", 1)
+                    filters.append((k.strip(), v.strip(), "!="))
+                elif "=" in part:
+                    k, v = part.split("=", 1)
+                    filters.append((k.strip(), v.strip(), "=="))
+        elif isinstance(filter_spec, dict):
+            filters = [(k, v, "==") for k, v in filter_spec.items()]
+
+        before_count = len(data_rows)
+        for field, value, op in filters:
+            if field in header:
+                idx = header.index(field)
+                if op == "!=":
+                    data_rows = [row for row in data_rows if idx < len(row) and row[idx] != str(value)]
+                else:
+                    data_rows = [row for row in data_rows if idx < len(row) and row[idx] == str(value)]
+        print(f"过滤: {filters} → {before_count} → {len(data_rows)} 行")
+
+    # limit(过滤之后)
+    if limit and len(data_rows) > limit:
+        print(f"限制行数: {len(data_rows)} → {limit}")
+        data_rows = data_rows[:limit]
+
+    # 列映射(排序之后)
+    header, data_rows = apply_cols_mapping(header, data_rows, cols_spec)
+
+    # 按列推断类型并转换
+    col_types = infer_column_types(data_rows)
+    converted_rows = [convert_row_by_types(row, col_types) for row in data_rows]
+
+    # 初始化飞书客户端
+    client = Client(LARK_HOST)
+    access_token = client.get_tenant_access_token(APP_ID, APP_SECRET)
+
+    # 获取 sheet_id
+    if sheet_id is None:
+        sheet_id = client.get_sheetid(access_token, sheet_token)
+    print(f"Sheet ID: {sheet_id}")
+
+    # 获取表格信息
+    sheet_props = client.get_sheet_properties(access_token, sheet_token, sheet_id)
+    current_cols = sheet_props['column_count'] if sheet_props else 26
+    header_end_col = column_index_to_letter(current_cols)
+
+    # 扩展列数(CSV 列数超过当前 sheet 列数时)
+    num_csv_cols = len(header)
+    if num_csv_cols > current_cols:
+        add_cols = num_csv_cols - current_cols
+        expand_headers = {
+            'Content-Type': 'application/json; charset=utf-8',
+            'Authorization': f'Bearer {access_token}'
+        }
+        expand_payload = {
+            "dimension": {
+                "sheetId": sheet_id,
+                "majorDimension": "COLUMNS",
+                "length": add_cols
+            }
+        }
+        try:
+            request("POST", f"{LARK_HOST}/open-apis/sheets/v2/spreadsheets/{sheet_token}/dimension_range",
+                    expand_headers, expand_payload)
+            print(f"扩展列数: {current_cols} -> {num_csv_cols} (+{add_cols}列)")
+            current_cols = num_csv_cols
+            header_end_col = column_index_to_letter(current_cols)
+        except Exception as e:
+            print(f"  扩展列数失败: {e}")
+
+    # 读取飞书表头(获取所有列)
+    feishu_header = client.read_range_values(access_token, sheet_token, f"{sheet_id}!A1:{header_end_col}1")
+    feishu_cols = []
+    if feishu_header and feishu_header[0]:
+        feishu_cols = [c for c in feishu_header[0] if c]  # 过滤 None 和空字符串
+
+    # 富文本列转纯文本(飞书表头可能含带链接的 list 结构)
+    def _col_to_str(col):
+        if isinstance(col, list):
+            return "".join(item.get("text", "") for item in col if isinstance(item, dict))
+        return col
+
+    if feishu_cols:
+        feishu_cols_str = [_col_to_str(c) for c in feishu_cols]
+        print(f"飞书表头: {feishu_cols_str}")
+        print(f"CSV表头: {header}")
+
+        # 校验字段一致性(警告但继续,以飞书表头为准)
+        feishu_set = set(feishu_cols_str)
+        csv_set = set(header)
+
+        missing_in_csv = feishu_set - csv_set
+        missing_in_feishu = csv_set - feishu_set
+
+        if missing_in_csv:
+            print(f"警告: CSV缺少字段(将填空值): {missing_in_csv}")
+        if missing_in_feishu:
+            print(f"警告: 飞书缺少字段(将忽略): {missing_in_feishu}")
+
+        # 按飞书表头顺序重排数据(用纯文本版本做匹配)
+        csv_col_index = {name: i for i, name in enumerate(header)}
+        new_converted_rows = []
+        for row in converted_rows:
+            new_row = []
+            for col_name in feishu_cols_str:
+                if col_name in csv_col_index:
+                    new_row.append(row[csv_col_index[col_name]])
+                else:
+                    new_row.append("")  # CSV缺少的字段填空
+            new_converted_rows.append(new_row)
+
+        converted_rows = new_converted_rows
+        header = feishu_cols
+        print(f"已按飞书表头顺序重排数据")
+    else:
+        # 飞书表头为空,用 CSV 表头写入(飞书单次最多写100列,需分批)
+        print(f"飞书表头为空,使用 CSV 表头写入")
+        col_batch = 100
+        for start in range(0, len(header), col_batch):
+            end = min(start + col_batch, len(header))
+            start_col = column_index_to_letter(start + 1)
+            end_col = column_index_to_letter(end)
+            batch_range = f"{sheet_id}!{start_col}1:{end_col}1"
+            client.batch_update_values(access_token, sheet_token, {
+                "valueRanges": [{"range": batch_range, "values": [header[start:end]]}]
+            })
+
+    total_rows = len(converted_rows)
+    num_cols = len(header)
+    end_col = column_index_to_letter(num_cols)
+
+    # 飞书单 sheet 上限 5,000,000 cells,预留表头+模板行
+    CELL_LIMIT = 5_000_000
+    max_data_rows = (CELL_LIMIT // num_cols) - 2
+    if total_rows > max_data_rows:
+        print(f"飞书 cell 上限 {CELL_LIMIT:,}({num_cols}列 × {max_data_rows}行),截断 {total_rows} → {max_data_rows} 行")
+        converted_rows = converted_rows[:max_data_rows]
+        total_rows = max_data_rows
+
+    print(f"上传到飞书: {total_rows} 行数据")
+
+    batch_size = 500
+
+    # 获取当前行数(复用之前获取的 sheet_props)
+    current_rows = sheet_props['row_count'] if sheet_props else 2
+    print(f"当前行数: {current_rows}, 需要数据行: {total_rows}")
+
+    headers = {
+        'Content-Type': 'application/json; charset=utf-8',
+        'Authorization': f'Bearer {access_token}'
+    }
+
+    # 第1步:删除旧数据行(保留第1行表头 + 第2行样式模板),分批删除
+    if current_rows > 2:
+        print(f"清理旧数据({current_rows - 2}行)...")
+        rows_to_delete = current_rows - 2
+        delete_batch = 5000
+        while rows_to_delete > 0:
+            # 每次从第3行开始删除,删除后行号会自动调整
+            batch = min(rows_to_delete, delete_batch)
+            try:
+                client.delete_rows(access_token, sheet_token, sheet_id, 3, 2 + batch)
+                rows_to_delete -= batch
+                if rows_to_delete > 0:
+                    print(f"  已删除 {current_rows - 2 - rows_to_delete}/{current_rows - 2}")
+            except Exception as e:
+                print(f"  清理失败: {e}")
+                break
+
+    # 第2步:扩展表格容量(insert 不会自动扩展)
+    # 删除后当前只有2行(表头+模板),需要扩展到 2 + total_rows 行
+    add_url = f"{LARK_HOST}/open-apis/sheets/v2/spreadsheets/{sheet_token}/dimension_range"
+    expand_batch = 5000
+    remaining = total_rows
+    expanded = 0
+    while remaining > 0:
+        chunk = min(remaining, expand_batch)
+        add_payload = {
+            "dimension": {
+                "sheetId": sheet_id,
+                "majorDimension": "ROWS",
+                "length": chunk
+            }
+        }
+        try:
+            request("POST", add_url, headers, add_payload)
+            expanded += chunk
+            remaining -= chunk
+        except Exception as e:
+            print(f"  扩展容量失败(已扩展{expanded}): {e}")
+            break
+    if expanded > 0:
+        print(f"扩展容量: +{expanded} 行")
+
+    # 第3步:分批写入数据到扩展的空行(不再 insert,避免 expand+insert 双重加行超 cell 上限)
+    print(f"写入 {total_rows} 行...")
+    batches = [converted_rows[i:i + batch_size] for i in range(0, total_rows, batch_size)]
+    processed = 0
+
+    for i, batch in enumerate(batches):
+        batch_count = len(batch)
+        start_row = 3 + i * batch_size  # 从第3行开始,顺序写入
+
+        # 写入数据(飞书单次最多100列,需按列分批)
+        col_batch = 100
+        value_ranges = []
+        for col_start in range(0, num_cols, col_batch):
+            col_end = min(col_start + col_batch, num_cols)
+            sc = column_index_to_letter(col_start + 1)
+            ec = column_index_to_letter(col_end)
+            col_range = f"{sheet_id}!{sc}{start_row}:{ec}{start_row + batch_count - 1}"
+            col_values = [row[col_start:col_end] for row in batch]
+            value_ranges.append({"range": col_range, "values": col_values})
+        client.batch_update_values(access_token, sheet_token, {
+            "valueRanges": value_ranges
+        })
+
+        processed += batch_count
+        print(f"  处理: {processed}/{total_rows}")
+
+    # 第5步:删除模板行(第2行),仅当初始存在模板行时
+    if current_rows >= 2:
+        print(f"删除模板行...")
+        try:
+            client.delete_rows(access_token, sheet_token, sheet_id, 2, 2)
+        except Exception as e:
+            print(f"  删除模板行失败: {e}")
+
+    print(f"飞书上传完成: {sheet_token}")
+
+
+def get_date_range(start_str, end_str):
+    """生成日期范围列表"""
+    start = datetime.strptime(start_str, "%Y%m%d")
+    end = datetime.strptime(end_str, "%Y%m%d")
+    dates = []
+    current = start
+    while current <= end:
+        dates.append(current.strftime("%Y%m%d"))
+        current += timedelta(days=1)
+    return dates
+
+
+def fetch_single_day(dt, sql_template, daily_dir, parallel_threads=0, config="default", hh=None):
+    """获取单天数据(可选指定小时)"""
+    global success_count, fail_count
+
+    try:
+        client = ODPSClient(config=config)
+        sql = sql_template.replace("${dt}", dt)
+        if hh is not None:
+            sql = sql.replace("${hh}", hh)
+            output_file = daily_dir / f"{dt}_{hh}.csv"
+        else:
+            output_file = daily_dir / f"{dt}.csv"
+
+        # 下载到文件
+        if parallel_threads > 0:
+            # 多线程并行下载(适合大数据量)
+            client.execute_sql_result_save_file_parallel(sql, str(output_file), workers=parallel_threads)
+        else:
+            # 单线程下载
+            client.execute_sql_result_save_file(sql, str(output_file))
+
+        # 检查结果
+        if output_file.exists():
+            row_count = sum(1 for _ in open(output_file)) - 1  # 减去表头
+            with counter_lock:
+                success_count += 1
+            if row_count > 0:
+                return (dt, "success", row_count)
+            else:
+                return (dt, "empty", 0)
+        else:
+            with counter_lock:
+                fail_count += 1
+            return (dt, "fail", 0)
+
+    except Exception as e:
+        with counter_lock:
+            fail_count += 1
+        return (dt, "error", str(e))
+
+
+def main():
+    global success_count, fail_count
+
+    parser = argparse.ArgumentParser(description="按天增量获取数据")
+    parser.add_argument("sql_file", type=str, help="SQL文件路径")
+    parser.add_argument("--days", type=int, default=7, help="获取最近N天 (默认7)")
+    parser.add_argument("--start", type=str, help="开始日期 YYYYMMDD")
+    parser.add_argument("--end", type=str, help="结束日期 YYYYMMDD")
+    parser.add_argument("--date", type=str, help="单天日期 YYYYMMDD")
+    parser.add_argument("--hh", type=str, default=None, help="小时 HH (00-23),需配合 --date 使用")
+    parser.add_argument("--force", action="store_true", help="强制重新获取")
+    parser.add_argument("--workers", type=int, default=5, help="天级并发数 (默认5)")
+    parser.add_argument("--parallel", type=int, default=50, help="单天多线程下载 (默认50, 大数据量推荐)")
+    parser.add_argument("--merge", action="store_true", help="合并所有日期数据到一个文件")
+    parser.add_argument("--feishu", nargs="?", const="__USE_CONFIG__",
+                        help="上传到飞书表格")
+    parser.add_argument("--sheet-id", type=str, default=None, help="飞书工作表ID")
+    parser.add_argument("--sort", type=str, default=None, help="排序: 字段:asc/desc")
+    parser.add_argument("--cols", type=str, default=None, help="列映射: 原名:新名,...")
+    parser.add_argument("--filter", type=str, default=None, help="过滤: 字段=值,字段=值")
+    parser.add_argument("--limit", type=int, default=None, help="上传行数上限")
+    parser.add_argument("--config", type=str, default="default", help="ODPS配置: default 或 piaoquan_api")
+    args = parser.parse_args()
+
+    # 解析 SQL 文件路径
+    sql_file = Path(args.sql_file).resolve()
+    if not sql_file.exists():
+        print(f"错误: 找不到 {sql_file}")
+        return
+
+    # 加载飞书配置(优先级: 命令行 > {sql名}.json > sql目录/default.json > 根目录/default.json > 默认值)
+    feishu_config = load_feishu_config(sql_file)
+    if args.feishu == "__USE_CONFIG__":
+        args.feishu = feishu_config["token"]
+    elif args.feishu is None:
+        pass  # 未启用飞书上传
+    # 命令行参数覆盖配置文件
+    if args.sheet_id is None:
+        args.sheet_id = feishu_config["sheet_id"]
+    if args.sort is None:
+        args.sort = feishu_config["sort"]
+    if args.cols is None:
+        args.cols = feishu_config["cols"]
+    if args.filter is None:
+        args.filter = feishu_config["filter"]
+    if args.limit is None:
+        args.limit = feishu_config["limit"]
+
+    # 打印飞书配置
+    if args.feishu:
+        print(f"飞书配置: token={args.feishu}, sheet_id={args.sheet_id}, sort={args.sort}, cols={args.cols}")
+
+    # 输出目录:SQL 同目录下的 output/SQL文件名/
+    output_dir = sql_file.parent / "output"
+    daily_dir = output_dir / sql_file.stem
+    daily_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"SQL文件: {sql_file}")
+    print(f"数据目录: {daily_dir}")
+
+    # 仅合并模式:不获取数据,直接合并已有文件
+    if args.merge:
+        existing_dates = get_existing_dates(daily_dir)
+        print(f"已有数据: {len(existing_dates)}天")
+        if existing_dates:
+            merged_file = merge_csv_files(daily_dir)
+            # 如果指定了飞书上传
+            if args.feishu and merged_file:
+                upload_to_feishu(merged_file, args.feishu, args.sheet_id, args.sort, args.cols, args.filter, args.limit)
+        else:
+            print("没有可合并的数据")
+        return
+
+    # 确定日期范围
+    if args.date:
+        target_dates = [args.date]
+    elif args.start and args.end:
+        target_dates = get_date_range(args.start, args.end)
+    else:
+        today = datetime.now()
+        end_date = (today - timedelta(days=1)).strftime("%Y%m%d")
+        start_date = (today - timedelta(days=args.days)).strftime("%Y%m%d")
+        target_dates = get_date_range(start_date, end_date)
+
+    print(f"目标日期: {target_dates[0]} ~ {target_dates[-1]} ({len(target_dates)}天)")
+
+    # 检查已有数据
+    existing_dates = get_existing_dates(daily_dir, args.hh)
+    if args.hh:
+        print(f"已有数据: {len(existing_dates)}天 (hh={args.hh})")
+    else:
+        print(f"已有数据: {len(existing_dates)}天")
+
+    # 确定需要获取的日期
+    if args.force:
+        missing_dates = target_dates
+        print(f"强制模式: 重新获取所有 {len(missing_dates)} 天")
+    else:
+        missing_dates = [d for d in target_dates if d not in existing_dates]
+        print(f"需要获取: {len(missing_dates)}天")
+
+    if not missing_dates:
+        print("没有需要获取的数据,退出")
+        return
+
+    # 读取 SQL 模板
+    sql_template = sql_file.read_text(encoding="utf-8")
+
+    # 检测 SQL 中是否包含 ${dt} 变量
+    has_dt_var = "${dt}" in sql_template
+
+    # 重置计数器
+    success_count = 0
+    fail_count = 0
+
+    # 如果 SQL 中没有 ${dt},只需执行一次
+    if not has_dt_var:
+        print("\n检测到 SQL 中不含 ${dt} 变量,只执行一次...")
+        target_dates = ["20000101"]  # 用虚拟日期
+        missing_dates = target_dates
+        output_file = output_dir / f"{sql_file.stem}.csv"
+        output_file.parent.mkdir(parents=True, exist_ok=True)
+
+        try:
+            client = ODPSClient(config=args.config)
+            if args.parallel > 0:
+                client.execute_sql_result_save_file_parallel(sql_template, str(output_file), workers=args.parallel)
+            else:
+                client.execute_sql_result_save_file(sql_template, str(output_file))
+            print(f"数据目录: {output_file}")
+            # 如果指定了飞书上传
+            if args.feishu and output_file.exists():
+                upload_to_feishu(output_file, args.feishu, args.sheet_id, args.sort, args.cols, args.filter, args.limit)
+        except Exception as e:
+            print(f"执行失败: {e}")
+        return
+
+    # 并发获取
+    print(f"目标日期: {target_dates[0]} ~ {target_dates[-1]} ({len(target_dates)}天)")
+    workers = min(args.workers, len(missing_dates))
+    if args.parallel > 0:
+        print(f"\n开始获取 (天级并发: {workers}, 单天多线程: {args.parallel})...")
+    else:
+        print(f"\n开始获取 (并发数: {workers})...")
+
+    with ThreadPoolExecutor(max_workers=workers) as executor:
+        futures = {
+            executor.submit(fetch_single_day, dt, sql_template, daily_dir, args.parallel, args.config, args.hh): dt
+            for dt in missing_dates
+        }
+
+        completed = 0
+        for future in as_completed(futures):
+            completed += 1
+            dt, status, info = future.result()
+
+            if status == "success":
+                print(f"  [{completed}/{len(missing_dates)}] {dt}: {info} 行")
+            elif status == "empty":
+                print(f"  [{completed}/{len(missing_dates)}] {dt}: 无数据")
+            elif status == "error":
+                print(f"  [{completed}/{len(missing_dates)}] {dt}: {info}")
+            else:
+                print(f"  [{completed}/{len(missing_dates)}] {dt}: 失败")
+
+    print(f"\n完成! 成功: {success_count}, 失败: {fail_count}")
+    print(f"数据目录: {daily_dir}")
+
+    # 如果指定了飞书上传,先合并再上传
+    if args.feishu:
+        merged_file = merge_csv_files(daily_dir)
+        if merged_file:
+            upload_to_feishu(merged_file, args.feishu, args.sheet_id, args.sort, args.cols, args.filter, args.limit)
+
+
+if __name__ == "__main__":
+    main()

+ 47 - 0
skills/src/odps_skills/fetch_table_code.py

@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""根据表名从 DataWorks 获取生产代码,保存到 production_code/ 目录。
+
+用法:
+    fetch-table-code loghubods.dwd_recsys_alg_exposure_base_20250108
+    fetch-table-code loghubods.dwd_recsys_alg_exposure_base_20250108 --force
+    fetch-table-code loghubods.dwd_recsys_alg_exposure_base_20250108 --recursive
+    fetch-table-code loghubods.dwd_recsys_alg_exposure_base_20250108 --recursive --depth 5
+"""
+
+import os
+import sys
+import argparse
+
+
+def main():
+    parser = argparse.ArgumentParser(description="获取表的 DataWorks 生产代码")
+    parser.add_argument("table_name", help="表名,格式: project.table 或 table")
+    parser.add_argument("--force", action="store_true", help="跳过缓存,强制从 API 拉取")
+    parser.add_argument("--recursive", "-r", action="store_true", help="递归获取所有上游表的代码")
+    parser.add_argument("--depth", type=int, default=3, help="递归最大深度(默认 3)")
+    parser.add_argument("--output-dir", type=str, default=None,
+                        help="产出基础目录(默认 $ODPS_SKILLS_DATA_DIR 或 cwd)")
+    args = parser.parse_args()
+
+    # 设置 output-dir(需在 import dataworks 之前,因为模块级会读取 env)
+    if args.output_dir:
+        os.environ["ODPS_SKILLS_DATA_DIR"] = args.output_dir
+
+    from odps_skills.dataworks import DataWorksClient
+
+    dw = DataWorksClient()
+
+    if args.recursive:
+        dw.get_node_code_recursive(args.table_name, max_depth=args.depth, force=args.force)
+    else:
+        results = dw.get_node_code(args.table_name, force=args.force)
+        if not results:
+            print(f"未找到 '{args.table_name}' 的生产代码")
+            sys.exit(1)
+        for r in results:
+            print(f"任务: {r['task_name']}  代码长度: {len(r['content'])} chars")
+
+
+if __name__ == "__main__":
+    main()

+ 154 - 0
skills/src/odps_skills/run_sql.py

@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+SQL 执行工具 - 输入 SQL 文件,输出查询结果到同目录下的 CSV
+
+使用示例:
+    run-sql tasks/渠道效果分析/渠道再分享回流.sql
+    run-sql tasks/渠道效果分析/渠道再分享回流.sql --start 20251222 --end 20260103
+"""
+import argparse
+from datetime import datetime, timedelta
+from pathlib import Path
+
+from odps_skills.client import ODPSClient
+
+
+def get_default_dates():
+    """获取默认日期范围:最近 7 天(start=7天前, end=昨天)"""
+    today = datetime.now()
+    end_date = today - timedelta(days=1)
+    start_date = today - timedelta(days=7)
+    return start_date.strftime('%Y%m%d'), end_date.strftime('%Y%m%d')
+
+
+def parse_variables(var_list: list) -> dict:
+    """解析变量列表为字典"""
+    if not var_list:
+        return {}
+    variables = {}
+    for item in var_list:
+        if '=' in item:
+            key, value = item.split('=', 1)
+            variables[key.strip()] = value.strip()
+    return variables
+
+
+def replace_variables(sql: str, variables: dict) -> str:
+    """替换 SQL 中的 ${variable} 占位符"""
+    for key, value in variables.items():
+        sql = sql.replace(f'${{{key}}}', value)
+    return sql
+
+
+def run_sql(sql_file: str, output_file: str = None, variables: dict = None,
+            start: str = None, end: str = None, dry_run: bool = False):
+    """
+    执行 SQL 文件并保存结果
+
+    Args:
+        sql_file: SQL 文件路径
+        output_file: 输出文件路径(默认与 SQL 同目录同名)
+        variables: 变量替换字典
+        start: dt 分区起始日期
+        end: dt 分区结束日期
+        dry_run: 仅打印 SQL,不执行
+    """
+    sql_path = Path(sql_file)
+
+    # 合并 start/end 到 variables
+    if variables is None:
+        variables = {}
+    if start:
+        variables['start'] = start
+    if end:
+        variables['end'] = end
+
+    # 输出目录:SQL 同目录下的 output/;文件名:[sql前缀_]日期.csv
+    if output_file is None:
+        output_dir = sql_path.parent / "output"
+        output_dir.mkdir(exist_ok=True)
+        # SQL 文件名作为前缀
+        sql_stem = sql_path.stem  # 去掉 .sql 后缀
+        prefix = f"{sql_stem}_"
+        if start and end:
+            output_file = output_dir / f"{prefix}{start}_{end}.csv"
+        elif start:
+            output_file = output_dir / f"{prefix}{start}.csv"
+        else:
+            output_file = output_dir / f"{prefix}result.csv"
+    else:
+        output_file = Path(output_file)
+
+    # 读取 SQL
+    with open(sql_path, 'r', encoding='utf-8') as f:
+        sql = f.read()
+
+    # 变量替换
+    if variables:
+        sql = replace_variables(sql, variables)
+
+    # Dry run 模式
+    if dry_run:
+        print("=" * 50)
+        print("SQL 预览 (dry-run 模式)")
+        print("=" * 50)
+        print(sql)
+        print("=" * 50)
+        print(f"输出文件: {output_file}")
+        return
+
+    # 执行 SQL
+    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] 开始执行: {sql_path.name}")
+
+    odps_client = ODPSClient()
+    odps_client.execute_sql_result_save_file(sql, str(output_file))
+
+    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] 完成,结果保存至: {output_file}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='执行 SQL 文件并输出结果',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+示例:
+  run-sql tasks/渠道效果分析/渠道再分享回流.sql
+  run-sql tasks/渠道效果分析/渠道再分享回流.sql --start 20251222 --end 20260103
+  run-sql tasks/渠道效果分析/渠道再分享回流.sql --dry-run
+        """
+    )
+    parser.add_argument('sql_file', type=str, help='SQL 文件路径')
+    parser.add_argument('--start', type=str, help='dt 分区起始日期,替换 ${start}')
+    parser.add_argument('--end', type=str, help='dt 分区结束日期,替换 ${end}')
+    parser.add_argument('-o', '--output', type=str, help='自定义输出路径')
+    parser.add_argument('--vars', nargs='*', metavar='KEY=VALUE', help='额外变量,如: apptype=36')
+    parser.add_argument('--dry-run', action='store_true', help='仅打印 SQL,不执行')
+
+    args = parser.parse_args()
+
+    # 解析变量
+    variables = parse_variables(args.vars)
+
+    # 默认日期
+    start = args.start
+    end = args.end
+    if start is None or end is None:
+        default_start, default_end = get_default_dates()
+        start = start or default_start
+        end = end or default_end
+        print(f"使用默认日期范围: {start} ~ {end}")
+
+    # 执行
+    run_sql(
+        sql_file=args.sql_file,
+        output_file=args.output,
+        variables=variables,
+        start=start,
+        end=end,
+        dry_run=args.dry_run
+    )
+
+
+if __name__ == "__main__":
+    main()