# -*- coding: UTF-8 -*- import json import base64 import hashlib import os from curl_cffi import requests as mj_requests import requests import os from dotenv import load_dotenv, find_dotenv load_dotenv(find_dotenv()) # load from env APP_ID = 'cli_a22acf2916b8500e' APP_SECRET = 'tE0xAB2gZTMlBGdPczCGLcmpRlZQm5CQ' LARK_HOST = 'https://open.feishu.cn' APP_HOST = 'https://open.feishu.cn' EMAIL = 'semsevens@email.com' class LarkException(Exception): def __init__(self, code=0, msg=None): self.code = code self.msg = msg def __str__(self) -> str: return "{}:{}".format(self.code, self.msg) __repr__ = __str__ def request(method, url, headers, payload={}): response = requests.request(method, url, headers=headers, json=payload) # logging.info("URL: " + url) # logging.info("X-Tt-Logid: " + response.headers['X-Tt-Logid']) # logging.info("headers:\n"+json.dumps(headers,indent=2, ensure_ascii=False)) # logging.info("payload:\n"+json.dumps(payload,indent=2, ensure_ascii=False)) resp = {} if response.text[0] == '{': resp = response.json() # logging.info("response:\n"+json.dumps(resp,indent=2, ensure_ascii=False)) else: pass # logging.info("response:\n"+response.text) code = resp.get("code", -1) if code == -1: code = resp.get("StatusCode", -1) if code == -1 and response.status_code != 200: response.raise_for_status() if code != 0: raise LarkException(code=code, msg=resp.get("msg", "")) return resp def get_image_data_from_url(img_url, use_cache=True): # 计算URL的MD5哈希值 url_hash = hashlib.md5(img_url.encode()).hexdigest() cache_dir = 'image_cache' cache_file = os.path.join(cache_dir, f'{url_hash}.json') if use_cache: # 检查缓存目录是否存在,如果不存在则创建 if not os.path.exists(cache_dir): os.makedirs(cache_dir) # 检查缓存文件是否存在 if os.path.exists(cache_file): with open(cache_file, 'r') as f: cached_data = json.load(f) return cached_data['image_data'] # 如果缓存不存在,从URL获取图片 if 'midjourney' in img_url: proxies = { 'http': 'http://127.0.0.1:7890', 'https': 'http://127.0.0.1:7890', } # response = mj_requests.get(img_url, impersonate="chrome100", proxies=proxies) response = mj_requests.get(img_url.replace("https://", "http://"), impersonate="chrome100") else: # proxies = { # 'http': 'http://t10952018781111:1ap37oc3@d844.kdltps.com:15818', # 'https': 'http://t10952018781111:1ap37oc3@d844.kdltps.com:15818', # } # proxies = { # 'http': None, # 'https': None, # } # response = requests.get(img_url.replace("https://", "http://"), proxies=proxies) response = requests.get(img_url) # response = requests.get(img_url, proxies=proxies) if response.status_code == 200: image_content = response.content missing_padding = 4 - len(image_content) % 4 if missing_padding: image_content += b'=' * missing_padding image_data = base64.b64encode(image_content).decode('utf-8') # 将图片数据保存到缓存 with open(cache_file, 'w') as f: json.dump({'image_data': image_data}, f) return image_data else: # import traceback # traceback.print_exc() raise Exception(f"无法从URL获取图片: {img_url}") from PIL import Image import io import os def get_image_size(img_url): img_data = get_image_data_from_url(img_url) img = Image.open(io.BytesIO(base64.b64decode(img_data))) width, height = img.size return width, height if __name__ == "__main__": img_url = "https://sns-webpic.xhscdn.com/1040g2sg31c4vs26n12a05ph3cdp3cutm5prqo90" img_data = get_image_data_from_url(img_url) save_path = "/Users/nieqi/Downloads/save.json" with open(save_path, 'w') as f: f.write(img_data) def column_id(col): '''column int to string id''' ans = "" i = col while i > 0: m = int((i-1) % 26) i = int((i-1) / 26) ans = chr(m+65) + ans return ans def do_compress_image(image_data, image_type): # 压缩图片 from PIL import Image import io import base64 Image.MAX_IMAGE_PIXELS = None # 禁用图片大小限制 # 将base64转为图片对象 image = Image.open(io.BytesIO(base64.b64decode(image_data))) # 计算压缩后的尺寸,保持宽高比 max_size = 1600 ratio = min(max_size/image.width, max_size/image.height) if ratio < 1: new_size = (int(image.width * ratio), int(image.height * ratio)) image = image.resize(new_size, Image.Resampling.LANCZOS) # 在保存之前转换RGBA为RGB if image.mode == 'RGBA': # 创建白色背景 background = Image.new('RGB', image.size, (255, 255, 255)) # 将RGBA图片合成到白色背景上 background.paste(image, mask=image.split()[3]) # 使用alpha通道作为mask image = background buffer = io.BytesIO() # 将 'JPG' 转换为 'JPEG' if image_type and image_type.upper() == 'JPG': image_type = 'JPEG' image_type = 'JPEG' # image.save(buffer, format=image_type.upper(), quality=95, optimize=True) image.save(buffer, format=image_type.upper(), quality=85, optimize=True) image_data = base64.b64encode(buffer.getvalue()).decode() return image_data class Client(object): def __init__(self, lark_host): self._host = lark_host def get_tenant_access_token(self, app_id, app_secret): url = self._host+"/open-apis/auth/v3/app_access_token/internal/" headers = { 'Content-Type': 'application/json; charset=utf-8' } payload = { 'app_id': app_id, 'app_secret': app_secret } resp = request("POST", url, headers, payload) return resp['tenant_access_token'] def get_user_access_token(self, tenant_access_token, code): url = self._host+"/open-apis/authen/v1/access_token" headers = { 'Content-Type': 'application/json; charset=utf-8' } payload = { "grant_type": "authorization_code", "code": code, "app_access_token": tenant_access_token } resp = request("POST", url, headers, payload) return resp['data']['access_token'] def get_root_folder_token(self, access_token): url = self._host+"/open-apis/drive/explorer/v2/root_folder/meta" headers = { 'Content-Type': 'application/json; charset=utf-8', 'Authorization': 'Bearer '+access_token } resp = request("GET", url, headers) return resp['data']['token'] def create_spreadsheet(self, access_token, foldertoken, title): url =self._host+"/open-apis/sheets/v3/spreadsheets" headers = { 'Content-Type': 'application/json; charset=utf-8', 'Authorization': 'Bearer '+access_token } payload={ "title": title, "folder_token": foldertoken } resp = request("POST", url, headers, payload) return resp['data']['spreadsheet']['spreadsheet_token'], resp['data']['spreadsheet']['url'] def get_sheetid(self, access_token, doctoken, sheet_index=0): url = self._host+"/open-apis/sheets/v2/spreadsheets/"+doctoken+"/metainfo" headers = { 'Content-Type': 'application/json; charset=utf-8', 'Authorization': 'Bearer '+access_token } resp = request("GET", url, headers) return resp['data']['sheets'][sheet_index]["sheetId"] def batch_update_values(self, access_token, doctoken, data): url =self._host+"/open-apis/sheets/v2/spreadsheets/"+doctoken+"/values_batch_update" headers = { 'Content-Type': 'application/json; charset=utf-8', 'Authorization': 'Bearer '+access_token } payload=data resp = request("POST", url, headers, payload) return resp['data']['spreadsheetToken'] def batch_update_styles(self, access_token, doctoken, data): url =self._host+"/open-apis/sheets/v2/spreadsheets/"+doctoken+"/styles_batch_update" headers = { 'Content-Type': 'application/json; charset=utf-8', 'Authorization': 'Bearer '+access_token } payload=data resp = request("PUT", url, headers, payload) return resp['data']['spreadsheetToken'] def add_permissions_member(self, access_token, doctoken, doctype, member_type, member_id, perm): url = self._host+"/open-apis/drive/v1/permissions/"+doctoken+"/members?type="+doctype+"&need_notification=false" headers = { 'Content-Type': 'application/json; charset=utf-8', 'Authorization': 'Bearer '+access_token } payload = { "member_type": member_type, "member_id": member_id, "perm": perm } request("POST", url, headers, payload) def write_image_to_cell(self, access_token, doctoken, sheetid, img_url, row, col, image_type, compress_image=True): url = f"{self._host}/open-apis/sheets/v2/spreadsheets/{doctoken}/values_image" headers = { 'Content-Type': 'application/json; charset=utf-8', 'Authorization': f'Bearer {access_token}' } try: image_data = get_image_data_from_url(img_url) except Exception as e: print(img_url) print(e) return None, None if compress_image: image_data = do_compress_image(image_data, image_type) image_name = img_url.split('/')[-1].replace(f'.{image_type}', '') # 从URL中提取文件名 if compress_image: image_type = 'JPEG' cell_start = column_id(col)+str(row) range = f'{sheetid}!{cell_start}:{cell_start}' payload = { "range": range, "image": image_data, "name": f"{image_name}.{image_type}" } try: resp = request("POST", url, headers, payload) except Exception as e: print(img_url) print(image_name) print(image_type) print(e) return None, None return resp['data']['revision'], resp['data']['updateRange'] def merge_cells(self, access_token, doctoken, sheetid, start_row, end_row, start_col, end_col): print(f"merge start_row = {start_row} end_row = {end_row} start_col = {start_col} end_col = {end_col}") url = f"{self._host}/open-apis/sheets/v2/spreadsheets/{doctoken}/merge_cells" headers = { 'Content-Type': 'application/json; charset=utf-8', 'Authorization': f'Bearer {access_token}' } start_col_id = column_id(start_col) end_col_id = column_id(end_col) payload = { "range": f"{sheetid}!{start_col_id}{start_row}:{end_col_id}{end_row}", "mergeType": "MERGE_ALL", } try: resp = request("POST", url, headers, payload) except Exception as e: print(e) return None return None def write_images_to_cell(self, access_token, doctoken, sheetid, img_url_list, row, col, compress_image=True, grid_width=None, grid_height=None, border_width=3, border_color=(200, 200, 200)): """ 将多张图片拼接后写入单元格 Args: img_url_list: 图片URL列表 row: 目标单元格行号 col: 目标单元格列号 compress_image: 是否压缩图片 grid_width: 拼接图片的列数,如果为None则自动计算 grid_height: 拼接图片的行数,如果为None则自动计算 border_width: 边框宽度,像素 border_color: 边框颜色,RGB元组 """ from PIL import Image, ImageDraw import io import base64 import math # 下载所有图片 images = [] for img_url in img_url_list: try: image_type = get_image_type(img_url) if not image_type: continue image_data = get_image_data_from_url(img_url) image = Image.open(io.BytesIO(base64.b64decode(image_data))) images.append(image) except Exception as e: print(f"下载图片失败: {img_url}") print(e) continue if not images: return None, None # 计算拼接图片的行列数 img_count = len(images) if grid_width is None and grid_height is None: # 如果未指定行列数,计算最接近正方形的网格 grid_width = math.ceil(math.sqrt(img_count)) grid_height = math.ceil(img_count / grid_width) elif grid_width is None: # 如果只指定了行数,计算列数 grid_width = math.ceil(img_count / grid_height) elif grid_height is None: # 如果只指定了列数,计算行数 grid_height = math.ceil(img_count / grid_width) # 确保网格能容纳所有图片 while grid_width * grid_height < img_count: if grid_width <= grid_height: grid_width += 1 else: grid_height += 1 # 调整所有图片到相同尺寸,保持原始比例 if images: # 计算目标尺寸(使用平均尺寸作为参考) avg_width = sum(img.width for img in images) // len(images) avg_height = sum(img.height for img in images) // len(images) target_size = (avg_width, avg_height) # 调整图片尺寸,保持原始比例 resized_images = [] for img in images: # 计算保持比例的缩放尺寸 img_ratio = img.width / img.height target_ratio = target_size[0] / target_size[1] if img_ratio > target_ratio: # 图片比目标更宽,以宽度为准 new_width = target_size[0] new_height = int(target_size[0] / img_ratio) else: # 图片比目标更高,以高度为准 new_height = target_size[1] new_width = int(target_size[1] * img_ratio) # 缩放图片,保持比例 resized_img = img.resize((new_width, new_height), Image.Resampling.LANCZOS) resized_images.append(resized_img) # 创建拼接画布 canvas_width = grid_width * avg_width + (grid_width + 1) * border_width canvas_height = grid_height * avg_height + (grid_height + 1) * border_width canvas = Image.new('RGB', (canvas_width, canvas_height), border_color) # 拼接图片 for i, img in enumerate(resized_images): row_idx = i // grid_width col_idx = i % grid_width # 计算每个网格单元的位置 cell_x = col_idx * avg_width + (col_idx + 1) * border_width cell_y = row_idx * avg_height + (row_idx + 1) * border_width # 在网格单元中居中放置图片 center_x = cell_x + (avg_width - img.width) // 2 center_y = cell_y + (avg_height - img.height) // 2 canvas.paste(img, (center_x, center_y)) # 将拼接后的图片转换为base64 output = io.BytesIO() if compress_image: canvas.save(output, format='JPEG', quality=85) image_type = 'JPEG' else: canvas.save(output, format='PNG') image_type = 'PNG' output.seek(0) image_data = base64.b64encode(output.getvalue()).decode() # 调用写入图片的API url = f"{self._host}/open-apis/sheets/v2/spreadsheets/{doctoken}/values_image" headers = { 'Content-Type': 'application/json; charset=utf-8', 'Authorization': f'Bearer {access_token}' } cell_start = column_id(col) + str(row) range_val = f'{sheetid}!{cell_start}:{cell_start}' payload = { "range": range_val, "image": image_data, "name": f"combined_image.{image_type}" } try: resp = request("POST", url, headers, payload) return resp['data']['revision'], resp['data']['updateRange'] except Exception as e: print(f"写入拼接图片失败: {e}") return None, None return None, None def read_range_values(self, access_token, doctoken, range_val): """ 读取指定范围的数据 Args: access_token: 访问令牌 doctoken: 表格token range_val: 范围,格式如 "Sheet1!A1:C10" Returns: 读取到的数据列表 """ url = f"{self._host}/open-apis/sheets/v2/spreadsheets/{doctoken}/values/{range_val}" headers = { 'Content-Type': 'application/json; charset=utf-8', 'Authorization': f'Bearer {access_token}' } try: resp = request("GET", url, headers) return resp['data']['valueRange']['values'] except Exception as e: print(f"读取数据失败: {e}") return [] def prepend_data(self, access_token, doctoken, range_val, values): """ 在指定位置前面插入数据 Args: access_token: 访问令牌 doctoken: 表格token range_val: 插入范围,格式如 "Sheet1!A1:C1" values: 要插入的数据 Returns: 操作结果 """ url = f"{self._host}/open-apis/sheets/v3/spreadsheets/{doctoken}/sheets/{range_val.split('!')[0]}/prepend" headers = { 'Content-Type': 'application/json; charset=utf-8', 'Authorization': f'Bearer {access_token}' } # 从range_val中提取行数 range_part = range_val.split('!')[1] # 如 "A1:Z1" start_cell = range_part.split(':')[0] # 如 "A1" payload = { "values": values } try: resp = request("POST", url, headers, payload) return resp except Exception as e: print(f"插入数据失败: {e}") return None def insert_data_at_row(self, access_token, doctoken, sheetid, row, values): """ 在指定行插入数据(使用批量更新方式) Args: access_token: 访问令牌 doctoken: 表格token sheetid: 工作表ID row: 目标行号 values: 要插入的数据 Returns: 操作结果 """ # 使用批量更新的方式插入数据 cols = len(values[0]) if values else 1 end_col = column_id(cols) range_val = f"{sheetid}!A{row}:{end_col}{row}" body = { "valueRanges": [ { "range": range_val, "values": values } ] } try: result = self.batch_update_values(access_token, doctoken, body) return result except Exception as e: print(f"插入数据到第{row}行失败: {e}") return None def insert_rows_before(self, access_token, doctoken, sheetid, row_index, count=1): """ 在指定行前插入新行(基于飞书官方API) Args: access_token: 访问令牌 doctoken: 表格token sheetid: 工作表ID row_index: 插入位置的行号(从1开始,在此行前插入) count: 插入行数(默认1行) Returns: 操作结果 """ # 先获取工作表信息,检查当前行数 sheet_props = self.get_sheet_properties(access_token, doctoken, sheetid) if not sheet_props: print("无法获取工作表信息,尝试直接插入") current_row_count = 1000 # 默认值 else: current_row_count = sheet_props['row_count'] print(f"当前工作表行数: {current_row_count}") # 如果要插入的位置超过了当前行数,使用追加模式 if row_index > current_row_count: print(f"插入位置({row_index})超过当前行数({current_row_count}),使用追加模式") # 使用追加方式在末尾添加行 return self.append_empty_rows(access_token, doctoken, sheetid, count) url = f"{self._host}/open-apis/sheets/v2/spreadsheets/{doctoken}/insert_dimension_range" headers = { 'Content-Type': 'application/json; charset=utf-8', 'Authorization': f'Bearer {access_token}' } # 转换为0基索引:row_index=3表示第3行,对应startIndex=2 start_index = row_index - 1 # 从0开始计数 end_index = start_index + count # 结束位置(不包含) # 确保 endIndex 不超过当前工作表的行数限制 if end_index > current_row_count: print(f"警告:计算的endIndex({end_index})超过当前行数({current_row_count}),调整为追加模式") return self.append_empty_rows(access_token, doctoken, sheetid, count) # 智能选择继承样式:插入第2行时继承后面的数据行样式,其他情况继承前面的样式 inherit_style = "AFTER" if row_index == 2 else "BEFORE" payload = { "dimension": { "sheetId": sheetid, "majorDimension": "ROWS", "startIndex": start_index, # 从0开始计数 "endIndex": end_index # 结束位置(不包含此行) }, "inheritStyle": inherit_style # 智能继承样式 } try: resp = request("POST", url, headers, payload) print(f"在第{row_index}行前成功插入{count}行(startIndex={start_index}, endIndex={end_index}, inheritStyle={inherit_style})") return resp except Exception as e: print(f"在第{row_index}行前插入{count}行失败: {e}") # 如果插入失败,尝试追加模式 print("尝试使用追加模式...") return self.append_empty_rows(access_token, doctoken, sheetid, count) def insert_row_with_images(self, access_token, doctoken, sheetid, row, values, compress_image=True, grid_width=None, grid_height=None, border_width=3, border_color=(200, 200, 200)): """ 在指定行插入数据并同时处理图片写入(覆盖方式) Args: access_token: 访问令牌 doctoken: 表格token sheetid: 工作表ID row: 目标行号 values: 要插入的数据 compress_image: 是否压缩图片 grid_width: 拼接图片的列数 grid_height: 拼接图片的行数 border_width: 边框宽度 border_color: 边框颜色 Returns: 操作结果 """ # 1. 先插入文本数据(覆盖指定行) result = self.insert_data_at_row(access_token, doctoken, sheetid, row, values) if not result: return None # 2. 同时处理图片写入 if values and len(values) > 0: row_data = values[0] for col_index, cell in enumerate(row_data, start=1): if is_image_list_cell_url(cell): # 处理图片列表 try: img_urls = eval(cell) self.write_images_to_cell(access_token, doctoken, sheetid, img_urls, row, col_index, compress_image, grid_width, grid_height, border_width, border_color) except Exception as e: print(f"写入图片列表失败 (第{row}行第{col_index}列): {e}") elif is_image_cell(cell): # 处理单张图片 image_type = get_image_type(cell) if image_type: try: self.write_image_to_cell(access_token, doctoken, sheetid, cell, row, col_index, image_type, compress_image) except Exception as e: print(f"写入单张图片失败 (第{row}行第{col_index}列): {e}") return result def update_specific_fields(self, access_token, doctoken, sheetid, row, field_updates, headers=None): """ 只更新指定字段,其他字段保持不变 Args: access_token: 访问令牌 doctoken: 表格token sheetid: 工作表ID row: 目标行号(从1开始) field_updates: 字段更新字典,格式如 {"列名": "新值", "列B": "新值B"} 或者 {列索引: "新值", 2: "新值B"}(从1开始计数) headers: 表头列表,用于列名到列索引的映射。如果为None,则field_updates的key必须是列索引 Returns: 操作结果 """ try: # 如果提供了headers且field_updates的key是列名,则转换为列索引 if headers and field_updates: column_updates = {} for field_name, value in field_updates.items(): if isinstance(field_name, str): # 如果是列名 try: col_index = headers.index(field_name) + 1 # 转为1基索引 column_updates[col_index] = value except ValueError: print(f"警告:找不到列名 '{field_name}',跳过更新") continue else: # 如果已经是列索引 column_updates[field_name] = value else: column_updates = field_updates # 构建批量更新请求 value_ranges = [] for col_index, value in column_updates.items(): col_letter = column_id(col_index) range_val = f"{sheetid}!{col_letter}{row}:{col_letter}{row}" value_ranges.append({ "range": range_val, "values": [[value]] }) body = { "valueRanges": value_ranges } result = self.batch_update_values(access_token, doctoken, body) if result: updated_fields = list(column_updates.keys()) print(f"成功更新第{row}行的字段: {updated_fields}") return result except Exception as e: print(f"更新第{row}行指定字段失败: {e}") return None def update_row_with_specific_fields_and_images(self, access_token, doctoken, sheetid, row, field_updates, headers=None, compress_image=True, grid_width=None, grid_height=None, border_width=3, border_color=(200, 200, 200)): """ 更新指定字段并处理图片 Args: access_token: 访问令牌 doctoken: 表格token sheetid: 工作表ID row: 目标行号 field_updates: 字段更新字典 headers: 表头列表 compress_image: 是否压缩图片 grid_width: 拼接图片的列数 grid_height: 拼接图片的行数 border_width: 边框宽度 border_color: 边框颜色 Returns: 操作结果 """ # 1. 先更新文本数据 result = self.update_specific_fields(access_token, doctoken, sheetid, row, field_updates, headers) if not result: return None # 2. 处理图片写入 column_updates = {} if headers and field_updates: for field_name, value in field_updates.items(): if isinstance(field_name, str): # 如果是列名 try: col_index = headers.index(field_name) + 1 column_updates[col_index] = value except ValueError: continue else: # 如果已经是列索引 column_updates[field_name] = value else: column_updates = field_updates for col_index, cell in column_updates.items(): if is_image_list_cell_url(cell): # 处理图片列表 try: img_urls = eval(cell) self.write_images_to_cell(access_token, doctoken, sheetid, img_urls, row, col_index, compress_image, grid_width, grid_height, border_width, border_color) except Exception as e: print(f"写入图片列表失败 (第{row}行第{col_index}列): {e}") elif is_image_cell(cell): # 处理单张图片 image_type = get_image_type(cell) if image_type: try: self.write_image_to_cell(access_token, doctoken, sheetid, cell, row, col_index, image_type, compress_image) except Exception as e: print(f"写入单张图片失败 (第{row}行第{col_index}列): {e}") return result def insert_row_with_data_at_position(self, access_token, doctoken, sheetid, row_position, values, compress_image=True, grid_width=None, grid_height=None, border_width=3, border_color=(200, 200, 200)): """ 在指定位置真正插入新行并填入数据 Args: access_token: 访问令牌 doctoken: 表格token sheetid: 工作表ID row_position: 插入位置(从1开始,在此行前插入) values: 要插入的数据 compress_image: 是否压缩图片 grid_width: 拼接图片的列数 grid_height: 拼接图片的行数 border_width: 边框宽度 border_color: 边框颜色 Returns: 操作结果 """ # 获取当前工作表行数 sheet_props = self.get_sheet_properties(access_token, doctoken, sheetid) current_row_count = sheet_props['row_count'] if sheet_props else 1 # 1. 先插入空行 insert_result = self.insert_rows_before(access_token, doctoken, sheetid, row_position, 1) if not insert_result: print(f"插入空行失败,无法在第{row_position}行插入数据") return None # 如果是追加模式(插入位置超过了原有行数),实际数据位置是当前行数+1 actual_row_position = row_position if row_position > current_row_count: actual_row_position = current_row_count + 1 print(f"追加模式:实际数据插入位置调整为第{actual_row_position}行") # 2. 再在新插入的行中填入数据 result = self.insert_data_at_row(access_token, doctoken, sheetid, actual_row_position, values) if not result: print(f"插入数据失败") return None # 3. 同时处理图片写入 if values and len(values) > 0: row_data = values[0] for col_index, cell in enumerate(row_data, start=1): if is_image_list_cell_url(cell): # 处理图片列表 try: img_urls = eval(cell) self.write_images_to_cell(access_token, doctoken, sheetid, img_urls, actual_row_position, col_index, compress_image, grid_width, grid_height, border_width, border_color) except Exception as e: print(f"写入图片列表失败 (第{actual_row_position}行第{col_index}列): {e}") elif is_image_cell(cell): # 处理单张图片 image_type = get_image_type(cell) if image_type: try: self.write_image_to_cell(access_token, doctoken, sheetid, cell, actual_row_position, col_index, image_type, compress_image) except Exception as e: print(f"写入单张图片失败 (第{actual_row_position}行第{col_index}列): {e}") return result def get_sheet_info(self, access_token, doctoken, sheetid): """ 获取工作表的基础信息 Args: access_token: 访问令牌 doctoken: 表格token sheetid: 工作表ID Returns: 工作表信息,包含行数、列数等 """ url = f"{self._host}/open-apis/sheets/v3/spreadsheets/{doctoken}/sheets/{sheetid}" headers = { 'Content-Type': 'application/json; charset=utf-8', 'Authorization': f'Bearer {access_token}' } try: resp = request("GET", url, headers) return resp['data']['sheet'] except Exception as e: print(f"获取工作表信息失败: {e}") return None def get_sheet_properties(self, access_token, doctoken, sheetid): """ 获取工作表属性,包括行数和列数 Args: access_token: 访问令牌 doctoken: 表格token sheetid: 工作表ID Returns: dict: 包含 row_count, column_count 等信息 """ sheet_info = self.get_sheet_info(access_token, doctoken, sheetid) if sheet_info: grid_properties = sheet_info.get('grid_properties', {}) return { 'row_count': grid_properties.get('row_count', 0), 'column_count': grid_properties.get('column_count', 0), 'title': sheet_info.get('title', ''), 'sheet_id': sheet_info.get('sheet_id', ''), 'sheet_type': sheet_info.get('sheet_type', '') } return None def append_data(self, access_token, doctoken, range_val, values): """ 在指定位置后面追加数据 Args: access_token: 访问令牌 doctoken: 表格token range_val: 追加范围,格式如 "Sheet1!A1:C1" values: 要追加的数据 Returns: 操作结果 """ url = f"{self._host}/open-apis/sheets/v2/spreadsheets/{doctoken}/values_append" headers = { 'Content-Type': 'application/json; charset=utf-8', 'Authorization': f'Bearer {access_token}' } payload = { "valueRange": { "range": range_val, "values": values } } try: resp = request("POST", url, headers, payload) return resp except Exception as e: print(f"追加数据失败: {e}") return None def delete_rows(self, access_token, doctoken, sheetid, start_row, end_row): """ 删除指定范围的行 Args: access_token: 访问令牌 doctoken: 表格token sheetid: 工作表ID start_row: 开始行号(从1开始) end_row: 结束行号(从1开始,包含) Returns: 操作结果 """ url = f"{self._host}/open-apis/sheets/v2/spreadsheets/{doctoken}/dimension_range" headers = { 'Content-Type': 'application/json; charset=utf-8', 'Authorization': f'Bearer {access_token}' } payload = { "dimension": { "sheetId": sheetid, "majorDimension": "ROWS", "startIndex": start_row, # 从1开始计数,包含 "endIndex": end_row # 从1开始计数,包含 } } try: resp = request("DELETE", url, headers, payload) return resp except Exception as e: print(f"删除第{start_row}-{end_row}行失败: {e}") return None def delete_single_row(self, access_token, doctoken, sheetid, row): """ 删除单行 Args: access_token: 访问令牌 doctoken: 表格token sheetid: 工作表ID row: 行号(从1开始) Returns: 操作结果 """ return self.delete_rows(access_token, doctoken, sheetid, row, row) def append_empty_rows(self, access_token, doctoken, sheetid, count=1): """ 在工作表末尾追加空行 Args: access_token: 访问令牌 doctoken: 表格token sheetid: 工作表ID count: 追加行数(默认1行) Returns: 操作结果 """ # 获取当前工作表信息 sheet_props = self.get_sheet_properties(access_token, doctoken, sheetid) if not sheet_props: print("无法获取工作表信息,追加失败") return None current_row_count = sheet_props['row_count'] current_col_count = sheet_props['column_count'] print(f"在工作表末尾追加{count}行,当前行数: {current_row_count}") # 构造空数据行 empty_values = [[''] * max(current_col_count, 1) for _ in range(count)] # 使用append_data在末尾追加 range_val = f"{sheetid}!A{current_row_count + 1}:{column_id(max(current_col_count, 1))}{current_row_count + count}" try: result = self.append_data(access_token, doctoken, range_val, empty_values) if result: print(f"成功在末尾追加{count}行空行") return result except Exception as e: print(f"追加空行失败: {e}") return None # -*- coding: UTF-8 -*- import json import logging from datetime import datetime import re import os import requests from urllib.parse import urlparse LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s" logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) import os logging.info(os.getcwd()) def column_id(col): '''column int to string id''' ans = "" i = col while i > 0: m = int((i-1) % 26) i = int((i-1) / 26) ans = chr(m+65) + ans return ans def get_image_type(url): '''根据图片URL获取图片类型''' try: # 发送 HEAD 请求以获取头信息 path = urlparse(url).path ext = path.split('.')[-1].lower() if ext in ['jpg', 'jpeg', 'png', 'gif']: return ext ext = 'jpeg' if 'jpg' in url: ext = 'jpg' if 'jpeg' in url: ext = 'jpeg' if 'png' in url: ext = 'png' if 'gif' in url: ext = 'gif' if "webp" in url: ext = "webp" # 如果无法确定类型,返回 None return ext except Exception as e: print(f"获取图片类型时出错: {str(e)}") return None def is_image_cell(cell): # 判断是否包含中文字符 if isinstance(cell, str): for char in cell: if '\u4e00' <= char <= '\u9fff': return False is_image = False if ( isinstance(cell, str) and cell.startswith('http') and ( re.match(r'https?://.+\.(jpg|jpeg|png|gif|webp).*', cell, re.I) or re.match(r'http?://.+\.(jpg|jpeg|png|gif|webp).*', cell, re.I) or ('xhscdn.com' in cell and 'format/jpg' in cell) or ('rescdn.yishihui.com' in cell and 'jpg' in cell) or 'sns-webpic-qc.xhscdn.com' in cell or 'ci.xiaohongshu.com' in cell ) ): is_image = True return is_image def is_image_list_cell_url(cell): if isinstance(cell, str) and cell.strip() and cell[0] == '[' and cell[-1] == ']': try: cell_obj = eval(cell) except: return False if type(cell_obj) == list: for c in cell_obj: if not is_image_cell(c): return False return True return False def write_images(client, access_token, token, sheetid, data, start_row=1, start_col=1, skip_col=[], compress_image=True, grid_width=None, grid_height=None, border_width=3, border_color=(200, 200, 200)): '''将图片URL写入单元格''' for row_index, row in enumerate(data, start=1): if row_index < start_row: print(f"跳过行: {row_index}") continue for col_index, cell in enumerate(row, start=1): # if cell is not None and "http" in cell and is_image_cell(cell) is False: # print(f"is_image_cell = {is_image_cell(cell)}, {cell}") if col_index < start_col: continue if col_index in skip_col: continue if is_image_list_cell_url(cell): # print(f"is_image_list_cell_url = True , {cell}") client.write_images_to_cell(access_token, token, sheetid, eval(cell), row_index, col_index, compress_image, grid_width, grid_height, border_width, border_color) elif is_image_cell(cell): image_type = get_image_type(cell) if image_type: client.write_image_to_cell(access_token, token, sheetid, cell, row_index, col_index,image_type, compress_image) def merge_cells(client, access_token, token, sheetid, data ): row_cnt = len(data) col_cnt = len(data[0]) for col in range(0,col_cnt): previous_row = 0 previous_value = None for row in range(0,row_cnt): cell_value = data[row][col] if cell_value != previous_value : if row - previous_row > 1: client.merge_cells(access_token, token, sheetid, previous_row+1, row, col+1, col+1) previous_row = row previous_value= cell_value def pack_data(data, sheetid, start_row=1, start_col=1): rows = len(data) cols = len(data[0]) range1 = f"{sheetid}!{column_id(start_col)}{start_row}:{column_id(cols)}{rows}" body = { "valueRanges": [ { "range": range1, "values": [] }, ] } print(range1) for d in data[start_row-1:]: row = [] for c in d[start_col-1:]: row.append(c) body["valueRanges"][0]["values"].append(row) return body def write_data_to_sheet(data, sheet_token='IoTOsjZ4khIqlOtTxnec8oTbn7c', sheetid=None, skip_text=False, skip_images=False, start_row=1, start_col=1, skip_col=[], compress_image=True, grid_width=None, grid_height=None, border_width=3, border_color=(200, 200, 200)): '''测试函数''' # 初始化 API 客户端 client = Client(LARK_HOST) # 获取租户访问令牌 access_token = client.get_tenant_access_token(APP_ID, APP_SECRET) # 获取第一个 sheet_id if sheetid is None: sheetid = client.get_sheetid(access_token, sheet_token) print(f"Sheet ID: {sheetid}") # 构建并写入测试数据 body = pack_data(data, sheetid, start_row=start_row, start_col=start_col) if not skip_text: client.batch_update_values(access_token, sheet_token, body) # merge_cells(client, access_token, sheet_token, sheetid, data) # 写入图片 if not skip_images: write_images(client, access_token, sheet_token, sheetid, data, start_row=start_row, start_col=start_col, skip_col=skip_col, compress_image=compress_image, grid_width=grid_width, grid_height=grid_height, border_width=border_width, border_color=border_color) def get_test_data(): data = [ ["标题1", "标题2", "标题3", "图片"], [1, 2,2, "http://sns-webpic.xhscdn.com/1040g2sg316vc6tdrk4705o8h0c2095f1else4i8?imageView2/2/w/0/format/jpg/v3"], [4, "https://cdn.midjourney.com/f78df4d5-9b8b-4ec7-ae34-5cc04d176f87/0_0.png", 6, "dd"], # [7, 8, 9, "https://sns-webpic.xhscdn.com/1040g2sg317l7814ck4705n3aa5ik4jgjahhcam0?imageView2/2/w/0/format/jpg/v3"], ] return data from typing import List, Dict import pandas as pd import json def to_feishu( res_list: List[Dict], sheet_id: str = 'Qn9MAs', sheet_token: str = 'Rbsysi6FChzCp7tfv19crkWNnEb', start_row: int = 1, start_col: int = 1, grid_width: int = None, grid_height: int = None, border_width: int = 3, border_color: tuple = (200, 200, 200), ) -> None: """ 将数据导出到飞书表格 Args: res_list: 数据列表 sheet_id: 表格ID sheet_token: 表格token start_row: 起始行 start_col: 起始列 grid_width: 拼接图片的列数,如果为None则自动计算 grid_height: 拼接图片的行数,如果为None则自动计算 border_width: 边框宽度,像素 border_color: 边框颜色,RGB元组 """ from tqdm import tqdm def truncate_by_bytes(text, max_bytes=450000): """按字节长度截断文本""" if not text: return "" text_str = str(text) encoded = text_str.encode('utf-8') if len(encoded) <= max_bytes: return text_str # 安全截断,避免截断多字节字符 truncated = encoded[:max_bytes] while len(truncated) > 0: try: return truncated.decode('utf-8') + "...[已截断]" except UnicodeDecodeError: truncated = truncated[:-1] return "" res_new_v4 = [] for row in tqdm(res_list): if not row: continue for k, v in row.items(): if isinstance(v, list): if len(v) > 0 and v[0] and v[0].startswith('http'): row[k] = truncate_by_bytes(str(v)) else: json_str = json.dumps(v, ensure_ascii=False, separators=(',', ':')) row[k] = truncate_by_bytes(json_str) elif isinstance(v, dict): json_str = json.dumps(v, ensure_ascii=False, indent=2) row[k] = truncate_by_bytes(json_str) else: row[k] = truncate_by_bytes(v) res_new_v4.append(row) df = pd.DataFrame(res_new_v4) df.fillna('', inplace=True) header = df.columns.tolist() data_rows = df.values.tolist() data_with_header = [header] + data_rows write_data_to_sheet( data_with_header, sheet_token=sheet_token, sheetid=sheet_id, start_col=start_col, start_row=start_row, grid_width=grid_width, grid_height=grid_height, border_width=border_width, border_color=border_color, ) def to_feishu_incremental( res_list: List[Dict], sort_field: str = '内容ID', sheet_id: str = 'Qn9MAs', sheet_token: str = 'Rbsysi6FChzCp7tfv19crkWNnEb', unique_field: str = None, # 用于去重的唯一字段,默认使用sort_field duplicate_strategy: str = 'skip', # 重复数据处理策略:'skip'跳过, 'delete'删除后插入, 'update'更新 update_fields: List[str] = None, # 当duplicate_strategy='update'时,指定要更新的字段列表。None表示更新所有字段 cleanup_duplicates: bool = True, # 是否先清理现有表格中的重复数据 keep_first: bool = True, # 清理重复数据时保留第一个(True)还是最后一个(False) sort_ascending: bool = False, # 排序顺序:True为升序(从小到大),False为降序(从大到小) grid_width: int = None, grid_height: int = None, border_width: int = 3, border_color: tuple = (200, 200, 200), ) -> None: """ 逐行增量插入数据到飞书表格,按指定字段查找插入位置 Args: res_list: 数据列表 sort_field: 用于排序的字段名,如 '内容ID' sheet_id: 表格ID sheet_token: 表格token unique_field: 用于去重的唯一字段,默认使用sort_field duplicate_strategy: 重复数据处理策略 - 'skip': 跳过重复数据(默认) - 'delete': 删除重复数据后插入新数据 - 'update': 更新重复数据的指定字段 update_fields: 当duplicate_strategy='update'时,指定要更新的字段列表 - None: 更新所有字段(除了unique_field) - ['字段1', '字段2']: 只更新指定的字段 cleanup_duplicates: 是否先清理现有表格中的重复数据 keep_first: 清理重复数据时保留第一个(True)还是最后一个(False) sort_ascending: 排序顺序,True为升序(从小到大),False为降序(从大到小),默认False grid_width: 拼接图片的列数,如果为None则自动计算 grid_height: 拼接图片的行数,如果为None则自动计算 border_width: 边框宽度,像素 border_color: 边框颜色,RGB元组 """ from tqdm import tqdm import pandas as pd import json from typing import List def truncate_by_bytes(text, max_bytes=450000): """按字节长度截断文本""" if not text: return "" text_str = str(text) encoded = text_str.encode('utf-8') if len(encoded) <= max_bytes: return text_str # 安全截断,避免截断多字节字符 truncated = encoded[:max_bytes] while len(truncated) > 0: try: return truncated.decode('utf-8') + "...[已截断]" except UnicodeDecodeError: truncated = truncated[:-1] return "" # 初始化 API 客户端 client = Client(LARK_HOST) access_token = client.get_tenant_access_token(APP_ID, APP_SECRET) # 设置去重字段,默认使用排序字段 if unique_field is None: unique_field = sort_field # 1. 获取工作表基础信息 print("正在获取工作表信息...") sheet_props = client.get_sheet_properties(access_token, sheet_token, sheet_id) if not sheet_props: print("获取工作表信息失败,使用默认范围") max_col = 'ZZ' max_row = 1000 else: print(f"工作表信息: 行数={sheet_props['row_count']}, 列数={sheet_props['column_count']}") max_col = column_id(sheet_props['column_count']) if sheet_props['column_count'] > 0 else 'ZZ' max_row = sheet_props['row_count'] if sheet_props['row_count'] > 0 else 1000 # 2. 读取表头(使用精确范围) print("正在读取表头...") header_range = f"{sheet_id}!A1:{max_col}1" # 表头总是从A列开始读取 header_data = client.read_range_values(access_token, sheet_token, header_range) if not header_data or not header_data[0] or all(not cell.strip() for cell in header_data[0] if cell): print("表格为空,需要根据数据创建表头") # 从第一条数据中提取字段名作为表头 if not res_list or not res_list[0]: print("错误:无法从空数据中创建表头") return # 提取字段名 headers = list(res_list[0].keys()) print(f"创建表头: {headers}") # 写入表头(表头不包含图片,使用普通插入即可) header_range = f"{sheet_id}!A1:{column_id(len(headers))}1" client.insert_data_at_row(access_token, sheet_token, sheet_id, 1, [headers]) # 表头创建后,从第二行开始插入数据 print("表头创建完成,开始插入数据...") else: # 解析现有表头 headers = [cell.strip() for cell in header_data[0] if cell is not None] headers = [h for h in headers if h] # 移除空字段 print(f"读取到现有表头: {headers}") # 检查排序字段和去重字段是否存在 if sort_field not in headers: print(f"警告: 排序字段 '{sort_field}' 未在表头中找到。可用字段: {headers}") # 如果找不到排序字段,就直接追加到末尾 # 使用工作表信息中的行数,或从第二行开始(如果刚创建了表头) start_row = len(headers) + 1 if 'headers' in locals() else (max_row + 1 if sheet_props else 2) to_feishu(res_list, sheet_id, sheet_token, start_row, 1, grid_width, grid_height, border_width, border_color) return if unique_field not in headers: print(f"警告: 去重字段 '{unique_field}' 未在表头中找到,将使用排序字段 '{sort_field}' 进行去重") unique_field = sort_field sort_field_index = headers.index(sort_field) sort_field_col = column_id(sort_field_index + 1) # 转换为列标识符,如A, B, C... unique_field_index = headers.index(unique_field) unique_field_col = column_id(unique_field_index + 1) # 转换为列标识符,如A, B, C... # 3. 读取排序字段和去重字段的数据 print(f"正在读取排序字段 '{sort_field}' 和去重字段 '{unique_field}' 列数据...") # 读取排序字段数据 sort_data_range = f"{sheet_id}!{sort_field_col}2:{sort_field_col}{max_row}" all_sort_data = client.read_range_values(access_token, sheet_token, sort_data_range) # 读取去重字段数据(如果与排序字段不同) if unique_field != sort_field: unique_data_range = f"{sheet_id}!{unique_field_col}2:{unique_field_col}{max_row}" all_unique_data = client.read_range_values(access_token, sheet_token, unique_data_range) else: all_unique_data = all_sort_data # 先清理空白行(排序字段和去重字段都为空的行) print("检查并清理空白行...") empty_rows_to_delete = [] if all_unique_data and all_sort_data: for i in range(min(len(all_unique_data), len(all_sort_data))): unique_row = all_unique_data[i] if i < len(all_unique_data) else None sort_row = all_sort_data[i] if i < len(all_sort_data) else None # 检查去重字段值 unique_value = "" if unique_row and len(unique_row) > 0 and unique_row[0]: unique_value = str(unique_row[0]).strip() # 检查排序字段值 sort_value = "" if sort_row and len(sort_row) > 0 and sort_row[0]: sort_value = str(sort_row[0]).strip() # 如果排序字段和去重字段都为空,标记为空白行 if not unique_value and not sort_value: row_number = i + 2 # +2 因为从第2行开始,且行号从1开始 empty_rows_to_delete.append(row_number) print(f"标记删除空白行: 第{row_number}行") # 删除空白行 if empty_rows_to_delete: print(f"开始删除 {len(empty_rows_to_delete)} 个空白行...") # 按行号倒序删除,避免删除后行号变化的问题 empty_rows_to_delete.sort(reverse=True) for row_to_delete in empty_rows_to_delete: delete_result = client.delete_single_row(access_token, sheet_token, sheet_id, row_to_delete) if delete_result: print(f"成功删除空白行: 第{row_to_delete}行") else: print(f"删除空白行失败: 第{row_to_delete}行") # 重新读取数据(删除后数据已经改变) print("重新读取数据(清理空白行后)...") # 重新读取排序字段数据 sort_data_range = f"{sheet_id}!{sort_field_col}2:{sort_field_col}{max_row}" all_sort_data = client.read_range_values(access_token, sheet_token, sort_data_range) # 重新读取去重字段数据 if unique_field != sort_field: unique_data_range = f"{sheet_id}!{unique_field_col}2:{unique_field_col}{max_row}" all_unique_data = client.read_range_values(access_token, sheet_token, unique_data_range) else: all_unique_data = all_sort_data # 构建现有数据的去重集合 duplicate_rows_to_delete = [] if cleanup_duplicates and all_unique_data: # 先分析重复数据 seen_unique_values = {} # 记录已见过的唯一值和对应行号 actual_data_rows = [] # 记录实际有数据的行号 print(f"开始分析重复数据,总共读取了 {len(all_unique_data)} 行数据") # 先找出所有有效数据行及其对应的实际行号(必须同时有排序字段和去重字段的值) for i in range(min(len(all_unique_data), len(all_sort_data) if all_sort_data else 0)): unique_row = all_unique_data[i] if i < len(all_unique_data) else None sort_row = all_sort_data[i] if i < len(all_sort_data) else None # 检查去重字段值 unique_value = "" if unique_row and len(unique_row) > 0 and unique_row[0]: unique_value = str(unique_row[0]).strip() # 检查排序字段值 sort_value = "" if sort_row and len(sort_row) > 0 and sort_row[0]: sort_value = str(sort_row[0]).strip() # 只有当排序字段和去重字段都有值时,才认为是有效数据 if unique_value and sort_value: actual_row_number = i + 2 # +2 因为从第2行开始,且行号从1开始 actual_data_rows.append((actual_row_number, unique_value, sort_value)) print(f"找到 {len(actual_data_rows)} 行有效数据") # 分析重复数据 for actual_row_number, unique_value, sort_value in actual_data_rows: if unique_value in seen_unique_values: # 发现重复数据 if keep_first: # 保留第一个,删除当前这个 duplicate_rows_to_delete.append(actual_row_number) print(f"标记删除重复行: 第{actual_row_number}行 ({unique_field}={unique_value}, {sort_field}={sort_value})") else: # 保留最后一个,删除之前的 previous_row = seen_unique_values[unique_value] duplicate_rows_to_delete.append(previous_row) print(f"标记删除重复行: 第{previous_row}行 ({unique_field}={unique_value}, {sort_field}={sort_value})") seen_unique_values[unique_value] = actual_row_number else: # 第一次见到这个唯一值 seen_unique_values[unique_value] = actual_row_number # 执行清理:删除重复行 if duplicate_rows_to_delete: print(f"开始清理 {len(duplicate_rows_to_delete)} 行重复数据...") # 按行号倒序删除,避免删除后行号变化的问题 duplicate_rows_to_delete.sort(reverse=True) for row_to_delete in duplicate_rows_to_delete: delete_result = client.delete_single_row(access_token, sheet_token, sheet_id, row_to_delete) if delete_result: print(f"成功删除重复行: 第{row_to_delete}行") else: print(f"删除重复行失败: 第{row_to_delete}行") # 重新读取数据(删除后数据已经改变) print("重新读取排序和去重字段数据...") # 重新读取排序字段数据 sort_data_range = f"{sheet_id}!{sort_field_col}2:{sort_field_col}{max_row}" all_sort_data = client.read_range_values(access_token, sheet_token, sort_data_range) # 重新读取去重字段数据 if unique_field != sort_field: unique_data_range = f"{sheet_id}!{unique_field_col}2:{unique_field_col}{max_row}" all_unique_data = client.read_range_values(access_token, sheet_token, unique_data_range) else: all_unique_data = all_sort_data # 构建最终的去重集合(处理清理后的数据,必须同时有排序字段和去重字段的值) existing_unique_values = set() existing_unique_rows = {} # 用于update策略:{unique_value: row_number} if all_unique_data and all_sort_data: for i in range(min(len(all_unique_data), len(all_sort_data))): unique_row = all_unique_data[i] if i < len(all_unique_data) else None sort_row = all_sort_data[i] if i < len(all_sort_data) else None # 检查去重字段值 unique_value = "" if unique_row and len(unique_row) > 0 and unique_row[0]: unique_value = str(unique_row[0]).strip() # 检查排序字段值 sort_value = "" if sort_row and len(sort_row) > 0 and sort_row[0]: sort_value = str(sort_row[0]).strip() # 只有当排序字段和去重字段都有值时,才添加到去重集合 if unique_value and sort_value: actual_row_number = i + 2 # +2 因为从第2行开始,且行号从1开始 existing_unique_values.add(unique_value) existing_unique_rows[unique_value] = actual_row_number print(f"现有去重值数量: {len(existing_unique_values)}") print(existing_unique_values) # 获取排序数据用于插入位置计算(基于清理后的最新数据) sort_data = [] if all_sort_data: # 同时检查排序字段和去重字段,确保数据完整性 for i in range(min(len(all_sort_data), len(all_unique_data) if all_unique_data else 0)): sort_row = all_sort_data[i] if i < len(all_sort_data) else None unique_row = all_unique_data[i] if i < len(all_unique_data) else None # 检查排序字段值 sort_value = "" if sort_row and len(sort_row) > 0 and sort_row[0]: sort_value = str(sort_row[0]).strip() # 检查去重字段值 unique_value = "" if unique_row and len(unique_row) > 0 and unique_row[0]: unique_value = str(unique_row[0]).strip() # 只有当排序字段和去重字段都有值时,才加入排序数据 if sort_value and unique_value: sort_data.append([sort_value]) if not sort_data: print("未读取到排序字段数据,所有新数据将从第二行开始插入") # 处理新数据 processed_data = [] for row in tqdm(res_list, desc="处理数据"): if not row: continue processed_row = {} for k, v in row.items(): if isinstance(v, list): if len(v) > 0 and v[0] and str(v[0]).startswith('http'): processed_row[k] = truncate_by_bytes(str(v)) else: json_str = json.dumps(v, ensure_ascii=False, indent=1) processed_row[k] = truncate_by_bytes(json_str) elif isinstance(v, dict): json_str = json.dumps(v, ensure_ascii=False, indent=1) processed_row[k] = truncate_by_bytes(json_str) else: processed_row[k] = truncate_by_bytes(v) processed_data.append(processed_row) # 转换为DataFrame以便操作 df_new = pd.DataFrame(processed_data) df_new.fillna('', inplace=True) # 确保新数据包含所有必要的列 for header in headers: if header not in df_new.columns: df_new[header] = '' # 按表头顺序重新排列列 df_new = df_new.reindex(columns=headers, fill_value='') # 预处理:过滤重复数据并确定插入顺序 print(f"预处理新数据:过滤重复并排序...") print(f"传入数据总量: {len(df_new)} 行") print(f"现有去重集合大小: {len(existing_unique_values)}") valid_rows = [] update_rows = [] # 需要更新的行:[{row_number, values, unique_value}, ...] skipped_count = 0 new_data_duplicates = 0 # 新数据内部重复计数 updated_count = 0 # 更新计数 for idx, new_row in df_new.iterrows(): new_row_values = new_row.tolist() new_sort_value = str(new_row_values[sort_field_index]) new_unique_value = str(new_row_values[unique_field_index]) # 检查是否与现有数据重复 if new_unique_value in existing_unique_values: if duplicate_strategy == 'update': # 更新策略:记录需要更新的行 target_row = existing_unique_rows[new_unique_value] update_rows.append({ 'row_number': target_row, 'values': new_row_values, 'unique_value': new_unique_value }) print(f"标记更新现有数据: 第{target_row}行 {unique_field}={new_unique_value}") updated_count += 1 continue elif duplicate_strategy == 'delete': # 删除策略:先删除现有行,再插入新数据 target_row = existing_unique_rows[new_unique_value] delete_result = client.delete_single_row(access_token, sheet_token, sheet_id, target_row) if delete_result: print(f"成功删除重复行: 第{target_row}行 {unique_field}={new_unique_value}") # 从去重集合中移除,允许后续插入 existing_unique_values.remove(new_unique_value) # 更新所有行号(删除后后面的行号会前移) for key, row_num in existing_unique_rows.items(): if row_num > target_row: existing_unique_rows[key] = row_num - 1 del existing_unique_rows[new_unique_value] else: print(f"删除重复行失败: 第{target_row}行 {unique_field}={new_unique_value}") skipped_count += 1 continue else: # 'skip' 策略 print(f"跳过与现有数据重复: {unique_field}={new_unique_value}") skipped_count += 1 continue # 检查新数据内部是否重复 already_processed = any(row['unique_value'] == new_unique_value for row in valid_rows) if already_processed: print(f"跳过新数据内部重复: {unique_field}={new_unique_value}") new_data_duplicates += 1 continue # 添加到待插入列表 valid_rows.append({ 'values': new_row_values, 'sort_value': new_sort_value, 'unique_value': new_unique_value }) print(f"预处理完成:有效数据 {len(valid_rows)} 行,需要更新 {len(update_rows)} 行,跳过与现有重复 {skipped_count} 行,跳过新数据内部重复 {new_data_duplicates} 行") # 处理更新操作 if update_rows: print(f"开始执行更新操作,共 {len(update_rows)} 行...") for update_data in tqdm(update_rows, desc="更新数据"): row_number = update_data['row_number'] new_values = update_data['values'] unique_value = update_data['unique_value'] # 构建字段更新字典 if update_fields is None: # 更新所有字段,但排除unique_field(避免修改关键字段) field_updates = {} for i, header in enumerate(headers): if header != unique_field: # 不更新去重字段 field_updates[header] = new_values[i] print(f"更新第{row_number}行所有字段(除了{unique_field}): {unique_value}") else: # 只更新指定字段 field_updates = {} for field_name in update_fields: if field_name in headers: field_index = headers.index(field_name) field_updates[field_name] = new_values[field_index] else: print(f"警告:字段 '{field_name}' 不存在于表头中,跳过") print(f"更新第{row_number}行指定字段 {list(field_updates.keys())}: {unique_value}") # 执行更新 if field_updates: result = client.update_row_with_specific_fields_and_images( access_token, sheet_token, sheet_id, row_number, field_updates, headers, True, grid_width, grid_height, border_width, border_color ) if result: print(f"✅ 成功更新第{row_number}行") else: print(f"❌ 更新第{row_number}行失败") if not valid_rows: if update_rows: print("所有数据均为更新操作,无新数据需要插入") else: print("没有新数据需要插入") return # 按排序字段排序新数据(根据sort_ascending参数决定排序方向) if sort_ascending: # 升序排序:小的值先插入(reverse=False) valid_rows.sort(key=lambda x: x['sort_value'], reverse=False) print(f"新数据排序完成,将按升序插入") else: # 降序排序:大的值先插入(reverse=True) valid_rows.sort(key=lambda x: x['sort_value'], reverse=True) print(f"新数据排序完成,将按降序插入") # 逐行插入已排序的数据 for i, row_data in tqdm(enumerate(valid_rows), total=len(valid_rows), desc="插入数据"): new_row_values = row_data['values'] new_sort_value = row_data['sort_value'] new_unique_value = row_data['unique_value'] # 找到合适的插入位置(根据sort_ascending参数确定排序方向) insert_row = len(sort_data) + 2 # 默认插入到末尾 print(f"查找插入位置,新值: {new_sort_value}") # 找到两个相邻ID之间的正确插入位置 if sort_ascending: # 升序排列:小 → 大,需要找到 prev_value < new_value < current_value 的位置 for j in range(len(sort_data)): current_value = str(sort_data[j][0]) if sort_data[j] and len(sort_data[j]) > 0 else "" prev_value = str(sort_data[j-1][0]) if j > 0 and sort_data[j-1] and len(sort_data[j-1]) > 0 else None # 检查是否应该插入到当前位置 if prev_value is None: # 这是第一个位置,检查是否应该插入到最前面 if new_sort_value < current_value: insert_row = j + 2 # +2 因为表头偏移 print(f" 插入到最前面第{insert_row}行: 新值{new_sort_value} < 第一个值{current_value}") break else: # 检查是否在两个相邻值之间 if new_sort_value >= prev_value and new_sort_value < current_value: insert_row = j + 2 # +2 因为表头偏移 print(f" 插入到第{insert_row}行: {prev_value} <= {new_sort_value} < {current_value}") break elif new_sort_value == current_value: # 值相等时插入到相等值之后 insert_row = j + 3 # +2(表头偏移) +1(插入到此行之后) print(f" 插入到第{insert_row}行: 新值{new_sort_value} = 现有值{current_value},插入其后") break # 如果遍历完都没有找到位置,说明新值是最大的,插入到末尾 if insert_row == len(sort_data) + 2: last_value = str(sort_data[-1][0]) if sort_data and sort_data[-1] and len(sort_data[-1]) > 0 else "无" print(f" 插入到末尾第{insert_row}行: 新值{new_sort_value} > 最后一个值{last_value}") else: # 降序排列:大 → 小,需要找到 prev_value > new_value > current_value 的位置 for j in range(len(sort_data)): current_value = str(sort_data[j][0]) if sort_data[j] and len(sort_data[j]) > 0 else "" prev_value = str(sort_data[j-1][0]) if j > 0 and sort_data[j-1] and len(sort_data[j-1]) > 0 else None # 检查是否应该插入到当前位置 if prev_value is None: # 这是第一个位置,检查是否应该插入到最前面 if new_sort_value > current_value: insert_row = j + 2 # +2 因为表头偏移 print(f" 插入到最前面第{insert_row}行: 新值{new_sort_value} > 第一个值{current_value}") break else: # 检查是否在两个相邻值之间 if new_sort_value <= prev_value and new_sort_value > current_value: insert_row = j + 2 # +2 因为表头偏移 print(f" 插入到第{insert_row}行: {prev_value} >= {new_sort_value} > {current_value}") break elif new_sort_value == current_value: # 值相等时插入到相等值之后 insert_row = j + 3 # +2(表头偏移) +1(插入到此行之后) print(f" 插入到第{insert_row}行: 新值{new_sort_value} = 现有值{current_value},插入其后") break # 如果遍历完都没有找到位置,说明新值是最小的,插入到末尾 if insert_row == len(sort_data) + 2: last_value = str(sort_data[-1][0]) if sort_data and sort_data[-1] and len(sort_data[-1]) > 0 else "无" print(f" 插入到末尾第{insert_row}行: 新值{new_sort_value} < 最后一个值{last_value}") print(f"[{i+1}/{len(valid_rows)}] 最终插入位置: 第 {insert_row} 行: {sort_field}={new_sort_value}") # 插入数据到指定行(真正插入新行) result = client.insert_row_with_data_at_position(access_token, sheet_token, sheet_id, insert_row, [new_row_values], True, grid_width, grid_height, border_width, border_color) if result: print(f"成功插入数据和图片到第 {insert_row} 行") # 更新sort_data:在正确的位置添加新的排序值 sort_data_index = insert_row - 2 # 转换为sort_data的索引(-2因为表头偏移) sort_data.insert(sort_data_index, [new_sort_value]) # 更新去重集合 existing_unique_values.add(new_unique_value) else: print(f"插入数据到第 {insert_row} 行失败") if __name__ == "__main__": # data = get_test_data() # sheet_token = 'IoTOsjZ4khIqlOtTxnec8oTbn7c' # sheetid = 'K9c4LG' # write_data_to_sheet(data, sheetid=sheetid) # is_image_cell_result = is_image_cell('["http://sns-webpic-qc.xhscdn.com/202501021415/1a6e88908930afce92b09206d5a482f8/1040g2sg31b74rf6k7g5g5oo7i8vkgev59lkjet0!nd_whlt34_webp_wm_1","http://sns-webpic-qc.xhscdn.com/202501021415/1a6e88908930afce92b09206d5a482f8/1040g2sg31b74rf6k7g5g5oo7i8vkgev59lkjet0!nd_whlt34_webp_wm_1"]') # print(is_image_cell_result) # 新增函数使用示例 """ 示例:使用 to_feishu_incremental 增量插入数据 # 测试数据 test_data = [ { '内容ID': '1001', '标题': '测试标题1', '内容': '测试内容1', '图片': '["http://example.com/image1.jpg", "http://example.com/image2.jpg"]' }, { '内容ID': '1003', '标题': '测试标题2', '内容': '测试内容2', '图片': 'http://example.com/image3.jpg' } ] # 调用增量插入函数 to_feishu_incremental( res_list=test_data, sort_field='内容ID', # 按此字段排序 sheet_id='your_sheet_id', sheet_token='your_sheet_token', unique_field='内容ID', # 去重字段,默认使用sort_field duplicate_strategy='update', # 重复处理策略:'skip'跳过, 'delete'删除后插入, 'update'更新指定字段 update_fields=['标题', '内容', '图片'], # 当strategy='update'时,只更新这些字段 cleanup_duplicates=True, # 先清理现有表格中的重复数据 keep_first=True, # 清理时保留第一个重复项 sort_ascending=False, # 排序顺序:False为降序(大→小),True为升序(小→大) grid_width=2, # 图片拼接列数 grid_height=2, # 图片拼接行数 ) # 排序方向示例: # 示例1:按时间戳降序排序(最新的在前面)- 适合新闻、动态等时间敏感内容 to_feishu_incremental( res_list=news_data, sort_field='发布时间', sort_ascending=False, # 降序,最新时间在前面 # ... 其他参数 ) # 示例2:按ID升序排序(从小到大)- 适合有明确编号顺序的内容 to_feishu_incremental( res_list=product_data, sort_field='产品ID', sort_ascending=True, # 升序,小ID在前面 # ... 其他参数 ) # 示例3:按优先级降序排序(高优先级在前面)- 适合任务、问题等需要优先级管理的内容 to_feishu_incremental( res_list=task_data, sort_field='优先级', sort_ascending=False, # 降序,高优先级在前面 # ... 其他参数 ) 功能说明: 1. **智能表头处理**: - 如果表格为空,自动从数据中提取字段名创建表头 - 如果表格已有数据,读取现有表头结构 2. **空白行清理**: - 自动检测并删除排序字段和去重字段都为空的空白行 - 确保数据的连续性和逻辑一致性 3. **重复数据清理**: - cleanup_duplicates=True: 先清理现有表格中的重复数据 - keep_first: 保留第一个或最后一个重复项 4. **智能去重检查**: - 基于 unique_field 字段检查数据是否已存在 - 预处理阶段过滤重复数据,避免插入过程中的状态变化问题 5. **排序插入**:根据指定的 sort_field 字段和 sort_ascending 参数查找插入位置 - sort_ascending=False(默认):降序排序,较大的值插入到较前面的位置 - sort_ascending=True:升序排序,较小的值插入到较前面的位置 6. **逐行数据插入**:按排序顺序逐行插入数据,保持表格整体有序 7. **完整图片支持**:自动处理图片写入,支持单张图片和图片列表 8. **图片拼接功能**:支持多图拼接,可设置拼接的行列数和边框样式 适用场景: - ✅ 空表格:自动创建表头并插入数据 - ✅ 已有重复数据的表格:先清理重复,再智能插入 - ✅ 增量数据更新:逐条插入,保持排序,自动去重 - ✅ 重复运行安全:不会插入重复数据 - ✅ 数据清理:一键清理现有重复数据 - ✅ 灵活排序:支持升序和降序两种排序方式 """