# 读取 文件/视频分析报告.slxs文件,逐行将第5列数据的JSON字符串读取出来,取出 segments 字段 写到第7列,剩余字段写到第6列 import pandas as pd import json import logging import re # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) try: # 读取 Excel 文件 input_file = '文件/视频分析报告.xlsx' # Changed extension to .xlsx output_file = '文件/视频分析报告_new.xlsx' logger.info(f"Reading file: {input_file}") df = pd.read_excel(input_file) # 获取第5列的列名 column_6 = df.columns[5] # 0-based index for 5th column # 逐行读取第5列数据的JSON字符串 for index, row in df.iterrows(): try: json_str = str(row[column_6]) if not json_str or json_str == 'nan': logger.warning(f"Empty or invalid JSON string at row {index + 1}") continue # Debug print with more details logger.info(f"=== Row {index + 1} Debug ===") logger.info(f"Raw data type: {type(row[column_6])}") logger.info(f"Raw data first 10 chars: {repr(str(row[column_6])[:10])}") logger.info(f"Raw data last 10 chars: {repr(str(row[column_6])[-10:])}") logger.info(f"String length: {len(json_str)}") logger.info(f"First 100 chars: {repr(json_str[:100])}") logger.info("===================") # Try to clean the string json_str = json_str.strip() # Remove any potential BOM or special characters at the start json_str = json_str.lstrip('\ufeff') # Convert Python-style single quotes to JSON double quotes # First, replace all single quotes that are not inside double quotes json_str = re.sub(r'(?