# 读取 文件/视频分析报告.slxs文件，逐行将第5列数据的JSON字符串读取出来，取出 segments 字段 写到第7列，剩余字段写到第6列

import pandas as pd
import json
import logging
import re

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # 读取 Excel 文件
    input_file = '文件/视频分析报告.xlsx'  # Changed extension to .xlsx
    output_file = '文件/视频分析报告_new.xlsx'
    
    logger.info(f"Reading file: {input_file}")
    df = pd.read_excel(input_file)
    
    # 获取第5列的列名
    column_6 = df.columns[5]  # 0-based index for 5th column
    
    # 逐行读取第5列数据的JSON字符串
    for index, row in df.iterrows():
        try:
            json_str = str(row[column_6])
            if not json_str or json_str == 'nan':
                logger.warning(f"Empty or invalid JSON string at row {index + 1}")
                continue
            
            # Debug print with more details
            logger.info(f"=== Row {index + 1} Debug ===")
            logger.info(f"Raw data type: {type(row[column_6])}")
            logger.info(f"Raw data first 10 chars: {repr(str(row[column_6])[:10])}")
            logger.info(f"Raw data last 10 chars: {repr(str(row[column_6])[-10:])}")
            logger.info(f"String length: {len(json_str)}")
            logger.info(f"First 100 chars: {repr(json_str[:100])}")
            logger.info("===================")
            
            # Try to clean the string
            json_str = json_str.strip()
            # Remove any potential BOM or special characters at the start
            json_str = json_str.lstrip('\ufeff')
            
            # Convert Python-style single quotes to JSON double quotes
            # First, replace all single quotes that are not inside double quotes
            json_str = re.sub(r'(?<!\\)"([^"]*?)(?<!\\)"', r'__DOUBLE_QUOTED__\1__END__', json_str)  # Save double-quoted strings
            json_str = json_str.replace("'", '"')  # Convert single quotes to double quotes
            json_str = re.sub(r'__DOUBLE_QUOTED__([^_]*?)__END__', r'"\1"', json_str)  # Restore double-quoted strings
            
            # Ensure it starts with { and ends with }
            if not json_str.startswith('{'):
                json_str = '{' + json_str
            if not json_str.endswith('}'):
                json_str = json_str + '}'
            
            # Add quotes around property names if missing
            json_str = re.sub(r'([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', json_str)
            
            logger.info(f"Cleaned JSON string first 100 chars: {repr(json_str[:100])}")
            
            try:
                data = json.loads(json_str)
            except json.JSONDecodeError as je:
                logger.error(f"JSON decode error details: {str(je)}")
                logger.error(f"Error position: {je.pos}")
                logger.error(f"Line: {je.lineno}, Column: {je.colno}")
                logger.error(f"Character at error: {repr(json_str[je.pos:je.pos+10])}")
                raise
            
            print(data)
            segments = data.get('segments', [])
            
            # 移除segments字段后的剩余数据
            remaining_data = {k: v for k, v in data.items() if k != 'segments'}
            remaining_json = json.dumps(remaining_data, ensure_ascii=False)
            
            # 更新数据
            df.loc[index, '第7列'] = json.dumps(segments, ensure_ascii=False)
            df.loc[index, '第6列'] = remaining_json
            
        except json.JSONDecodeError as e:
            logger.error(f"JSON parsing error at row {index + 1}: {str(e)}")
        except Exception as e:
            logger.error(f"Error processing row {index + 1}: {str(e)}")
    
    # 保存到新的Excel文件
    logger.info(f"Saving to: {output_file}")
    df.to_excel(output_file, index=False)
    logger.info("Processing completed successfully")
    
except Exception as e:
    logger.error(f"An error occurred: {str(e)}")