12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 |
- # 读取 文件/视频分析报告.slxs文件,逐行将第5列数据的JSON字符串读取出来,取出 segments 字段 写到第7列,剩余字段写到第6列
- import pandas as pd
- import json
- import logging
- import re
- # Set up logging
- logging.basicConfig(level=logging.INFO)
- logger = logging.getLogger(__name__)
- try:
- # 读取 Excel 文件
- input_file = '文件/视频分析报告.xlsx' # Changed extension to .xlsx
- output_file = '文件/视频分析报告_new.xlsx'
-
- logger.info(f"Reading file: {input_file}")
- df = pd.read_excel(input_file)
-
- # 获取第5列的列名
- column_6 = df.columns[5] # 0-based index for 5th column
-
- # 逐行读取第5列数据的JSON字符串
- for index, row in df.iterrows():
- try:
- json_str = str(row[column_6])
- if not json_str or json_str == 'nan':
- logger.warning(f"Empty or invalid JSON string at row {index + 1}")
- continue
-
- # Debug print with more details
- logger.info(f"=== Row {index + 1} Debug ===")
- logger.info(f"Raw data type: {type(row[column_6])}")
- logger.info(f"Raw data first 10 chars: {repr(str(row[column_6])[:10])}")
- logger.info(f"Raw data last 10 chars: {repr(str(row[column_6])[-10:])}")
- logger.info(f"String length: {len(json_str)}")
- logger.info(f"First 100 chars: {repr(json_str[:100])}")
- logger.info("===================")
-
- # Try to clean the string
- json_str = json_str.strip()
- # Remove any potential BOM or special characters at the start
- json_str = json_str.lstrip('\ufeff')
-
- # Convert Python-style single quotes to JSON double quotes
- # First, replace all single quotes that are not inside double quotes
- json_str = re.sub(r'(?<!\\)"([^"]*?)(?<!\\)"', r'__DOUBLE_QUOTED__\1__END__', json_str) # Save double-quoted strings
- json_str = json_str.replace("'", '"') # Convert single quotes to double quotes
- json_str = re.sub(r'__DOUBLE_QUOTED__([^_]*?)__END__', r'"\1"', json_str) # Restore double-quoted strings
-
- # Ensure it starts with { and ends with }
- if not json_str.startswith('{'):
- json_str = '{' + json_str
- if not json_str.endswith('}'):
- json_str = json_str + '}'
-
- # Add quotes around property names if missing
- json_str = re.sub(r'([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', json_str)
-
- logger.info(f"Cleaned JSON string first 100 chars: {repr(json_str[:100])}")
-
- try:
- data = json.loads(json_str)
- except json.JSONDecodeError as je:
- logger.error(f"JSON decode error details: {str(je)}")
- logger.error(f"Error position: {je.pos}")
- logger.error(f"Line: {je.lineno}, Column: {je.colno}")
- logger.error(f"Character at error: {repr(json_str[je.pos:je.pos+10])}")
- raise
-
- print(data)
- segments = data.get('segments', [])
-
- # 移除segments字段后的剩余数据
- remaining_data = {k: v for k, v in data.items() if k != 'segments'}
- remaining_json = json.dumps(remaining_data, ensure_ascii=False)
-
- # 更新数据
- df.loc[index, '第7列'] = json.dumps(segments, ensure_ascii=False)
- df.loc[index, '第6列'] = remaining_json
-
- except json.JSONDecodeError as e:
- logger.error(f"JSON parsing error at row {index + 1}: {str(e)}")
- except Exception as e:
- logger.error(f"Error processing row {index + 1}: {str(e)}")
-
- # 保存到新的Excel文件
- logger.info(f"Saving to: {output_file}")
- df.to_excel(output_file, index=False)
- logger.info("Processing completed successfully")
-
- except Exception as e:
- logger.error(f"An error occurred: {str(e)}")
|