test.py 4.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. # 读取 文件/视频分析报告.slxs文件,逐行将第5列数据的JSON字符串读取出来,取出 segments 字段 写到第7列,剩余字段写到第6列
  2. import pandas as pd
  3. import json
  4. import logging
  5. import re
  6. # Set up logging
  7. logging.basicConfig(level=logging.INFO)
  8. logger = logging.getLogger(__name__)
  9. try:
  10. # 读取 Excel 文件
  11. input_file = '文件/视频分析报告.xlsx' # Changed extension to .xlsx
  12. output_file = '文件/视频分析报告_new.xlsx'
  13. logger.info(f"Reading file: {input_file}")
  14. df = pd.read_excel(input_file)
  15. # 获取第5列的列名
  16. column_6 = df.columns[5] # 0-based index for 5th column
  17. # 逐行读取第5列数据的JSON字符串
  18. for index, row in df.iterrows():
  19. try:
  20. json_str = str(row[column_6])
  21. if not json_str or json_str == 'nan':
  22. logger.warning(f"Empty or invalid JSON string at row {index + 1}")
  23. continue
  24. # Debug print with more details
  25. logger.info(f"=== Row {index + 1} Debug ===")
  26. logger.info(f"Raw data type: {type(row[column_6])}")
  27. logger.info(f"Raw data first 10 chars: {repr(str(row[column_6])[:10])}")
  28. logger.info(f"Raw data last 10 chars: {repr(str(row[column_6])[-10:])}")
  29. logger.info(f"String length: {len(json_str)}")
  30. logger.info(f"First 100 chars: {repr(json_str[:100])}")
  31. logger.info("===================")
  32. # Try to clean the string
  33. json_str = json_str.strip()
  34. # Remove any potential BOM or special characters at the start
  35. json_str = json_str.lstrip('\ufeff')
  36. # Convert Python-style single quotes to JSON double quotes
  37. # First, replace all single quotes that are not inside double quotes
  38. json_str = re.sub(r'(?<!\\)"([^"]*?)(?<!\\)"', r'__DOUBLE_QUOTED__\1__END__', json_str) # Save double-quoted strings
  39. json_str = json_str.replace("'", '"') # Convert single quotes to double quotes
  40. json_str = re.sub(r'__DOUBLE_QUOTED__([^_]*?)__END__', r'"\1"', json_str) # Restore double-quoted strings
  41. # Ensure it starts with { and ends with }
  42. if not json_str.startswith('{'):
  43. json_str = '{' + json_str
  44. if not json_str.endswith('}'):
  45. json_str = json_str + '}'
  46. # Add quotes around property names if missing
  47. json_str = re.sub(r'([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', json_str)
  48. logger.info(f"Cleaned JSON string first 100 chars: {repr(json_str[:100])}")
  49. try:
  50. data = json.loads(json_str)
  51. except json.JSONDecodeError as je:
  52. logger.error(f"JSON decode error details: {str(je)}")
  53. logger.error(f"Error position: {je.pos}")
  54. logger.error(f"Line: {je.lineno}, Column: {je.colno}")
  55. logger.error(f"Character at error: {repr(json_str[je.pos:je.pos+10])}")
  56. raise
  57. print(data)
  58. segments = data.get('segments', [])
  59. # 移除segments字段后的剩余数据
  60. remaining_data = {k: v for k, v in data.items() if k != 'segments'}
  61. remaining_json = json.dumps(remaining_data, ensure_ascii=False)
  62. # 更新数据
  63. df.loc[index, '第7列'] = json.dumps(segments, ensure_ascii=False)
  64. df.loc[index, '第6列'] = remaining_json
  65. except json.JSONDecodeError as e:
  66. logger.error(f"JSON parsing error at row {index + 1}: {str(e)}")
  67. except Exception as e:
  68. logger.error(f"Error processing row {index + 1}: {str(e)}")
  69. # 保存到新的Excel文件
  70. logger.info(f"Saving to: {output_file}")
  71. df.to_excel(output_file, index=False)
  72. logger.info("Processing completed successfully")
  73. except Exception as e:
  74. logger.error(f"An error occurred: {str(e)}")