test.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. # 读取 文件/视频分析报告.xlsx文件,逐行将第5列数据的JSON字符串读取出来,提取segments字段并处理
  2. import pandas as pd
  3. import json
  4. import logging
  5. import re
  6. # Set up logging
  7. logging.basicConfig(level=logging.INFO)
  8. logger = logging.getLogger(__name__)
  9. def clean_json_string(json_str):
  10. """清理和格式化JSON字符串"""
  11. # 移除BOM和空白字符
  12. json_str = json_str.strip().lstrip('\ufeff')
  13. # 处理Python风格的字符串(单引号转双引号)
  14. json_str = re.sub(r'(?<!\\)"([^"]*?)(?<!\\)"', r'__DOUBLE_QUOTED__\1__END__', json_str)
  15. json_str = json_str.replace("'", '"')
  16. json_str = re.sub(r'__DOUBLE_QUOTED__([^_]*?)__END__', r'"\1"', json_str)
  17. # 确保是有效的JSON数组或对象
  18. if not (json_str.startswith('[') or json_str.startswith('{')):
  19. json_str = '[' + json_str
  20. if not (json_str.endswith(']') or json_str.endswith('}')):
  21. json_str = json_str + ']'
  22. return json_str
  23. try:
  24. # 读取 Excel 文件
  25. input_file = '文件/视频分析报告.xlsx'
  26. output_file = '文件/视频分析报告_new.xlsx'
  27. logger.info(f"Reading file: {input_file}")
  28. df = pd.read_excel(input_file)
  29. # 获取第5列的列名
  30. column_5 = df.columns[4] # 0-based index for 5th column
  31. # 逐行处理数据
  32. for index, row in df.iterrows():
  33. try:
  34. json_str = str(row[column_5])
  35. if not json_str or json_str == 'nan':
  36. logger.warning(f"Empty or invalid JSON string at row {index + 1}")
  37. continue
  38. # 清理和格式化JSON字符串
  39. json_str = clean_json_string(json_str)
  40. # 解析JSON数据
  41. try:
  42. data = json.loads(json_str)
  43. # 如果数据是字典,尝试获取segments字段
  44. if isinstance(data, dict):
  45. segments = data.get('segments', [])
  46. remaining_data = {k: v for k, v in data.items() if k != 'segments'}
  47. # 如果数据是列表,直接使用
  48. elif isinstance(data, list):
  49. segments = data
  50. remaining_data = {}
  51. else:
  52. logger.warning(f"Unexpected data type at row {index + 1}: {type(data)}")
  53. continue
  54. # 更新数据
  55. df.loc[index, '第7列'] = json.dumps(segments, ensure_ascii=False)
  56. df.loc[index, '第6列'] = json.dumps(remaining_data, ensure_ascii=False)
  57. except json.JSONDecodeError as je:
  58. logger.error(f"JSON decode error at row {index + 1}: {str(je)}")
  59. logger.error(f"Error position: {je.pos}")
  60. logger.error(f"Character at error: {repr(json_str[je.pos:je.pos+10])}")
  61. continue
  62. except Exception as e:
  63. logger.error(f"Error processing row {index + 1}: {str(e)}")
  64. continue
  65. # 保存到新的Excel文件
  66. logger.info(f"Saving to: {output_file}")
  67. df.to_excel(output_file, index=False)
  68. logger.info("Processing completed successfully")
  69. except Exception as e:
  70. logger.error(f"An error occurred: {str(e)}")