read_txt_files.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. import os
  2. import json
  3. def read_text_file(filepath):
  4. """尝试多种编码读取文本文件"""
  5. encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'utf-16', 'big5']
  6. for encoding in encodings:
  7. try:
  8. with open(filepath, 'r', encoding=encoding) as f:
  9. content = f.read()
  10. if content and len(content) > 100:
  11. return content, encoding
  12. except:
  13. continue
  14. # 最后尝试忽略错误
  15. try:
  16. with open(filepath, 'rb') as f:
  17. raw_data = f.read()
  18. content = raw_data.decode('utf-8', errors='ignore')
  19. return content, 'utf-8-ignore'
  20. except:
  21. return None, None
  22. input_dir = "input"
  23. results = {}
  24. txt_files = [f for f in os.listdir(input_dir) if f.endswith('.txt')]
  25. for filename in txt_files:
  26. filepath = os.path.join(input_dir, filename)
  27. print(f"Reading: {filename}")
  28. content, encoding = read_text_file(filepath)
  29. if content:
  30. results[filename] = {
  31. 'encoding': encoding,
  32. 'length': len(content),
  33. 'first_3000': content[:3000]
  34. }
  35. print(f" Success - Encoding: {encoding}, Length: {len(content)}")
  36. else:
  37. results[filename] = {'error': 'Failed to read'}
  38. print(f" Failed")
  39. # 保存结果
  40. with open('samples_data.json', 'w', encoding='utf-8') as f:
  41. json.dump(results, f, ensure_ascii=False, indent=2)
  42. print(f"\nProcessed {len(results)} files")