| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051 |
- import os
- import json
- def read_text_file(filepath):
- """尝试多种编码读取文本文件"""
- encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'utf-16', 'big5']
-
- for encoding in encodings:
- try:
- with open(filepath, 'r', encoding=encoding) as f:
- content = f.read()
- if content and len(content) > 100:
- return content, encoding
- except:
- continue
-
- # 最后尝试忽略错误
- try:
- with open(filepath, 'rb') as f:
- raw_data = f.read()
- content = raw_data.decode('utf-8', errors='ignore')
- return content, 'utf-8-ignore'
- except:
- return None, None
- input_dir = "input"
- results = {}
- txt_files = [f for f in os.listdir(input_dir) if f.endswith('.txt')]
- for filename in txt_files:
- filepath = os.path.join(input_dir, filename)
- print(f"Reading: {filename}")
-
- content, encoding = read_text_file(filepath)
- if content:
- results[filename] = {
- 'encoding': encoding,
- 'length': len(content),
- 'first_3000': content[:3000]
- }
- print(f" Success - Encoding: {encoding}, Length: {len(content)}")
- else:
- results[filename] = {'error': 'Failed to read'}
- print(f" Failed")
- # 保存结果
- with open('samples_data.json', 'w', encoding='utf-8') as f:
- json.dump(results, f, ensure_ascii=False, indent=2)
- print(f"\nProcessed {len(results)} files")
|