import os import json def read_text_file(filepath): """尝试多种编码读取文本文件""" encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'utf-16', 'big5'] for encoding in encodings: try: with open(filepath, 'r', encoding=encoding) as f: content = f.read() if content and len(content) > 100: return content, encoding except: continue # 最后尝试忽略错误 try: with open(filepath, 'rb') as f: raw_data = f.read() content = raw_data.decode('utf-8', errors='ignore') return content, 'utf-8-ignore' except: return None, None input_dir = "input" results = {} txt_files = [f for f in os.listdir(input_dir) if f.endswith('.txt')] for filename in txt_files: filepath = os.path.join(input_dir, filename) print(f"Reading: {filename}") content, encoding = read_text_file(filepath) if content: results[filename] = { 'encoding': encoding, 'length': len(content), 'first_3000': content[:3000] } print(f" Success - Encoding: {encoding}, Length: {len(content)}") else: results[filename] = {'error': 'Failed to read'} print(f" Failed") # 保存结果 with open('samples_data.json', 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\nProcessed {len(results)} files")