import os import json def read_text_file(filepath): """尝试多种编码读取文本文件""" encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'utf-16', 'big5'] for encoding in encodings: try: with open(filepath, 'r', encoding=encoding) as f: content = f.read() if content and len(content) > 100: return content, encoding except: continue try: with open(filepath, 'rb') as f: raw_data = f.read() content = raw_data.decode('utf-8', errors='ignore') return content, 'utf-8-ignore' except: return None, None def read_pdf_file(filepath): """读取PDF文件""" try: import pypdf with open(filepath, 'rb') as f: pdf_reader = pypdf.PdfReader(f) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n" return text, 'PDF' except Exception as e: try: import PyPDF2 with open(filepath, 'rb') as f: pdf_reader = PyPDF2.PdfReader(f) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n" return text, 'PDF' except Exception as e2: return f"Error: {str(e)}, {str(e2)}", None def read_docx_file(filepath): """读取DOCX文件""" try: import docx doc = docx.Document(filepath) text = "\n".join([para.text for para in doc.paragraphs]) return text, 'DOCX' except Exception as e: return f"Error: {str(e)}", None input_dir = "input" results = {} for filename in os.listdir(input_dir): filepath = os.path.join(input_dir, filename) if not os.path.isfile(filepath): continue print(f"Processing: {filename}") if filename.endswith('.txt'): content, encoding = read_text_file(filepath) if content: results[filename] = { 'format': 'TXT', 'encoding': encoding, 'length': len(content), 'first_3000': content[:3000] } print(f" TXT - Encoding: {encoding}, Length: {len(content)}") else: results[filename] = {'error': 'Failed to read'} elif filename.endswith('.pdf'): content, file_type = read_pdf_file(filepath) if file_type: results[filename] = { 'format': 'PDF', 'length': len(content), 'first_3000': content[:3000] } print(f" PDF - Length: {len(content)}") else: results[filename] = {'error': content} print(f" PDF Error: {content}") elif filename.endswith('.docx'): content, file_type = read_docx_file(filepath) if file_type: results[filename] = { 'format': 'DOCX', 'length': len(content), 'first_3000': content[:3000] } print(f" DOCX - Length: {len(content)}") else: results[filename] = {'error': content} print(f" DOCX Error: {content}") # 保存结果 with open('samples_data.json', 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\nTotal: {len(results)} files processed")