#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import json def read_text_file(filepath): """尝试多种编码读取文本文件""" encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'utf-16', 'big5', 'latin1'] for encoding in encodings: try: with open(filepath, 'r', encoding=encoding, errors='ignore') as f: content = f.read() if content and len(content) > 100: # 确保读取到有效内容 return content, encoding except Exception as e: continue # 如果都失败,使用二进制模式读取并尝试解码 try: with open(filepath, 'rb') as f: raw_data = f.read() content = raw_data.decode('utf-8', errors='ignore') return content, 'utf-8 (with errors ignored)' except: return None, None def read_pdf_file(filepath): """读取PDF文件""" try: import PyPDF2 with open(filepath, 'rb') as f: pdf_reader = PyPDF2.PdfReader(f) text = "" for page in pdf_reader.pages[:50]: # 读取前50页 text += page.extract_text() return text, 'PDF' except ImportError: return "需要安装PyPDF2库", None except Exception as e: return f"PDF读取错误: {str(e)}", None def read_docx_file(filepath): """读取DOCX文件""" try: import docx doc = docx.Document(filepath) text = "\n".join([para.text for para in doc.paragraphs]) return text, 'DOCX' except ImportError: return "需要安装python-docx库", None except Exception as e: return f"DOCX读取错误: {str(e)}", None def main(): input_dir = "examples/analyze_story/input" files = os.listdir(input_dir) results = {} for filename in files: filepath = os.path.join(input_dir, filename) if not os.path.isfile(filepath): continue print(f"\n处理文件: {filename}") if filename.endswith('.txt'): content, encoding = read_text_file(filepath) if content: results[filename] = { 'encoding': encoding, 'length': len(content), 'preview': content[:500], 'first_3000': content[:3000] } print(f" 编码: {encoding}, 长度: {len(content)}") else: print(f" 读取失败") results[filename] = {'error': '无法读取'} elif filename.endswith('.pdf'): content, file_type = read_pdf_file(filepath) if file_type: results[filename] = { 'type': file_type, 'length': len(content), 'preview': content[:500], 'first_3000': content[:3000] } print(f" 类型: PDF, 长度: {len(content)}") else: results[filename] = {'error': content} print(f" {content}") elif filename.endswith('.docx'): content, file_type = read_docx_file(filepath) if file_type: results[filename] = { 'type': file_type, 'length': len(content), 'preview': content[:500], 'first_3000': content[:3000] } print(f" 类型: DOCX, 长度: {len(content)}") else: results[filename] = {'error': content} print(f" {content}") # 保存结果 with open('examples/analyze_story/samples_data.json', 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\n\n读取完成,共处理 {len(results)} 个文件") print("结果已保存到 examples/analyze_story/samples_data.json") if __name__ == '__main__': main()