| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- import os
- import json
- def read_text_file(filepath):
- """尝试多种编码读取文本文件"""
- encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'utf-16', 'big5', 'latin1']
-
- for encoding in encodings:
- try:
- with open(filepath, 'r', encoding=encoding, errors='ignore') as f:
- content = f.read()
- if content and len(content) > 100: # 确保读取到有效内容
- return content, encoding
- except Exception as e:
- continue
-
- # 如果都失败,使用二进制模式读取并尝试解码
- try:
- with open(filepath, 'rb') as f:
- raw_data = f.read()
- content = raw_data.decode('utf-8', errors='ignore')
- return content, 'utf-8 (with errors ignored)'
- except:
- return None, None
- def read_pdf_file(filepath):
- """读取PDF文件"""
- try:
- import PyPDF2
- with open(filepath, 'rb') as f:
- pdf_reader = PyPDF2.PdfReader(f)
- text = ""
- for page in pdf_reader.pages[:50]: # 读取前50页
- text += page.extract_text()
- return text, 'PDF'
- except ImportError:
- return "需要安装PyPDF2库", None
- except Exception as e:
- return f"PDF读取错误: {str(e)}", None
- def read_docx_file(filepath):
- """读取DOCX文件"""
- try:
- import docx
- doc = docx.Document(filepath)
- text = "\n".join([para.text for para in doc.paragraphs])
- return text, 'DOCX'
- except ImportError:
- return "需要安装python-docx库", None
- except Exception as e:
- return f"DOCX读取错误: {str(e)}", None
- def main():
- input_dir = "examples/analyze_story/input"
- files = os.listdir(input_dir)
-
- results = {}
-
- for filename in files:
- filepath = os.path.join(input_dir, filename)
- if not os.path.isfile(filepath):
- continue
-
- print(f"\n处理文件: {filename}")
-
- if filename.endswith('.txt'):
- content, encoding = read_text_file(filepath)
- if content:
- results[filename] = {
- 'encoding': encoding,
- 'length': len(content),
- 'preview': content[:500],
- 'first_3000': content[:3000]
- }
- print(f" 编码: {encoding}, 长度: {len(content)}")
- else:
- print(f" 读取失败")
- results[filename] = {'error': '无法读取'}
-
- elif filename.endswith('.pdf'):
- content, file_type = read_pdf_file(filepath)
- if file_type:
- results[filename] = {
- 'type': file_type,
- 'length': len(content),
- 'preview': content[:500],
- 'first_3000': content[:3000]
- }
- print(f" 类型: PDF, 长度: {len(content)}")
- else:
- results[filename] = {'error': content}
- print(f" {content}")
-
- elif filename.endswith('.docx'):
- content, file_type = read_docx_file(filepath)
- if file_type:
- results[filename] = {
- 'type': file_type,
- 'length': len(content),
- 'preview': content[:500],
- 'first_3000': content[:3000]
- }
- print(f" 类型: DOCX, 长度: {len(content)}")
- else:
- results[filename] = {'error': content}
- print(f" {content}")
-
- # 保存结果
- with open('examples/analyze_story/samples_data.json', 'w', encoding='utf-8') as f:
- json.dump(results, f, ensure_ascii=False, indent=2)
-
- print(f"\n\n读取完成,共处理 {len(results)} 个文件")
- print("结果已保存到 examples/analyze_story/samples_data.json")
- if __name__ == '__main__':
- main()
|