| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110 |
- import os
- import json
- def read_text_file(filepath):
- """尝试多种编码读取文本文件"""
- encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'utf-16', 'big5']
-
- for encoding in encodings:
- try:
- with open(filepath, 'r', encoding=encoding) as f:
- content = f.read()
- if content and len(content) > 100:
- return content, encoding
- except:
- continue
-
- try:
- with open(filepath, 'rb') as f:
- raw_data = f.read()
- content = raw_data.decode('utf-8', errors='ignore')
- return content, 'utf-8-ignore'
- except:
- return None, None
- def read_pdf_file(filepath):
- """读取PDF文件"""
- try:
- import pypdf
- with open(filepath, 'rb') as f:
- pdf_reader = pypdf.PdfReader(f)
- text = ""
- for page in pdf_reader.pages:
- text += page.extract_text() + "\n"
- return text, 'PDF'
- except Exception as e:
- try:
- import PyPDF2
- with open(filepath, 'rb') as f:
- pdf_reader = PyPDF2.PdfReader(f)
- text = ""
- for page in pdf_reader.pages:
- text += page.extract_text() + "\n"
- return text, 'PDF'
- except Exception as e2:
- return f"Error: {str(e)}, {str(e2)}", None
- def read_docx_file(filepath):
- """读取DOCX文件"""
- try:
- import docx
- doc = docx.Document(filepath)
- text = "\n".join([para.text for para in doc.paragraphs])
- return text, 'DOCX'
- except Exception as e:
- return f"Error: {str(e)}", None
- input_dir = "input"
- results = {}
- for filename in os.listdir(input_dir):
- filepath = os.path.join(input_dir, filename)
- if not os.path.isfile(filepath):
- continue
-
- print(f"Processing: {filename}")
-
- if filename.endswith('.txt'):
- content, encoding = read_text_file(filepath)
- if content:
- results[filename] = {
- 'format': 'TXT',
- 'encoding': encoding,
- 'length': len(content),
- 'first_3000': content[:3000]
- }
- print(f" TXT - Encoding: {encoding}, Length: {len(content)}")
- else:
- results[filename] = {'error': 'Failed to read'}
-
- elif filename.endswith('.pdf'):
- content, file_type = read_pdf_file(filepath)
- if file_type:
- results[filename] = {
- 'format': 'PDF',
- 'length': len(content),
- 'first_3000': content[:3000]
- }
- print(f" PDF - Length: {len(content)}")
- else:
- results[filename] = {'error': content}
- print(f" PDF Error: {content}")
-
- elif filename.endswith('.docx'):
- content, file_type = read_docx_file(filepath)
- if file_type:
- results[filename] = {
- 'format': 'DOCX',
- 'length': len(content),
- 'first_3000': content[:3000]
- }
- print(f" DOCX - Length: {len(content)}")
- else:
- results[filename] = {'error': content}
- print(f" DOCX Error: {content}")
- # 保存结果
- with open('samples_data.json', 'w', encoding='utf-8') as f:
- json.dump(results, f, ensure_ascii=False, indent=2)
- print(f"\nTotal: {len(results)} files processed")
|