read_all_files.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. import os
  2. import json
  3. def read_text_file(filepath):
  4. """尝试多种编码读取文本文件"""
  5. encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'utf-16', 'big5']
  6. for encoding in encodings:
  7. try:
  8. with open(filepath, 'r', encoding=encoding) as f:
  9. content = f.read()
  10. if content and len(content) > 100:
  11. return content, encoding
  12. except:
  13. continue
  14. try:
  15. with open(filepath, 'rb') as f:
  16. raw_data = f.read()
  17. content = raw_data.decode('utf-8', errors='ignore')
  18. return content, 'utf-8-ignore'
  19. except:
  20. return None, None
  21. def read_pdf_file(filepath):
  22. """读取PDF文件"""
  23. try:
  24. import pypdf
  25. with open(filepath, 'rb') as f:
  26. pdf_reader = pypdf.PdfReader(f)
  27. text = ""
  28. for page in pdf_reader.pages:
  29. text += page.extract_text() + "\n"
  30. return text, 'PDF'
  31. except Exception as e:
  32. try:
  33. import PyPDF2
  34. with open(filepath, 'rb') as f:
  35. pdf_reader = PyPDF2.PdfReader(f)
  36. text = ""
  37. for page in pdf_reader.pages:
  38. text += page.extract_text() + "\n"
  39. return text, 'PDF'
  40. except Exception as e2:
  41. return f"Error: {str(e)}, {str(e2)}", None
  42. def read_docx_file(filepath):
  43. """读取DOCX文件"""
  44. try:
  45. import docx
  46. doc = docx.Document(filepath)
  47. text = "\n".join([para.text for para in doc.paragraphs])
  48. return text, 'DOCX'
  49. except Exception as e:
  50. return f"Error: {str(e)}", None
  51. input_dir = "input"
  52. results = {}
  53. for filename in os.listdir(input_dir):
  54. filepath = os.path.join(input_dir, filename)
  55. if not os.path.isfile(filepath):
  56. continue
  57. print(f"Processing: {filename}")
  58. if filename.endswith('.txt'):
  59. content, encoding = read_text_file(filepath)
  60. if content:
  61. results[filename] = {
  62. 'format': 'TXT',
  63. 'encoding': encoding,
  64. 'length': len(content),
  65. 'first_3000': content[:3000]
  66. }
  67. print(f" TXT - Encoding: {encoding}, Length: {len(content)}")
  68. else:
  69. results[filename] = {'error': 'Failed to read'}
  70. elif filename.endswith('.pdf'):
  71. content, file_type = read_pdf_file(filepath)
  72. if file_type:
  73. results[filename] = {
  74. 'format': 'PDF',
  75. 'length': len(content),
  76. 'first_3000': content[:3000]
  77. }
  78. print(f" PDF - Length: {len(content)}")
  79. else:
  80. results[filename] = {'error': content}
  81. print(f" PDF Error: {content}")
  82. elif filename.endswith('.docx'):
  83. content, file_type = read_docx_file(filepath)
  84. if file_type:
  85. results[filename] = {
  86. 'format': 'DOCX',
  87. 'length': len(content),
  88. 'first_3000': content[:3000]
  89. }
  90. print(f" DOCX - Length: {len(content)}")
  91. else:
  92. results[filename] = {'error': content}
  93. print(f" DOCX Error: {content}")
  94. # 保存结果
  95. with open('samples_data.json', 'w', encoding='utf-8') as f:
  96. json.dump(results, f, ensure_ascii=False, indent=2)
  97. print(f"\nTotal: {len(results)} files processed")