read_samples.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. import os
  4. import json
  5. def read_text_file(filepath):
  6. """尝试多种编码读取文本文件"""
  7. encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'utf-16', 'big5', 'latin1']
  8. for encoding in encodings:
  9. try:
  10. with open(filepath, 'r', encoding=encoding, errors='ignore') as f:
  11. content = f.read()
  12. if content and len(content) > 100: # 确保读取到有效内容
  13. return content, encoding
  14. except Exception as e:
  15. continue
  16. # 如果都失败,使用二进制模式读取并尝试解码
  17. try:
  18. with open(filepath, 'rb') as f:
  19. raw_data = f.read()
  20. content = raw_data.decode('utf-8', errors='ignore')
  21. return content, 'utf-8 (with errors ignored)'
  22. except:
  23. return None, None
  24. def read_pdf_file(filepath):
  25. """读取PDF文件"""
  26. try:
  27. import PyPDF2
  28. with open(filepath, 'rb') as f:
  29. pdf_reader = PyPDF2.PdfReader(f)
  30. text = ""
  31. for page in pdf_reader.pages[:50]: # 读取前50页
  32. text += page.extract_text()
  33. return text, 'PDF'
  34. except ImportError:
  35. return "需要安装PyPDF2库", None
  36. except Exception as e:
  37. return f"PDF读取错误: {str(e)}", None
  38. def read_docx_file(filepath):
  39. """读取DOCX文件"""
  40. try:
  41. import docx
  42. doc = docx.Document(filepath)
  43. text = "\n".join([para.text for para in doc.paragraphs])
  44. return text, 'DOCX'
  45. except ImportError:
  46. return "需要安装python-docx库", None
  47. except Exception as e:
  48. return f"DOCX读取错误: {str(e)}", None
  49. def main():
  50. input_dir = "examples/analyze_story/input"
  51. files = os.listdir(input_dir)
  52. results = {}
  53. for filename in files:
  54. filepath = os.path.join(input_dir, filename)
  55. if not os.path.isfile(filepath):
  56. continue
  57. print(f"\n处理文件: {filename}")
  58. if filename.endswith('.txt'):
  59. content, encoding = read_text_file(filepath)
  60. if content:
  61. results[filename] = {
  62. 'encoding': encoding,
  63. 'length': len(content),
  64. 'preview': content[:500],
  65. 'first_3000': content[:3000]
  66. }
  67. print(f" 编码: {encoding}, 长度: {len(content)}")
  68. else:
  69. print(f" 读取失败")
  70. results[filename] = {'error': '无法读取'}
  71. elif filename.endswith('.pdf'):
  72. content, file_type = read_pdf_file(filepath)
  73. if file_type:
  74. results[filename] = {
  75. 'type': file_type,
  76. 'length': len(content),
  77. 'preview': content[:500],
  78. 'first_3000': content[:3000]
  79. }
  80. print(f" 类型: PDF, 长度: {len(content)}")
  81. else:
  82. results[filename] = {'error': content}
  83. print(f" {content}")
  84. elif filename.endswith('.docx'):
  85. content, file_type = read_docx_file(filepath)
  86. if file_type:
  87. results[filename] = {
  88. 'type': file_type,
  89. 'length': len(content),
  90. 'preview': content[:500],
  91. 'first_3000': content[:3000]
  92. }
  93. print(f" 类型: DOCX, 长度: {len(content)}")
  94. else:
  95. results[filename] = {'error': content}
  96. print(f" {content}")
  97. # 保存结果
  98. with open('examples/analyze_story/samples_data.json', 'w', encoding='utf-8') as f:
  99. json.dump(results, f, ensure_ascii=False, indent=2)
  100. print(f"\n\n读取完成,共处理 {len(results)} 个文件")
  101. print("结果已保存到 examples/analyze_story/samples_data.json")
  102. if __name__ == '__main__':
  103. main()