howard
/
Agent


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
							#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import json

def read_text_file(filepath):
    """尝试多种编码读取文本文件"""
    encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'utf-16', 'big5', 'latin1']
    
    for encoding in encodings:
        try:
            with open(filepath, 'r', encoding=encoding, errors='ignore') as f:
                content = f.read()
                if content and len(content) > 100:  # 确保读取到有效内容
                    return content, encoding
        except Exception as e:
            continue
    
    # 如果都失败，使用二进制模式读取并尝试解码
    try:
        with open(filepath, 'rb') as f:
            raw_data = f.read()
            content = raw_data.decode('utf-8', errors='ignore')
            return content, 'utf-8 (with errors ignored)'
    except:
        return None, None

def read_pdf_file(filepath):
    """读取PDF文件"""
    try:
        import PyPDF2
        with open(filepath, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            text = ""
            for page in pdf_reader.pages[:50]:  # 读取前50页
                text += page.extract_text()
            return text, 'PDF'
    except ImportError:
        return "需要安装PyPDF2库", None
    except Exception as e:
        return f"PDF读取错误: {str(e)}", None

def read_docx_file(filepath):
    """读取DOCX文件"""
    try:
        import docx
        doc = docx.Document(filepath)
        text = "\n".join([para.text for para in doc.paragraphs])
        return text, 'DOCX'
    except ImportError:
        return "需要安装python-docx库", None
    except Exception as e:
        return f"DOCX读取错误: {str(e)}", None

def main():
    input_dir = "examples/analyze_story/input"
    files = os.listdir(input_dir)
    
    results = {}
    
    for filename in files:
        filepath = os.path.join(input_dir, filename)
        if not os.path.isfile(filepath):
            continue
        
        print(f"\n处理文件: {filename}")
        
        if filename.endswith('.txt'):
            content, encoding = read_text_file(filepath)
            if content:
                results[filename] = {
                    'encoding': encoding,
                    'length': len(content),
                    'preview': content[:500],
                    'first_3000': content[:3000]
                }
                print(f"  编码: {encoding}, 长度: {len(content)}")
            else:
                print(f"  读取失败")
                results[filename] = {'error': '无法读取'}
        
        elif filename.endswith('.pdf'):
            content, file_type = read_pdf_file(filepath)
            if file_type:
                results[filename] = {
                    'type': file_type,
                    'length': len(content),
                    'preview': content[:500],
                    'first_3000': content[:3000]
                }
                print(f"  类型: PDF, 长度: {len(content)}")
            else:
                results[filename] = {'error': content}
                print(f"  {content}")
        
        elif filename.endswith('.docx'):
            content, file_type = read_docx_file(filepath)
            if file_type:
                results[filename] = {
                    'type': file_type,
                    'length': len(content),
                    'preview': content[:500],
                    'first_3000': content[:3000]
                }
                print(f"  类型: DOCX, 长度: {len(content)}")
            else:
                results[filename] = {'error': content}
                print(f"  {content}")
    
    # 保存结果
    with open('examples/analyze_story/samples_data.json', 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    
    print(f"\n\n读取完成，共处理 {len(results)} 个文件")
    print("结果已保存到 examples/analyze_story/samples_data.json")

if __name__ == '__main__':
    main()