import json
import re

def analyze_story_type(filename, content):
    """分析文件类型：网文/剧本/短剧"""
    
    # 剧本特征
    script_patterns = [
        r'第\d+集',
        r'\d+-\d+\s+(日|夜)\s+(内|外)',
        r'人物[:：]',
        r'▲',
        r'画外音',
        r'打戏设计',
        r'剧本',
        r'编剧[:：]'
    ]
    
    # 网文特征
    novel_patterns = [
        r'第\d+章',
        r'第\d+卷',
        r'内容简介',
        r'作者[:：]',
        r'本文由.*分享',
        r'TXT.*下载'
    ]
    
    script_score = 0
    novel_score = 0
    
    # 检查前3000字
    preview = content[:3000]
    
    for pattern in script_patterns:
        if re.search(pattern, preview):
            script_score += 1
    
    for pattern in novel_patterns:
        if re.search(pattern, preview):
            novel_score += 1
    
    # 判断类型
    if script_score > novel_score:
        if '短剧' in filename or re.search(r'第\d+集', preview):
            return '短剧剧本'
        else:
            return '电影剧本'
    elif novel_score > 0:
        return '网络小说'
    else:
        return '未知类型'

def extract_structure_info(filename, content, story_type):
    """提取关键结构信息"""
    info = {
        'filename': filename,
        'type': story_type,
        'length': len(content),
        'first_3000': content[:3000]
    }
    
    if '剧本' in story_type:
        # 提取剧本结构信息
        info['scenes'] = len(re.findall(r'\d+-\d+', content[:10000]))
        info['characters'] = extract_characters_from_script(content[:5000])
        info['structure_notes'] = '剧本格式，包含场景编号、人物、对话和动作描述'
        
    elif story_type == '网络小说':
        # 提取小说结构信息
        chapters = re.findall(r'第[零一二三四五六七八九十百千万\d]+[章回].*', content[:20000])
        info['chapters_preview'] = chapters[:10] if chapters else []
        info['chapter_count_estimate'] = len(re.findall(r'第\d+章', content))
        
        # 提取作者和简介
        author_match = re.search(r'作者[:：](.*)', content[:2000])
        if author_match:
            info['author'] = author_match.group(1).strip()
        
        intro_match = re.search(r'内容简介[:：](.*?)(?=第|作者|PS|内容标签)', content[:3000], re.DOTALL)
        if intro_match:
            info['intro'] = intro_match.group(1).strip()[:200]
        
        info['structure_notes'] = '网文格式，分章节叙事'
    
    return info

def extract_characters_from_script(text):
    """从剧本中提取人物"""
    # 查找"人物："后的内容
    char_match = re.search(r'人物[:：](.*?)(?=\n\n|▲)', text, re.DOTALL)
    if char_match:
        chars = char_match.group(1).strip()
        return [c.strip() for c in re.split(r'[，,、]', chars) if c.strip()]
    return []

def main():
    # 读取数据
    with open('samples_data.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    analysis_results = []
    
    for filename, file_data in data.items():
        if 'error' in file_data:
            print(f"跳过错误文件: {filename}")
            continue
        
        content = file_data.get('first_3000', '')
        if not content:
            continue
        
        # 使用完整内容进行分析（如果有的话）
        # 这里我们只用first_3000，实际应该重新读取完整文件
        print(f"\n分析文件: {filename}")
        
        story_type = analyze_story_type(filename, content)
        print(f"  类型: {story_type}")
        
        info = extract_structure_info(filename, content, story_type)
        analysis_results.append(info)
        
        print(f"  长度: {info['length']} 字符")
        if 'author' in info:
            print(f"  作者: {info['author']}")
        if 'chapters_preview' in info and info['chapters_preview']:
            print(f"  章节预览: {len(info['chapters_preview'])} 个")
    
    # 保存分析结果
    with open('analysis_results.json', 'w', encoding='utf-8') as f:
        json.dump(analysis_results, f, ensure_ascii=False, indent=2)
    
    print(f"\n\n分析完成，共分析 {len(analysis_results)} 个文件")
    print("结果已保存到 analysis_results.json")
    
    return analysis_results

if __name__ == '__main__':
    main()