import json import re def analyze_story_type(filename, content): """分析文件类型:网文/剧本/短剧""" # 剧本特征 script_patterns = [ r'第\d+集', r'\d+-\d+\s+(日|夜)\s+(内|外)', r'人物[::]', r'▲', r'画外音', r'打戏设计', r'剧本', r'编剧[::]' ] # 网文特征 novel_patterns = [ r'第\d+章', r'第\d+卷', r'内容简介', r'作者[::]', r'本文由.*分享', r'TXT.*下载' ] script_score = 0 novel_score = 0 # 检查前3000字 preview = content[:3000] for pattern in script_patterns: if re.search(pattern, preview): script_score += 1 for pattern in novel_patterns: if re.search(pattern, preview): novel_score += 1 # 判断类型 if script_score > novel_score: if '短剧' in filename or re.search(r'第\d+集', preview): return '短剧剧本' else: return '电影剧本' elif novel_score > 0: return '网络小说' else: return '未知类型' def extract_structure_info(filename, content, story_type): """提取关键结构信息""" info = { 'filename': filename, 'type': story_type, 'length': len(content), 'first_3000': content[:3000] } if '剧本' in story_type: # 提取剧本结构信息 info['scenes'] = len(re.findall(r'\d+-\d+', content[:10000])) info['characters'] = extract_characters_from_script(content[:5000]) info['structure_notes'] = '剧本格式,包含场景编号、人物、对话和动作描述' elif story_type == '网络小说': # 提取小说结构信息 chapters = re.findall(r'第[零一二三四五六七八九十百千万\d]+[章回].*', content[:20000]) info['chapters_preview'] = chapters[:10] if chapters else [] info['chapter_count_estimate'] = len(re.findall(r'第\d+章', content)) # 提取作者和简介 author_match = re.search(r'作者[::](.*)', content[:2000]) if author_match: info['author'] = author_match.group(1).strip() intro_match = re.search(r'内容简介[::](.*?)(?=第|作者|PS|内容标签)', content[:3000], re.DOTALL) if intro_match: info['intro'] = intro_match.group(1).strip()[:200] info['structure_notes'] = '网文格式,分章节叙事' return info def extract_characters_from_script(text): """从剧本中提取人物""" # 查找"人物:"后的内容 char_match = re.search(r'人物[::](.*?)(?=\n\n|▲)', text, re.DOTALL) if char_match: chars = char_match.group(1).strip() return [c.strip() for c in re.split(r'[,,、]', chars) if c.strip()] return [] def main(): # 读取数据 with open('samples_data.json', 'r', encoding='utf-8') as f: data = json.load(f) analysis_results = [] for filename, file_data in data.items(): if 'error' in file_data: print(f"跳过错误文件: {filename}") continue content = file_data.get('first_3000', '') if not content: continue # 使用完整内容进行分析(如果有的话) # 这里我们只用first_3000,实际应该重新读取完整文件 print(f"\n分析文件: {filename}") story_type = analyze_story_type(filename, content) print(f" 类型: {story_type}") info = extract_structure_info(filename, content, story_type) analysis_results.append(info) print(f" 长度: {info['length']} 字符") if 'author' in info: print(f" 作者: {info['author']}") if 'chapters_preview' in info and info['chapters_preview']: print(f" 章节预览: {len(info['chapters_preview'])} 个") # 保存分析结果 with open('analysis_results.json', 'w', encoding='utf-8') as f: json.dump(analysis_results, f, ensure_ascii=False, indent=2) print(f"\n\n分析完成,共分析 {len(analysis_results)} 个文件") print("结果已保存到 analysis_results.json") return analysis_results if __name__ == '__main__': main()