| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139 |
- import json
- import re
- def analyze_story_type(filename, content):
- """分析文件类型:网文/剧本/短剧"""
-
- # 剧本特征
- script_patterns = [
- r'第\d+集',
- r'\d+-\d+\s+(日|夜)\s+(内|外)',
- r'人物[::]',
- r'▲',
- r'画外音',
- r'打戏设计',
- r'剧本',
- r'编剧[::]'
- ]
-
- # 网文特征
- novel_patterns = [
- r'第\d+章',
- r'第\d+卷',
- r'内容简介',
- r'作者[::]',
- r'本文由.*分享',
- r'TXT.*下载'
- ]
-
- script_score = 0
- novel_score = 0
-
- # 检查前3000字
- preview = content[:3000]
-
- for pattern in script_patterns:
- if re.search(pattern, preview):
- script_score += 1
-
- for pattern in novel_patterns:
- if re.search(pattern, preview):
- novel_score += 1
-
- # 判断类型
- if script_score > novel_score:
- if '短剧' in filename or re.search(r'第\d+集', preview):
- return '短剧剧本'
- else:
- return '电影剧本'
- elif novel_score > 0:
- return '网络小说'
- else:
- return '未知类型'
- def extract_structure_info(filename, content, story_type):
- """提取关键结构信息"""
- info = {
- 'filename': filename,
- 'type': story_type,
- 'length': len(content),
- 'first_3000': content[:3000]
- }
-
- if '剧本' in story_type:
- # 提取剧本结构信息
- info['scenes'] = len(re.findall(r'\d+-\d+', content[:10000]))
- info['characters'] = extract_characters_from_script(content[:5000])
- info['structure_notes'] = '剧本格式,包含场景编号、人物、对话和动作描述'
-
- elif story_type == '网络小说':
- # 提取小说结构信息
- chapters = re.findall(r'第[零一二三四五六七八九十百千万\d]+[章回].*', content[:20000])
- info['chapters_preview'] = chapters[:10] if chapters else []
- info['chapter_count_estimate'] = len(re.findall(r'第\d+章', content))
-
- # 提取作者和简介
- author_match = re.search(r'作者[::](.*)', content[:2000])
- if author_match:
- info['author'] = author_match.group(1).strip()
-
- intro_match = re.search(r'内容简介[::](.*?)(?=第|作者|PS|内容标签)', content[:3000], re.DOTALL)
- if intro_match:
- info['intro'] = intro_match.group(1).strip()[:200]
-
- info['structure_notes'] = '网文格式,分章节叙事'
-
- return info
- def extract_characters_from_script(text):
- """从剧本中提取人物"""
- # 查找"人物:"后的内容
- char_match = re.search(r'人物[::](.*?)(?=\n\n|▲)', text, re.DOTALL)
- if char_match:
- chars = char_match.group(1).strip()
- return [c.strip() for c in re.split(r'[,,、]', chars) if c.strip()]
- return []
- def main():
- # 读取数据
- with open('samples_data.json', 'r', encoding='utf-8') as f:
- data = json.load(f)
-
- analysis_results = []
-
- for filename, file_data in data.items():
- if 'error' in file_data:
- print(f"跳过错误文件: {filename}")
- continue
-
- content = file_data.get('first_3000', '')
- if not content:
- continue
-
- # 使用完整内容进行分析(如果有的话)
- # 这里我们只用first_3000,实际应该重新读取完整文件
- print(f"\n分析文件: {filename}")
-
- story_type = analyze_story_type(filename, content)
- print(f" 类型: {story_type}")
-
- info = extract_structure_info(filename, content, story_type)
- analysis_results.append(info)
-
- print(f" 长度: {info['length']} 字符")
- if 'author' in info:
- print(f" 作者: {info['author']}")
- if 'chapters_preview' in info and info['chapters_preview']:
- print(f" 章节预览: {len(info['chapters_preview'])} 个")
-
- # 保存分析结果
- with open('analysis_results.json', 'w', encoding='utf-8') as f:
- json.dump(analysis_results, f, ensure_ascii=False, indent=2)
-
- print(f"\n\n分析完成,共分析 {len(analysis_results)} 个文件")
- print("结果已保存到 analysis_results.json")
-
- return analysis_results
- if __name__ == '__main__':
- main()
|