analyze_samples.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. import json
  2. import re
  3. def analyze_story_type(filename, content):
  4. """分析文件类型:网文/剧本/短剧"""
  5. # 剧本特征
  6. script_patterns = [
  7. r'第\d+集',
  8. r'\d+-\d+\s+(日|夜)\s+(内|外)',
  9. r'人物[::]',
  10. r'▲',
  11. r'画外音',
  12. r'打戏设计',
  13. r'剧本',
  14. r'编剧[::]'
  15. ]
  16. # 网文特征
  17. novel_patterns = [
  18. r'第\d+章',
  19. r'第\d+卷',
  20. r'内容简介',
  21. r'作者[::]',
  22. r'本文由.*分享',
  23. r'TXT.*下载'
  24. ]
  25. script_score = 0
  26. novel_score = 0
  27. # 检查前3000字
  28. preview = content[:3000]
  29. for pattern in script_patterns:
  30. if re.search(pattern, preview):
  31. script_score += 1
  32. for pattern in novel_patterns:
  33. if re.search(pattern, preview):
  34. novel_score += 1
  35. # 判断类型
  36. if script_score > novel_score:
  37. if '短剧' in filename or re.search(r'第\d+集', preview):
  38. return '短剧剧本'
  39. else:
  40. return '电影剧本'
  41. elif novel_score > 0:
  42. return '网络小说'
  43. else:
  44. return '未知类型'
  45. def extract_structure_info(filename, content, story_type):
  46. """提取关键结构信息"""
  47. info = {
  48. 'filename': filename,
  49. 'type': story_type,
  50. 'length': len(content),
  51. 'first_3000': content[:3000]
  52. }
  53. if '剧本' in story_type:
  54. # 提取剧本结构信息
  55. info['scenes'] = len(re.findall(r'\d+-\d+', content[:10000]))
  56. info['characters'] = extract_characters_from_script(content[:5000])
  57. info['structure_notes'] = '剧本格式,包含场景编号、人物、对话和动作描述'
  58. elif story_type == '网络小说':
  59. # 提取小说结构信息
  60. chapters = re.findall(r'第[零一二三四五六七八九十百千万\d]+[章回].*', content[:20000])
  61. info['chapters_preview'] = chapters[:10] if chapters else []
  62. info['chapter_count_estimate'] = len(re.findall(r'第\d+章', content))
  63. # 提取作者和简介
  64. author_match = re.search(r'作者[::](.*)', content[:2000])
  65. if author_match:
  66. info['author'] = author_match.group(1).strip()
  67. intro_match = re.search(r'内容简介[::](.*?)(?=第|作者|PS|内容标签)', content[:3000], re.DOTALL)
  68. if intro_match:
  69. info['intro'] = intro_match.group(1).strip()[:200]
  70. info['structure_notes'] = '网文格式,分章节叙事'
  71. return info
  72. def extract_characters_from_script(text):
  73. """从剧本中提取人物"""
  74. # 查找"人物:"后的内容
  75. char_match = re.search(r'人物[::](.*?)(?=\n\n|▲)', text, re.DOTALL)
  76. if char_match:
  77. chars = char_match.group(1).strip()
  78. return [c.strip() for c in re.split(r'[,,、]', chars) if c.strip()]
  79. return []
  80. def main():
  81. # 读取数据
  82. with open('samples_data.json', 'r', encoding='utf-8') as f:
  83. data = json.load(f)
  84. analysis_results = []
  85. for filename, file_data in data.items():
  86. if 'error' in file_data:
  87. print(f"跳过错误文件: {filename}")
  88. continue
  89. content = file_data.get('first_3000', '')
  90. if not content:
  91. continue
  92. # 使用完整内容进行分析(如果有的话)
  93. # 这里我们只用first_3000,实际应该重新读取完整文件
  94. print(f"\n分析文件: {filename}")
  95. story_type = analyze_story_type(filename, content)
  96. print(f" 类型: {story_type}")
  97. info = extract_structure_info(filename, content, story_type)
  98. analysis_results.append(info)
  99. print(f" 长度: {info['length']} 字符")
  100. if 'author' in info:
  101. print(f" 作者: {info['author']}")
  102. if 'chapters_preview' in info and info['chapters_preview']:
  103. print(f" 章节预览: {len(info['chapters_preview'])} 个")
  104. # 保存分析结果
  105. with open('analysis_results.json', 'w', encoding='utf-8') as f:
  106. json.dump(analysis_results, f, ensure_ascii=False, indent=2)
  107. print(f"\n\n分析完成,共分析 {len(analysis_results)} 个文件")
  108. print("结果已保存到 analysis_results.json")
  109. return analysis_results
  110. if __name__ == '__main__':
  111. main()