split_text_into_sentences.py 543 B

123456789101112131415161718
  1. import re
  2. import nltk
  3. from typing import List
  4. from langchain.text_splitter import RecursiveCharacterTextSplitter
  5. class SplitTextIntoSentences:
  6. @staticmethod
  7. def nltk_sent_tokenize(text: str) -> List[str]:
  8. """especially for English"""
  9. return [s.strip() for s in nltk.sent_tokenize(text) if s.strip()]
  10. @staticmethod
  11. def lang_chain_tokenize(text: str) -> List[str]:
  12. splitter = RecursiveCharacterTextSplitter(chunk_size=64, chunk_overlap=16)
  13. docs = splitter.split_text(text)
  14. return docs