123456789101112131415161718 |
- import re
- import nltk
- from typing import List
- from langchain.text_splitter import RecursiveCharacterTextSplitter
- class SplitTextIntoSentences:
- @staticmethod
- def nltk_sent_tokenize(text: str) -> List[str]:
- """especially for English"""
- return [s.strip() for s in nltk.sent_tokenize(text) if s.strip()]
- @staticmethod
- def lang_chain_tokenize(text: str) -> List[str]:
- splitter = RecursiveCharacterTextSplitter(chunk_size=64, chunk_overlap=16)
- docs = splitter.split_text(text)
- return docs
|