1234567891011121314151617181920212223242526272829303132333435 |
- import re
- import nltk
- import jieba
- from typing import List
- from langchain.text_splitter import RecursiveCharacterTextSplitter
- class SplitTextIntoSentences:
- @staticmethod
- def nltk_sent_tokenize(text: str) -> List[str]:
- """especially for English"""
- return [s.strip() for s in nltk.sent_tokenize(text) if s.strip()]
- @staticmethod
- def jieba_sent_tokenize(text: str) -> List[str]:
- """especially for Chinese"""
- words = list(jieba.cut(text))
- sentence_list: List = []
- buf = ""
- for w in words:
- buf += w
- if re.match(r"[。!?!?;;…]", w): # 遇到标点就断句
- sentence_list.append(buf.strip())
- buf = ""
- if buf.strip():
- sentence_list.append(buf.strip())
- return sentence_list
- @staticmethod
- def lang_chain_tokenize(text: str) -> List[str]:
- splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
- docs = splitter.split_text(text)
- return docs
|