import re import nltk import jieba from typing import List class SplitTextIntoSentences: @staticmethod def nltk_sent_tokenize(text: str) -> List[str]: """especially for English""" return [s.strip() for s in nltk.sent_tokenize(text) if s.strip()] @staticmethod def jieba_sent_tokenize(text: str) -> List[str]: """especially for Chinese""" words = list(jieba.cut(text)) sentence_list: List = [] buf = "" for w in words: buf += w if re.match(r"[。!?!?;;…]", w): # 遇到标点就断句 sentence_list.append(buf.strip()) buf = "" if buf.strip(): sentence_list.append(buf.strip()) return sentence_list