1234567891011121314151617181920212223242526272829 |
- import re
- import nltk
- import jieba
- from typing import List
- class SplitTextIntoSentences:
- @staticmethod
- def nltk_sent_tokenize(text: str) -> List[str]:
- """especially for English"""
- return [s.strip() for s in nltk.sent_tokenize(text) if s.strip()]
- @staticmethod
- def jieba_sent_tokenize(text: str) -> List[str]:
- """especially for Chinese"""
- words = list(jieba.cut(text))
- sentence_list: List = []
- buf = ""
- for w in words:
- buf += w
- if re.match(r"[。!?!?;;…]", w): # 遇到标点就断句
- sentence_list.append(buf.strip())
- buf = ""
- if buf.strip():
- sentence_list.append(buf.strip())
- return sentence_list
|