split_text_into_sentences.py 769 B

1234567891011121314151617181920212223242526272829
  1. import re
  2. import nltk
  3. import jieba
  4. from typing import List
  5. class SplitTextIntoSentences:
  6. @staticmethod
  7. def nltk_sent_tokenize(text: str) -> List[str]:
  8. """especially for English"""
  9. return [s.strip() for s in nltk.sent_tokenize(text) if s.strip()]
  10. @staticmethod
  11. def jieba_sent_tokenize(text: str) -> List[str]:
  12. """especially for Chinese"""
  13. words = list(jieba.cut(text))
  14. sentence_list: List = []
  15. buf = ""
  16. for w in words:
  17. buf += w
  18. if re.match(r"[。!?!?;;…]", w): # 遇到标点就断句
  19. sentence_list.append(buf.strip())
  20. buf = ""
  21. if buf.strip():
  22. sentence_list.append(buf.strip())
  23. return sentence_list