split_text_into_sentences.py 1.0 KB

1234567891011121314151617181920212223242526272829303132333435
  1. import re
  2. import nltk
  3. import jieba
  4. from typing import List
  5. from langchain.text_splitter import RecursiveCharacterTextSplitter
  6. class SplitTextIntoSentences:
  7. @staticmethod
  8. def nltk_sent_tokenize(text: str) -> List[str]:
  9. """especially for English"""
  10. return [s.strip() for s in nltk.sent_tokenize(text) if s.strip()]
  11. @staticmethod
  12. def jieba_sent_tokenize(text: str) -> List[str]:
  13. """especially for Chinese"""
  14. words = list(jieba.cut(text))
  15. sentence_list: List = []
  16. buf = ""
  17. for w in words:
  18. buf += w
  19. if re.match(r"[。!?!?;;…]", w): # 遇到标点就断句
  20. sentence_list.append(buf.strip())
  21. buf = ""
  22. if buf.strip():
  23. sentence_list.append(buf.strip())
  24. return sentence_list
  25. @staticmethod
  26. def lang_chain_tokenize(text: str) -> List[str]:
  27. splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
  28. docs = splitter.split_text(text)
  29. return docs