|
@@ -1,6 +1,5 @@
|
|
|
import re
|
|
|
import nltk
|
|
|
-import jieba
|
|
|
|
|
|
from typing import List
|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
@@ -12,22 +11,6 @@ class SplitTextIntoSentences:
|
|
|
"""especially for English"""
|
|
|
return [s.strip() for s in nltk.sent_tokenize(text) if s.strip()]
|
|
|
|
|
|
- @staticmethod
|
|
|
- def jieba_sent_tokenize(text: str) -> List[str]:
|
|
|
- """especially for Chinese"""
|
|
|
- words = list(jieba.cut(text))
|
|
|
- sentence_list: List = []
|
|
|
- buf = ""
|
|
|
- for w in words:
|
|
|
- buf += w
|
|
|
- if re.match(r"[。!?!?;;…]", w): # 遇到标点就断句
|
|
|
- sentence_list.append(buf.strip())
|
|
|
- buf = ""
|
|
|
-
|
|
|
- if buf.strip():
|
|
|
- sentence_list.append(buf.strip())
|
|
|
- return sentence_list
|
|
|
-
|
|
|
@staticmethod
|
|
|
def lang_chain_tokenize(text: str) -> List[str]:
|
|
|
splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
|