пре 1 недеља · 79bd8d779c
--- a/applications/async_task/chunk_task.py
+++ b/applications/async_task/chunk_task.py
@@ -6,6 +6,7 @@ from applications.utils.async_utils import run_tasks_with_asyncio_task_group
 
				 from applications.utils.chunks import LLMClassifier, TopicAwarePackerV2
			
 
				 from applications.utils.milvus import async_insert_chunk
			
 
				 from applications.utils.mysql import ContentChunks, Contents
			
 
				+from applications.utils.nlp import num_tokens
			
 
				 from applications.config import Chunk, DEFAULT_MODEL
			
 
				 from applications.config import ELASTIC_SEARCH_INDEX
			
 
				 
			
@@ -28,15 +29,12 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
 
				         self.content_manager = Contents(self.mysql_client)
			
 
				         self.chunk_manager = ContentChunks(self.mysql_client)
			
 
				 
			
 
				-    async def _chunk_each_content(
			
 
				-        self,
			
 
				-        doc_id: str,
			
 
				-        text: str,
			
 
				-        text_type: int,
			
 
				-        title: str,
			
 
				-        dataset_id: int,
			
 
				-        re_chunk: bool,
			
 
				-    ) -> List[Chunk]:
			
 
				+    async def _chunk_each_content(self, doc_id: str, data: dict) -> List[Chunk]:
			
 
				+        title, text = data.get("title", "").strip(), data["text"].strip()
			
 
				+        text_type = data.get("text_type", 1)
			
 
				+        dataset_id = data.get("dataset_id", 0)  # 默认知识库 id 为 0
			
 
				+        re_chunk = data.get("re_chunk", False)
			
 
				+        dont_chunk = data.get("dont_chunk", False)
			
 
				         if re_chunk:
			
 
				             await self.content_manager.update_content_info(
			
 
				                 doc_id=doc_id,
			
@@ -53,7 +51,7 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
 
				         if not flag:
			
 
				             return []
			
 
				         else:
			
 
				-            raw_chunks = await self.chunk(text, text_type, dataset_id)
			
 
				+            raw_chunks = await self.chunk(text, text_type, dataset_id, dont_chunk)
			
 
				             if not raw_chunks:
			
 
				                 await self.content_manager.update_content_status(
			
 
				                     doc_id=doc_id,
			
@@ -215,26 +213,20 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
 
				 
			
 
				     async def deal(self, data):
			
 
				         text = data.get("text", "")
			
 
				-        title = data.get("title", "")
			
 
				-        text, title = text.strip(), title.strip()
			
 
				-        text_type = data.get("text_type", 1)
			
 
				-        dataset_id = data.get("dataset_id", 0)  # 默认知识库 id 为 0
			
 
				-        re_chunk = data.get("re_chunk", False)
			
 
				-
			
 
				-        if not text:
			
 
				-            return None
			
 
				+        dont_chunk = data.get("dont_chunk", False)
			
 
				+        # 如果无需分块，判断text 长度
			
 
				+        if dont_chunk and num_tokens(text) >= self.max_tokens:
			
 
				+            return {"error": "文档超多模型支持的最大吞吐量"}
			
 
				 
			
 
				         self.init_processer()
			
 
				 
			
 
				         async def _process():
			
 
				-            chunks = await self._chunk_each_content(
			
 
				-                self.doc_id, text, text_type, title, dataset_id, re_chunk
			
 
				-            )
			
 
				+            chunks = await self._chunk_each_content(self.doc_id, data)
			
 
				             if not chunks:
			
 
				                 return
			
 
				 
			
 
				             # # dev
			
 
				-            # for chunk in chunks:
			
 
				+            # for chunk in tqdm(chunks):
			
 
				             #     await self.save_each_chunk(chunk)
			
 
				 
			
 
				             await run_tasks_with_asyncio_task_group(
			
--- a/applications/config/base_chunk.py
+++ b/applications/config/base_chunk.py
@@ -25,7 +25,7 @@ class Chunk:
 
				 @dataclass
			
 
				 class ChunkerConfig:
			
 
				     target_tokens: int = 256
			
 
				-    max_tokens: int = 512
			
 
				+    max_tokens: int = 2048
			
 
				     min_tokens: int = 64
			
 
				     boundary_threshold: float = 0.8
			
 
				     min_sent_per_chunk: int = 3
			
--- a/applications/utils/chunks/topic_aware_chunking.py
+++ b/applications/utils/chunks/topic_aware_chunking.py
@@ -31,8 +31,15 @@ class TopicAwareChunker(BoundaryDetector, SplitTextIntoSentences):
 
				             embs.append(np.array(e, dtype=np.float32))
			
 
				         return np.stack(embs)
			
 
				 
			
 
				-    async def _raw_chunk(self, text: str) -> Dict[str, Any]:
			
 
				+    async def _raw_chunk(self, text: str, dont_chunk: bool) -> Dict[str, Any]:
			
 
				         # sentence_list = self.jieba_sent_tokenize(text)
			
 
				+        if dont_chunk:
			
 
				+            return {
			
 
				+                "sentence_list": [text],
			
 
				+                "boundaries": [],
			
 
				+                "embeddings": await self._encode_batch([text]),
			
 
				+            }
			
 
				+
			
 
				         sentence_list = self.lang_chain_tokenize(text)
			
 
				         if not sentence_list:
			
 
				             return {}
			
@@ -154,8 +161,10 @@ class TopicAwarePackerV2(TopicAwareChunker):
 
				 
			
 
				         return chunks
			
 
				 
			
 
				-    async def chunk(self, text: str, text_type: int, dataset_id: int) -> List[Chunk]:
			
 
				-        raw_info = await self._raw_chunk(text)
			
 
				+    async def chunk(
			
 
				+        self, text: str, text_type: int, dataset_id: int, dont_chunk: bool
			
 
				+    ) -> List[Chunk]:
			
 
				+        raw_info = await self._raw_chunk(text, dont_chunk)
			
 
				         if not raw_info:
			
 
				             return []
			
 
				 
			
--- a/applications/utils/nlp/split_text_into_sentences.py
+++ b/applications/utils/nlp/split_text_into_sentences.py
@@ -13,6 +13,6 @@ class SplitTextIntoSentences:
 
				 
			
 
				     @staticmethod
			
 
				     def lang_chain_tokenize(text: str) -> List[str]:
			
 
				-        splitter = RecursiveCharacterTextSplitter(chunk_size=64, chunk_overlap=16)
			
 
				+        splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=16)
			
 
				         docs = splitter.split_text(text)
			
 
				         return docs
			
--- a/routes/buleprint.py
+++ b/routes/buleprint.py
@@ -56,8 +56,8 @@ async def delete():
 
				     if not level or not params:
			
 
				         return jsonify({"error": "error  level or params"})
			
 
				     resource = get_resource_manager()
			
 
				-    delete_task = DeleteTask(resource)
			
 
				-    response = await delete_task.deal(level, params)
			
 
				+    del_task = DeleteTask(resource)
			
 
				+    response = await del_task.deal(level, params)
			
 
				     return jsonify(response)