Преглед изворни кода

Merge branch 'feature/luojunhui/2025-09-28-chunk-improve' of Server/rag_server into master

luojunhui пре 1 недеља
родитељ
комит
79bd8d779c

+ 14 - 22
applications/async_task/chunk_task.py

@@ -6,6 +6,7 @@ from applications.utils.async_utils import run_tasks_with_asyncio_task_group
 from applications.utils.chunks import LLMClassifier, TopicAwarePackerV2
 from applications.utils.milvus import async_insert_chunk
 from applications.utils.mysql import ContentChunks, Contents
+from applications.utils.nlp import num_tokens
 from applications.config import Chunk, DEFAULT_MODEL
 from applications.config import ELASTIC_SEARCH_INDEX
 
@@ -28,15 +29,12 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
         self.content_manager = Contents(self.mysql_client)
         self.chunk_manager = ContentChunks(self.mysql_client)
 
-    async def _chunk_each_content(
-        self,
-        doc_id: str,
-        text: str,
-        text_type: int,
-        title: str,
-        dataset_id: int,
-        re_chunk: bool,
-    ) -> List[Chunk]:
+    async def _chunk_each_content(self, doc_id: str, data: dict) -> List[Chunk]:
+        title, text = data.get("title", "").strip(), data["text"].strip()
+        text_type = data.get("text_type", 1)
+        dataset_id = data.get("dataset_id", 0)  # 默认知识库 id 为 0
+        re_chunk = data.get("re_chunk", False)
+        dont_chunk = data.get("dont_chunk", False)
         if re_chunk:
             await self.content_manager.update_content_info(
                 doc_id=doc_id,
@@ -53,7 +51,7 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
         if not flag:
             return []
         else:
-            raw_chunks = await self.chunk(text, text_type, dataset_id)
+            raw_chunks = await self.chunk(text, text_type, dataset_id, dont_chunk)
             if not raw_chunks:
                 await self.content_manager.update_content_status(
                     doc_id=doc_id,
@@ -215,26 +213,20 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
 
     async def deal(self, data):
         text = data.get("text", "")
-        title = data.get("title", "")
-        text, title = text.strip(), title.strip()
-        text_type = data.get("text_type", 1)
-        dataset_id = data.get("dataset_id", 0)  # 默认知识库 id 为 0
-        re_chunk = data.get("re_chunk", False)
-
-        if not text:
-            return None
+        dont_chunk = data.get("dont_chunk", False)
+        # 如果无需分块,判断text 长度
+        if dont_chunk and num_tokens(text) >= self.max_tokens:
+            return {"error": "文档超多模型支持的最大吞吐量"}
 
         self.init_processer()
 
         async def _process():
-            chunks = await self._chunk_each_content(
-                self.doc_id, text, text_type, title, dataset_id, re_chunk
-            )
+            chunks = await self._chunk_each_content(self.doc_id, data)
             if not chunks:
                 return
 
             # # dev
-            # for chunk in chunks:
+            # for chunk in tqdm(chunks):
             #     await self.save_each_chunk(chunk)
 
             await run_tasks_with_asyncio_task_group(

+ 1 - 1
applications/config/base_chunk.py

@@ -25,7 +25,7 @@ class Chunk:
 @dataclass
 class ChunkerConfig:
     target_tokens: int = 256
-    max_tokens: int = 512
+    max_tokens: int = 2048
     min_tokens: int = 64
     boundary_threshold: float = 0.8
     min_sent_per_chunk: int = 3

+ 12 - 3
applications/utils/chunks/topic_aware_chunking.py

@@ -31,8 +31,15 @@ class TopicAwareChunker(BoundaryDetector, SplitTextIntoSentences):
             embs.append(np.array(e, dtype=np.float32))
         return np.stack(embs)
 
-    async def _raw_chunk(self, text: str) -> Dict[str, Any]:
+    async def _raw_chunk(self, text: str, dont_chunk: bool) -> Dict[str, Any]:
         # sentence_list = self.jieba_sent_tokenize(text)
+        if dont_chunk:
+            return {
+                "sentence_list": [text],
+                "boundaries": [],
+                "embeddings": await self._encode_batch([text]),
+            }
+
         sentence_list = self.lang_chain_tokenize(text)
         if not sentence_list:
             return {}
@@ -154,8 +161,10 @@ class TopicAwarePackerV2(TopicAwareChunker):
 
         return chunks
 
-    async def chunk(self, text: str, text_type: int, dataset_id: int) -> List[Chunk]:
-        raw_info = await self._raw_chunk(text)
+    async def chunk(
+        self, text: str, text_type: int, dataset_id: int, dont_chunk: bool
+    ) -> List[Chunk]:
+        raw_info = await self._raw_chunk(text, dont_chunk)
         if not raw_info:
             return []
 

+ 1 - 1
applications/utils/nlp/split_text_into_sentences.py

@@ -13,6 +13,6 @@ class SplitTextIntoSentences:
 
     @staticmethod
     def lang_chain_tokenize(text: str) -> List[str]:
-        splitter = RecursiveCharacterTextSplitter(chunk_size=64, chunk_overlap=16)
+        splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=16)
         docs = splitter.split_text(text)
         return docs

+ 2 - 2
routes/buleprint.py

@@ -56,8 +56,8 @@ async def delete():
     if not level or not params:
         return jsonify({"error": "error  level or params"})
     resource = get_resource_manager()
-    delete_task = DeleteTask(resource)
-    response = await delete_task.deal(level, params)
+    del_task = DeleteTask(resource)
+    response = await del_task.deal(level, params)
     return jsonify(response)