Jelajahi Sumber

新增 rechunk 功能

luojunhui 2 minggu lalu
induk
melakukan
74add9a108

+ 5 - 1
applications/async_task/chunk_task.py

@@ -38,21 +38,24 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
         re_chunk: bool,
     ) -> List[Chunk]:
         if re_chunk:
-            flag = await self.content_manager.update_content_info(
+            await self.content_manager.update_content_info(
                 doc_id=doc_id,
                 text=text,
                 text_type=text_type,
                 title=title,
                 dataset_id=dataset_id,
             )
+            flag = True
         else:
             flag = await self.content_manager.insert_content(
                 doc_id, text, text_type, title, dataset_id
             )
+        print(flag)
         if not flag:
             return []
         else:
             raw_chunks = await self.chunk(text, text_type, dataset_id)
+            print(raw_chunks)
             if not raw_chunks:
                 await self.content_manager.update_content_status(
                     doc_id=doc_id,
@@ -229,6 +232,7 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
             chunks = await self._chunk_each_content(
                 self.doc_id, text, text_type, title, dataset_id, re_chunk
             )
+            print(chunks)
             if not chunks:
                 return
 

+ 1 - 1
applications/utils/chunks/topic_aware_chunking.py

@@ -37,7 +37,7 @@ class TopicAwareChunker(BoundaryDetector, SplitTextIntoSentences):
             return {}
 
         sentences_embeddings = await self._encode_batch(sentence_list)
-        boundaries = self.detect_boundaries(sentence_list, sentences_embeddings)
+        boundaries = self.detect_boundaries(sentence_list, sentences_embeddings, True)
         return {
             "sentence_list": sentence_list,
             "boundaries": boundaries,