|
@@ -29,11 +29,26 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
|
|
self.chunk_manager = ContentChunks(self.mysql_client)
|
|
self.chunk_manager = ContentChunks(self.mysql_client)
|
|
|
|
|
|
async def _chunk_each_content(
|
|
async def _chunk_each_content(
|
|
- self, doc_id: str, text: str, text_type: int, title: str, dataset_id: int
|
|
|
|
|
|
+ self,
|
|
|
|
+ doc_id: str,
|
|
|
|
+ text: str,
|
|
|
|
+ text_type: int,
|
|
|
|
+ title: str,
|
|
|
|
+ dataset_id: int,
|
|
|
|
+ re_chunk: bool,
|
|
) -> List[Chunk]:
|
|
) -> List[Chunk]:
|
|
- flag = await self.content_manager.insert_content(
|
|
|
|
- doc_id, text, text_type, title, dataset_id
|
|
|
|
- )
|
|
|
|
|
|
+ if re_chunk:
|
|
|
|
+ flag = await self.content_manager.update_content_info(
|
|
|
|
+ doc_id=doc_id,
|
|
|
|
+ text=text,
|
|
|
|
+ text_type=text_type,
|
|
|
|
+ title=title,
|
|
|
|
+ dataset_id=dataset_id,
|
|
|
|
+ )
|
|
|
|
+ else:
|
|
|
|
+ flag = await self.content_manager.insert_content(
|
|
|
|
+ doc_id, text, text_type, title, dataset_id
|
|
|
|
+ )
|
|
if not flag:
|
|
if not flag:
|
|
return []
|
|
return []
|
|
else:
|
|
else:
|
|
@@ -203,6 +218,8 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
|
|
text, title = text.strip(), title.strip()
|
|
text, title = text.strip(), title.strip()
|
|
text_type = data.get("text_type", 1)
|
|
text_type = data.get("text_type", 1)
|
|
dataset_id = data.get("dataset_id", 0) # 默认知识库 id 为 0
|
|
dataset_id = data.get("dataset_id", 0) # 默认知识库 id 为 0
|
|
|
|
+ re_chunk = data.get("re_chunk", False)
|
|
|
|
+
|
|
if not text:
|
|
if not text:
|
|
return None
|
|
return None
|
|
|
|
|
|
@@ -210,7 +227,7 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
|
|
|
|
|
|
async def _process():
|
|
async def _process():
|
|
chunks = await self._chunk_each_content(
|
|
chunks = await self._chunk_each_content(
|
|
- self.doc_id, text, text_type, title, dataset_id
|
|
|
|
|
|
+ self.doc_id, text, text_type, title, dataset_id, re_chunk
|
|
)
|
|
)
|
|
if not chunks:
|
|
if not chunks:
|
|
return
|
|
return
|