|
@@ -273,43 +273,28 @@ class ChunkBooksTask(ChunkEmbeddingTask):
|
|
|
async def _process_each_book(self, book_id):
|
|
async def _process_each_book(self, book_id):
|
|
|
result = await self.book_manager.get_book_extract_detail(book_id=book_id)
|
|
result = await self.book_manager.get_book_extract_detail(book_id=book_id)
|
|
|
extract_result = result[0]["extract_result"]
|
|
extract_result = result[0]["extract_result"]
|
|
|
- book_name = result[0]["book_name"]
|
|
|
|
|
- book_oss_path = result[0]["book_oss_path"]
|
|
|
|
|
book_texts = [
|
|
book_texts = [
|
|
|
i["text"] for i in json.loads(extract_result) if i["type"] == "text"
|
|
i["text"] for i in json.loads(extract_result) if i["type"] == "text"
|
|
|
]
|
|
]
|
|
|
-
|
|
|
|
|
- # first insert into contents
|
|
|
|
|
- flag = await self.content_manager.insert_content(
|
|
|
|
|
- self.doc_id,
|
|
|
|
|
- book_oss_path,
|
|
|
|
|
- self.BOOK_PDF_TYPE,
|
|
|
|
|
- book_name,
|
|
|
|
|
- self.BOOK_PDF_DATASET_ID,
|
|
|
|
|
- ext=None,
|
|
|
|
|
|
|
+ raw_chunks = await self.chunk_books(
|
|
|
|
|
+ sentence_list=book_texts,
|
|
|
|
|
+ text_type=self.BOOK_PDF_TYPE,
|
|
|
|
|
+ dataset_id=self.BOOK_PDF_DATASET_ID,
|
|
|
)
|
|
)
|
|
|
- if not flag:
|
|
|
|
|
- return []
|
|
|
|
|
- else:
|
|
|
|
|
- raw_chunks = await self.chunk_books(
|
|
|
|
|
- sentence_list=book_texts,
|
|
|
|
|
- text_type=self.BOOK_PDF_TYPE,
|
|
|
|
|
- dataset_id=self.BOOK_PDF_DATASET_ID,
|
|
|
|
|
- )
|
|
|
|
|
- if not raw_chunks:
|
|
|
|
|
- await self.content_manager.update_content_status(
|
|
|
|
|
- doc_id=self.doc_id,
|
|
|
|
|
- ori_status=self.INIT_STATUS,
|
|
|
|
|
- new_status=self.FAILED_STATUS,
|
|
|
|
|
- )
|
|
|
|
|
- return []
|
|
|
|
|
-
|
|
|
|
|
|
|
+ if not raw_chunks:
|
|
|
await self.content_manager.update_content_status(
|
|
await self.content_manager.update_content_status(
|
|
|
doc_id=self.doc_id,
|
|
doc_id=self.doc_id,
|
|
|
ori_status=self.INIT_STATUS,
|
|
ori_status=self.INIT_STATUS,
|
|
|
- new_status=self.PROCESSING_STATUS,
|
|
|
|
|
|
|
+ new_status=self.FAILED_STATUS,
|
|
|
)
|
|
)
|
|
|
- return raw_chunks
|
|
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ await self.content_manager.update_content_status(
|
|
|
|
|
+ doc_id=self.doc_id,
|
|
|
|
|
+ ori_status=self.INIT_STATUS,
|
|
|
|
|
+ new_status=self.PROCESSING_STATUS,
|
|
|
|
|
+ )
|
|
|
|
|
+ return raw_chunks
|
|
|
|
|
|
|
|
async def deal(self, data):
|
|
async def deal(self, data):
|
|
|
book_id = data.get("book_id", None)
|
|
book_id = data.get("book_id", None)
|