瀏覽代碼

修改上传书籍

xueyiming 1 月之前
父節點
當前提交
2d00c0b1d2
共有 4 個文件被更改,包括 38 次插入37 次删除
  1. 14 29
      applications/async_task/chunk_task.py
  2. 5 5
      applications/utils/mysql/books.py
  3. 6 2
      applications/utils/task/async_task.py
  4. 13 1
      routes/blueprint.py

+ 14 - 29
applications/async_task/chunk_task.py

@@ -273,43 +273,28 @@ class ChunkBooksTask(ChunkEmbeddingTask):
     async def _process_each_book(self, book_id):
         result = await self.book_manager.get_book_extract_detail(book_id=book_id)
         extract_result = result[0]["extract_result"]
-        book_name = result[0]["book_name"]
-        book_oss_path = result[0]["book_oss_path"]
         book_texts = [
             i["text"] for i in json.loads(extract_result) if i["type"] == "text"
         ]
-
-        # first insert into contents
-        flag = await self.content_manager.insert_content(
-            self.doc_id,
-            book_oss_path,
-            self.BOOK_PDF_TYPE,
-            book_name,
-            self.BOOK_PDF_DATASET_ID,
-            ext=None,
+        raw_chunks = await self.chunk_books(
+            sentence_list=book_texts,
+            text_type=self.BOOK_PDF_TYPE,
+            dataset_id=self.BOOK_PDF_DATASET_ID,
         )
-        if not flag:
-            return []
-        else:
-            raw_chunks = await self.chunk_books(
-                sentence_list=book_texts,
-                text_type=self.BOOK_PDF_TYPE,
-                dataset_id=self.BOOK_PDF_DATASET_ID,
-            )
-            if not raw_chunks:
-                await self.content_manager.update_content_status(
-                    doc_id=self.doc_id,
-                    ori_status=self.INIT_STATUS,
-                    new_status=self.FAILED_STATUS,
-                )
-                return []
-
+        if not raw_chunks:
             await self.content_manager.update_content_status(
                 doc_id=self.doc_id,
                 ori_status=self.INIT_STATUS,
-                new_status=self.PROCESSING_STATUS,
+                new_status=self.FAILED_STATUS,
             )
-            return raw_chunks
+            return []
+
+        await self.content_manager.update_content_status(
+            doc_id=self.doc_id,
+            ori_status=self.INIT_STATUS,
+            new_status=self.PROCESSING_STATUS,
+        )
+        return raw_chunks
 
     async def deal(self, data):
         book_id = data.get("book_id", None)

+ 5 - 5
applications/utils/mysql/books.py

@@ -24,18 +24,18 @@ class Books(BaseMySQLClient):
             query=query, params=(new_status, book_id, ori_status)
         )
 
-    async def insert_book(self, book_id, book_name, book_oss_path):
+    async def insert_book(self, book_id, book_name, book_oss_path, doc_id):
         query = """
-            INSERT INTO books (book_id, book_name, book_oss_path)
-             VALUES (%s, %s, %s);
+            INSERT INTO books (book_id, book_name, book_oss_path, doc_id)
+             VALUES (%s, %s, %s, %s);
         """
         return await self.pool.async_save(
-            query=query, params=(book_id, book_name, book_oss_path)
+            query=query, params=(book_id, book_name, book_oss_path, doc_id)
         )
 
     async def select_init_books(self):
         query = """
-            SELECT book_id, book_name, book_oss_path, extract_status
+            SELECT book_id, book_name, book_oss_path, extract_status, doc_id
             FROM books
             WHERE extract_status = 0;
         """

+ 6 - 2
applications/utils/task/async_task.py

@@ -26,6 +26,7 @@ async def handle_books():
 
         for book in books:
             book_id = book.get("book_id")
+            doc_id = book.get("doc_id")
             # 获取提取状态
             extract_status = (await books_mapper.select_book_extract_status(book_id))[
                 0
@@ -56,7 +57,10 @@ async def handle_books():
                         if content_list:
                             # 更新提取结果
                             await books_mapper.update_book_extract_result(
-                                book_id, content_list
+                                book_id,
+                                json.dumps(
+                                    json.loads(content_list), ensure_ascii=False
+                                ),
                             )
 
                 except Exception as e:
@@ -64,7 +68,6 @@ async def handle_books():
                     continue  # 如果提取过程失败,跳过该书籍
 
                 # 创建文档 ID
-                doc_id = f"doc-{uuid.uuid4()}"
                 chunk_task = ChunkBooksTask(doc_id=doc_id, resource=resource)
 
                 # 处理分片任务
@@ -185,6 +188,7 @@ async def query_search(
     resource = get_resource_manager()
     content_chunk_mapper = ContentChunks(resource.mysql_client)
     res = []
+    print(json.dumps(response["results"], ensure_ascii=False, indent=2))
     for result in response["results"]:
         content_chunks = await content_chunk_mapper.select_chunk_content(
             doc_id=result["doc_id"], chunk_id=result["chunk_id"]

+ 13 - 1
routes/blueprint.py

@@ -74,6 +74,11 @@ async def chunk():
     body = await request.get_json()
     text = body.get("text", "")
     ori_doc_id = body.get("doc_id")
+    is_web = body.get("is_web")
+    if is_web:
+        dataset_id = body.get("dataset_id", 0)
+        if dataset_id == 12 or dataset_id == 11:
+            return jsonify({"error": "系统知识库不支持手动添加"})
     text = text.strip()
     if not text:
         return jsonify({"error": "error  text"})
@@ -546,13 +551,20 @@ async def upload_pdf():
         await file.save(file_path)
         resource = get_resource_manager()
         books = Books(resource.mysql_client)
+        content_manager = Contents(resource.mysql_client)
         # 上传到 OSS
         try:
             oss_client = OSSClient()
             # 上传文件到 OSS
             oss_path = f"rag/pdfs/{book_id}.pdf"
             oss_client.upload_file(file_path, oss_path)
-            await books.insert_book(book_id, filename, oss_path)
+            doc_id = f"doc-{uuid.uuid4()}"
+            BOOK_PDF_TYPE = 3
+            BOOK_PDF_DATASET_ID = 21
+            await content_manager.insert_content(
+                doc_id, oss_path, BOOK_PDF_TYPE, filename, BOOK_PDF_DATASET_ID, None
+            )
+            await books.insert_book(book_id, filename, oss_path, doc_id)
             return jsonify({"status_code": 200, "detail": "success"})
         except Exception as e:
             return jsonify({"status_code": 500, "detail": str(e)})