3 週間前 · 6e820112bc
--- a/applications/async_task/chunk_task.py
+++ b/applications/async_task/chunk_task.py
@@ -273,43 +273,28 @@ class ChunkBooksTask(ChunkEmbeddingTask):
 
				     async def _process_each_book(self, book_id):
			
 
				         result = await self.book_manager.get_book_extract_detail(book_id=book_id)
			
 
				         extract_result = result[0]["extract_result"]
			
 
				-        book_name = result[0]["book_name"]
			
 
				-        book_oss_path = result[0]["book_oss_path"]
			
 
				         book_texts = [
			
 
				             i["text"] for i in json.loads(extract_result) if i["type"] == "text"
			
 
				         ]
			
 
				-
			
 
				-        # first insert into contents
			
 
				-        flag = await self.content_manager.insert_content(
			
 
				-            self.doc_id,
			
 
				-            book_oss_path,
			
 
				-            self.BOOK_PDF_TYPE,
			
 
				-            book_name,
			
 
				-            self.BOOK_PDF_DATASET_ID,
			
 
				-            ext=None,
			
 
				+        raw_chunks = await self.chunk_books(
			
 
				+            sentence_list=book_texts,
			
 
				+            text_type=self.BOOK_PDF_TYPE,
			
 
				+            dataset_id=self.BOOK_PDF_DATASET_ID,
			
 
				         )
			
 
				-        if not flag:
			
 
				-            return []
			
 
				-        else:
			
 
				-            raw_chunks = await self.chunk_books(
			
 
				-                sentence_list=book_texts,
			
 
				-                text_type=self.BOOK_PDF_TYPE,
			
 
				-                dataset_id=self.BOOK_PDF_DATASET_ID,
			
 
				-            )
			
 
				-            if not raw_chunks:
			
 
				-                await self.content_manager.update_content_status(
			
 
				-                    doc_id=self.doc_id,
			
 
				-                    ori_status=self.INIT_STATUS,
			
 
				-                    new_status=self.FAILED_STATUS,
			
 
				-                )
			
 
				-                return []
			
 
				-
			
 
				+        if not raw_chunks:
			
 
				             await self.content_manager.update_content_status(
			
 
				                 doc_id=self.doc_id,
			
 
				                 ori_status=self.INIT_STATUS,
			
 
				-                new_status=self.PROCESSING_STATUS,
			
 
				+                new_status=self.FAILED_STATUS,
			
 
				             )
			
 
				-            return raw_chunks
			
 
				+            return []
			
 
				+
			
 
				+        await self.content_manager.update_content_status(
			
 
				+            doc_id=self.doc_id,
			
 
				+            ori_status=self.INIT_STATUS,
			
 
				+            new_status=self.PROCESSING_STATUS,
			
 
				+        )
			
 
				+        return raw_chunks
			
 
				 
			
 
				     async def deal(self, data):
			
 
				         book_id = data.get("book_id", None)
			
--- a/applications/utils/mysql/books.py
+++ b/applications/utils/mysql/books.py
@@ -24,18 +24,18 @@ class Books(BaseMySQLClient):
 
				             query=query, params=(new_status, book_id, ori_status)
			
 
				         )
			
 
				 
			
 
				-    async def insert_book(self, book_id, book_name, book_oss_path):
			
 
				+    async def insert_book(self, book_id, book_name, book_oss_path, doc_id):
			
 
				         query = """
			
 
				-            INSERT INTO books (book_id, book_name, book_oss_path)
			
 
				-             VALUES (%s, %s, %s);
			
 
				+            INSERT INTO books (book_id, book_name, book_oss_path, doc_id)
			
 
				+             VALUES (%s, %s, %s, %s);
			
 
				         """
			
 
				         return await self.pool.async_save(
			
 
				-            query=query, params=(book_id, book_name, book_oss_path)
			
 
				+            query=query, params=(book_id, book_name, book_oss_path, doc_id)
			
 
				         )
			
 
				 
			
 
				     async def select_init_books(self):
			
 
				         query = """
			
 
				-            SELECT book_id, book_name, book_oss_path, extract_status
			
 
				+            SELECT book_id, book_name, book_oss_path, extract_status, doc_id
			
 
				             FROM books
			
 
				             WHERE extract_status = 0;
			
 
				         """
			
--- a/applications/utils/oss/oss_client.py
+++ b/applications/utils/oss/oss_client.py
@@ -88,3 +88,21 @@ class OSSClient:
 
				             return False
			
 
				         except Exception as e:
			
 
				             raise Exception(f"Error checking file existence on OSS: {str(e)}")
			
 
				+
			
 
				+    def generate_url(self, oss_file_path, expire=3600, inline=True):
			
 
				+        if not oss_file_path:
			
 
				+            return ""
			
 
				+        try:
			
 
				+            headers = {}
			
 
				+            if inline:
			
 
				+                headers["response-content-disposition"] = "inline"
			
 
				+            url = self.bucket.sign_url("GET", oss_file_path, expire, headers=headers)
			
 
				+            return url
			
 
				+        except Exception as e:
			
 
				+            raise Exception(f"Error generating URL for OSS file: {str(e)}")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    oss = OSSClient()
			
 
				+    url = oss.generate_url("rag/pdfs/book-9e1babe8-c8fb-4740-9008-c495837cfbae.pdf")
			
 
				+    print(url)
			
--- a/applications/utils/task/async_task.py
+++ b/applications/utils/task/async_task.py
@@ -26,6 +26,7 @@ async def handle_books():
 
				 
			
 
				         for book in books:
			
 
				             book_id = book.get("book_id")
			
 
				+            doc_id = book.get("doc_id")
			
 
				             # 获取提取状态
			
 
				             extract_status = (await books_mapper.select_book_extract_status(book_id))[
			
 
				                 0
			
@@ -34,10 +35,10 @@ async def handle_books():
 
				             if extract_status == 0:
			
 
				                 # 更新提取状态为处理中
			
 
				                 await books_mapper.update_book_extract_status(book_id, 1)
			
 
				-                book_path = os.path.join("/tmp", book_id)
			
 
				+                book_path = os.path.join("/tmp", book_id + ".pdf")
			
 
				 
			
 
				                 if not os.path.exists(book_path):
			
 
				-                    oss_path = f"rag/pdfs/{book_id}"
			
 
				+                    oss_path = f"rag/pdfs/{book_id}.pdf"
			
 
				                     try:
			
 
				                         # 下载书籍文件
			
 
				                         await oss_client.download_file(oss_path, book_path)
			
@@ -56,7 +57,10 @@ async def handle_books():
 
				                         if content_list:
			
 
				                             # 更新提取结果
			
 
				                             await books_mapper.update_book_extract_result(
			
 
				-                                book_id, content_list
			
 
				+                                book_id,
			
 
				+                                json.dumps(
			
 
				+                                    json.loads(content_list), ensure_ascii=False
			
 
				+                                ),
			
 
				                             )
			
 
				 
			
 
				                 except Exception as e:
			
@@ -64,7 +68,6 @@ async def handle_books():
 
				                     continue  # 如果提取过程失败，跳过该书籍
			
 
				 
			
 
				                 # 创建文档 ID
			
 
				-                doc_id = f"doc-{uuid.uuid4()}"
			
 
				                 chunk_task = ChunkBooksTask(doc_id=doc_id, resource=resource)
			
 
				 
			
 
				                 # 处理分片任务
			
@@ -185,6 +188,7 @@ async def query_search(
 
				     resource = get_resource_manager()
			
 
				     content_chunk_mapper = ContentChunks(resource.mysql_client)
			
 
				     res = []
			
 
				+    print(json.dumps(response["results"], ensure_ascii=False, indent=2))
			
 
				     for result in response["results"]:
			
 
				         content_chunks = await content_chunk_mapper.select_chunk_content(
			
 
				             doc_id=result["doc_id"], chunk_id=result["chunk_id"]
			
--- a/routes/blueprint.py
+++ b/routes/blueprint.py
@@ -74,6 +74,11 @@ async def chunk():
 
				     body = await request.get_json()
			
 
				     text = body.get("text", "")
			
 
				     ori_doc_id = body.get("doc_id")
			
 
				+    is_web = body.get("is_web")
			
 
				+    if is_web:
			
 
				+        dataset_id = body.get("dataset_id", 0)
			
 
				+        if dataset_id == 12 or dataset_id == 11:
			
 
				+            return jsonify({"error": "系统知识库不支持手动添加"})
			
 
				     text = text.strip()
			
 
				     if not text:
			
 
				         return jsonify({"error": "error  text"})
			
@@ -249,7 +254,7 @@ async def get_content():
 
				         return jsonify({"status_code": 404, "detail": "content not found", "data": {}})
			
 
				 
			
 
				     row = rows[0]
			
 
				-
			
 
				+    oss_client = OSSClient()
			
 
				     return jsonify(
			
 
				         {
			
 
				             "status_code": 200,
			
@@ -258,6 +263,10 @@ async def get_content():
 
				                 "title": row.get("title", ""),
			
 
				                 "text": row.get("text", ""),
			
 
				                 "doc_id": row.get("doc_id", ""),
			
 
				+                "textType": row.get("text_type"),
			
 
				+                "url": oss_client.generate_url(row.get("text"))
			
 
				+                if row.get("text_type") == 3
			
 
				+                else "",
			
 
				             },
			
 
				         }
			
 
				     )
			
@@ -291,7 +300,7 @@ async def content_list():
 
				         doc_status=doc_status,
			
 
				         order_by=order_by,
			
 
				     )
			
 
				-
			
 
				+    oss_client = OSSClient()
			
 
				     # 格式化 entities，只保留必要字段
			
 
				     entities = [
			
 
				         {
			
@@ -299,6 +308,10 @@ async def content_list():
 
				             "title": row.get("title") or "",
			
 
				             "text": row.get("text") or "",
			
 
				             "statusDesc": "可用" if row.get("status") == 2 else "不可用",
			
 
				+            "textType": row.get("text_type"),
			
 
				+            "url": oss_client.generate_url(row.get("text"))
			
 
				+            if row.get("text_type") == 3
			
 
				+            else "",
			
 
				         }
			
 
				         for row in result["entities"]
			
 
				     ]
			
@@ -520,11 +533,8 @@ async def upload_pdf():
 
				         # 检查文件扩展名是否是 .pdf
			
 
				         if not file.filename.lower().endswith(".pdf"):
			
 
				             return jsonify(
			
 
				-                {
			
 
				-                    "status": "error",
			
 
				-                    "message": "Invalid file format. Only PDF files are allowed.",
			
 
				-                }
			
 
				-            ), 400
			
 
				+                {"status_code": 400, "detail": "Only PDF files are allowed."}
			
 
				+            )
			
 
				 
			
 
				         # 获取文件名
			
 
				         filename = file.filename
			
@@ -533,24 +543,28 @@ async def upload_pdf():
 
				         # 检查文件的 MIME 类型是否是 application/pdf
			
 
				         if file.content_type != "application/pdf":
			
 
				             return jsonify(
			
 
				-                {
			
 
				-                    "status": "error",
			
 
				-                    "message": "Invalid MIME type. Only PDF files are allowed.",
			
 
				-                }
			
 
				-            ), 400
			
 
				+                {"status_code": 400, "detail": "Only PDF files are allowed."}
			
 
				+            )
			
 
				 
			
 
				         # 保存到本地（可选，视需要）
			
 
				-        file_path = os.path.join("/tmp", book_id)  # 临时存储路径
			
 
				+        file_path = os.path.join("/tmp", book_id + ".pdf")  # 临时存储路径
			
 
				         await file.save(file_path)
			
 
				         resource = get_resource_manager()
			
 
				         books = Books(resource.mysql_client)
			
 
				+        content_manager = Contents(resource.mysql_client)
			
 
				         # 上传到 OSS
			
 
				         try:
			
 
				             oss_client = OSSClient()
			
 
				             # 上传文件到 OSS
			
 
				-            oss_path = f"rag/pdfs/{book_id}"
			
 
				+            oss_path = f"rag/pdfs/{book_id}.pdf"
			
 
				             oss_client.upload_file(file_path, oss_path)
			
 
				-            await books.insert_book(book_id, filename, oss_path)
			
 
				+            doc_id = f"doc-{uuid.uuid4()}"
			
 
				+            BOOK_PDF_TYPE = 3
			
 
				+            BOOK_PDF_DATASET_ID = 21
			
 
				+            await content_manager.insert_content(
			
 
				+                doc_id, oss_path, BOOK_PDF_TYPE, filename, BOOK_PDF_DATASET_ID, None
			
 
				+            )
			
 
				+            await books.insert_book(book_id, filename, oss_path, doc_id)
			
 
				             return jsonify({"status_code": 200, "detail": "success"})
			
 
				         except Exception as e:
			
 
				             return jsonify({"status_code": 500, "detail": str(e)})