2 ヶ月前 · 325726dff0
--- a/applications/async_task/__init__.py
+++ b/applications/async_task/__init__.py
@@ -1,7 +1,14 @@
 
				 from .chunk_task import ChunkEmbeddingTask
			
 
				+from .chunk_task import ChunkBooksTask
			
 
				 from .delete_task import DeleteTask
			
 
				 from .auto_rechunk_task import AutoRechunkTask
			
 
				 from .build_graph import BuildGraph
			
 
				 
			
 
				 
			
 
				-__all__ = ["ChunkEmbeddingTask", "DeleteTask", "AutoRechunkTask", "BuildGraph"]
			
 
				+__all__ = [
			
 
				+    "ChunkEmbeddingTask",
			
 
				+    "DeleteTask",
			
 
				+    "AutoRechunkTask",
			
 
				+    "BuildGraph",
			
 
				+    "ChunkBooksTask",
			
 
				+]
			
--- a/applications/async_task/chunk_task.py
+++ b/applications/async_task/chunk_task.py
@@ -6,7 +6,7 @@ from applications.api import get_basic_embedding
 
				 from applications.utils.async_utils import run_tasks_with_asyncio_task_group
			
 
				 from applications.utils.chunks import LLMClassifier, TopicAwarePackerV2
			
 
				 from applications.utils.milvus import async_insert_chunk
			
 
				-from applications.utils.mysql import ContentChunks, Contents
			
 
				+from applications.utils.mysql import Books, ContentChunks, Contents
			
 
				 from applications.utils.nlp import num_tokens
			
 
				 from applications.config import Chunk, DEFAULT_MODEL
			
 
				 from applications.config import ELASTIC_SEARCH_INDEX
			
@@ -17,6 +17,7 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
 
				         super().__init__(doc_id)
			
 
				         self.chunk_manager = None
			
 
				         self.content_manager = None
			
 
				+        self.book_manager = None
			
 
				         self.mysql_client = resource.mysql_client
			
 
				         self.milvus_client = resource.milvus_client
			
 
				         self.es_client = resource.es_client
			
@@ -29,6 +30,7 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
 
				     def init_processer(self):
			
 
				         self.content_manager = Contents(self.mysql_client)
			
 
				         self.chunk_manager = ContentChunks(self.mysql_client)
			
 
				+        self.book_manager = Books(self.mysql_client)
			
 
				 
			
 
				     async def _chunk_each_content(self, doc_id: str, data: dict) -> List[Chunk]:
			
 
				         title, text = data.get("title", "").strip(), data["text"].strip()
			
@@ -260,3 +262,102 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
 
				 
			
 
				         asyncio.create_task(_process())
			
 
				         return self.doc_id
			
 
				+
			
 
				+
			
 
				+class ChunkBooksTask(ChunkEmbeddingTask):
			
 
				+    """图书类型分块任务"""
			
 
				+
			
 
				+    BOOK_PDF_DATASET_ID = 17
			
 
				+    BOOK_PDF_TYPE = 3
			
 
				+
			
 
				+    async def _process_each_book(self, book_id):
			
 
				+        result = await self.book_manager.get_book_extract_detail(book_id=book_id)
			
 
				+        extract_result = result[0]["extract_result"]
			
 
				+        book_name = result[0]["book_name"]
			
 
				+        book_oss_path = result[0]["book_oss_path"]
			
 
				+        book_texts = [
			
 
				+            i["text"] for i in json.loads(extract_result) if i["type"] == "text"
			
 
				+        ]
			
 
				+
			
 
				+        # first insert into contents
			
 
				+        flag = await self.content_manager.insert_content(
			
 
				+            self.doc_id,
			
 
				+            book_oss_path,
			
 
				+            self.BOOK_PDF_TYPE,
			
 
				+            book_name,
			
 
				+            self.BOOK_PDF_DATASET_ID,
			
 
				+            ext=None,
			
 
				+        )
			
 
				+        if not flag:
			
 
				+            return []
			
 
				+        else:
			
 
				+            raw_chunks = await self.chunk_books(sentence_list=book_texts, text_type=self.BOOK_PDF_TYPE, dataset_id=self.BOOK_PDF_DATASET_ID)
			
 
				+            if not raw_chunks:
			
 
				+                await self.content_manager.update_content_status(
			
 
				+                    doc_id=self.doc_id,
			
 
				+                    ori_status=self.INIT_STATUS,
			
 
				+                    new_status=self.FAILED_STATUS,
			
 
				+                )
			
 
				+                return []
			
 
				+
			
 
				+            await self.content_manager.update_content_status(
			
 
				+                doc_id=self.doc_id,
			
 
				+                ori_status=self.INIT_STATUS,
			
 
				+                new_status=self.PROCESSING_STATUS,
			
 
				+            )
			
 
				+            return raw_chunks
			
 
				+
			
 
				+    async def deal(self, data):
			
 
				+        book_id = data.get("book_id", None)
			
 
				+        if not book_id:
			
 
				+            return {"error": "Book id should not be None"}
			
 
				+
			
 
				+        self.init_processer()
			
 
				+
			
 
				+        # LOCK
			
 
				+        acquire_lock = await self.book_manager.update_book_chunk_status(
			
 
				+                book_id=book_id,
			
 
				+                ori_status=self.INIT_STATUS,
			
 
				+                new_status=self.PROCESSING_STATUS
			
 
				+            )
			
 
				+        print(acquire_lock)
			
 
				+
			
 
				+        if not acquire_lock:
			
 
				+            return {
			
 
				+                "info": "book is processing or processed"
			
 
				+            }
			
 
				+
			
 
				+        async def _process():
			
 
				+            chunks = await self._process_each_book(book_id)
			
 
				+            if not chunks:
			
 
				+                return
			
 
				+
			
 
				+            # # dev
			
 
				+            # for chunk in tqdm(chunks):
			
 
				+            #     await self.save_each_chunk(chunk)
			
 
				+
			
 
				+            await run_tasks_with_asyncio_task_group(
			
 
				+                task_list=chunks,
			
 
				+                handler=self.save_each_chunk,
			
 
				+                description="处理单篇文章分块",
			
 
				+                unit="chunk",
			
 
				+                max_concurrency=10,
			
 
				+            )
			
 
				+
			
 
				+            await self.content_manager.update_content_status(
			
 
				+                doc_id=self.doc_id,
			
 
				+                ori_status=self.PROCESSING_STATUS,
			
 
				+                new_status=self.FINISHED_STATUS,
			
 
				+            )
			
 
				+
			
 
				+            await self.book_manager.update_book_chunk_status(
			
 
				+                book_id=book_id,
			
 
				+                ori_status=self.PROCESSING_STATUS,
			
 
				+                new_status=self.FINISHED_STATUS
			
 
				+            )
			
 
				+
			
 
				+        asyncio.create_task(_process())
			
 
				+        return self.doc_id
			
 
				+
			
 
				+
			
 
				+__all__ = ["ChunkEmbeddingTask", "ChunkBooksTask"]
			
--- a/applications/prompts/__init__.py
+++ b/applications/prompts/__init__.py
@@ -0,0 +1 @@
 
				+from .build_graph import extract_entity_and_graph
			
--- a/applications/prompts/build_graph.py
+++ b/applications/prompts/build_graph.py
@@ -0,0 +1,63 @@
 
				+import json
			
 
				+
			
 
				+
			
 
				+def extract_entity_and_graph(text: str) -> str:
			
 
				+    """
			
 
				+    通用知识抽取 Prompt 生成器。
			
 
				+    从任意输入文本中提取实体、关系和概念信息。
			
 
				+    输出 JSON 格式，兼容大模型调用，不会出现格式冲突或转义问题。
			
 
				+    """
			
 
				+    safe_text = json.dumps(text, ensure_ascii=False)
			
 
				+    prompt = f"""
			
 
				+### 角色设定
			
 
				+你是一名专业的知识抽取助手，请从输入的文本中识别出关键信息并输出结构化结果。
			
 
				+
			
 
				+### 输入文本：
			
 
				+{safe_text}
			
 
				+
			
 
				+### 抽取目标
			
 
				+请提取以下三类信息：
			
 
				+1. **entities（实体）**  
			
 
				+   - 指文本中出现的具体对象、人物、机构、地点、技术、产品、事件等。  
			
 
				+   - 每个实体需包含：名称、类型、别名、描述
			
 
				+   - 只需要获取主要,关键内容包含的实体，知识中的示例，样例等补充信息不需要提取实体
			
 
				+
			
 
				+2. **relations（关系）**  
			
 
				+   - 指实体之间的语义联系。  
			
 
				+   - 常见关系类型包括（不限于）：属于、隶属、依赖、控制、合作、位于、应用于、开发、影响、由...定义、用于。  
			
 
				+   - 每条关系需包含：source、target、relation_type、evidence、confidence。
			
 
				+3. **concepts（概念）**  
			
 
				+   - 指文本涉及的主题、核心技术、思想、学科领域、话题关键词等。
			
 
				+
			
 
				+### 输出格式
			
 
				+请严格输出以下 JSON 结构，禁止输出任何解释性文字、注释或 Markdown 代码块。
			
 
				+
			
 
				+{{
			
 
				+  "entities": [
			
 
				+    {{
			
 
				+      "name": "string", "type": "string", "aliases": ["string"], "description": "string"
			
 
				+    }}
			
 
				+  ],
			
 
				+  "relations": [
			
 
				+    {{
			
 
				+    "source": "string", "target": "string", "relation_type": "string", "evidence": "string", "confidence": 0.0
			
 
				+    }}
			
 
				+  ],
			
 
				+  "concepts": ["string"]
			
 
				+}}
			
 
				+
			
 
				+### 输出规则
			
 
				+1. 严格输出合法 JSON 格式，可直接解析；
			
 
				+2. 所有字段必须存在，即使为空数组；
			
 
				+3. 若未检测到任何内容，请输出：
			
 
				+   {{
			
 
				+     "entities": [],
			
 
				+     "relations": [],
			
 
				+     "concepts": []
			
 
				+   }};
			
 
				+4. `confidence` 为 0.0 ~ 1.0 之间的小数；
			
 
				+5. 禁止推理未出现在文本中的实体或关系；
			
 
				+6. 若出现模糊信息，请保持描述中立；
			
 
				+7. 输出中不允许包含解释性文字、注释、示例或 Markdown。
			
 
				+"""
			
 
				+    return prompt
			
--- a/applications/utils/chunks/topic_aware_chunking.py
+++ b/applications/utils/chunks/topic_aware_chunking.py
@@ -52,6 +52,15 @@ class TopicAwareChunker(BoundaryDetector, SplitTextIntoSentences):
 
				             "embeddings": sentences_embeddings,
			
 
				         }
			
 
				 
			
 
				+    async def _book_chunk(self, sentence_list: List[str]) -> Dict[str, Any]:
			
 
				+        sentences_embeddings = await self._encode_batch(sentence_list)
			
 
				+        boundaries = self.detect_boundaries(sentence_list, sentences_embeddings)
			
 
				+        return {
			
 
				+            "sentence_list": sentence_list,
			
 
				+            "boundaries": boundaries,
			
 
				+            "embeddings": sentences_embeddings,
			
 
				+        }
			
 
				+
			
 
				 
			
 
				 class TopicAwarePackerV1(TopicAwareChunker):
			
 
				     def _pack_v1(
			
@@ -175,3 +184,17 @@ class TopicAwarePackerV2(TopicAwareChunker):
 
				             text_type=text_type,
			
 
				             dataset_id=dataset_id,
			
 
				         )
			
 
				+
			
 
				+    async def chunk_books(self, sentence_list: List[str], text_type: int, dataset_id: int) -> List[Chunk]:
			
 
				+        raw_info = await self._book_chunk(sentence_list=sentence_list)
			
 
				+        if not raw_info:
			
 
				+            return []
			
 
				+
			
 
				+        return self._pack_v2(
			
 
				+            sentence_list=raw_info["sentence_list"],
			
 
				+            boundaries=raw_info["boundaries"],
			
 
				+            embeddings=raw_info["embeddings"],
			
 
				+            text_type=text_type,
			
 
				+            dataset_id=dataset_id,
			
 
				+        )
			
 
				+
			
--- a/applications/utils/mysql/__init__.py
+++ b/applications/utils/mysql/__init__.py
@@ -1,7 +1,15 @@
 
				+from .books import Books
			
 
				 from .pool import DatabaseManager
			
 
				 from .mapper import Dataset, ChatResult
			
 
				 from .content_chunks import ContentChunks
			
 
				 from .contents import Contents
			
 
				 
			
 
				 
			
 
				-__all__ = ["Contents", "ContentChunks", "DatabaseManager", "Dataset", "ChatResult"]
			
 
				+__all__ = [
			
 
				+    "Contents",
			
 
				+    "ContentChunks",
			
 
				+    "DatabaseManager",
			
 
				+    "Dataset",
			
 
				+    "ChatResult",
			
 
				+    "Books",
			
 
				+]
			
--- a/applications/utils/mysql/books.py
+++ b/applications/utils/mysql/books.py
@@ -0,0 +1,23 @@
 
				+from .base import BaseMySQLClient
			
 
				+
			
 
				+
			
 
				+class Books(BaseMySQLClient):
			
 
				+    async def get_books(self):
			
 
				+        query = """
			
 
				+            SELECT book_id, book_name, book_oss_path, extract_status
			
 
				+            FROM books
			
 
				+            WHERE status = 1;
			
 
				+        """
			
 
				+        return await self.pool.async_fetch(query=query)
			
 
				+
			
 
				+    async def get_book_extract_detail(self, book_id):
			
 
				+        query = """
			
 
				+            SELECT book_name, book_oss_path, extract_result FROM books WHERE book_id = %s;
			
 
				+        """
			
 
				+        return await self.pool.async_fetch(query=query, params=(book_id,))
			
 
				+
			
 
				+    async def update_book_chunk_status(self, book_id, ori_status, new_status):
			
 
				+        query = """
			
 
				+            UPDATE books SET chunk_status = %s WHERE book_id = %s and chunk_status = %s;
			
 
				+        """
			
 
				+        return await self.pool.async_save(query=query, params=(new_status, book_id, ori_status))
			
--- a/routes/blueprint.py
+++ b/routes/blueprint.py
@@ -10,7 +10,7 @@ from quart_cors import cors
 
				 from applications.api import get_basic_embedding
			
 
				 from applications.api import get_img_embedding
			
 
				 from applications.async_task import AutoRechunkTask, BuildGraph
			
 
				-from applications.async_task import ChunkEmbeddingTask, DeleteTask
			
 
				+from applications.async_task import ChunkEmbeddingTask, DeleteTask, ChunkBooksTask
			
 
				 from applications.config import (
			
 
				     DEFAULT_MODEL,
			
 
				     LOCAL_MODEL_CONFIG,
			
@@ -84,6 +84,16 @@ async def chunk():
 
				     return jsonify({"doc_id": doc_id})
			
 
				 
			
 
				 
			
 
				+@server_bp.route("/chunk_book", methods=["POST"])
			
 
				+async def chunk_book():
			
 
				+    body = await request.get_json()
			
 
				+    resource = get_resource_manager()
			
 
				+    doc_id = f"doc-{uuid.uuid4()}"
			
 
				+    chunk_task = ChunkBooksTask(doc_id=doc_id, resource=resource)
			
 
				+    doc_id = await chunk_task.deal(body)
			
 
				+    return jsonify({"doc_id": doc_id})
			
 
				+
			
 
				+
			
 
				 @server_bp.route("/search", methods=["POST"])
			
 
				 async def search():
			
 
				     """
		`@@ -0,0 +1 @@`
		`+from .build_graph import extract_entity_and_graph`