ソースを参照

pdf-chunking-方法

luojunhui 1 日 前
コミット
325726dff0

+ 8 - 1
applications/async_task/__init__.py

@@ -1,7 +1,14 @@
 from .chunk_task import ChunkEmbeddingTask
+from .chunk_task import ChunkBooksTask
 from .delete_task import DeleteTask
 from .auto_rechunk_task import AutoRechunkTask
 from .build_graph import BuildGraph
 
 
-__all__ = ["ChunkEmbeddingTask", "DeleteTask", "AutoRechunkTask", "BuildGraph"]
+__all__ = [
+    "ChunkEmbeddingTask",
+    "DeleteTask",
+    "AutoRechunkTask",
+    "BuildGraph",
+    "ChunkBooksTask",
+]

+ 102 - 1
applications/async_task/chunk_task.py

@@ -6,7 +6,7 @@ from applications.api import get_basic_embedding
 from applications.utils.async_utils import run_tasks_with_asyncio_task_group
 from applications.utils.chunks import LLMClassifier, TopicAwarePackerV2
 from applications.utils.milvus import async_insert_chunk
-from applications.utils.mysql import ContentChunks, Contents
+from applications.utils.mysql import Books, ContentChunks, Contents
 from applications.utils.nlp import num_tokens
 from applications.config import Chunk, DEFAULT_MODEL
 from applications.config import ELASTIC_SEARCH_INDEX
@@ -17,6 +17,7 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
         super().__init__(doc_id)
         self.chunk_manager = None
         self.content_manager = None
+        self.book_manager = None
         self.mysql_client = resource.mysql_client
         self.milvus_client = resource.milvus_client
         self.es_client = resource.es_client
@@ -29,6 +30,7 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
     def init_processer(self):
         self.content_manager = Contents(self.mysql_client)
         self.chunk_manager = ContentChunks(self.mysql_client)
+        self.book_manager = Books(self.mysql_client)
 
     async def _chunk_each_content(self, doc_id: str, data: dict) -> List[Chunk]:
         title, text = data.get("title", "").strip(), data["text"].strip()
@@ -260,3 +262,102 @@ class ChunkEmbeddingTask(TopicAwarePackerV2):
 
         asyncio.create_task(_process())
         return self.doc_id
+
+
+class ChunkBooksTask(ChunkEmbeddingTask):
+    """图书类型分块任务"""
+
+    BOOK_PDF_DATASET_ID = 17
+    BOOK_PDF_TYPE = 3
+
+    async def _process_each_book(self, book_id):
+        result = await self.book_manager.get_book_extract_detail(book_id=book_id)
+        extract_result = result[0]["extract_result"]
+        book_name = result[0]["book_name"]
+        book_oss_path = result[0]["book_oss_path"]
+        book_texts = [
+            i["text"] for i in json.loads(extract_result) if i["type"] == "text"
+        ]
+
+        # first insert into contents
+        flag = await self.content_manager.insert_content(
+            self.doc_id,
+            book_oss_path,
+            self.BOOK_PDF_TYPE,
+            book_name,
+            self.BOOK_PDF_DATASET_ID,
+            ext=None,
+        )
+        if not flag:
+            return []
+        else:
+            raw_chunks = await self.chunk_books(sentence_list=book_texts, text_type=self.BOOK_PDF_TYPE, dataset_id=self.BOOK_PDF_DATASET_ID)
+            if not raw_chunks:
+                await self.content_manager.update_content_status(
+                    doc_id=self.doc_id,
+                    ori_status=self.INIT_STATUS,
+                    new_status=self.FAILED_STATUS,
+                )
+                return []
+
+            await self.content_manager.update_content_status(
+                doc_id=self.doc_id,
+                ori_status=self.INIT_STATUS,
+                new_status=self.PROCESSING_STATUS,
+            )
+            return raw_chunks
+
+    async def deal(self, data):
+        book_id = data.get("book_id", None)
+        if not book_id:
+            return {"error": "Book id should not be None"}
+
+        self.init_processer()
+
+        # LOCK
+        acquire_lock = await self.book_manager.update_book_chunk_status(
+                book_id=book_id,
+                ori_status=self.INIT_STATUS,
+                new_status=self.PROCESSING_STATUS
+            )
+        print(acquire_lock)
+
+        if not acquire_lock:
+            return {
+                "info": "book is processing or processed"
+            }
+
+        async def _process():
+            chunks = await self._process_each_book(book_id)
+            if not chunks:
+                return
+
+            # # dev
+            # for chunk in tqdm(chunks):
+            #     await self.save_each_chunk(chunk)
+
+            await run_tasks_with_asyncio_task_group(
+                task_list=chunks,
+                handler=self.save_each_chunk,
+                description="处理单篇文章分块",
+                unit="chunk",
+                max_concurrency=10,
+            )
+
+            await self.content_manager.update_content_status(
+                doc_id=self.doc_id,
+                ori_status=self.PROCESSING_STATUS,
+                new_status=self.FINISHED_STATUS,
+            )
+
+            await self.book_manager.update_book_chunk_status(
+                book_id=book_id,
+                ori_status=self.PROCESSING_STATUS,
+                new_status=self.FINISHED_STATUS
+            )
+
+        asyncio.create_task(_process())
+        return self.doc_id
+
+
+__all__ = ["ChunkEmbeddingTask", "ChunkBooksTask"]

+ 1 - 0
applications/prompts/__init__.py

@@ -0,0 +1 @@
+from .build_graph import extract_entity_and_graph

+ 63 - 0
applications/prompts/build_graph.py

@@ -0,0 +1,63 @@
+import json
+
+
+def extract_entity_and_graph(text: str) -> str:
+    """
+    通用知识抽取 Prompt 生成器。
+    从任意输入文本中提取实体、关系和概念信息。
+    输出 JSON 格式,兼容大模型调用,不会出现格式冲突或转义问题。
+    """
+    safe_text = json.dumps(text, ensure_ascii=False)
+    prompt = f"""
+### 角色设定
+你是一名专业的知识抽取助手,请从输入的文本中识别出关键信息并输出结构化结果。
+
+### 输入文本:
+{safe_text}
+
+### 抽取目标
+请提取以下三类信息:
+1. **entities(实体)**  
+   - 指文本中出现的具体对象、人物、机构、地点、技术、产品、事件等。  
+   - 每个实体需包含:名称、类型、别名、描述
+   - 只需要获取主要,关键内容包含的实体,知识中的示例,样例等补充信息不需要提取实体
+
+2. **relations(关系)**  
+   - 指实体之间的语义联系。  
+   - 常见关系类型包括(不限于):属于、隶属、依赖、控制、合作、位于、应用于、开发、影响、由...定义、用于。  
+   - 每条关系需包含:source、target、relation_type、evidence、confidence。
+3. **concepts(概念)**  
+   - 指文本涉及的主题、核心技术、思想、学科领域、话题关键词等。
+
+### 输出格式
+请严格输出以下 JSON 结构,禁止输出任何解释性文字、注释或 Markdown 代码块。
+
+{{
+  "entities": [
+    {{
+      "name": "string", "type": "string", "aliases": ["string"], "description": "string"
+    }}
+  ],
+  "relations": [
+    {{
+    "source": "string", "target": "string", "relation_type": "string", "evidence": "string", "confidence": 0.0
+    }}
+  ],
+  "concepts": ["string"]
+}}
+
+### 输出规则
+1. 严格输出合法 JSON 格式,可直接解析;
+2. 所有字段必须存在,即使为空数组;
+3. 若未检测到任何内容,请输出:
+   {{
+     "entities": [],
+     "relations": [],
+     "concepts": []
+   }};
+4. `confidence` 为 0.0 ~ 1.0 之间的小数;
+5. 禁止推理未出现在文本中的实体或关系;
+6. 若出现模糊信息,请保持描述中立;
+7. 输出中不允许包含解释性文字、注释、示例或 Markdown。
+"""
+    return prompt

+ 23 - 0
applications/utils/chunks/topic_aware_chunking.py

@@ -52,6 +52,15 @@ class TopicAwareChunker(BoundaryDetector, SplitTextIntoSentences):
             "embeddings": sentences_embeddings,
         }
 
+    async def _book_chunk(self, sentence_list: List[str]) -> Dict[str, Any]:
+        sentences_embeddings = await self._encode_batch(sentence_list)
+        boundaries = self.detect_boundaries(sentence_list, sentences_embeddings)
+        return {
+            "sentence_list": sentence_list,
+            "boundaries": boundaries,
+            "embeddings": sentences_embeddings,
+        }
+
 
 class TopicAwarePackerV1(TopicAwareChunker):
     def _pack_v1(
@@ -175,3 +184,17 @@ class TopicAwarePackerV2(TopicAwareChunker):
             text_type=text_type,
             dataset_id=dataset_id,
         )
+
+    async def chunk_books(self, sentence_list: List[str], text_type: int, dataset_id: int) -> List[Chunk]:
+        raw_info = await self._book_chunk(sentence_list=sentence_list)
+        if not raw_info:
+            return []
+
+        return self._pack_v2(
+            sentence_list=raw_info["sentence_list"],
+            boundaries=raw_info["boundaries"],
+            embeddings=raw_info["embeddings"],
+            text_type=text_type,
+            dataset_id=dataset_id,
+        )
+

+ 9 - 1
applications/utils/mysql/__init__.py

@@ -1,7 +1,15 @@
+from .books import Books
 from .pool import DatabaseManager
 from .mapper import Dataset, ChatResult
 from .content_chunks import ContentChunks
 from .contents import Contents
 
 
-__all__ = ["Contents", "ContentChunks", "DatabaseManager", "Dataset", "ChatResult"]
+__all__ = [
+    "Contents",
+    "ContentChunks",
+    "DatabaseManager",
+    "Dataset",
+    "ChatResult",
+    "Books",
+]

+ 23 - 0
applications/utils/mysql/books.py

@@ -0,0 +1,23 @@
+from .base import BaseMySQLClient
+
+
+class Books(BaseMySQLClient):
+    async def get_books(self):
+        query = """
+            SELECT book_id, book_name, book_oss_path, extract_status
+            FROM books
+            WHERE status = 1;
+        """
+        return await self.pool.async_fetch(query=query)
+
+    async def get_book_extract_detail(self, book_id):
+        query = """
+            SELECT book_name, book_oss_path, extract_result FROM books WHERE book_id = %s;
+        """
+        return await self.pool.async_fetch(query=query, params=(book_id,))
+
+    async def update_book_chunk_status(self, book_id, ori_status, new_status):
+        query = """
+            UPDATE books SET chunk_status = %s WHERE book_id = %s and chunk_status = %s;
+        """
+        return await self.pool.async_save(query=query, params=(new_status, book_id, ori_status))

+ 11 - 1
routes/blueprint.py

@@ -10,7 +10,7 @@ from quart_cors import cors
 from applications.api import get_basic_embedding
 from applications.api import get_img_embedding
 from applications.async_task import AutoRechunkTask, BuildGraph
-from applications.async_task import ChunkEmbeddingTask, DeleteTask
+from applications.async_task import ChunkEmbeddingTask, DeleteTask, ChunkBooksTask
 from applications.config import (
     DEFAULT_MODEL,
     LOCAL_MODEL_CONFIG,
@@ -84,6 +84,16 @@ async def chunk():
     return jsonify({"doc_id": doc_id})
 
 
+@server_bp.route("/chunk_book", methods=["POST"])
+async def chunk_book():
+    body = await request.get_json()
+    resource = get_resource_manager()
+    doc_id = f"doc-{uuid.uuid4()}"
+    chunk_task = ChunkBooksTask(doc_id=doc_id, resource=resource)
+    doc_id = await chunk_task.deal(body)
+    return jsonify({"doc_id": doc_id})
+
+
 @server_bp.route("/search", methods=["POST"])
 async def search():
     """