Prechádzať zdrojové kódy

pdf-chunking-方法

luojunhui 1 deň pred
rodič
commit
192a0c397d

+ 11 - 9
applications/async_task/chunk_task.py

@@ -291,7 +291,11 @@ class ChunkBooksTask(ChunkEmbeddingTask):
         if not flag:
             return []
         else:
-            raw_chunks = await self.chunk_books(sentence_list=book_texts, text_type=self.BOOK_PDF_TYPE, dataset_id=self.BOOK_PDF_DATASET_ID)
+            raw_chunks = await self.chunk_books(
+                sentence_list=book_texts,
+                text_type=self.BOOK_PDF_TYPE,
+                dataset_id=self.BOOK_PDF_DATASET_ID,
+            )
             if not raw_chunks:
                 await self.content_manager.update_content_status(
                     doc_id=self.doc_id,
@@ -315,14 +319,12 @@ class ChunkBooksTask(ChunkEmbeddingTask):
         self.init_processer()
         # LOCK
         acquire_lock = await self.book_manager.update_book_chunk_status(
-                book_id=book_id,
-                ori_status=self.INIT_STATUS,
-                new_status=self.PROCESSING_STATUS
-            )
+            book_id=book_id,
+            ori_status=self.INIT_STATUS,
+            new_status=self.PROCESSING_STATUS,
+        )
         if not acquire_lock:
-            return {
-                "info": "book is processing or processed"
-            }
+            return {"info": "book is processing or processed"}
 
         async def _process():
             chunks = await self._process_each_book(book_id)
@@ -350,7 +352,7 @@ class ChunkBooksTask(ChunkEmbeddingTask):
             await self.book_manager.update_book_chunk_status(
                 book_id=book_id,
                 ori_status=self.PROCESSING_STATUS,
-                new_status=self.FINISHED_STATUS
+                new_status=self.FINISHED_STATUS,
             )
 
         asyncio.create_task(_process())

+ 0 - 1
applications/config/base_chunk.py

@@ -34,4 +34,3 @@ class ChunkerConfig:
     enable_kg: bool = True
     topic_purity_floor: float = 0.8
     kg_topk: int = 3
-

+ 6 - 2
applications/utils/chunks/topic_aware_chunking.py

@@ -185,7 +185,9 @@ class TopicAwarePackerV2(TopicAwareChunker):
             dataset_id=dataset_id,
         )
 
-    async def chunk_books(self, sentence_list: List[str], text_type: int, dataset_id: int) -> List[Chunk]:
+    async def chunk_books(
+        self, sentence_list: List[str], text_type: int, dataset_id: int
+    ) -> List[Chunk]:
         raw_info = await self._book_chunk(sentence_list=sentence_list)
         if not raw_info:
             return []
@@ -198,7 +200,9 @@ class TopicAwarePackerV2(TopicAwareChunker):
             dataset_id=dataset_id,
         )
 
-    async def chunk_books_raw(self, sentence_list: List[str], text_type: int, dataset_id: int):
+    async def chunk_books_raw(
+        self, sentence_list: List[str], text_type: int, dataset_id: int
+    ):
         chunks = []
         for index, text in enumerate(sentence_list, 1):
             chunks.append(

+ 3 - 1
applications/utils/mysql/books.py

@@ -20,4 +20,6 @@ class Books(BaseMySQLClient):
         query = """
             UPDATE books SET chunk_status = %s WHERE book_id = %s and chunk_status = %s;
         """
-        return await self.pool.async_save(query=query, params=(new_status, book_id, ori_status))
+        return await self.pool.async_save(
+            query=query, params=(new_status, book_id, ori_status)
+        )

+ 9 - 8
applications/utils/nlp/boundary_detector.py

@@ -70,7 +70,7 @@ class BoundaryDetector(ChunkerConfig):
         return boundaries
 
     def detect_boundaries_v2(
-            self, sentence_list: List[str], embs: np.ndarray, debug: bool = False
+        self, sentence_list: List[str], embs: np.ndarray, debug: bool = False
     ) -> List[int]:
         """
         约束:相邻 boundary(含开头到第一个 boundary)之间的句子数 ∈ [3, 10]
@@ -89,11 +89,11 @@ class BoundaryDetector(ChunkerConfig):
         adj_scores = np.zeros_like(cut_scores)
         for i in range(len(cut_scores)):
             sent_to_check = sentence_list[i] if i < n else sentence_list[-1]
-            snippet = (sent_to_check[-20:] if sent_to_check else "")
+            snippet = sent_to_check[-20:] if sent_to_check else ""
             adj_scores[i] = (
-                    cut_scores[i]
-                    + self.turn_signal(snippet)
-                    + self.figure_signal(sent_to_check)
+                cut_scores[i]
+                + self.turn_signal(snippet)
+                + self.figure_signal(sent_to_check)
             )
 
         # --- 3-10 句强约束切分 ---
@@ -158,12 +158,14 @@ class BoundaryDetector(ChunkerConfig):
 
             if lower <= upper:
                 # 在允许区间里找 adj_scores 最高的位置
-                window = adj_scores[lower: upper + 1]
+                window = adj_scores[lower : upper + 1]
                 j = int(np.argmax(window)) + lower
                 if j != boundaries[-1]:
                     boundaries[-1] = j
                     if debug:
-                        print(f"[fix-tail] move last boundary -> {j}, tail_len={n - 1 - j}")
+                        print(
+                            f"[fix-tail] move last boundary -> {j}, tail_len={n - 1 - j}"
+                        )
             else:
                 # 没有可行区间:退化为合并尾段(删掉最后一个 boundary)
                 dropped = boundaries.pop()
@@ -171,4 +173,3 @@ class BoundaryDetector(ChunkerConfig):
                     print(f"[fix-tail] drop last boundary {dropped} to avoid tiny tail")
 
         return boundaries
-