1 deň pred · 192a0c397d
--- a/applications/async_task/chunk_task.py
+++ b/applications/async_task/chunk_task.py
@@ -291,7 +291,11 @@ class ChunkBooksTask(ChunkEmbeddingTask):
 
				         if not flag:
			
 
				             return []
			
 
				         else:
			
 
				-            raw_chunks = await self.chunk_books(sentence_list=book_texts, text_type=self.BOOK_PDF_TYPE, dataset_id=self.BOOK_PDF_DATASET_ID)
			
 
				+            raw_chunks = await self.chunk_books(
			
 
				+                sentence_list=book_texts,
			
 
				+                text_type=self.BOOK_PDF_TYPE,
			
 
				+                dataset_id=self.BOOK_PDF_DATASET_ID,
			
 
				+            )
			
 
				             if not raw_chunks:
			
 
				                 await self.content_manager.update_content_status(
			
 
				                     doc_id=self.doc_id,
			
@@ -315,14 +319,12 @@ class ChunkBooksTask(ChunkEmbeddingTask):
 
				         self.init_processer()
			
 
				         # LOCK
			
 
				         acquire_lock = await self.book_manager.update_book_chunk_status(
			
 
				-                book_id=book_id,
			
 
				-                ori_status=self.INIT_STATUS,
			
 
				-                new_status=self.PROCESSING_STATUS
			
 
				-            )
			
 
				+            book_id=book_id,
			
 
				+            ori_status=self.INIT_STATUS,
			
 
				+            new_status=self.PROCESSING_STATUS,
			
 
				+        )
			
 
				         if not acquire_lock:
			
 
				-            return {
			
 
				-                "info": "book is processing or processed"
			
 
				-            }
			
 
				+            return {"info": "book is processing or processed"}
			
 
				 
			
 
				         async def _process():
			
 
				             chunks = await self._process_each_book(book_id)
			
@@ -350,7 +352,7 @@ class ChunkBooksTask(ChunkEmbeddingTask):
 
				             await self.book_manager.update_book_chunk_status(
			
 
				                 book_id=book_id,
			
 
				                 ori_status=self.PROCESSING_STATUS,
			
 
				-                new_status=self.FINISHED_STATUS
			
 
				+                new_status=self.FINISHED_STATUS,
			
 
				             )
			
 
				 
			
 
				         asyncio.create_task(_process())
			
--- a/applications/config/base_chunk.py
+++ b/applications/config/base_chunk.py
@@ -34,4 +34,3 @@ class ChunkerConfig:
 
				     enable_kg: bool = True
			
 
				     topic_purity_floor: float = 0.8
			
 
				     kg_topk: int = 3
			
 
				-
			
--- a/applications/utils/chunks/topic_aware_chunking.py
+++ b/applications/utils/chunks/topic_aware_chunking.py
@@ -185,7 +185,9 @@ class TopicAwarePackerV2(TopicAwareChunker):
 
				             dataset_id=dataset_id,
			
 
				         )
			
 
				 
			
 
				-    async def chunk_books(self, sentence_list: List[str], text_type: int, dataset_id: int) -> List[Chunk]:
			
 
				+    async def chunk_books(
			
 
				+        self, sentence_list: List[str], text_type: int, dataset_id: int
			
 
				+    ) -> List[Chunk]:
			
 
				         raw_info = await self._book_chunk(sentence_list=sentence_list)
			
 
				         if not raw_info:
			
 
				             return []
			
@@ -198,7 +200,9 @@ class TopicAwarePackerV2(TopicAwareChunker):
 
				             dataset_id=dataset_id,
			
 
				         )
			
 
				 
			
 
				-    async def chunk_books_raw(self, sentence_list: List[str], text_type: int, dataset_id: int):
			
 
				+    async def chunk_books_raw(
			
 
				+        self, sentence_list: List[str], text_type: int, dataset_id: int
			
 
				+    ):
			
 
				         chunks = []
			
 
				         for index, text in enumerate(sentence_list, 1):
			
 
				             chunks.append(
			
--- a/applications/utils/mysql/books.py
+++ b/applications/utils/mysql/books.py
@@ -20,4 +20,6 @@ class Books(BaseMySQLClient):
 
				         query = """
			
 
				             UPDATE books SET chunk_status = %s WHERE book_id = %s and chunk_status = %s;
			
 
				         """
			
 
				-        return await self.pool.async_save(query=query, params=(new_status, book_id, ori_status))
			
 
				+        return await self.pool.async_save(
			
 
				+            query=query, params=(new_status, book_id, ori_status)
			
 
				+        )
			
--- a/applications/utils/nlp/boundary_detector.py
+++ b/applications/utils/nlp/boundary_detector.py
@@ -70,7 +70,7 @@ class BoundaryDetector(ChunkerConfig):
 
				         return boundaries
			
 
				 
			
 
				     def detect_boundaries_v2(
			
 
				-            self, sentence_list: List[str], embs: np.ndarray, debug: bool = False
			
 
				+        self, sentence_list: List[str], embs: np.ndarray, debug: bool = False
			
 
				     ) -> List[int]:
			
 
				         """
			
 
				         约束：相邻 boundary（含开头到第一个 boundary）之间的句子数 ∈ [3, 10]
			
@@ -89,11 +89,11 @@ class BoundaryDetector(ChunkerConfig):
 
				         adj_scores = np.zeros_like(cut_scores)
			
 
				         for i in range(len(cut_scores)):
			
 
				             sent_to_check = sentence_list[i] if i < n else sentence_list[-1]
			
 
				-            snippet = (sent_to_check[-20:] if sent_to_check else "")
			
 
				+            snippet = sent_to_check[-20:] if sent_to_check else ""
			
 
				             adj_scores[i] = (
			
 
				-                    cut_scores[i]
			
 
				-                    + self.turn_signal(snippet)
			
 
				-                    + self.figure_signal(sent_to_check)
			
 
				+                cut_scores[i]
			
 
				+                + self.turn_signal(snippet)
			
 
				+                + self.figure_signal(sent_to_check)
			
 
				             )
			
 
				 
			
 
				         # --- 3-10 句强约束切分 ---
			
@@ -158,12 +158,14 @@ class BoundaryDetector(ChunkerConfig):
 
				 
			
 
				             if lower <= upper:
			
 
				                 # 在允许区间里找 adj_scores 最高的位置
			
 
				-                window = adj_scores[lower: upper + 1]
			
 
				+                window = adj_scores[lower : upper + 1]
			
 
				                 j = int(np.argmax(window)) + lower
			
 
				                 if j != boundaries[-1]:
			
 
				                     boundaries[-1] = j
			
 
				                     if debug:
			
 
				-                        print(f"[fix-tail] move last boundary -> {j}, tail_len={n - 1 - j}")
			
 
				+                        print(
			
 
				+                            f"[fix-tail] move last boundary -> {j}, tail_len={n - 1 - j}"
			
 
				+                        )
			
 
				             else:
			
 
				                 # 没有可行区间：退化为合并尾段（删掉最后一个 boundary）
			
 
				                 dropped = boundaries.pop()
			
@@ -171,4 +173,3 @@ class BoundaryDetector(ChunkerConfig):
 
				                     print(f"[fix-tail] drop last boundary {dropped} to avoid tiny tail")
			
 
				 
			
 
				         return boundaries
			
 
				-