Ver Fonte

chunk策略优化

luojunhui há 2 semanas atrás
pai
commit
e17488f8e0

+ 2 - 2
applications/utils/chunks/topic_aware_chunking.py

@@ -39,7 +39,7 @@ class TopicAwareChunker(BoundaryDetector, SplitTextIntoSentences):
         sentences_embeddings = await self._encode_batch(sentence_list)
         boundaries = self.detect_boundaries(sentence_list, sentences_embeddings)
         return {
-            "sentences_list": sentence_list,
+            "sentence_list": sentence_list,
             "boundaries": boundaries,
             "embeddings": sentences_embeddings,
         }
@@ -130,7 +130,7 @@ class TopicAwarePackerV2(TopicAwareChunker):
             text = "".join(seg)
             tokens = num_tokens(text)
             # 如果 token 过短,则暂时不用
-            status = 2 if tokens < self.max_tokens else 1
+            status = 2 if tokens < self.min_tokens else 1
             chunks.append(
                 Chunk(
                     doc_id=self.doc_id,

+ 8 - 5
applications/utils/mysql/mapper.py

@@ -8,8 +8,10 @@ class TaskConst:
     FINISHED_STATUS = 2
     FAILED_STATUS = 3
 
+    CHUNK_USEFUL_STATUS = 1
 
-class BaseMySQLClient:
+
+class BaseMySQLClient(TaskConst):
     def __init__(self, pool):
         self.pool = pool
 
@@ -77,8 +79,8 @@ class ContentChunks(BaseMySQLClient):
     async def insert_chunk(self, chunk: Chunk) -> int:
         query = """
             INSERT IGNORE INTO content_chunks
-                (chunk_id, doc_id, text, tokens, topic_purity, text_type, dataset_id) 
-                VALUES (%s, %s, %s, %s, %s, %s, %s);
+                (chunk_id, doc_id, text, tokens, topic_purity, text_type, dataset_id, status) 
+                VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
         """
         return await self.pool.async_save(
             query=query,
@@ -90,6 +92,7 @@ class ContentChunks(BaseMySQLClient):
                 chunk.topic_purity,
                 chunk.text_type,
                 chunk.dataset_id,
+                chunk.status,
             ),
         )
 
@@ -97,10 +100,10 @@ class ContentChunks(BaseMySQLClient):
         query = """
             UPDATE content_chunks
             SET chunk_status = %s 
-            WHERE doc_id = %s AND chunk_id = %s AND chunk_status = %s;
+            WHERE doc_id = %s AND chunk_id = %s AND chunk_status = %s and status = %s;
         """
         return await self.pool.async_save(
-            query=query, params=(new_status, doc_id, chunk_id, ori_status)
+            query=query, params=(new_status, doc_id, chunk_id, ori_status, self.CHUNK_USEFUL_STATUS)
         )
 
     async def update_embedding_status(self, doc_id, chunk_id, ori_status, new_status):