|
@@ -2,12 +2,15 @@ import asyncio
|
|
import uuid
|
|
import uuid
|
|
from typing import List
|
|
from typing import List
|
|
|
|
|
|
|
|
+from applications.api import get_basic_embedding
|
|
|
|
+from applications.utils.async_utils import run_tasks_with_asyncio_task_group
|
|
from applications.utils.mysql import ContentChunks, Contents
|
|
from applications.utils.mysql import ContentChunks, Contents
|
|
from applications.utils.chunks import TopicAwareChunker, LLMClassifier
|
|
from applications.utils.chunks import TopicAwareChunker, LLMClassifier
|
|
-from applications.config import DEFAULT_MODEL, Chunk, ChunkerConfig
|
|
|
|
|
|
+from applications.utils.milvus import async_insert_chunk
|
|
|
|
+from applications.config import Chunk, ChunkerConfig, DEFAULT_MODEL
|
|
|
|
|
|
|
|
|
|
-class ChunkTask(TopicAwareChunker):
|
|
|
|
|
|
+class ChunkEmbeddingTask(TopicAwareChunker):
|
|
def __init__(self, mysql_pool, vector_pool, cfg: ChunkerConfig):
|
|
def __init__(self, mysql_pool, vector_pool, cfg: ChunkerConfig):
|
|
super().__init__(cfg)
|
|
super().__init__(cfg)
|
|
self.content_chunk_processor = None
|
|
self.content_chunk_processor = None
|
|
@@ -16,6 +19,10 @@ class ChunkTask(TopicAwareChunker):
|
|
self.vector_pool = vector_pool
|
|
self.vector_pool = vector_pool
|
|
self.classifier = LLMClassifier()
|
|
self.classifier = LLMClassifier()
|
|
|
|
|
|
|
|
+ @staticmethod
|
|
|
|
+ async def get_embedding_list(text: str) -> List:
|
|
|
|
+ return await get_basic_embedding(text=text, model=DEFAULT_MODEL, dev=True)
|
|
|
|
+
|
|
def init_processer(self):
|
|
def init_processer(self):
|
|
self.contents_processor = Contents(self.mysql_pool)
|
|
self.contents_processor = Contents(self.mysql_pool)
|
|
self.content_chunk_processor = ContentChunks(self.mysql_pool)
|
|
self.content_chunk_processor = ContentChunks(self.mysql_pool)
|
|
@@ -28,14 +35,17 @@ class ChunkTask(TopicAwareChunker):
|
|
raw_chunks = await self.chunk(text)
|
|
raw_chunks = await self.chunk(text)
|
|
if not raw_chunks:
|
|
if not raw_chunks:
|
|
await self.contents_processor.update_content_status(
|
|
await self.contents_processor.update_content_status(
|
|
- doc_id=doc_id, ori_status=self.INIT_STATUS, new_status=self.FAILED_STATUS
|
|
|
|
|
|
+ doc_id=doc_id,
|
|
|
|
+ ori_status=self.INIT_STATUS,
|
|
|
|
+ new_status=self.FAILED_STATUS,
|
|
)
|
|
)
|
|
return []
|
|
return []
|
|
|
|
|
|
affected_rows = await self.contents_processor.update_content_status(
|
|
affected_rows = await self.contents_processor.update_content_status(
|
|
- doc_id=doc_id, ori_status=self.INIT_STATUS, new_status=self.PROCESSING_STATUS
|
|
|
|
|
|
+ doc_id=doc_id,
|
|
|
|
+ ori_status=self.INIT_STATUS,
|
|
|
|
+ new_status=self.PROCESSING_STATUS,
|
|
)
|
|
)
|
|
- print(affected_rows)
|
|
|
|
return raw_chunks
|
|
return raw_chunks
|
|
|
|
|
|
async def process_each_chunk(self, chunk: Chunk):
|
|
async def process_each_chunk(self, chunk: Chunk):
|
|
@@ -45,7 +55,10 @@ class ChunkTask(TopicAwareChunker):
|
|
return
|
|
return
|
|
|
|
|
|
acquire_lock = await self.content_chunk_processor.update_chunk_status(
|
|
acquire_lock = await self.content_chunk_processor.update_chunk_status(
|
|
- doc_id=chunk.doc_id, chunk_id=chunk.chunk_id, ori_status=self.INIT_STATUS, new_status=self.PROCESSING_STATUS
|
|
|
|
|
|
+ doc_id=chunk.doc_id,
|
|
|
|
+ chunk_id=chunk.chunk_id,
|
|
|
|
+ ori_status=self.INIT_STATUS,
|
|
|
|
+ new_status=self.PROCESSING_STATUS,
|
|
)
|
|
)
|
|
if not acquire_lock:
|
|
if not acquire_lock:
|
|
return
|
|
return
|
|
@@ -53,17 +66,78 @@ class ChunkTask(TopicAwareChunker):
|
|
completion = await self.classifier.classify_chunk(chunk)
|
|
completion = await self.classifier.classify_chunk(chunk)
|
|
if not completion:
|
|
if not completion:
|
|
await self.content_chunk_processor.update_chunk_status(
|
|
await self.content_chunk_processor.update_chunk_status(
|
|
- doc_id=chunk.doc_id, chunk_id=chunk.chunk_id, ori_status=self.PROCESSING_STATUS, new_status=self.FAILED_STATUS
|
|
|
|
|
|
+ doc_id=chunk.doc_id,
|
|
|
|
+ chunk_id=chunk.chunk_id,
|
|
|
|
+ ori_status=self.PROCESSING_STATUS,
|
|
|
|
+ new_status=self.FAILED_STATUS,
|
|
)
|
|
)
|
|
|
|
+ return
|
|
|
|
|
|
update_flag = await self.content_chunk_processor.set_chunk_result(
|
|
update_flag = await self.content_chunk_processor.set_chunk_result(
|
|
- chunk=completion, new_status=self.FINISHED_STATUS, ori_status=self.PROCESSING_STATUS
|
|
|
|
|
|
+ chunk=completion,
|
|
|
|
+ ori_status=self.PROCESSING_STATUS,
|
|
|
|
+ new_status=self.FINISHED_STATUS,
|
|
)
|
|
)
|
|
if not update_flag:
|
|
if not update_flag:
|
|
await self.content_chunk_processor.update_chunk_status(
|
|
await self.content_chunk_processor.update_chunk_status(
|
|
- doc_id=chunk.doc_id, chunk_id=chunk.chunk_id, ori_status=self.PROCESSING_STATUS, new_status=self.FAILED_STATUS
|
|
|
|
|
|
+ doc_id=chunk.doc_id,
|
|
|
|
+ chunk_id=chunk.chunk_id,
|
|
|
|
+ ori_status=self.PROCESSING_STATUS,
|
|
|
|
+ new_status=self.FAILED_STATUS,
|
|
)
|
|
)
|
|
|
|
+ return
|
|
|
|
|
|
|
|
+ await self.save_to_milvus(completion)
|
|
|
|
+
|
|
|
|
+ async def save_to_milvus(self, chunk: Chunk):
|
|
|
|
+ """
|
|
|
|
+ :param chunk: each single chunk
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ # 抢锁
|
|
|
|
+ acquire_lock = await self.content_chunk_processor.update_embedding_status(
|
|
|
|
+ doc_id=chunk.doc_id,
|
|
|
|
+ chunk_id=chunk.chunk_id,
|
|
|
|
+ new_status=self.PROCESSING_STATUS,
|
|
|
|
+ ori_status=self.INIT_STATUS,
|
|
|
|
+ )
|
|
|
|
+ if not acquire_lock:
|
|
|
|
+ print(f"抢占-{chunk.doc_id}-{chunk.chunk_id}分块-embedding处理锁失败")
|
|
|
|
+ return
|
|
|
|
+ try:
|
|
|
|
+ data = {
|
|
|
|
+ "doc_id": chunk.doc_id,
|
|
|
|
+ "chunk_id": chunk.chunk_id,
|
|
|
|
+ "vector_text": await self.get_embedding_list(chunk.text),
|
|
|
|
+ "vector_summary": await self.get_embedding_list(chunk.summary),
|
|
|
|
+ "vector_questions": await self.get_embedding_list(
|
|
|
|
+ ",".join(chunk.questions)
|
|
|
|
+ ),
|
|
|
|
+ "topic": chunk.topic,
|
|
|
|
+ "domain": chunk.domain,
|
|
|
|
+ "task_type": chunk.task_type,
|
|
|
|
+ "summary": chunk.summary,
|
|
|
|
+ "keywords": chunk.keywords,
|
|
|
|
+ "concepts": chunk.concepts,
|
|
|
|
+ "questions": chunk.questions,
|
|
|
|
+ "topic_purity": chunk.topic_purity,
|
|
|
|
+ "tokens": chunk.tokens,
|
|
|
|
+ }
|
|
|
|
+ await async_insert_chunk(self.vector_pool, data)
|
|
|
|
+ await self.content_chunk_processor.update_embedding_status(
|
|
|
|
+ doc_id=chunk.doc_id,
|
|
|
|
+ chunk_id=chunk.chunk_id,
|
|
|
|
+ ori_status=self.PROCESSING_STATUS,
|
|
|
|
+ new_status=self.FINISHED_STATUS,
|
|
|
|
+ )
|
|
|
|
+ except Exception as e:
|
|
|
|
+ await self.content_chunk_processor.update_embedding_status(
|
|
|
|
+ doc_id=chunk.doc_id,
|
|
|
|
+ chunk_id=chunk.chunk_id,
|
|
|
|
+ ori_status=self.PROCESSING_STATUS,
|
|
|
|
+ new_status=self.FAILED_STATUS,
|
|
|
|
+ )
|
|
|
|
+ print(f"存入向量数据库失败", e)
|
|
|
|
|
|
async def deal(self, data):
|
|
async def deal(self, data):
|
|
text = data.get("text")
|
|
text = data.get("text")
|
|
@@ -78,20 +152,19 @@ class ChunkTask(TopicAwareChunker):
|
|
if not chunks:
|
|
if not chunks:
|
|
return
|
|
return
|
|
|
|
|
|
- # 开始分batch
|
|
|
|
- async with asyncio.TaskGroup() as tg:
|
|
|
|
- for chunk in chunks:
|
|
|
|
- tg.create_task(self.process_each_chunk(chunk))
|
|
|
|
|
|
+ await run_tasks_with_asyncio_task_group(
|
|
|
|
+ task_list=chunks,
|
|
|
|
+ handler=self.process_each_chunk,
|
|
|
|
+ description="处理单篇文章分块",
|
|
|
|
+ unit="chunk",
|
|
|
|
+ max_concurrency=10,
|
|
|
|
+ )
|
|
|
|
|
|
await self.contents_processor.update_content_status(
|
|
await self.contents_processor.update_content_status(
|
|
- doc_id=doc_id, ori_status=self.PROCESSING_STATUS, new_status=self.FINISHED_STATUS
|
|
|
|
|
|
+ doc_id=doc_id,
|
|
|
|
+ ori_status=self.PROCESSING_STATUS,
|
|
|
|
+ new_status=self.FINISHED_STATUS,
|
|
)
|
|
)
|
|
|
|
|
|
- await _process()
|
|
|
|
- # asyncio.create_task(_process())
|
|
|
|
|
|
+ asyncio.create_task(_process())
|
|
return doc_id
|
|
return doc_id
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|