|
@@ -1,23 +1,24 @@
|
|
import asyncio
|
|
import asyncio
|
|
-import uuid
|
|
|
|
from typing import List
|
|
from typing import List
|
|
|
|
|
|
from applications.api import get_basic_embedding
|
|
from applications.api import get_basic_embedding
|
|
from applications.utils.async_utils import run_tasks_with_asyncio_task_group
|
|
from applications.utils.async_utils import run_tasks_with_asyncio_task_group
|
|
-from applications.utils.mysql import ContentChunks, Contents
|
|
|
|
from applications.utils.chunks import TopicAwareChunker, LLMClassifier
|
|
from applications.utils.chunks import TopicAwareChunker, LLMClassifier
|
|
from applications.utils.milvus import async_insert_chunk
|
|
from applications.utils.milvus import async_insert_chunk
|
|
|
|
+from applications.utils.mysql import ContentChunks, Contents
|
|
from applications.config import Chunk, ChunkerConfig, DEFAULT_MODEL
|
|
from applications.config import Chunk, ChunkerConfig, DEFAULT_MODEL
|
|
|
|
+from applications.config import ELASTIC_SEARCH_INDEX
|
|
|
|
|
|
|
|
|
|
class ChunkEmbeddingTask(TopicAwareChunker):
|
|
class ChunkEmbeddingTask(TopicAwareChunker):
|
|
- def __init__(self, mysql_pool, vector_pool, cfg: ChunkerConfig, doc_id):
|
|
|
|
|
|
+ def __init__(self, mysql_pool, vector_pool, cfg: ChunkerConfig, doc_id, es_pool):
|
|
super().__init__(cfg, doc_id)
|
|
super().__init__(cfg, doc_id)
|
|
self.content_chunk_processor = None
|
|
self.content_chunk_processor = None
|
|
self.contents_processor = None
|
|
self.contents_processor = None
|
|
self.mysql_pool = mysql_pool
|
|
self.mysql_pool = mysql_pool
|
|
self.vector_pool = vector_pool
|
|
self.vector_pool = vector_pool
|
|
self.classifier = LLMClassifier()
|
|
self.classifier = LLMClassifier()
|
|
|
|
+ self.es_client = es_pool
|
|
|
|
|
|
@staticmethod
|
|
@staticmethod
|
|
async def get_embedding_list(text: str) -> List:
|
|
async def get_embedding_list(text: str) -> List:
|
|
@@ -27,14 +28,16 @@ class ChunkEmbeddingTask(TopicAwareChunker):
|
|
self.contents_processor = Contents(self.mysql_pool)
|
|
self.contents_processor = Contents(self.mysql_pool)
|
|
self.content_chunk_processor = ContentChunks(self.mysql_pool)
|
|
self.content_chunk_processor = ContentChunks(self.mysql_pool)
|
|
|
|
|
|
- async def process_content(
|
|
|
|
- self, doc_id: str, text: str, text_type: int
|
|
|
|
|
|
+ async def _chunk_each_content(
|
|
|
|
+ self, doc_id: str, text: str, text_type: int, title: str, dataset_id: int
|
|
) -> List[Chunk]:
|
|
) -> List[Chunk]:
|
|
- flag = await self.contents_processor.insert_content(doc_id, text, text_type)
|
|
|
|
|
|
+ flag = await self.contents_processor.insert_content(
|
|
|
|
+ doc_id, text, text_type, title, dataset_id
|
|
|
|
+ )
|
|
if not flag:
|
|
if not flag:
|
|
return []
|
|
return []
|
|
else:
|
|
else:
|
|
- raw_chunks = await self.chunk(text, text_type)
|
|
|
|
|
|
+ raw_chunks = await self.chunk(text, text_type, dataset_id)
|
|
if not raw_chunks:
|
|
if not raw_chunks:
|
|
await self.contents_processor.update_content_status(
|
|
await self.contents_processor.update_content_status(
|
|
doc_id=doc_id,
|
|
doc_id=doc_id,
|
|
@@ -50,7 +53,31 @@ class ChunkEmbeddingTask(TopicAwareChunker):
|
|
)
|
|
)
|
|
return raw_chunks
|
|
return raw_chunks
|
|
|
|
|
|
- async def process_each_chunk(self, chunk: Chunk):
|
|
|
|
|
|
+ async def insert_into_es(self, milvus_id, chunk: Chunk) -> int:
|
|
|
|
+ docs = [
|
|
|
|
+ {
|
|
|
|
+ "_index": ELASTIC_SEARCH_INDEX,
|
|
|
|
+ "_id": milvus_id,
|
|
|
|
+ "_source": {
|
|
|
|
+ "milvus_id": milvus_id,
|
|
|
|
+ "doc_id": chunk.doc_id,
|
|
|
|
+ "dataset_id": chunk.dataset_id,
|
|
|
|
+ "chunk_id": chunk.chunk_id,
|
|
|
|
+ "topic": chunk.topic,
|
|
|
|
+ "domain": chunk.domain,
|
|
|
|
+ "task_type": chunk.task_type,
|
|
|
|
+ "text_type": chunk.text_type,
|
|
|
|
+ "keywords": chunk.keywords,
|
|
|
|
+ "concepts": chunk.concepts,
|
|
|
|
+ "entities": chunk.entities,
|
|
|
|
+ "status": chunk.status,
|
|
|
|
+ },
|
|
|
|
+ }
|
|
|
|
+ ]
|
|
|
|
+ resp = await self.es_client.bulk_insert(docs)
|
|
|
|
+ return resp["success"]
|
|
|
|
+
|
|
|
|
+ async def save_each_chunk(self, chunk: Chunk):
|
|
# insert
|
|
# insert
|
|
flag = await self.content_chunk_processor.insert_chunk(chunk)
|
|
flag = await self.content_chunk_processor.insert_chunk(chunk)
|
|
if not flag:
|
|
if not flag:
|
|
@@ -92,7 +119,30 @@ class ChunkEmbeddingTask(TopicAwareChunker):
|
|
)
|
|
)
|
|
return
|
|
return
|
|
|
|
|
|
- await self.save_to_milvus(completion)
|
|
|
|
|
|
+ milvus_id = await self.save_to_milvus(completion)
|
|
|
|
+ if not milvus_id:
|
|
|
|
+ return
|
|
|
|
+
|
|
|
|
+ # 存储到 es 中
|
|
|
|
+ # acquire_lock
|
|
|
|
+ acquire_es_lock = await self.content_chunk_processor.update_es_status(
|
|
|
|
+ doc_id=chunk.doc_id,
|
|
|
|
+ chunk_id=chunk.chunk_id,
|
|
|
|
+ ori_status=self.INIT_STATUS,
|
|
|
|
+ new_status=self.PROCESSING_STATUS,
|
|
|
|
+ )
|
|
|
|
+ if not acquire_es_lock:
|
|
|
|
+ print(f"获取 es Lock Fail: {chunk.doc_id}--{chunk.chunk_id}")
|
|
|
|
+ return
|
|
|
|
+
|
|
|
|
+ insert_rows = await self.insert_into_es(milvus_id, completion)
|
|
|
|
+ final_status = self.FINISHED_STATUS if insert_rows else self.FAILED_STATUS
|
|
|
|
+ await self.content_chunk_processor.update_es_status(
|
|
|
|
+ doc_id=chunk.doc_id,
|
|
|
|
+ chunk_id=chunk.chunk_id,
|
|
|
|
+ ori_status=self.PROCESSING_STATUS,
|
|
|
|
+ new_status=final_status,
|
|
|
|
+ )
|
|
|
|
|
|
async def save_to_milvus(self, chunk: Chunk):
|
|
async def save_to_milvus(self, chunk: Chunk):
|
|
"""
|
|
"""
|
|
@@ -108,7 +158,7 @@ class ChunkEmbeddingTask(TopicAwareChunker):
|
|
)
|
|
)
|
|
if not acquire_lock:
|
|
if not acquire_lock:
|
|
print(f"抢占-{chunk.doc_id}-{chunk.chunk_id}分块-embedding处理锁失败")
|
|
print(f"抢占-{chunk.doc_id}-{chunk.chunk_id}分块-embedding处理锁失败")
|
|
- return
|
|
|
|
|
|
+ return None
|
|
try:
|
|
try:
|
|
data = {
|
|
data = {
|
|
"doc_id": chunk.doc_id,
|
|
"doc_id": chunk.doc_id,
|
|
@@ -118,24 +168,25 @@ class ChunkEmbeddingTask(TopicAwareChunker):
|
|
"vector_questions": await self.get_embedding_list(
|
|
"vector_questions": await self.get_embedding_list(
|
|
",".join(chunk.questions)
|
|
",".join(chunk.questions)
|
|
),
|
|
),
|
|
- "topic": chunk.topic,
|
|
|
|
- "domain": chunk.domain,
|
|
|
|
- "task_type": chunk.task_type,
|
|
|
|
- "summary": chunk.summary,
|
|
|
|
- "keywords": chunk.keywords,
|
|
|
|
- "entities": chunk.entities,
|
|
|
|
- "concepts": chunk.concepts,
|
|
|
|
- "questions": chunk.questions,
|
|
|
|
- "topic_purity": chunk.topic_purity,
|
|
|
|
- "tokens": chunk.tokens,
|
|
|
|
}
|
|
}
|
|
- await async_insert_chunk(self.vector_pool, data)
|
|
|
|
|
|
+ resp = await async_insert_chunk(self.vector_pool, data)
|
|
|
|
+ if not resp:
|
|
|
|
+ await self.content_chunk_processor.update_embedding_status(
|
|
|
|
+ doc_id=chunk.doc_id,
|
|
|
|
+ chunk_id=chunk.chunk_id,
|
|
|
|
+ ori_status=self.PROCESSING_STATUS,
|
|
|
|
+ new_status=self.FAILED_STATUS,
|
|
|
|
+ )
|
|
|
|
+ return None
|
|
|
|
+
|
|
await self.content_chunk_processor.update_embedding_status(
|
|
await self.content_chunk_processor.update_embedding_status(
|
|
doc_id=chunk.doc_id,
|
|
doc_id=chunk.doc_id,
|
|
chunk_id=chunk.chunk_id,
|
|
chunk_id=chunk.chunk_id,
|
|
ori_status=self.PROCESSING_STATUS,
|
|
ori_status=self.PROCESSING_STATUS,
|
|
new_status=self.FINISHED_STATUS,
|
|
new_status=self.FINISHED_STATUS,
|
|
)
|
|
)
|
|
|
|
+ milvus_id = resp[0]
|
|
|
|
+ return milvus_id
|
|
except Exception as e:
|
|
except Exception as e:
|
|
await self.content_chunk_processor.update_embedding_status(
|
|
await self.content_chunk_processor.update_embedding_status(
|
|
doc_id=chunk.doc_id,
|
|
doc_id=chunk.doc_id,
|
|
@@ -144,28 +195,33 @@ class ChunkEmbeddingTask(TopicAwareChunker):
|
|
new_status=self.FAILED_STATUS,
|
|
new_status=self.FAILED_STATUS,
|
|
)
|
|
)
|
|
print(f"存入向量数据库失败", e)
|
|
print(f"存入向量数据库失败", e)
|
|
|
|
+ return None
|
|
|
|
|
|
async def deal(self, data):
|
|
async def deal(self, data):
|
|
text = data.get("text", "")
|
|
text = data.get("text", "")
|
|
- text = text.strip()
|
|
|
|
|
|
+ title = data.get("title", "")
|
|
|
|
+ text, title = text.strip(), title.strip()
|
|
text_type = data.get("text_type", 1)
|
|
text_type = data.get("text_type", 1)
|
|
|
|
+ dataset_id = data.get("dataset_id", 0) # 默认知识库 id 为 0
|
|
if not text:
|
|
if not text:
|
|
return None
|
|
return None
|
|
|
|
|
|
self.init_processer()
|
|
self.init_processer()
|
|
|
|
|
|
async def _process():
|
|
async def _process():
|
|
- chunks = await self.process_content(self.doc_id, text, text_type)
|
|
|
|
|
|
+ chunks = await self._chunk_each_content(
|
|
|
|
+ self.doc_id, text, text_type, title, dataset_id
|
|
|
|
+ )
|
|
if not chunks:
|
|
if not chunks:
|
|
return
|
|
return
|
|
|
|
|
|
# # dev
|
|
# # dev
|
|
# for chunk in chunks:
|
|
# for chunk in chunks:
|
|
- # await self.process_each_chunk(chunk)
|
|
|
|
|
|
+ # await self.save_each_chunk(chunk)
|
|
|
|
|
|
await run_tasks_with_asyncio_task_group(
|
|
await run_tasks_with_asyncio_task_group(
|
|
task_list=chunks,
|
|
task_list=chunks,
|
|
- handler=self.process_each_chunk,
|
|
|
|
|
|
+ handler=self.save_each_chunk,
|
|
description="处理单篇文章分块",
|
|
description="处理单篇文章分块",
|
|
unit="chunk",
|
|
unit="chunk",
|
|
max_concurrency=10,
|
|
max_concurrency=10,
|