|
@@ -11,8 +11,8 @@ from applications.config import Chunk, ChunkerConfig, DEFAULT_MODEL
|
|
|
|
|
|
|
|
|
class ChunkEmbeddingTask(TopicAwareChunker):
|
|
|
- def __init__(self, mysql_pool, vector_pool, cfg: ChunkerConfig):
|
|
|
- super().__init__(cfg)
|
|
|
+ def __init__(self, mysql_pool, vector_pool, cfg: ChunkerConfig, doc_id):
|
|
|
+ super().__init__(cfg, doc_id)
|
|
|
self.content_chunk_processor = None
|
|
|
self.contents_processor = None
|
|
|
self.mysql_pool = mysql_pool
|
|
@@ -52,6 +52,7 @@ class ChunkEmbeddingTask(TopicAwareChunker):
|
|
|
# insert
|
|
|
flag = await self.content_chunk_processor.insert_chunk(chunk)
|
|
|
if not flag:
|
|
|
+ print("插入文本失败")
|
|
|
return
|
|
|
|
|
|
acquire_lock = await self.content_chunk_processor.update_chunk_status(
|
|
@@ -61,6 +62,7 @@ class ChunkEmbeddingTask(TopicAwareChunker):
|
|
|
new_status=self.PROCESSING_STATUS,
|
|
|
)
|
|
|
if not acquire_lock:
|
|
|
+ print("抢占文本分块锁失败")
|
|
|
return
|
|
|
|
|
|
completion = await self.classifier.classify_chunk(chunk)
|
|
@@ -71,6 +73,7 @@ class ChunkEmbeddingTask(TopicAwareChunker):
|
|
|
ori_status=self.PROCESSING_STATUS,
|
|
|
new_status=self.FAILED_STATUS,
|
|
|
)
|
|
|
+ print("从deepseek获取信息失败")
|
|
|
return
|
|
|
|
|
|
update_flag = await self.content_chunk_processor.set_chunk_result(
|
|
@@ -118,6 +121,7 @@ class ChunkEmbeddingTask(TopicAwareChunker):
|
|
|
"task_type": chunk.task_type,
|
|
|
"summary": chunk.summary,
|
|
|
"keywords": chunk.keywords,
|
|
|
+ "entities": chunk.entities,
|
|
|
"concepts": chunk.concepts,
|
|
|
"questions": chunk.questions,
|
|
|
"topic_purity": chunk.topic_purity,
|
|
@@ -140,18 +144,22 @@ class ChunkEmbeddingTask(TopicAwareChunker):
|
|
|
print(f"存入向量数据库失败", e)
|
|
|
|
|
|
async def deal(self, data):
|
|
|
- text = data.get("text")
|
|
|
+ text = data.get("text", "")
|
|
|
+ text = text.strip()
|
|
|
if not text:
|
|
|
return None
|
|
|
|
|
|
self.init_processer()
|
|
|
- doc_id = f"doc-{uuid.uuid4()}"
|
|
|
|
|
|
async def _process():
|
|
|
- chunks = await self.process_content(doc_id, text)
|
|
|
+ chunks = await self.process_content(self.doc_id, text)
|
|
|
if not chunks:
|
|
|
return
|
|
|
|
|
|
+ # # dev
|
|
|
+ # for chunk in chunks:
|
|
|
+ # await self.process_each_chunk(chunk)
|
|
|
+
|
|
|
await run_tasks_with_asyncio_task_group(
|
|
|
task_list=chunks,
|
|
|
handler=self.process_each_chunk,
|
|
@@ -161,10 +169,10 @@ class ChunkEmbeddingTask(TopicAwareChunker):
|
|
|
)
|
|
|
|
|
|
await self.contents_processor.update_content_status(
|
|
|
- doc_id=doc_id,
|
|
|
+ doc_id=self.doc_id,
|
|
|
ori_status=self.PROCESSING_STATUS,
|
|
|
new_status=self.FINISHED_STATUS,
|
|
|
)
|
|
|
|
|
|
asyncio.create_task(_process())
|
|
|
- return doc_id
|
|
|
+ return self.doc_id
|