2 hónapja · 970e3777a3
--- a/applications/async_task/build_graph.py
+++ b/applications/async_task/build_graph.py
@@ -33,7 +33,7 @@ class BuildGraph(AsyncNeo4jRepository):
 
				         acquire_lock = await self.chunk_manager.update_graph_status(
			
 
				             doc_id, chunk_id, self.INIT_STATUS, self.PROCESSING_STATUS
			
 
				         )
			
 
				-        if acquire_lock:
			
 
				+        if not acquire_lock:
			
 
				             print(f"while building graph, acquire lock for chunk {chunk_id}")
			
 
				             return
			
 
				 
			
@@ -68,10 +68,12 @@ class BuildGraph(AsyncNeo4jRepository):
 
				     async def deal(self, doc_id):
			
 
				         """async process single chunk"""
			
 
				         chunk_list = await self.get_chunk_list_from_es(doc_id)
			
 
				-        await run_tasks_with_asyncio_task_group(
			
 
				-            task_list=chunk_list,
			
 
				-            handler=self.add_single_chunk,
			
 
				-            description="build graph",
			
 
				-            unit="chunk",
			
 
				-            max_concurrency=10,
			
 
				-        )
			
 
				+        for chunk in chunk_list:
			
 
				+            await self.add_single_chunk(chunk)
			
 
				+        # await run_tasks_with_asyncio_task_group(
			
 
				+        #     task_list=chunk_list,
			
 
				+        #     handler=self.add_single_chunk,
			
 
				+        #     description="build graph",
			
 
				+        #     unit="chunk",
			
 
				+        #     max_concurrency=10,
			
 
				+        # )
			
--- a/applications/utils/chunks/llm_classifier.py
+++ b/applications/utils/chunks/llm_classifier.py
@@ -9,26 +9,34 @@ class LLMClassifier:
 
				     def generate_prompt(chunk_text: str) -> str:
			
 
				         raw_prompt = """
			
 
				 你是一个文本分析助手。  
			
 
				-我会给你一段文本，请你输出以下信息：  
			
 
				-1. **主题标签 (topic)**：一句话概括文本主题  
			
 
				-2. **关键词 (keywords)**：3-5 个，便于检索  
			
 
				-3. **摘要 (summary)**：50字以内简要说明  
			
 
				-4. **领域 (domain)**：该文本所属领域（如：AI 技术、体育、金融）
			
 
				-5. **任务类型 (task_type)**：文本主要任务类型（如：解释、教学、动作描述、方法提出）  
			
 
				-6. **核心知识点 (concepts)**：涉及的核心知识点或概念  
			
 
				-7. **显示/隐式问题 (questions)**：文本中隐含或显式的问题
			
 
				-8. **实体(entities)**: 文本中的提到的命名实体
			
 
				+请严格按照以下要求分析我提供的文本，并输出 **JSON 格式**结果：
			
 
				 
			
 
				-请用 JSON 格式输出，例如：
			
 
				+### 输出字段说明
			
 
				+1. **topic**：一句话概括文本主题  
			
 
				+2. **summary**：50字以内简要说明文本内容  
			
 
				+3. **domain**：从下列枚举表中选择一个最合适的领域（必须严格选取一个，不能生成新词）  
			
 
				+   - ["AI 技术","机器学习","自然语言处理","计算机视觉","知识图谱","数据科学","软件工程","数据库","云计算","网络安全","区块链","量子计算",
			
 
				+      "数学","物理","化学","生物","医学","心理学","教育",
			
 
				+      "金融","会计","经济学","管理学","市场营销","投资/基金",
			
 
				+      "法律","政治","社会学","历史","哲学","语言学","文学","艺术",
			
 
				+      "体育","娱乐","军事","环境科学","地理","其他"]
			
 
				+4. **task_type**：文本主要任务类型（如：解释、教学、动作描述、方法提出）  
			
 
				+5. **keywords**：不超过 3 个，偏向外部检索用标签（概括性强，利于搜索）  
			
 
				+6. **concepts**：不超过 3 个，偏向内部知识点（技术/学术内涵，和 keywords 明显区分）  
			
 
				+7. **questions**：文本中显式或隐含的问题（无则返回空数组）  
			
 
				+8. **entities**：文本中出现的命名实体（如人名、地名、机构名、系统名、模型名等，无则返回空数组）
			
 
				+
			
 
				+### 输出格式示例
			
 
				+```json
			
 
				 {
			
 
				-    "topic": "RAG 技术与分块策略",
			
 
				-    "summary": "介绍RAG技术并提出主题感知的分块方法。", 
			
 
				-    "domain": "AI 技术",
			
 
				-    "task_type": "方法提出",
			
 
				-    "keywords": ["RAG", "检索增强", "文本分块", "知识图谱"],
			
 
				-    "concepts": ["RAG", "文本分块", "知识图谱"],
			
 
				-    "questions": ["如何提升RAG的检索效果？"]
			
 
				-    "entities": ["entity1"]
			
 
				+  "topic": "RAG 技术与主题感知分块",
			
 
				+  "summary": "介绍RAG在复杂问答中的应用，并提出分块方法。",
			
 
				+  "domain": "自然语言处理",
			
 
				+  "task_type": "方法提出",
			
 
				+  "keywords": ["RAG", "文本分块", "问答系统"],
			
 
				+  "concepts": ["检索增强生成", "语义边界检测", "主题感知分块"],
			
 
				+  "questions": ["如何优化RAG在问答场景中的效果？"],
			
 
				+  "entities": ["RAG"]
			
 
				 }
			
 
				 
			
 
				 下面是文本：
			
@@ -41,7 +49,6 @@ class LLMClassifier:
 
				         response = await fetch_deepseek_completion(
			
 
				             model="DeepSeek-V3", prompt=prompt, output_type="json"
			
 
				         )
			
 
				-        print(response)
			
 
				         return Chunk(
			
 
				             chunk_id=chunk.chunk_id,
			
 
				             doc_id=chunk.doc_id,
			
--- a/applications/utils/chunks/topic_aware_chunking.py
+++ b/applications/utils/chunks/topic_aware_chunking.py
@@ -32,7 +32,8 @@ class TopicAwareChunker(BoundaryDetector, SplitTextIntoSentences):
 
				         return np.stack(embs)
			
 
				 
			
 
				     async def _raw_chunk(self, text: str) -> Dict[str, Any]:
			
 
				-        sentence_list = self.jieba_sent_tokenize(text)
			
 
				+        # sentence_list = self.jieba_sent_tokenize(text)
			
 
				+        sentence_list = self.lang_chain_tokenize(text)
			
 
				         if not sentence_list:
			
 
				             return {}
			
 
				 
			
--- a/applications/utils/mysql/content_chunks.py
+++ b/applications/utils/mysql/content_chunks.py
@@ -187,4 +187,4 @@ class ContentChunks(BaseMySQLClient):
 
				             "page": page_num,
			
 
				             "page_size": page_size,
			
 
				             "total_pages": total_pages,
			
 
				-        }
			
 
				+        }
			
--- a/applications/utils/mysql/contents.py
+++ b/applications/utils/mysql/contents.py
@@ -1,5 +1,6 @@
 
				 from .base import BaseMySQLClient
			
 
				 
			
 
				+
			
 
				 class Contents(BaseMySQLClient):
			
 
				     async def insert_content(self, doc_id, text, text_type, title, dataset_id):
			
 
				         query = """
			
@@ -128,4 +129,4 @@ class Contents(BaseMySQLClient):
 
				             "page": page_num,
			
 
				             "page_size": page_size,
			
 
				             "total_pages": total_pages,
			
 
				-        }
			
 
				+        }
			
--- a/applications/utils/nlp/split_text_into_sentences.py
+++ b/applications/utils/nlp/split_text_into_sentences.py
@@ -3,6 +3,7 @@ import nltk
 
				 import jieba
			
 
				 
			
 
				 from typing import List
			
 
				+from langchain.text_splitter import RecursiveCharacterTextSplitter
			
 
				 
			
 
				 
			
 
				 class SplitTextIntoSentences:
			
@@ -26,3 +27,9 @@ class SplitTextIntoSentences:
 
				         if buf.strip():
			
 
				             sentence_list.append(buf.strip())
			
 
				         return sentence_list
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def lang_chain_tokenize(text: str) -> List[str]:
			
 
				+        splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
			
 
				+        docs = splitter.split_text(text)
			
 
				+        return docs
			
--- a/routes/buleprint.py
+++ b/routes/buleprint.py
@@ -560,6 +560,10 @@ async def delete_task():
 
				         return jsonify({"status_code": 500, "detail": "docId not found", "data": {}})
			
 
				 
			
 
				     resource = get_resource_manager()
			
 
				-    build_graph_task = BuildGraph(neo4j=resource.graph_client, es_client=resource.es_client)
			
 
				+    build_graph_task = BuildGraph(
			
 
				+        neo4j=resource.graph_client,
			
 
				+        es_client=resource.es_client,
			
 
				+        mysql_client=resource.mysql_client,
			
 
				+    )
			
 
				     await build_graph_task.deal(doc_id)
			
 
				     return jsonify({"status_code": 200, "detail": "success", "data": {}})