před 2 týdny · 863c02bc2e
--- a/applications/async_task/chunk_task.py
+++ b/applications/async_task/chunk_task.py
@@ -1,23 +1,24 @@
 
				 import asyncio
			
 
				-import uuid
			
 
				 from typing import List
			
 
				 
			
 
				 from applications.api import get_basic_embedding
			
 
				 from applications.utils.async_utils import run_tasks_with_asyncio_task_group
			
 
				-from applications.utils.mysql import ContentChunks, Contents
			
 
				 from applications.utils.chunks import TopicAwareChunker, LLMClassifier
			
 
				 from applications.utils.milvus import async_insert_chunk
			
 
				+from applications.utils.mysql import ContentChunks, Contents
			
 
				 from applications.config import Chunk, ChunkerConfig, DEFAULT_MODEL
			
 
				+from applications.config import ELASTIC_SEARCH_INDEX
			
 
				 
			
 
				 
			
 
				 class ChunkEmbeddingTask(TopicAwareChunker):
			
 
				-    def __init__(self, mysql_pool, vector_pool, cfg: ChunkerConfig, doc_id):
			
 
				+    def __init__(self, mysql_pool, vector_pool, cfg: ChunkerConfig, doc_id, es_pool):
			
 
				         super().__init__(cfg, doc_id)
			
 
				         self.content_chunk_processor = None
			
 
				         self.contents_processor = None
			
 
				         self.mysql_pool = mysql_pool
			
 
				         self.vector_pool = vector_pool
			
 
				         self.classifier = LLMClassifier()
			
 
				+        self.es_client = es_pool
			
 
				 
			
 
				     @staticmethod
			
 
				     async def get_embedding_list(text: str) -> List:
			
@@ -27,14 +28,16 @@ class ChunkEmbeddingTask(TopicAwareChunker):
 
				         self.contents_processor = Contents(self.mysql_pool)
			
 
				         self.content_chunk_processor = ContentChunks(self.mysql_pool)
			
 
				 
			
 
				-    async def process_content(
			
 
				-        self, doc_id: str, text: str, text_type: int
			
 
				+    async def _chunk_each_content(
			
 
				+        self, doc_id: str, text: str, text_type: int, title: str, dataset_id: int
			
 
				     ) -> List[Chunk]:
			
 
				-        flag = await self.contents_processor.insert_content(doc_id, text, text_type)
			
 
				+        flag = await self.contents_processor.insert_content(
			
 
				+            doc_id, text, text_type, title, dataset_id
			
 
				+        )
			
 
				         if not flag:
			
 
				             return []
			
 
				         else:
			
 
				-            raw_chunks = await self.chunk(text, text_type)
			
 
				+            raw_chunks = await self.chunk(text, text_type, dataset_id)
			
 
				             if not raw_chunks:
			
 
				                 await self.contents_processor.update_content_status(
			
 
				                     doc_id=doc_id,
			
@@ -50,7 +53,31 @@ class ChunkEmbeddingTask(TopicAwareChunker):
 
				             )
			
 
				             return raw_chunks
			
 
				 
			
 
				-    async def process_each_chunk(self, chunk: Chunk):
			
 
				+    async def insert_into_es(self, milvus_id, chunk: Chunk) -> int:
			
 
				+        docs = [
			
 
				+            {
			
 
				+                "_index": ELASTIC_SEARCH_INDEX,
			
 
				+                "_id": milvus_id,
			
 
				+                "_source": {
			
 
				+                    "milvus_id": milvus_id,
			
 
				+                    "doc_id": chunk.doc_id,
			
 
				+                    "dataset_id": chunk.dataset_id,
			
 
				+                    "chunk_id": chunk.chunk_id,
			
 
				+                    "topic": chunk.topic,
			
 
				+                    "domain": chunk.domain,
			
 
				+                    "task_type": chunk.task_type,
			
 
				+                    "text_type": chunk.text_type,
			
 
				+                    "keywords": chunk.keywords,
			
 
				+                    "concepts": chunk.concepts,
			
 
				+                    "entities": chunk.entities,
			
 
				+                    "status": chunk.status,
			
 
				+                },
			
 
				+            }
			
 
				+        ]
			
 
				+        resp = await self.es_client.bulk_insert(docs)
			
 
				+        return resp["success"]
			
 
				+
			
 
				+    async def save_each_chunk(self, chunk: Chunk):
			
 
				         # insert
			
 
				         flag = await self.content_chunk_processor.insert_chunk(chunk)
			
 
				         if not flag:
			
@@ -92,7 +119,30 @@ class ChunkEmbeddingTask(TopicAwareChunker):
 
				             )
			
 
				             return
			
 
				 
			
 
				-        await self.save_to_milvus(completion)
			
 
				+        milvus_id = await self.save_to_milvus(completion)
			
 
				+        if not milvus_id:
			
 
				+            return
			
 
				+
			
 
				+        # 存储到 es 中
			
 
				+        # acquire_lock
			
 
				+        acquire_es_lock = await self.content_chunk_processor.update_es_status(
			
 
				+            doc_id=chunk.doc_id,
			
 
				+            chunk_id=chunk.chunk_id,
			
 
				+            ori_status=self.INIT_STATUS,
			
 
				+            new_status=self.PROCESSING_STATUS,
			
 
				+        )
			
 
				+        if not acquire_es_lock:
			
 
				+            print(f"获取 es Lock Fail: {chunk.doc_id}--{chunk.chunk_id}")
			
 
				+            return
			
 
				+
			
 
				+        insert_rows = await self.insert_into_es(milvus_id, completion)
			
 
				+        final_status = self.FINISHED_STATUS if insert_rows else self.FAILED_STATUS
			
 
				+        await self.content_chunk_processor.update_es_status(
			
 
				+            doc_id=chunk.doc_id,
			
 
				+            chunk_id=chunk.chunk_id,
			
 
				+            ori_status=self.PROCESSING_STATUS,
			
 
				+            new_status=final_status,
			
 
				+        )
			
 
				 
			
 
				     async def save_to_milvus(self, chunk: Chunk):
			
 
				         """
			
@@ -108,7 +158,7 @@ class ChunkEmbeddingTask(TopicAwareChunker):
 
				         )
			
 
				         if not acquire_lock:
			
 
				             print(f"抢占-{chunk.doc_id}-{chunk.chunk_id}分块-embedding处理锁失败")
			
 
				-            return
			
 
				+            return None
			
 
				         try:
			
 
				             data = {
			
 
				                 "doc_id": chunk.doc_id,
			
@@ -118,24 +168,25 @@ class ChunkEmbeddingTask(TopicAwareChunker):
 
				                 "vector_questions": await self.get_embedding_list(
			
 
				                     ",".join(chunk.questions)
			
 
				                 ),
			
 
				-                "topic": chunk.topic,
			
 
				-                "domain": chunk.domain,
			
 
				-                "task_type": chunk.task_type,
			
 
				-                "summary": chunk.summary,
			
 
				-                "keywords": chunk.keywords,
			
 
				-                "entities": chunk.entities,
			
 
				-                "concepts": chunk.concepts,
			
 
				-                "questions": chunk.questions,
			
 
				-                "topic_purity": chunk.topic_purity,
			
 
				-                "tokens": chunk.tokens,
			
 
				             }
			
 
				-            await async_insert_chunk(self.vector_pool, data)
			
 
				+            resp = await async_insert_chunk(self.vector_pool, data)
			
 
				+            if not resp:
			
 
				+                await self.content_chunk_processor.update_embedding_status(
			
 
				+                    doc_id=chunk.doc_id,
			
 
				+                    chunk_id=chunk.chunk_id,
			
 
				+                    ori_status=self.PROCESSING_STATUS,
			
 
				+                    new_status=self.FAILED_STATUS,
			
 
				+                )
			
 
				+                return None
			
 
				+
			
 
				             await self.content_chunk_processor.update_embedding_status(
			
 
				                 doc_id=chunk.doc_id,
			
 
				                 chunk_id=chunk.chunk_id,
			
 
				                 ori_status=self.PROCESSING_STATUS,
			
 
				                 new_status=self.FINISHED_STATUS,
			
 
				             )
			
 
				+            milvus_id = resp[0]
			
 
				+            return milvus_id
			
 
				         except Exception as e:
			
 
				             await self.content_chunk_processor.update_embedding_status(
			
 
				                 doc_id=chunk.doc_id,
			
@@ -144,28 +195,33 @@ class ChunkEmbeddingTask(TopicAwareChunker):
 
				                 new_status=self.FAILED_STATUS,
			
 
				             )
			
 
				             print(f"存入向量数据库失败", e)
			
 
				+            return None
			
 
				 
			
 
				     async def deal(self, data):
			
 
				         text = data.get("text", "")
			
 
				-        text = text.strip()
			
 
				+        title = data.get("title", "")
			
 
				+        text, title = text.strip(), title.strip()
			
 
				         text_type = data.get("text_type", 1)
			
 
				+        dataset_id = data.get("dataset_id", 0)  # 默认知识库 id 为 0
			
 
				         if not text:
			
 
				             return None
			
 
				 
			
 
				         self.init_processer()
			
 
				 
			
 
				         async def _process():
			
 
				-            chunks = await self.process_content(self.doc_id, text, text_type)
			
 
				+            chunks = await self._chunk_each_content(
			
 
				+                self.doc_id, text, text_type, title, dataset_id
			
 
				+            )
			
 
				             if not chunks:
			
 
				                 return
			
 
				 
			
 
				             # # dev
			
 
				             # for chunk in chunks:
			
 
				-            #     await self.process_each_chunk(chunk)
			
 
				+            #     await self.save_each_chunk(chunk)
			
 
				 
			
 
				             await run_tasks_with_asyncio_task_group(
			
 
				                 task_list=chunks,
			
 
				-                handler=self.process_each_chunk,
			
 
				+                handler=self.save_each_chunk,
			
 
				                 description="处理单篇文章分块",
			
 
				                 unit="chunk",
			
 
				                 max_concurrency=10,
			
--- a/applications/config/__init__.py
+++ b/applications/config/__init__.py
@@ -6,10 +6,12 @@ from .model_config import (
 
				 )
			
 
				 from .deepseek_config import DEEPSEEK_MODEL, DEEPSEEK_API_KEY
			
 
				 from .base_chunk import Chunk, ChunkerConfig
			
 
				-from .milvus_config import MILVUS_CONFIG
			
 
				+from .elastic_search_config import ELASTIC_SEARCH_INDEX, ES_HOSTS, ES_PASSWORD
			
 
				+from .milvus_config import MILVUS_CONFIG, BASE_MILVUS_SEARCH_PARAMS
			
 
				 from .mysql_config import RAG_MYSQL_CONFIG
			
 
				 from .weight_config import WEIGHT_MAP
			
 
				 
			
 
				+
			
 
				 __all__ = [
			
 
				     "DEFAULT_MODEL",
			
 
				     "LOCAL_MODEL_CONFIG",
			
@@ -22,4 +24,8 @@ __all__ = [
 
				     "MILVUS_CONFIG",
			
 
				     "RAG_MYSQL_CONFIG",
			
 
				     "WEIGHT_MAP",
			
 
				+    "ES_HOSTS",
			
 
				+    "ES_PASSWORD",
			
 
				+    "ELASTIC_SEARCH_INDEX",
			
 
				+    "BASE_MILVUS_SEARCH_PARAMS",
			
 
				 ]
			
--- a/applications/config/base_chunk.py
+++ b/applications/config/base_chunk.py
@@ -8,12 +8,14 @@ class Chunk:
 
				     doc_id: str
			
 
				     text: str
			
 
				     tokens: int
			
 
				+    dataset_id: int
			
 
				     topic: str = ""
			
 
				     domain: str = ""
			
 
				     task_type: str = ""
			
 
				     topic_purity: float = 1.0
			
 
				     text_type: int = 1
			
 
				     summary: str = ""
			
 
				+    status: int = 1
			
 
				     keywords: List[str] = field(default_factory=list)
			
 
				     concepts: List[str] = field(default_factory=list)
			
 
				     questions: List[str] = field(default_factory=list)
			
--- a/applications/config/elastic_search_config.py
+++ b/applications/config/elastic_search_config.py
@@ -0,0 +1,5 @@
 
				+ELASTIC_SEARCH_INDEX = "milvus_metadata"
			
 
				+
			
 
				+ES_PASSWORD = "elastic123@"
			
 
				+
			
 
				+ES_HOSTS = ["http://es-cn-ols4fypjx00020u36.public.elasticsearch.aliyuncs.com:9200"]
			
--- a/applications/config/es_certs.crt
+++ b/applications/config/es_certs.crt
@@ -0,0 +1,31 @@
 
				+-----BEGIN CERTIFICATE-----
			
 
				+MIIFaTCCA1GgAwIBAgIUWHH9T8PVfiSyvT6S6NrAQ9iSLeEwDQYJKoZIhvcNAQEL
			
 
				+BQAwPDE6MDgGA1UEAxMxRWxhc3RpY3NlYXJjaCBzZWN1cml0eSBhdXRvLWNvbmZp
			
 
				+Z3VyYXRpb24gSFRUUCBDQTAeFw0yNTA3MDcwNzIwNTRaFw0yODA3MDYwNzIwNTRa
			
 
				+MDwxOjA4BgNVBAMTMUVsYXN0aWNzZWFyY2ggc2VjdXJpdHkgYXV0by1jb25maWd1
			
 
				+cmF0aW9uIEhUVFAgQ0EwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQCb
			
 
				+Y8E68+7S+hGKQX6vhyOxuCe3QyBHYlsxiSqGhi+WFx953u4SEMqrbqiyg2QquB9/
			
 
				+ynjKo3Tvhn0OPjuJRytteKn9OZkVhUT1D5P6PFo0j8x1LIJZm551XRCnQUZ8jC0C
			
 
				+REHy/JoKdT4YSCRIuXVTM5iM66vQ1t5Du4sb70mTygtc2DyXwgE4LkVnrHcwr2BZ
			
 
				+3/O69WvF7Zd7WP93yEfUsLsAAQStaCYMeYyaY5K8UwIVcFyWKJ9lnDGbR9KmuXb9
			
 
				+ipWqGw6aAYhmSs5gL+6xJ5dBpgMOqoBTvZpNniLA/phkelq9W2nAhBLFpRGRof8K
			
 
				+5iKwjAN8gnBXeSVklBoL23QD5zfoVjz+5eaXWO4qP+90jbwf+vEg/duncDRONGtk
			
 
				+TQd0Vr9NeO3Aye8PZsmmhKAaciaPWYyQO30omUq9kPsSUzZPu4k+CYb8qwVQCHpn
			
 
				+Za19NkvERQ8hCQks08/ly5qDM+5lBxJQFQjhjtzSDQ/ybbarMmgaBxpCexiksRmP
			
 
				+CQqVLW6IaLxUGEkIJqXRx8nmKUfK43vTBitOBFt5UcKob6+ikZLrqZ6xLY/jklE8
			
 
				+Z1wt9I8ZdQ3L3X9EORgmQ+4KIu/JQfBdfAYtLaS6MYWhiZSaKaIhgfXiZQTO9YuW
			
 
				+KrI5g+d2Yu2BYgIioLKo9LFWK1eTG2gNAGUI/+rqswIDAQABo2MwYTAdBgNVHQ4E
			
 
				+FgQUab2kAtPlJHLirQvbThvIwJ7hbLwwHwYDVR0jBBgwFoAUab2kAtPlJHLirQvb
			
 
				+ThvIwJ7hbLwwDwYDVR0TAQH/BAUwAwEB/zAOBgNVHQ8BAf8EBAMCAQYwDQYJKoZI
			
 
				+hvcNAQELBQADggIBAF+wJ598Krfai5Br6Vq0Z1jj0JsU8Kij4t9D+89QPgI85/Mv
			
 
				+zwj8xRgxx9RinKYdnzFJWrD9BITG2l3D0zcJhXfYUpq5HLP+c3zMwEMGzTLbgi70
			
 
				+cpYqkTJ+g/Ah5WRYZRHJIMF6BVK6izCOO0J49eYC6AONNxG2HeeUvEL4cNnxpw8T
			
 
				+NUe7v0FXe2iPLeE713h99ray0lBgI6J9QZqc/oEM47gHy+ByfWCv6Yw9qLlprppP
			
 
				+taHz2VWnCAACDLzbDnYhemQDji86yrUTEdCT8at1jAwHSixgkm88nEBgxPHDuq8t
			
 
				+thmiS6dELvXVUbyeWO7A/7zVde0Kndxe003OuYcX9I2IX7aIpC8sW/yY+alRhklq
			
 
				+t9vF6g1qvsN69xXfW5yI5G31TYMUw/3ng0aVJfRFaXkEV2SWEZD+4sWoYC/GU7kK
			
 
				+zlfaF22jTeul5qCKkN1k+i8K2lheEE3ZBC358W0RyvsrDwtXOra3VCpZ7qrez8OA
			
 
				+/HeY6iISZQ7g0s209KjqOPqVGcI8B0p6KMh00AeWisU6E/wy1LNTxkf2IS9b88n6
			
 
				+a3rj0TCycwhKOPTPB5pwlfbZNI00tGTFjqqi07SLqO9ZypsVkyR32G16JPJzk8Zw
			
 
				+kngBZt6y9LtCMRVbyDuIDNq+fjtDjgxMI9bQXtve4bOuq8cZzcMjC6khz/Ja
			
 
				+-----END CERTIFICATE-----
			
--- a/applications/config/milvus_config.py
+++ b/applications/config/milvus_config.py
@@ -5,3 +5,8 @@ MILVUS_CONFIG = {
 
				     "password": "Piaoquan@2025",
			
 
				     "port": "19530",
			
 
				 }
			
 
				+
			
 
				+BASE_MILVUS_SEARCH_PARAMS = {
			
 
				+    "metric_type": "COSINE",
			
 
				+    "params": {"ef": 64},
			
 
				+}
			
--- a/applications/config/model_config.py
+++ b/applications/config/model_config.py
@@ -6,7 +6,7 @@ LOCAL_MODEL_CONFIG = {
 
				 
			
 
				 DEFAULT_MODEL = "Qwen3-Embedding-4B"
			
 
				 
			
 
				-VLLM_SERVER_URL = "http://vllm-qwen:8000/v1/embeddings"
			
 
				-# VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
			
 
				+# VLLM_SERVER_URL = "http://vllm-qwen:8000/v1/embeddings"
			
 
				+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
			
 
				 
			
 
				 DEV_VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
			
--- a/applications/resource/__init__.py
+++ b/applications/resource/__init__.py
@@ -0,0 +1,4 @@
 
				+from .resource_manager import get_resource_manager
			
 
				+from .resource_manager import init_resource_manager
			
 
				+
			
 
				+__all__ = ["get_resource_manager", "init_resource_manager"]
			
--- a/applications/resource/resource_manager.py
+++ b/applications/resource/resource_manager.py
@@ -0,0 +1,87 @@
 
				+from pymilvus import connections, CollectionSchema, Collection
			
 
				+
			
 
				+from applications.utils.mysql import DatabaseManager
			
 
				+from applications.utils.milvus.field import fields
			
 
				+from applications.utils.elastic_search import AsyncElasticSearchClient
			
 
				+
			
 
				+
			
 
				+class ResourceManager:
			
 
				+    def __init__(self, es_index, es_hosts, es_password, milvus_config):
			
 
				+        self.es_index = es_index
			
 
				+        self.es_hosts = es_hosts
			
 
				+        self.es_password = es_password
			
 
				+        self.milvus_config = milvus_config
			
 
				+
			
 
				+        self.es_client: AsyncElasticSearchClient | None = None
			
 
				+        self.milvus_client: Collection | None = None
			
 
				+        self.mysql_client: DatabaseManager | None = None
			
 
				+
			
 
				+    async def load_milvus(self):
			
 
				+        connections.connect("default", **self.milvus_config)
			
 
				+
			
 
				+        schema = CollectionSchema(
			
 
				+            fields, description="Chunk multi-vector embeddings with metadata"
			
 
				+        )
			
 
				+        self.milvus_client = Collection(name="chunk_multi_embeddings_v2", schema=schema)
			
 
				+
			
 
				+        # create index
			
 
				+        vector_index_params = {
			
 
				+            "index_type": "IVF_FLAT",
			
 
				+            "metric_type": "COSINE",
			
 
				+            "params": {"M": 16, "efConstruction": 200},
			
 
				+        }
			
 
				+        self.milvus_client.create_index("vector_text", vector_index_params)
			
 
				+        self.milvus_client.create_index("vector_summary", vector_index_params)
			
 
				+        self.milvus_client.create_index("vector_questions", vector_index_params)
			
 
				+        self.milvus_client.load()
			
 
				+
			
 
				+    async def startup(self):
			
 
				+        # 初始化 Elasticsearch
			
 
				+        self.es_client = AsyncElasticSearchClient(
			
 
				+            index_name=self.es_index, hosts=self.es_hosts, password=self.es_password
			
 
				+        )
			
 
				+        if await self.es_client.es.ping():
			
 
				+            print("✅ Elasticsearch connected")
			
 
				+        else:
			
 
				+            print("❌ Elasticsearch connection failed")
			
 
				+
			
 
				+        # 初始化 MySQL
			
 
				+        self.mysql_client = DatabaseManager()
			
 
				+        await self.mysql_client.init_pools()
			
 
				+        print("✅ MySQL connected")
			
 
				+
			
 
				+        # 初始化 milvus
			
 
				+        await self.load_milvus()
			
 
				+        print("✅ Milvus loaded")
			
 
				+
			
 
				+    async def shutdown(self):
			
 
				+        # 关闭 Elasticsearch
			
 
				+        if self.es_client:
			
 
				+            await self.es_client.close()
			
 
				+            print("Elasticsearch closed")
			
 
				+
			
 
				+        # 关闭 Milvus
			
 
				+        connections.disconnect("default")
			
 
				+        print("Milvus closed")
			
 
				+
			
 
				+        # 关闭 MySQL
			
 
				+        if self.mysql_client:
			
 
				+            await self.mysql_client.close_pools()
			
 
				+            print("Mysql closed")
			
 
				+
			
 
				+
			
 
				+_resource_manager: ResourceManager | None = None
			
 
				+
			
 
				+
			
 
				+def init_resource_manager(es_index, es_hosts, es_password, milvus_config):
			
 
				+    global _resource_manager
			
 
				+    if _resource_manager is None:
			
 
				+        _resource_manager = ResourceManager(
			
 
				+            es_index, es_hosts, es_password, milvus_config
			
 
				+        )
			
 
				+
			
 
				+    return _resource_manager
			
 
				+
			
 
				+
			
 
				+def get_resource_manager() -> ResourceManager:
			
 
				+    return _resource_manager
			
--- a/applications/search/__init__.py
+++ b/applications/search/__init__.py
@@ -0,0 +1,3 @@
 
				+from .hybrid_search import HybridSearch
			
 
				+
			
 
				+__all__ = ["HybridSearch"]
			
--- a/applications/search/base_search.py
+++ b/applications/search/base_search.py
@@ -0,0 +1,7 @@
 
				+from applications.utils.milvus import MilvusSearch
			
 
				+
			
 
				+
			
 
				+class BaseSearch(MilvusSearch):
			
 
				+    def __init__(self, milvus_pool, es_pool):
			
 
				+        super().__init__(milvus_pool)
			
 
				+        self.es_pool = es_pool
			
--- a/applications/search/hybrid_search.py
+++ b/applications/search/hybrid_search.py
@@ -0,0 +1,41 @@
 
				+from typing import List, Dict, Optional, Any
			
 
				+from .base_search import BaseSearch
			
 
				+
			
 
				+from applications.utils.elastic_search import ElasticSearchStrategy
			
 
				+
			
 
				+
			
 
				+class HybridSearch(BaseSearch):
			
 
				+    def __init__(self, milvus_pool, es_pool):
			
 
				+        super().__init__(milvus_pool, es_pool)
			
 
				+        self.es_strategy = ElasticSearchStrategy(self.es_pool)
			
 
				+
			
 
				+    async def hybrid_search(
			
 
				+        self,
			
 
				+        filters: Dict[str, Any],  # 条件过滤
			
 
				+        query_vec: List[float],  # query 的向量
			
 
				+        anns_field: str = "vector_text",  # query指定的向量空间
			
 
				+        search_params: Optional[Dict[str, Any]] = None,  # 向量距离方式
			
 
				+        query_text: str = None,  # 是否通过 topic 倒排
			
 
				+        _source=False,  # 是否返回元数据
			
 
				+        es_size: int = 10000,  # es 第一层过滤数量
			
 
				+        sort_by: str = None,  # 排序
			
 
				+        milvus_size: int = 10,  # milvus粗排返回数量
			
 
				+    ):
			
 
				+        milvus_ids = await self.es_strategy.base_search(
			
 
				+            filters=filters,
			
 
				+            text_query=query_text,
			
 
				+            _source=_source,
			
 
				+            size=es_size,
			
 
				+            sort_by=sort_by,
			
 
				+        )
			
 
				+        if not milvus_ids:
			
 
				+            return {"results": []}
			
 
				+        milvus_ids_list = ",".join(milvus_ids)
			
 
				+        expr = f"id in [{milvus_ids_list}]"
			
 
				+        return await self.base_vector_search(
			
 
				+            query_vec=query_vec,
			
 
				+            anns_field=anns_field,
			
 
				+            limit=milvus_size,
			
 
				+            expr=expr,
			
 
				+            search_params=search_params,
			
 
				+        )
			
--- a/applications/utils/chunks/llm_classifier.py
+++ b/applications/utils/chunks/llm_classifier.py
@@ -48,6 +48,7 @@ class LLMClassifier:
 
				             text=text,
			
 
				             tokens=chunk.tokens,
			
 
				             topic_purity=chunk.topic_purity,
			
 
				+            dataset_id=chunk.dataset_id,
			
 
				             summary=response.get("summary"),
			
 
				             topic=response.get("topic"),
			
 
				             domain=response.get("domain"),
			
--- a/applications/utils/chunks/topic_aware_chunking.py
+++ b/applications/utils/chunks/topic_aware_chunking.py
@@ -108,7 +108,11 @@ class TopicAwareChunker(BoundaryDetector, SplitTextIntoSentences):
 
				         return np.stack(embs)
			
 
				 
			
 
				     def _pack_by_boundaries(
			
 
				-        self, sentence_list: List[str], boundaries: List[int], text_type: int
			
 
				+        self,
			
 
				+        sentence_list: List[str],
			
 
				+        boundaries: List[int],
			
 
				+        text_type: int,
			
 
				+        dataset_id: int,
			
 
				     ) -> List[Chunk]:
			
 
				         boundary_set = set(boundaries)
			
 
				         chunks: List[Chunk] = []
			
@@ -141,6 +145,7 @@ class TopicAwareChunker(BoundaryDetector, SplitTextIntoSentences):
 
				                 text=text,
			
 
				                 tokens=tokens,
			
 
				                 text_type=text_type,
			
 
				+                dataset_id=dataset_id,
			
 
				             )
			
 
				             chunks.append(chunk)
			
 
				             start = end + 1
			
@@ -167,14 +172,16 @@ class TopicAwareChunker(BoundaryDetector, SplitTextIntoSentences):
 
				         finally:
			
 
				             self.cfg.boundary_threshold = orig
			
 
				 
			
 
				-    async def chunk(self, text: str, text_type: int) -> List[Chunk]:
			
 
				+    async def chunk(self, text: str, text_type: int, dataset_id: int) -> List[Chunk]:
			
 
				         sentence_list = self.jieba_sent_tokenize(text)
			
 
				         if not sentence_list:
			
 
				             return []
			
 
				 
			
 
				         sentences_embeddings = await self._encode_batch(sentence_list)
			
 
				         boundaries = self.detect_boundaries(sentence_list, sentences_embeddings)
			
 
				-        raw_chunks = self._pack_by_boundaries(sentence_list, boundaries, text_type)
			
 
				+        raw_chunks = self._pack_by_boundaries(
			
 
				+            sentence_list, boundaries, text_type, dataset_id
			
 
				+        )
			
 
				         return raw_chunks
			
 
				 
			
 
				 
			
--- a/applications/utils/elastic_search/__init__.py
+++ b/applications/utils/elastic_search/__init__.py
@@ -0,0 +1,7 @@
 
				+from applications.config import ELASTIC_SEARCH_INDEX, ES_HOSTS, ES_PASSWORD
			
 
				+
			
 
				+from .client import AsyncElasticSearchClient
			
 
				+from .search_strategy import ElasticSearchStrategy
			
 
				+
			
 
				+
			
 
				+__all__ = ["AsyncElasticSearchClient", "ElasticSearchStrategy"]
			
--- a/applications/utils/elastic_search/client.py
+++ b/applications/utils/elastic_search/client.py
@@ -0,0 +1,77 @@
 
				+from elasticsearch import AsyncElasticsearch
			
 
				+from elasticsearch.helpers import async_bulk
			
 
				+
			
 
				+from applications.utils.async_utils import run_tasks_with_asyncio_task_group
			
 
				+
			
 
				+
			
 
				+class AsyncElasticSearchClient:
			
 
				+
			
 
				+    def __init__(self, index_name, hosts, password):
			
 
				+        self.es = AsyncElasticsearch(hosts=hosts, basic_auth=("elastic", password))
			
 
				+        self.index_name = index_name
			
 
				+
			
 
				+    async def create_index(self, settings, mappings):
			
 
				+        if await self.es.ping():
			
 
				+            print("ElasticSearch client is up and running")
			
 
				+        else:
			
 
				+            print("ElasticSearch client is not up and running")
			
 
				+
			
 
				+        exists = await self.es.indices.exists(index=self.index_name)
			
 
				+        if exists:
			
 
				+            print("index exists")
			
 
				+            await self.es.indices.delete(index=self.index_name)
			
 
				+            print("already delete index")
			
 
				+        try:
			
 
				+            await self.es.indices.create(
			
 
				+                index=self.index_name, settings=settings, mappings=mappings
			
 
				+            )
			
 
				+            print("Index created successfully")
			
 
				+        except Exception as e:
			
 
				+            print("fail to create index, reason:", e)
			
 
				+
			
 
				+    async def search(self, query):
			
 
				+        resp = await self.es.search(index=self.index_name, body=query)
			
 
				+        return resp
			
 
				+
			
 
				+    async def update(self, obj):
			
 
				+        return await self.es.update(
			
 
				+            index=self.index_name, id=obj["es_id"], body=obj["doc"]
			
 
				+        )
			
 
				+
			
 
				+    async def update_by_filed(self, field_name: str, field_value: str, doc: dict):
			
 
				+        try:
			
 
				+            # 先查出 doc_id
			
 
				+            query = {"query": {"term": {field_name: field_value}}}
			
 
				+            resp = await self.es.search(index=self.index_name, body=query)
			
 
				+            if not resp["hits"]["hits"]:
			
 
				+                print(f"No document found with {field_name}={field_value}")
			
 
				+                return None
			
 
				+
			
 
				+            task_list = [
			
 
				+                {"es_id": hit["_id"], "doc": doc} for hit in resp["hits"]["hits"]
			
 
				+            ]
			
 
				+
			
 
				+            # update by ids
			
 
				+            return await run_tasks_with_asyncio_task_group(
			
 
				+                task_list=task_list,
			
 
				+                handler=self.es.update,
			
 
				+                description="update by filed",
			
 
				+                unit="document",
			
 
				+                max_concurrency=10,
			
 
				+            )
			
 
				+        except Exception as e:
			
 
				+            print(f"fail to update by {field_name}={field_value}, reason:", e)
			
 
				+            return None
			
 
				+
			
 
				+    async def bulk_insert(self, docs):
			
 
				+        success, errors = await async_bulk(self.es, docs, request_timeout=10)
			
 
				+        return {"success": success, "failed": len(errors), "errors": errors}
			
 
				+
			
 
				+    async def close(self):
			
 
				+        await self.es.close()
			
 
				+
			
 
				+    async def __aenter__(self):
			
 
				+        return self
			
 
				+
			
 
				+    async def __aexit__(self, exc_type, exc_val, exc_tb):
			
 
				+        await self.es.close()
			
--- a/applications/utils/elastic_search/create_index.py
+++ b/applications/utils/elastic_search/create_index.py
@@ -0,0 +1,30 @@
 
				+"""
			
 
				+only use when create es index
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+
			
 
				+from applications.config import ELASTIC_SEARCH_INDEX, ES_HOSTS, ES_PASSWORD
			
 
				+from applications.utils.elastic_search.client import AsyncElasticSearchClient
			
 
				+
			
 
				+
			
 
				+settings = {"number_of_shards": 3, "number_of_replicas": 1}
			
 
				+
			
 
				+
			
 
				+mappings = {
			
 
				+    "properties": {
			
 
				+        "milvus_id": {"type": "keyword"},  # 向量数据库主键 id
			
 
				+        "doc_id": {"type": "keyword"},  # 文档 ID
			
 
				+        "chunk_id": {"type": "long"},  # chunk ID
			
 
				+        "topic": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},  # 主体
			
 
				+        "domain": {"type": "keyword"},
			
 
				+        "task_type": {"type": "keyword"},
			
 
				+        "text_type": {"type": "keyword"},
			
 
				+        "dataset_id": {"type": "keyword"},
			
 
				+        "keywords": {"type": "keyword"},
			
 
				+        "concepts": {"type": "keyword"},
			
 
				+        "entities": {"type": "keyword"},
			
 
				+        "status": {"type": "keyword"},
			
 
				+        "created_at": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss||epoch_millis"},
			
 
				+    }
			
 
				+}
			
--- a/applications/utils/elastic_search/search_strategy.py
+++ b/applications/utils/elastic_search/search_strategy.py
@@ -0,0 +1,39 @@
 
				+from typing import List, Dict
			
 
				+
			
 
				+
			
 
				+class ElasticSearchStrategy:
			
 
				+    def __init__(self, es):
			
 
				+        self.es = es
			
 
				+
			
 
				+    async def base_search(
			
 
				+        self,
			
 
				+        filters: Dict[str, List],
			
 
				+        text_query: str = None,
			
 
				+        _source=False,
			
 
				+        size: int = 10000,
			
 
				+        sort_by: str = None,
			
 
				+    ) -> List:
			
 
				+        must_clauses = []
			
 
				+        for field, value in filters.items():
			
 
				+            must_clauses.append({"terms": {field: value}})
			
 
				+
			
 
				+        if text_query:
			
 
				+            must_clauses.append({"match": {"topic": text_query}})
			
 
				+
			
 
				+        query = {
			
 
				+            "query": {"bool": {"must": must_clauses}},
			
 
				+            "size": size,
			
 
				+            "_source": _source,
			
 
				+        }
			
 
				+        try:
			
 
				+            resp = await self.es.search(query=query)
			
 
				+            return [
			
 
				+                hit["_source"] if _source else hit["_id"]
			
 
				+                for hit in resp["hits"]["hits"]
			
 
				+            ]
			
 
				+        except Exception as e:
			
 
				+            print(f"search failed: {e}")
			
 
				+            return []
			
 
				+
			
 
				+    async def search_strategy(self, query):
			
 
				+        pass
			
--- a/applications/utils/milvus/__init__.py
+++ b/applications/utils/milvus/__init__.py
@@ -1,10 +1,8 @@
 
				-from .collection import milvus_collection
			
 
				 from .functions import async_insert_chunk, async_search_chunk
			
 
				 from .search import MilvusSearch
			
 
				 
			
 
				 
			
 
				 __all__ = [
			
 
				-    "milvus_collection",
			
 
				     "async_insert_chunk",
			
 
				     "async_search_chunk",
			
 
				     "MilvusSearch",
			
--- a/applications/utils/milvus/collection.py
+++ b/applications/utils/milvus/collection.py
@@ -1,26 +1,26 @@
 
				-from pymilvus import connections, CollectionSchema, Collection
			
 
				-from applications.utils.milvus.field import fields
			
 
				-from applications.config import MILVUS_CONFIG
			
 
				-
			
 
				-
			
 
				-connections.connect("default", **MILVUS_CONFIG)
			
 
				-
			
 
				-schema = CollectionSchema(
			
 
				-    fields, description="Chunk multi-vector embeddings with metadata"
			
 
				-)
			
 
				-milvus_collection = Collection(name="chunk_multi_embeddings", schema=schema)
			
 
				-
			
 
				-# create index
			
 
				-vector_index_params = {
			
 
				-    "index_type": "IVF_FLAT",
			
 
				-    "metric_type": "COSINE",
			
 
				-    "params": {"M": 16, "efConstruction": 200},
			
 
				-}
			
 
				-
			
 
				-milvus_collection.create_index("vector_text", vector_index_params)
			
 
				-milvus_collection.create_index("vector_summary", vector_index_params)
			
 
				-milvus_collection.create_index("vector_questions", vector_index_params)
			
 
				-
			
 
				-milvus_collection.load()
			
 
				-
			
 
				-__all__ = ["milvus_collection"]
			
 
				+# from pymilvus import connections, CollectionSchema, Collection
			
 
				+# from applications.utils.milvus.field import fields
			
 
				+# from applications.config import MILVUS_CONFIG
			
 
				+#
			
 
				+#
			
 
				+# connections.connect("default", **MILVUS_CONFIG)
			
 
				+#
			
 
				+# schema = CollectionSchema(
			
 
				+#     fields, description="Chunk multi-vector embeddings with metadata"
			
 
				+# )
			
 
				+# milvus_collection = Collection(name="chunk_multi_embeddings_v2", schema=schema)
			
 
				+#
			
 
				+# # create index
			
 
				+# vector_index_params = {
			
 
				+#     "index_type": "IVF_FLAT",
			
 
				+#     "metric_type": "COSINE",
			
 
				+#     "params": {"M": 16, "efConstruction": 200},
			
 
				+# }
			
 
				+#
			
 
				+# milvus_collection.create_index("vector_text", vector_index_params)
			
 
				+# milvus_collection.create_index("vector_summary", vector_index_params)
			
 
				+# milvus_collection.create_index("vector_questions", vector_index_params)
			
 
				+#
			
 
				+# milvus_collection.load()
			
 
				+#
			
 
				+# __all__ = ["milvus_collection"]
			
--- a/applications/utils/milvus/field.py
+++ b/applications/utils/milvus/field.py
@@ -2,6 +2,7 @@ from pymilvus import FieldSchema, DataType
 
				 
			
 
				 # milvus 向量数据库
			
 
				 fields = [
			
 
				+    # 主键 ID
			
 
				     FieldSchema(
			
 
				         name="id",
			
 
				         dtype=DataType.INT64,
			
@@ -9,61 +10,30 @@ fields = [
 
				         auto_id=True,
			
 
				         description="自增id",
			
 
				     ),
			
 
				+    # 文档 id 字段
			
 
				     FieldSchema(
			
 
				         name="doc_id", dtype=DataType.VARCHAR, max_length=64, description="文档id"
			
 
				     ),
			
 
				     FieldSchema(name="chunk_id", dtype=DataType.INT64, description="文档分块id"),
			
 
				     # 三种向量字段
			
 
				-    FieldSchema(name="vector_text", dtype=DataType.FLOAT_VECTOR, dim=2560),
			
 
				-    FieldSchema(name="vector_summary", dtype=DataType.FLOAT_VECTOR, dim=2560),
			
 
				-    FieldSchema(name="vector_questions", dtype=DataType.FLOAT_VECTOR, dim=2560),
			
 
				-    # metadata
			
 
				     FieldSchema(
			
 
				-        name="topic", dtype=DataType.VARCHAR, max_length=255, description="主题"
			
 
				+        name="vector_text",
			
 
				+        dtype=DataType.FLOAT_VECTOR,
			
 
				+        dim=2560,
			
 
				+        description="chunk文本 embedding",
			
 
				     ),
			
 
				     FieldSchema(
			
 
				-        name="domain", dtype=DataType.VARCHAR, max_length=100, description="领域"
			
 
				+        name="vector_summary",
			
 
				+        dtype=DataType.FLOAT_VECTOR,
			
 
				+        dim=2560,
			
 
				+        description="总结 embedding",
			
 
				     ),
			
 
				     FieldSchema(
			
 
				-        name="task_type", dtype=DataType.VARCHAR, max_length=100, description="任务类型"
			
 
				+        name="vector_questions",
			
 
				+        dtype=DataType.FLOAT_VECTOR,
			
 
				+        dim=2560,
			
 
				+        description="衍生问题 embedding",
			
 
				     ),
			
 
				-    FieldSchema(
			
 
				-        name="summary", dtype=DataType.VARCHAR, max_length=512, description="总结"
			
 
				-    ),
			
 
				-    FieldSchema(
			
 
				-        name="keywords",
			
 
				-        dtype=DataType.ARRAY,
			
 
				-        element_type=DataType.VARCHAR,
			
 
				-        max_length=100,
			
 
				-        max_capacity=5,
			
 
				-        description="关键词",
			
 
				-    ),
			
 
				-    FieldSchema(
			
 
				-        name="concepts",
			
 
				-        dtype=DataType.ARRAY,
			
 
				-        element_type=DataType.VARCHAR,
			
 
				-        max_length=100,
			
 
				-        max_capacity=5,
			
 
				-        description="主要知识点",
			
 
				-    ),
			
 
				-    FieldSchema(
			
 
				-        name="questions",
			
 
				-        dtype=DataType.ARRAY,
			
 
				-        element_type=DataType.VARCHAR,
			
 
				-        max_length=200,
			
 
				-        max_capacity=5,
			
 
				-        description="隐含问题",
			
 
				-    ),
			
 
				-    FieldSchema(
			
 
				-        name="entities",
			
 
				-        dtype=DataType.ARRAY,
			
 
				-        element_type=DataType.VARCHAR,
			
 
				-        max_length=200,
			
 
				-        max_capacity=5,
			
 
				-        description="命名实体",
			
 
				-    ),
			
 
				-    FieldSchema(name="topic_purity", dtype=DataType.FLOAT),
			
 
				-    FieldSchema(name="tokens", dtype=DataType.INT64),
			
 
				 ]
			
 
				 
			
 
				 
			
--- a/applications/utils/milvus/functions.py
+++ b/applications/utils/milvus/functions.py
@@ -1,16 +1,17 @@
 
				 import asyncio
			
 
				-from typing import Dict
			
 
				+from typing import Dict, List
			
 
				 
			
 
				 import pymilvus
			
 
				 
			
 
				 
			
 
				-async def async_insert_chunk(collection: pymilvus.Collection, data: Dict):
			
 
				+async def async_insert_chunk(collection: pymilvus.Collection, data: Dict) -> List[int]:
			
 
				     """
			
 
				     :param collection:
			
 
				     :param data: insert data
			
 
				     :return:
			
 
				     """
			
 
				-    await asyncio.to_thread(collection.insert, [data])
			
 
				+    result = await asyncio.to_thread(collection.insert, [data])
			
 
				+    return result.primary_keys
			
 
				 
			
 
				 
			
 
				 async def async_search_chunk(
			
--- a/applications/utils/milvus/search.py
+++ b/applications/utils/milvus/search.py
@@ -5,18 +5,9 @@ from typing import List, Optional, Dict, Any, Union
 
				 class MilvusBase:
			
 
				 
			
 
				     output_fields = [
			
 
				+        "id",
			
 
				         "doc_id",
			
 
				         "chunk_id",
			
 
				-        # "summary",
			
 
				-        # "topic",
			
 
				-        # "domain",
			
 
				-        # "task_type",
			
 
				-        # "keywords",
			
 
				-        # "concepts",
			
 
				-        # "questions",
			
 
				-        # "entities",
			
 
				-        # "tokens",
			
 
				-        # "topic_purity",
			
 
				     ]
			
 
				 
			
 
				     def __init__(self, milvus_pool):
			
@@ -43,8 +34,8 @@ class MilvusBase:
 
				 
			
 
				 class MilvusSearch(MilvusBase):
			
 
				 
			
 
				-    # 通过向量匹配
			
 
				-    async def vector_search(
			
 
				+    # 通过向量粗搜索
			
 
				+    async def base_vector_search(
			
 
				         self,
			
 
				         query_vec: List[float],
			
 
				         anns_field: str = "vector_text",
			
@@ -67,29 +58,6 @@ class MilvusSearch(MilvusBase):
 
				         )
			
 
				         return {"results": self.hits_to_json(response)}
			
 
				 
			
 
				-    # 混合搜索（向量 + metadata）
			
 
				-    async def hybrid_search(
			
 
				-        self,
			
 
				-        query_vec: List[float],
			
 
				-        anns_field: str = "vector_text",
			
 
				-        limit: int = 5,
			
 
				-        filters: Optional[Dict[str, Union[str, int, float]]] = None,
			
 
				-    ):
			
 
				-        expr = None
			
 
				-        if filters:
			
 
				-            parts = []
			
 
				-            for k, v in filters.items():
			
 
				-                if isinstance(v, str):
			
 
				-                    parts.append(f'{k} == "{v}"')
			
 
				-                else:
			
 
				-                    parts.append(f"{k} == {v}")
			
 
				-            expr = " and ".join(parts)
			
 
				-
			
 
				-        response = await self.vector_search(
			
 
				-            query_vec=query_vec, anns_field=anns_field, limit=limit, expr=expr
			
 
				-        )
			
 
				-        return self.hits_to_json(response)
			
 
				-
			
 
				     async def search_by_strategy(
			
 
				         self,
			
 
				         query_vec: List[float],
			
@@ -125,41 +93,3 @@ class MilvusSearch(MilvusBase):
 
				             {"pk": k[0], "doc_id": k[1], "chunk_id": k[2], "score": v}
			
 
				             for k, v in ranked
			
 
				         ]
			
 
				-
			
 
				-
			
 
				-class MilvusQuery(MilvusBase):
			
 
				-    # 通过doc_id + chunk_id 获取数据
			
 
				-    async def get_by_doc_and_chunk(self, doc_id: str, chunk_id: int):
			
 
				-        expr = f'doc_id == "{doc_id}" and chunk_id == {chunk_id}'
			
 
				-        response = await asyncio.to_thread(
			
 
				-            self.milvus_pool.query,
			
 
				-            expr=expr,
			
 
				-            output_fields=self.output_fields,
			
 
				-        )
			
 
				-        return self.hits_to_json(response)
			
 
				-
			
 
				-    # 只按 metadata 条件查询
			
 
				-    async def filter_search(self, filters: Dict[str, Union[str, int, float]]):
			
 
				-        exprs = []
			
 
				-        for k, v in filters.items():
			
 
				-            if isinstance(v, str):
			
 
				-                exprs.append(f'{k} == "{v}"')
			
 
				-            else:
			
 
				-                exprs.append(f"{k} == {v}")
			
 
				-        expr = " and ".join(exprs)
			
 
				-        response = await asyncio.to_thread(
			
 
				-            self.milvus_pool.query,
			
 
				-            expr=expr,
			
 
				-            output_fields=self.output_fields,
			
 
				-        )
			
 
				-        print(response)
			
 
				-        return self.hits_to_json(response)
			
 
				-
			
 
				-    # 通过主键获取milvus数据
			
 
				-    async def get_by_id(self, pk: int):
			
 
				-        response = await asyncio.to_thread(
			
 
				-            self.milvus_pool.query,
			
 
				-            expr=f"id == {pk}",
			
 
				-            output_fields=self.output_fields,
			
 
				-        )
			
 
				-        return self.hits_to_json(response)
			
--- a/applications/utils/mysql/__init__.py
+++ b/applications/utils/mysql/__init__.py
@@ -2,10 +2,6 @@ from .pool import DatabaseManager
 
				 from .mapper import Contents, ContentChunks
			
 
				 
			
 
				 # 全局数据库管理器实例
			
 
				-mysql_manager = DatabaseManager()
			
 
				+# mysql_manager = DatabaseManager()
			
 
				 
			
 
				-__all__ = [
			
 
				-    "mysql_manager",
			
 
				-    "Contents",
			
 
				-    "ContentChunks",
			
 
				-]
			
 
				+__all__ = ["Contents", "ContentChunks", "DatabaseManager"]
			
--- a/applications/utils/mysql/mapper.py
+++ b/applications/utils/mysql/mapper.py
@@ -17,13 +17,15 @@ class BaseMySQLClient:
 
				 
			
 
				 class Contents(BaseMySQLClient):
			
 
				 
			
 
				-    async def insert_content(self, doc_id, text, text_type):
			
 
				+    async def insert_content(self, doc_id, text, text_type, title, dataset_id):
			
 
				         query = """
			
 
				             INSERT IGNORE INTO contents
			
 
				-                (doc_id, text, text_type)
			
 
				-            VALUES (%s, %s, %s);
			
 
				+                (doc_id, text, text_type, title, dataset_id)
			
 
				+            VALUES (%s, %s, %s, %s, %s);
			
 
				         """
			
 
				-        return await self.pool.async_save(query=query, params=(doc_id, text, text_type))
			
 
				+        return await self.pool.async_save(
			
 
				+            query=query, params=(doc_id, text, text_type, title, dataset_id)
			
 
				+        )
			
 
				 
			
 
				     async def update_content_status(self, doc_id, ori_status, new_status):
			
 
				         query = """
			
@@ -41,8 +43,8 @@ class ContentChunks(BaseMySQLClient):
 
				     async def insert_chunk(self, chunk: Chunk) -> int:
			
 
				         query = """
			
 
				             INSERT IGNORE INTO content_chunks
			
 
				-                (chunk_id, doc_id, text, tokens, topic_purity, text_type) 
			
 
				-                VALUES (%s, %s, %s, %s, %s, %s);
			
 
				+                (chunk_id, doc_id, text, tokens, topic_purity, text_type, dataset_id) 
			
 
				+                VALUES (%s, %s, %s, %s, %s, %s, %s);
			
 
				         """
			
 
				         return await self.pool.async_save(
			
 
				             query=query,
			
@@ -53,6 +55,7 @@ class ContentChunks(BaseMySQLClient):
 
				                 chunk.tokens,
			
 
				                 chunk.topic_purity,
			
 
				                 chunk.text_type,
			
 
				+                chunk.dataset_id,
			
 
				             ),
			
 
				         )
			
 
				 
			
@@ -100,3 +103,12 @@ class ContentChunks(BaseMySQLClient):
 
				                 ori_status,
			
 
				             ),
			
 
				         )
			
 
				+
			
 
				+    async def update_es_status(self, doc_id, chunk_id, ori_status, new_status):
			
 
				+        query = """
			
 
				+            UPDATE content_chunks SET es_status = %s
			
 
				+            WHERE doc_id = %s AND chunk_id = %s AND es_status = %s;
			
 
				+        """
			
 
				+        return await self.pool.async_save(
			
 
				+            query=query, params=(new_status, doc_id, chunk_id, ori_status)
			
 
				+        )
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,6 +16,7 @@ pip-chill==1.0.3
 
				 pymilvus==2.6.1
			
 
				 pysocks==1.7.1
			
 
				 quart-cors==0.8.0
			
 
				-sentence-transformers==5.1.0
			
 
				 tiktoken==0.11.0
			
 
				 uvloop==0.21.0
			
 
				+elasticsearch==8.17.2
			
 
				+scikit-learn==1.7.2
			
--- a/routes/__init__.py
+++ b/routes/__init__.py
@@ -1,3 +1,3 @@
 
				-from .buleprint import server_routes
			
 
				+from .buleprint import server_bp
			
 
				 
			
 
				-__all__ = ["server_routes"]
			
 
				+__all__ = ["server_bp"]
			
--- a/routes/buleprint.py
+++ b/routes/buleprint.py
@@ -1,5 +1,6 @@
 
				 import traceback
			
 
				 import uuid
			
 
				+from typing import Dict, Any
			
 
				 
			
 
				 from quart import Blueprint, jsonify, request
			
 
				 
			
@@ -7,122 +8,120 @@ from applications.config import (
 
				     DEFAULT_MODEL,
			
 
				     LOCAL_MODEL_CONFIG,
			
 
				     ChunkerConfig,
			
 
				-    WEIGHT_MAP,
			
 
				+    BASE_MILVUS_SEARCH_PARAMS,
			
 
				 )
			
 
				+from applications.resource import get_resource_manager
			
 
				 from applications.api import get_basic_embedding
			
 
				 from applications.api import get_img_embedding
			
 
				 from applications.async_task import ChunkEmbeddingTask
			
 
				-from applications.utils.milvus import MilvusSearch
			
 
				+from applications.search import HybridSearch
			
 
				+
			
 
				 
			
 
				 server_bp = Blueprint("api", __name__, url_prefix="/api")
			
 
				 
			
 
				 
			
 
				-def server_routes(mysql_db, vector_db):
			
 
				-
			
 
				-    @server_bp.route("/embed", methods=["POST"])
			
 
				-    async def embed():
			
 
				-        body = await request.get_json()
			
 
				-        text = body.get("text")
			
 
				-        model_name = body.get("model", DEFAULT_MODEL)
			
 
				-        if not LOCAL_MODEL_CONFIG.get(model_name):
			
 
				-            return jsonify({"error": "error  model"})
			
 
				-
			
 
				-        embedding = await get_basic_embedding(text, model_name)
			
 
				-        return jsonify({"embedding": embedding})
			
 
				-
			
 
				-    @server_bp.route("/img_embed", methods=["POST"])
			
 
				-    async def img_embed():
			
 
				-        body = await request.get_json()
			
 
				-        url_list = body.get("url_list")
			
 
				-        if not url_list:
			
 
				-            return jsonify({"error": "error  url_list"})
			
 
				-
			
 
				-        embedding = await get_img_embedding(url_list)
			
 
				-        return jsonify(embedding)
			
 
				-
			
 
				-    @server_bp.route("/chunk", methods=["POST"])
			
 
				-    async def chunk():
			
 
				-        body = await request.get_json()
			
 
				-        text = body.get("text", "")
			
 
				-        text = text.strip()
			
 
				-        if not text:
			
 
				-            return jsonify({"error": "error  text"})
			
 
				-        doc_id = f"doc-{uuid.uuid4()}"
			
 
				-        chunk_task = ChunkEmbeddingTask(
			
 
				-            mysql_db, vector_db, cfg=ChunkerConfig(), doc_id=doc_id
			
 
				-        )
			
 
				-        doc_id = await chunk_task.deal(body)
			
 
				-        return jsonify({"doc_id": doc_id})
			
 
				-
			
 
				-    @server_bp.route("/search", methods=["POST"])
			
 
				-    async def search():
			
 
				-        body = await request.get_json()
			
 
				-        search_type = body.get("search_type")
			
 
				-        if not search_type:
			
 
				-            return jsonify({"error": "missing search_type"}), 400
			
 
				-
			
 
				-        searcher = MilvusSearch(vector_db)
			
 
				-
			
 
				-        try:
			
 
				-            # 统一参数
			
 
				-            expr = body.get("expr")
			
 
				-            search_params = body.get("search_params") or {
			
 
				-                "metric_type": "COSINE",
			
 
				-                "params": {"ef": 64},
			
 
				-            }
			
 
				-            limit = body.get("limit", 50)
			
 
				-            query = body.get("query")
			
 
				-
			
 
				-            async def by_vector():
			
 
				-                if not query:
			
 
				-                    return {"error": "missing query"}
			
 
				-                field = body.get("field", "vector_text")
			
 
				-                query_vec = await get_basic_embedding(text=query, model=DEFAULT_MODEL)
			
 
				-                return await searcher.vector_search(
			
 
				-                    query_vec=query_vec,
			
 
				-                    anns_field=field,
			
 
				-                    expr=expr,
			
 
				+@server_bp.route("/embed", methods=["POST"])
			
 
				+async def embed():
			
 
				+    body = await request.get_json()
			
 
				+    text = body.get("text")
			
 
				+    model_name = body.get("model", DEFAULT_MODEL)
			
 
				+    if not LOCAL_MODEL_CONFIG.get(model_name):
			
 
				+        return jsonify({"error": "error  model"})
			
 
				+
			
 
				+    embedding = await get_basic_embedding(text, model_name)
			
 
				+    return jsonify({"embedding": embedding})
			
 
				+
			
 
				+
			
 
				+@server_bp.route("/img_embed", methods=["POST"])
			
 
				+async def img_embed():
			
 
				+    body = await request.get_json()
			
 
				+    url_list = body.get("url_list")
			
 
				+    if not url_list:
			
 
				+        return jsonify({"error": "error  url_list"})
			
 
				+
			
 
				+    embedding = await get_img_embedding(url_list)
			
 
				+    return jsonify(embedding)
			
 
				+
			
 
				+
			
 
				+@server_bp.route("/chunk", methods=["POST"])
			
 
				+async def chunk():
			
 
				+    body = await request.get_json()
			
 
				+    text = body.get("text", "")
			
 
				+    text = text.strip()
			
 
				+    if not text:
			
 
				+        return jsonify({"error": "error  text"})
			
 
				+    resource = get_resource_manager()
			
 
				+    doc_id = f"doc-{uuid.uuid4()}"
			
 
				+    chunk_task = ChunkEmbeddingTask(
			
 
				+        resource.mysql_client,
			
 
				+        resource.milvus_client,
			
 
				+        cfg=ChunkerConfig(),
			
 
				+        doc_id=doc_id,
			
 
				+        es_pool=resource.es_client,
			
 
				+    )
			
 
				+    doc_id = await chunk_task.deal(body)
			
 
				+    return jsonify({"doc_id": doc_id})
			
 
				+
			
 
				+
			
 
				+@server_bp.route("/search", methods=["POST"])
			
 
				+async def search():
			
 
				+    """
			
 
				+    filters: Dict[str, Any], # 条件过滤
			
 
				+    query_vec: List[float], # query 的向量
			
 
				+    anns_field: str = "vector_text", # query指定的向量空间
			
 
				+    search_params: Optional[Dict[str, Any]] = None, # 向量距离方式
			
 
				+    query_text: str = None, #是否通过 topic 倒排
			
 
				+    _source=False, # 是否返回元数据
			
 
				+    es_size: int = 10000, #es 第一层过滤数量
			
 
				+    sort_by: str = None, # 排序
			
 
				+    milvus_size: int = 10 # milvus粗排返回数量
			
 
				+    :return:
			
 
				+    """
			
 
				+    body = await request.get_json()
			
 
				+
			
 
				+    # 解析数据
			
 
				+    search_type: str = body.get("search_type")
			
 
				+    filters: Dict[str, Any] = body.get("filters", {})
			
 
				+    anns_field: str = body.get("anns_field", "vector_text")
			
 
				+    search_params: Dict[str, Any] = body.get("search_params", BASE_MILVUS_SEARCH_PARAMS)
			
 
				+    query_text: str = body.get("query_text")
			
 
				+    _source: bool = body.get("_source", False)
			
 
				+    es_size: int = body.get("es_size", 10000)
			
 
				+    sort_by: str = body.get("sort_by")
			
 
				+    milvus_size: int = body.get("milvus", 20)
			
 
				+    limit: int = body.get("limit", 10)
			
 
				+    if not query_text:
			
 
				+        return jsonify({"error": "error  query_text"})
			
 
				+
			
 
				+    query_vector = await get_basic_embedding(text=query_text, model=DEFAULT_MODEL)
			
 
				+    resource = get_resource_manager()
			
 
				+    search_engine = HybridSearch(
			
 
				+        milvus_pool=resource.milvus_client, es_pool=resource.es_client
			
 
				+    )
			
 
				+    try:
			
 
				+        match search_type:
			
 
				+            case "base":
			
 
				+                response = await search_engine.base_vector_search(
			
 
				+                    query_vec=query_vector,
			
 
				+                    anns_field=anns_field,
			
 
				                     search_params=search_params,
			
 
				                     limit=limit,
			
 
				                 )
			
 
				-
			
 
				-            async def hybrid():
			
 
				-                if not query:
			
 
				-                    return {"error": "missing query"}
			
 
				-                field = body.get("field", "vector_text")
			
 
				-                query_vec = await get_basic_embedding(text=query, model=DEFAULT_MODEL)
			
 
				-                return await searcher.hybrid_search(
			
 
				-                    query_vec=query_vec,
			
 
				-                    anns_field=field,
			
 
				-                    filters=body.get("filter_map"),
			
 
				-                    limit=limit,
			
 
				-                )
			
 
				-
			
 
				-            async def strategy():
			
 
				-                if not query:
			
 
				-                    return {"error": "missing query"}
			
 
				-                query_vec = await get_basic_embedding(text=query, model=DEFAULT_MODEL)
			
 
				-                return await searcher.search_by_strategy(
			
 
				-                    query_vec=query_vec,
			
 
				-                    weight_map=body.get("weight_map", WEIGHT_MAP),
			
 
				-                    expr=expr,
			
 
				-                    limit=limit,
			
 
				+                return jsonify(response), 200
			
 
				+            case "hybrid":
			
 
				+                response = await search_engine.hybrid_search(
			
 
				+                    filters=filters,
			
 
				+                    query_vec=query_vector,
			
 
				+                    anns_field=anns_field,
			
 
				+                    search_params=search_params,
			
 
				+                    es_size=es_size,
			
 
				+                    sort_by=sort_by,
			
 
				+                    milvus_size=milvus_size,
			
 
				                 )
			
 
				-
			
 
				-            # dispatch table
			
 
				-            handlers = {
			
 
				-                "by_vector": by_vector,
			
 
				-                "hybrid": hybrid,
			
 
				-                "strategy": strategy,
			
 
				-            }
			
 
				-
			
 
				-            if search_type not in handlers:
			
 
				-                return jsonify({"error": "invalid search_type"}), 400
			
 
				-
			
 
				-            result = await handlers[search_type]()
			
 
				-            return jsonify(result)
			
 
				-
			
 
				-        except Exception as e:
			
 
				-            return jsonify({"error": str(e), "traceback": traceback.format_exc()}), 500
			
 
				-
			
 
				-    return server_bp
			
 
				+                return jsonify(response), 200
			
 
				+            case "strategy":
			
 
				+                return jsonify({"error": "strategy not implemented"}), 405
			
 
				+            case _:
			
 
				+                return jsonify({"error": "error  search_type"}), 200
			
 
				+    except Exception as e:
			
 
				+        return jsonify({"error": str(e), "traceback": traceback.format_exc()}), 500
			
--- a/vector_app.py
+++ b/vector_app.py
@@ -2,31 +2,38 @@ import jieba
 
				 from quart import Quart
			
 
				 
			
 
				 from applications.config import LOCAL_MODEL_CONFIG, DEFAULT_MODEL
			
 
				-from applications.utils.milvus import milvus_collection
			
 
				-from applications.utils.mysql import mysql_manager
			
 
				-from routes import server_routes
			
 
				+from applications.config import ES_HOSTS, ES_PASSWORD, ELASTIC_SEARCH_INDEX
			
 
				+from applications.config import MILVUS_CONFIG
			
 
				+from applications.resource import init_resource_manager
			
 
				 
			
 
				 app = Quart(__name__)
			
 
				 
			
 
				+# 初始化
			
 
				 MODEL_PATH = LOCAL_MODEL_CONFIG[DEFAULT_MODEL]
			
 
				 
			
 
				-# 注册路由
			
 
				-app_route = server_routes(mysql_manager, milvus_collection)
			
 
				-app.register_blueprint(app_route)
			
 
				+resource_manager = init_resource_manager(
			
 
				+    es_hosts=ES_HOSTS,
			
 
				+    es_index=ELASTIC_SEARCH_INDEX,
			
 
				+    es_password=ES_PASSWORD,
			
 
				+    milvus_config=MILVUS_CONFIG,
			
 
				+)
			
 
				 
			
 
				 
			
 
				 @app.before_serving
			
 
				 async def startup():
			
 
				-    print("Starting application...")
			
 
				-    await mysql_manager.init_pools()
			
 
				-    print("Mysql pools init successfully")
			
 
				-
			
 
				-    print("Loading jieba dictionary...")
			
 
				+    await resource_manager.startup()
			
 
				+    print("Resource manager is ready.")
			
 
				     jieba.initialize()
			
 
				     print("Jieba dictionary loaded successfully")
			
 
				 
			
 
				 
			
 
				 @app.after_serving
			
 
				 async def shutdown():
			
 
				-    print("Shutting down application...")
			
 
				-    await mysql_manager.close_pools()
			
 
				+    await resource_manager.shutdown()
			
 
				+    print("Resource manager is Down.")
			
 
				+
			
 
				+
			
 
				+# 注册路由
			
 
				+from routes import server_bp
			
 
				+
			
 
				+app.register_blueprint(server_bp)