| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213 |
- """
- Milvus Lite 存储封装
- 单一存储架构,存储完整知识数据 + 向量。
- """
- from milvus import default_server
- from pymilvus import (
- connections, Collection, FieldSchema,
- CollectionSchema, DataType, utility
- )
- from typing import List, Dict, Optional
- import json
- import time
- class MilvusStore:
- def __init__(self, data_dir: str = "./milvus_data"):
- """
- 初始化 Milvus Lite 存储
- Args:
- data_dir: 数据存储目录
- """
- # 启动内嵌服务器
- default_server.set_base_dir(data_dir)
- default_server.start()
- # 连接
- connections.connect(
- host='127.0.0.1',
- port=default_server.listen_port
- )
- self._init_collection()
- def _init_collection(self):
- """初始化 collection"""
- collection_name = "knowledge"
- if utility.has_collection(collection_name):
- self.collection = Collection(collection_name)
- else:
- # 定义 schema
- fields = [
- FieldSchema(name="id", dtype=DataType.VARCHAR,
- max_length=100, is_primary=True),
- FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR,
- dim=1536),
- FieldSchema(name="message_id", dtype=DataType.VARCHAR,
- max_length=100),
- FieldSchema(name="task", dtype=DataType.VARCHAR,
- max_length=2000),
- FieldSchema(name="content", dtype=DataType.VARCHAR,
- max_length=50000),
- FieldSchema(name="types", dtype=DataType.JSON),
- FieldSchema(name="tags", dtype=DataType.JSON),
- FieldSchema(name="scopes", dtype=DataType.JSON),
- FieldSchema(name="owner", dtype=DataType.VARCHAR,
- max_length=200),
- FieldSchema(name="resource_ids", dtype=DataType.JSON),
- FieldSchema(name="source", dtype=DataType.JSON),
- FieldSchema(name="eval", dtype=DataType.JSON),
- FieldSchema(name="created_at", dtype=DataType.INT64),
- FieldSchema(name="updated_at", dtype=DataType.INT64),
- ]
- schema = CollectionSchema(fields, description="KnowHub Knowledge")
- self.collection = Collection(collection_name, schema)
- # 创建向量索引
- index_params = {
- "metric_type": "COSINE",
- "index_type": "HNSW",
- "params": {"M": 16, "efConstruction": 200}
- }
- self.collection.create_index("embedding", index_params)
- self.collection.load()
- def insert(self, knowledge: Dict):
- """
- 插入单条知识
- Args:
- knowledge: 知识数据(包含 embedding)
- """
- self.collection.insert([knowledge])
- self.collection.flush()
- def insert_batch(self, knowledge_list: List[Dict]):
- """
- 批量插入知识
- Args:
- knowledge_list: 知识列表
- """
- if not knowledge_list:
- return
- self.collection.insert(knowledge_list)
- self.collection.flush()
- def search(self,
- query_embedding: List[float],
- filters: Optional[str] = None,
- limit: int = 10) -> List[Dict]:
- """
- 向量检索 + 标量过滤
- Args:
- query_embedding: 查询向量
- filters: 过滤表达式(如: 'owner == "agent"')
- limit: 返回数量
- Returns:
- 知识列表
- """
- search_params = {"metric_type": "COSINE", "params": {"ef": 100}}
- results = self.collection.search(
- data=[query_embedding],
- anns_field="embedding",
- param=search_params,
- limit=limit,
- expr=filters,
- output_fields=["id", "message_id", "task", "content", "types",
- "tags", "scopes", "owner", "resource_ids",
- "source", "eval", "created_at", "updated_at"]
- )
- if not results or not results[0]:
- return []
- return [hit.entity.to_dict() for hit in results[0]]
- def query(self, filters: str, limit: int = 100) -> List[Dict]:
- """
- 纯标量查询(不使用向量)
- Args:
- filters: 过滤表达式
- limit: 返回数量
- Returns:
- 知识列表
- """
- results = self.collection.query(
- expr=filters,
- output_fields=["id", "message_id", "task", "content", "types",
- "tags", "scopes", "owner", "resource_ids",
- "source", "eval", "created_at", "updated_at"],
- limit=limit
- )
- return results
- def get_by_id(self, knowledge_id: str) -> Optional[Dict]:
- """
- 根据 ID 获取知识
- Args:
- knowledge_id: 知识 ID
- Returns:
- 知识数据,不存在返回 None
- """
- results = self.collection.query(
- expr=f'id == "{knowledge_id}"',
- output_fields=["id", "message_id", "task", "content", "types",
- "tags", "scopes", "owner", "resource_ids",
- "source", "eval", "created_at", "updated_at"]
- )
- return results[0] if results else None
- def update(self, knowledge_id: str, updates: Dict):
- """
- 更新知识(先删除再插入)
- Args:
- knowledge_id: 知识 ID
- updates: 更新字段
- """
- # 1. 查询现有数据
- existing = self.get_by_id(knowledge_id)
- if not existing:
- raise ValueError(f"Knowledge not found: {knowledge_id}")
- # 2. 合并更新
- existing.update(updates)
- existing["updated_at"] = int(time.time())
- # 3. 删除旧数据
- self.delete(knowledge_id)
- # 4. 插入新数据
- self.insert(existing)
- def delete(self, knowledge_id: str):
- """
- 删除知识
- Args:
- knowledge_id: 知识 ID
- """
- self.collection.delete(f'id == "{knowledge_id}"')
- self.collection.flush()
- def count(self) -> int:
- """返回知识总数"""
- return self.collection.num_entities
- def drop_collection(self):
- """删除 collection(危险操作)"""
- utility.drop_collection("knowledge")
|