|
|
@@ -136,6 +136,31 @@ def decrypt_content(resource_id: str, encrypted_text: str, provided_key: Optiona
|
|
|
return "[ENCRYPTED]"
|
|
|
|
|
|
|
|
|
+def serialize_milvus_result(data):
|
|
|
+ """将 Milvus 返回的数据转换为可序列化的字典"""
|
|
|
+ if data is None:
|
|
|
+ return None
|
|
|
+ elif isinstance(data, (str, int, float, bool)):
|
|
|
+ return data
|
|
|
+ elif isinstance(data, dict):
|
|
|
+ return {k: serialize_milvus_result(v) for k, v in data.items()}
|
|
|
+ elif isinstance(data, (list, tuple)):
|
|
|
+ return [serialize_milvus_result(item) for item in data]
|
|
|
+ elif hasattr(data, '__iter__') and not isinstance(data, (str, bytes)):
|
|
|
+ # 处理 RepeatedScalarContainer 等可迭代对象
|
|
|
+ return [serialize_milvus_result(item) for item in data]
|
|
|
+ elif hasattr(data, 'to_dict'):
|
|
|
+ return serialize_milvus_result(data.to_dict())
|
|
|
+ elif hasattr(data, '__dict__'):
|
|
|
+ return serialize_milvus_result(vars(data))
|
|
|
+ else:
|
|
|
+ # 尝试转换为字符串
|
|
|
+ try:
|
|
|
+ return str(data)
|
|
|
+ except:
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
def init_db():
|
|
|
"""初始化 SQLite(仅用于 resources)"""
|
|
|
conn = get_db()
|
|
|
@@ -566,7 +591,7 @@ async def search_knowledge_api(
|
|
|
if types:
|
|
|
type_list = [t.strip() for t in types.split(',') if t.strip()]
|
|
|
for t in type_list:
|
|
|
- filters.append(f'JSON_CONTAINS(types, "{t}")')
|
|
|
+ filters.append(f'ARRAY_CONTAINS(types, "{t}")')
|
|
|
if owner:
|
|
|
filters.append(f'owner == "{owner}"')
|
|
|
|
|
|
@@ -586,18 +611,21 @@ async def search_knowledge_api(
|
|
|
if not candidates:
|
|
|
return {"results": [], "count": 0, "reranked": False}
|
|
|
|
|
|
+ # 转换为可序列化的格式
|
|
|
+ serialized_candidates = [serialize_milvus_result(c) for c in candidates]
|
|
|
+
|
|
|
# 4. LLM 精排
|
|
|
- reranked_ids = await _llm_rerank(q, candidates, top_k)
|
|
|
+ reranked_ids = await _llm_rerank(q, serialized_candidates, top_k)
|
|
|
|
|
|
if reranked_ids:
|
|
|
# 按 LLM 排序返回
|
|
|
- id_to_candidate = {c["id"]: c for c in candidates}
|
|
|
+ id_to_candidate = {c["id"]: c for c in serialized_candidates}
|
|
|
results = [id_to_candidate[id] for id in reranked_ids if id in id_to_candidate]
|
|
|
return {"results": results, "count": len(results), "reranked": True}
|
|
|
else:
|
|
|
# Fallback:直接返回向量召回的 top k
|
|
|
print(f"[Knowledge Search] LLM 精排失败,fallback 到向量 top-{top_k}")
|
|
|
- return {"results": candidates[:top_k], "count": len(candidates[:top_k]), "reranked": False}
|
|
|
+ return {"results": serialized_candidates[:top_k], "count": len(serialized_candidates[:top_k]), "reranked": False}
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"[Knowledge Search] 错误: {e}")
|
|
|
@@ -639,12 +667,11 @@ async def save_knowledge(knowledge: KnowledgeIn):
|
|
|
"harmful_history": []
|
|
|
}
|
|
|
|
|
|
- # 生成向量
|
|
|
- text = f"{knowledge.task}\n{knowledge.content}"
|
|
|
- embedding = await get_embedding(text)
|
|
|
+ # 生成向量(只基于 task,因为搜索时用户描述的是任务场景)
|
|
|
+ embedding = await get_embedding(knowledge.task)
|
|
|
|
|
|
- # 插入 Milvus
|
|
|
- milvus_store.insert({
|
|
|
+ # 准备插入数据
|
|
|
+ insert_data = {
|
|
|
"id": knowledge_id,
|
|
|
"embedding": embedding,
|
|
|
"message_id": knowledge.message_id,
|
|
|
@@ -659,7 +686,12 @@ async def save_knowledge(knowledge: KnowledgeIn):
|
|
|
"eval": eval_data,
|
|
|
"created_at": now,
|
|
|
"updated_at": now,
|
|
|
- })
|
|
|
+ }
|
|
|
+
|
|
|
+ print(f"[Save Knowledge] 插入数据: {json.dumps({k: v for k, v in insert_data.items() if k != 'embedding'}, ensure_ascii=False)}")
|
|
|
+
|
|
|
+ # 插入 Milvus
|
|
|
+ milvus_store.insert(insert_data)
|
|
|
|
|
|
return {"status": "ok", "knowledge_id": knowledge_id}
|
|
|
|
|
|
@@ -685,10 +717,10 @@ def list_knowledge(
|
|
|
if types:
|
|
|
type_list = [t.strip() for t in types.split(',') if t.strip()]
|
|
|
for t in type_list:
|
|
|
- filters.append(f'JSON_CONTAINS(types, "{t}")')
|
|
|
+ filters.append(f'ARRAY_CONTAINS(types, "{t}")')
|
|
|
|
|
|
if scopes:
|
|
|
- filters.append(f'JSON_CONTAINS(scopes, "{scopes}")')
|
|
|
+ filters.append(f'ARRAY_CONTAINS(scopes, "{scopes}")')
|
|
|
|
|
|
if owner:
|
|
|
filters.append(f'owner like "%{owner}%"')
|
|
|
@@ -705,7 +737,10 @@ def list_knowledge(
|
|
|
# 查询 Milvus
|
|
|
results = milvus_store.query(filter_expr, limit=limit)
|
|
|
|
|
|
- return {"results": results, "count": len(results)}
|
|
|
+ # 转换为可序列化的格式
|
|
|
+ serialized_results = [serialize_milvus_result(r) for r in results]
|
|
|
+
|
|
|
+ return {"results": serialized_results, "count": len(serialized_results)}
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"[List Knowledge] 错误: {e}")
|
|
|
@@ -721,7 +756,9 @@ def get_all_tags():
|
|
|
|
|
|
all_tags = set()
|
|
|
for item in results:
|
|
|
- tags_dict = item.get("tags", {})
|
|
|
+ # 转换为标准字典
|
|
|
+ serialized_item = serialize_milvus_result(item)
|
|
|
+ tags_dict = serialized_item.get("tags", {})
|
|
|
if isinstance(tags_dict, dict):
|
|
|
for key in tags_dict.keys():
|
|
|
all_tags.add(key)
|
|
|
@@ -742,7 +779,7 @@ def get_knowledge(knowledge_id: str):
|
|
|
if not result:
|
|
|
raise HTTPException(status_code=404, detail=f"Knowledge not found: {knowledge_id}")
|
|
|
|
|
|
- return result
|
|
|
+ return serialize_milvus_result(result)
|
|
|
|
|
|
except HTTPException:
|
|
|
raise
|
|
|
@@ -827,8 +864,7 @@ async def update_knowledge(knowledge_id: str, update: KnowledgeUpdateIn):
|
|
|
|
|
|
# 如果内容变化,重新生成向量
|
|
|
if need_reembed:
|
|
|
- text = f"{existing['task']}\n{content}"
|
|
|
- embedding = await get_embedding(text)
|
|
|
+ embedding = await get_embedding(existing['task'])
|
|
|
updates["embedding"] = embedding
|
|
|
|
|
|
# 更新 Milvus
|
|
|
@@ -861,7 +897,7 @@ async def patch_knowledge(knowledge_id: str, patch: KnowledgePatchIn):
|
|
|
|
|
|
if patch.content is not None:
|
|
|
updates["content"] = patch.content
|
|
|
- need_reembed = True
|
|
|
+ # content 变化不需要重新生成 embedding(只基于 task)
|
|
|
|
|
|
if patch.types is not None:
|
|
|
updates["types"] = patch.types
|
|
|
@@ -878,12 +914,10 @@ async def patch_knowledge(knowledge_id: str, patch: KnowledgePatchIn):
|
|
|
if not updates:
|
|
|
return {"status": "ok", "knowledge_id": knowledge_id}
|
|
|
|
|
|
- # 如果 task 或 content 变化,重新生成向量
|
|
|
+ # 如果 task 变化,重新生成向量
|
|
|
if need_reembed:
|
|
|
task = updates.get("task", existing["task"])
|
|
|
- content = updates.get("content", existing["content"])
|
|
|
- text = f"{task}\n{content}"
|
|
|
- embedding = await get_embedding(text)
|
|
|
+ embedding = await get_embedding(task)
|
|
|
updates["embedding"] = embedding
|
|
|
|
|
|
# 更新 Milvus
|
|
|
@@ -947,9 +981,8 @@ async def batch_update_knowledge(batch: KnowledgeBatchUpdateIn):
|
|
|
for (knowledge_id, _, _, eval_data, task), evolved_content in zip(evolution_tasks, evolved_results):
|
|
|
eval_data["helpful"] = eval_data.get("helpful", 0) + 1
|
|
|
|
|
|
- # 重新生成向量
|
|
|
- text = f"{task}\n{evolved_content}"
|
|
|
- embedding = await get_embedding(text)
|
|
|
+ # 重新生成向量(只基于 task)
|
|
|
+ embedding = await get_embedding(task)
|
|
|
|
|
|
milvus_store.update(knowledge_id, {
|
|
|
"content": evolved_content,
|
|
|
@@ -970,6 +1003,8 @@ async def slim_knowledge(model: str = "google/gemini-2.5-flash-lite"):
|
|
|
try:
|
|
|
# 获取所有知识
|
|
|
all_knowledge = milvus_store.query('id != ""', limit=10000)
|
|
|
+ # 转换为可序列化的格式
|
|
|
+ all_knowledge = [serialize_milvus_result(item) for item in all_knowledge]
|
|
|
|
|
|
if len(all_knowledge) < 2:
|
|
|
return {"status": "ok", "message": f"知识库仅有 {len(all_knowledge)} 条,无需瘦身"}
|
|
|
@@ -1087,8 +1122,8 @@ REPORT: 原有 X 条,合并后 Y 条,精简了 Z 条。
|
|
|
# 生成向量并重建知识库
|
|
|
print(f"[知识瘦身] 正在为 {len(new_entries)} 条知识生成向量...")
|
|
|
|
|
|
- # 批量生成向量
|
|
|
- texts = [f"{e['task']}\n{e['content']}" for e in new_entries]
|
|
|
+ # 批量生成向量(只基于 task)
|
|
|
+ texts = [e['task'] for e in new_entries]
|
|
|
embeddings = await get_embeddings_batch(texts)
|
|
|
|
|
|
# 清空并重建
|
|
|
@@ -1231,8 +1266,8 @@ async def extract_knowledge_from_messages(extract_req: MessageExtractIn):
|
|
|
if not extracted_knowledge:
|
|
|
return {"status": "ok", "extracted_count": 0, "knowledge_ids": []}
|
|
|
|
|
|
- # 批量生成向量
|
|
|
- texts = [f"{item.get('task', '')}\n{item.get('content', '')}" for item in extracted_knowledge]
|
|
|
+ # 批量生成向量(只基于 task)
|
|
|
+ texts = [item.get('task', '') for item in extracted_knowledge]
|
|
|
embeddings = await get_embeddings_batch(texts)
|
|
|
|
|
|
# 保存提取的知识
|