|
@@ -136,6 +136,50 @@ def decrypt_content(resource_id: str, encrypted_text: str, provided_key: Optiona
|
|
|
return "[ENCRYPTED]"
|
|
return "[ENCRYPTED]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def serialize_milvus_result(data):
|
|
|
|
|
+ """将 Milvus 返回的数据转换为可序列化的字典"""
|
|
|
|
|
+ # 基本类型直接返回
|
|
|
|
|
+ if data is None or isinstance(data, (str, int, float, bool)):
|
|
|
|
|
+ return data
|
|
|
|
|
+
|
|
|
|
|
+ # 字典类型递归处理
|
|
|
|
|
+ if isinstance(data, dict):
|
|
|
|
|
+ return {k: serialize_milvus_result(v) for k, v in data.items()}
|
|
|
|
|
+
|
|
|
|
|
+ # 列表/元组类型递归处理
|
|
|
|
|
+ if isinstance(data, (list, tuple)):
|
|
|
|
|
+ return [serialize_milvus_result(item) for item in data]
|
|
|
|
|
+
|
|
|
|
|
+ # 尝试转换为字典(对于有 to_dict 方法的对象)
|
|
|
|
|
+ if hasattr(data, 'to_dict') and callable(getattr(data, 'to_dict')):
|
|
|
|
|
+ try:
|
|
|
|
|
+ return serialize_milvus_result(data.to_dict())
|
|
|
|
|
+ except:
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ # 尝试转换为列表(对于可迭代对象,如 RepeatedScalarContainer)
|
|
|
|
|
+ if hasattr(data, '__iter__') and not isinstance(data, (str, bytes, dict)):
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 强制转换为列表并递归处理
|
|
|
|
|
+ result = []
|
|
|
|
|
+ for item in data:
|
|
|
|
|
+ result.append(serialize_milvus_result(item))
|
|
|
|
|
+ return result
|
|
|
|
|
+ except:
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ # 尝试获取对象的属性字典
|
|
|
|
|
+ if hasattr(data, '__dict__'):
|
|
|
|
|
+ try:
|
|
|
|
|
+ return serialize_milvus_result(vars(data))
|
|
|
|
|
+ except:
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ # 最后的 fallback:对于无法处理的类型,返回 None 而不是字符串表示
|
|
|
|
|
+ # 这样可以避免产生无法序列化的字符串
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def init_db():
|
|
def init_db():
|
|
|
"""初始化 SQLite(仅用于 resources)"""
|
|
"""初始化 SQLite(仅用于 resources)"""
|
|
|
conn = get_db()
|
|
conn = get_db()
|
|
@@ -566,12 +610,12 @@ async def search_knowledge_api(
|
|
|
if types:
|
|
if types:
|
|
|
type_list = [t.strip() for t in types.split(',') if t.strip()]
|
|
type_list = [t.strip() for t in types.split(',') if t.strip()]
|
|
|
for t in type_list:
|
|
for t in type_list:
|
|
|
- filters.append(f'JSON_CONTAINS(types, "{t}")')
|
|
|
|
|
|
|
+ filters.append(f'array_contains(types, "{t}")')
|
|
|
if owner:
|
|
if owner:
|
|
|
filters.append(f'owner == "{owner}"')
|
|
filters.append(f'owner == "{owner}"')
|
|
|
|
|
|
|
|
# 添加 min_score 过滤
|
|
# 添加 min_score 过滤
|
|
|
- filters.append(f'JSON_EXTRACT(eval, "$.score") >= {min_score}')
|
|
|
|
|
|
|
+ filters.append(f'eval["score"] >= {min_score}')
|
|
|
|
|
|
|
|
filter_expr = ' and '.join(filters) if filters else None
|
|
filter_expr = ' and '.join(filters) if filters else None
|
|
|
|
|
|
|
@@ -586,18 +630,21 @@ async def search_knowledge_api(
|
|
|
if not candidates:
|
|
if not candidates:
|
|
|
return {"results": [], "count": 0, "reranked": False}
|
|
return {"results": [], "count": 0, "reranked": False}
|
|
|
|
|
|
|
|
|
|
+ # 转换为可序列化的格式
|
|
|
|
|
+ serialized_candidates = [serialize_milvus_result(c) for c in candidates]
|
|
|
|
|
+
|
|
|
# 4. LLM 精排
|
|
# 4. LLM 精排
|
|
|
- reranked_ids = await _llm_rerank(q, candidates, top_k)
|
|
|
|
|
|
|
+ reranked_ids = await _llm_rerank(q, serialized_candidates, top_k)
|
|
|
|
|
|
|
|
if reranked_ids:
|
|
if reranked_ids:
|
|
|
# 按 LLM 排序返回
|
|
# 按 LLM 排序返回
|
|
|
- id_to_candidate = {c["id"]: c for c in candidates}
|
|
|
|
|
|
|
+ id_to_candidate = {c["id"]: c for c in serialized_candidates}
|
|
|
results = [id_to_candidate[id] for id in reranked_ids if id in id_to_candidate]
|
|
results = [id_to_candidate[id] for id in reranked_ids if id in id_to_candidate]
|
|
|
return {"results": results, "count": len(results), "reranked": True}
|
|
return {"results": results, "count": len(results), "reranked": True}
|
|
|
else:
|
|
else:
|
|
|
# Fallback:直接返回向量召回的 top k
|
|
# Fallback:直接返回向量召回的 top k
|
|
|
print(f"[Knowledge Search] LLM 精排失败,fallback 到向量 top-{top_k}")
|
|
print(f"[Knowledge Search] LLM 精排失败,fallback 到向量 top-{top_k}")
|
|
|
- return {"results": candidates[:top_k], "count": len(candidates[:top_k]), "reranked": False}
|
|
|
|
|
|
|
+ return {"results": serialized_candidates[:top_k], "count": len(serialized_candidates[:top_k]), "reranked": False}
|
|
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
print(f"[Knowledge Search] 错误: {e}")
|
|
print(f"[Knowledge Search] 错误: {e}")
|
|
@@ -639,12 +686,14 @@ async def save_knowledge(knowledge: KnowledgeIn):
|
|
|
"harmful_history": []
|
|
"harmful_history": []
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- # 生成向量
|
|
|
|
|
- text = f"{knowledge.task}\n{knowledge.content}"
|
|
|
|
|
- embedding = await get_embedding(text)
|
|
|
|
|
|
|
+ # 生成向量(只基于 task,因为搜索时用户描述的是任务场景)
|
|
|
|
|
+ embedding = await get_embedding(knowledge.task)
|
|
|
|
|
|
|
|
- # 插入 Milvus
|
|
|
|
|
- milvus_store.insert({
|
|
|
|
|
|
|
+ # 提取 tag keys(用于高效筛选)
|
|
|
|
|
+ tag_keys = list(knowledge.tags.keys()) if isinstance(knowledge.tags, dict) else []
|
|
|
|
|
+
|
|
|
|
|
+ # 准备插入数据
|
|
|
|
|
+ insert_data = {
|
|
|
"id": knowledge_id,
|
|
"id": knowledge_id,
|
|
|
"embedding": embedding,
|
|
"embedding": embedding,
|
|
|
"message_id": knowledge.message_id,
|
|
"message_id": knowledge.message_id,
|
|
@@ -652,6 +701,7 @@ async def save_knowledge(knowledge: KnowledgeIn):
|
|
|
"content": knowledge.content,
|
|
"content": knowledge.content,
|
|
|
"types": knowledge.types,
|
|
"types": knowledge.types,
|
|
|
"tags": knowledge.tags,
|
|
"tags": knowledge.tags,
|
|
|
|
|
+ "tag_keys": tag_keys,
|
|
|
"scopes": knowledge.scopes,
|
|
"scopes": knowledge.scopes,
|
|
|
"owner": owner,
|
|
"owner": owner,
|
|
|
"resource_ids": knowledge.resource_ids,
|
|
"resource_ids": knowledge.resource_ids,
|
|
@@ -659,7 +709,12 @@ async def save_knowledge(knowledge: KnowledgeIn):
|
|
|
"eval": eval_data,
|
|
"eval": eval_data,
|
|
|
"created_at": now,
|
|
"created_at": now,
|
|
|
"updated_at": now,
|
|
"updated_at": now,
|
|
|
- })
|
|
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ print(f"[Save Knowledge] 插入数据: {json.dumps({k: v for k, v in insert_data.items() if k != 'embedding'}, ensure_ascii=False)}")
|
|
|
|
|
+
|
|
|
|
|
+ # 插入 Milvus
|
|
|
|
|
+ milvus_store.insert(insert_data)
|
|
|
|
|
|
|
|
return {"status": "ok", "knowledge_id": knowledge_id}
|
|
return {"status": "ok", "knowledge_id": knowledge_id}
|
|
|
|
|
|
|
@@ -685,19 +740,19 @@ def list_knowledge(
|
|
|
if types:
|
|
if types:
|
|
|
type_list = [t.strip() for t in types.split(',') if t.strip()]
|
|
type_list = [t.strip() for t in types.split(',') if t.strip()]
|
|
|
for t in type_list:
|
|
for t in type_list:
|
|
|
- filters.append(f'JSON_CONTAINS(types, "{t}")')
|
|
|
|
|
|
|
+ filters.append(f'array_contains(types, "{t}")')
|
|
|
|
|
|
|
|
if scopes:
|
|
if scopes:
|
|
|
- filters.append(f'JSON_CONTAINS(scopes, "{scopes}")')
|
|
|
|
|
|
|
+ filters.append(f'array_contains(scopes, "{scopes}")')
|
|
|
|
|
|
|
|
if owner:
|
|
if owner:
|
|
|
filters.append(f'owner like "%{owner}%"')
|
|
filters.append(f'owner like "%{owner}%"')
|
|
|
|
|
|
|
|
- # tags 支持多个,用 AND 连接(交集:必须同时包含所有选中的tag)
|
|
|
|
|
|
|
+ # tags 支持多个,用 AND 连接(使用 tag_keys 数组进行高效筛选)
|
|
|
if tags:
|
|
if tags:
|
|
|
tag_list = [t.strip() for t in tags.split(',') if t.strip()]
|
|
tag_list = [t.strip() for t in tags.split(',') if t.strip()]
|
|
|
for t in tag_list:
|
|
for t in tag_list:
|
|
|
- filters.append(f'JSON_CONTAINS_ANY(tags, ["{t}"])')
|
|
|
|
|
|
|
+ filters.append(f'array_contains(tag_keys, "{t}")')
|
|
|
|
|
|
|
|
# 如果没有过滤条件,查询所有
|
|
# 如果没有过滤条件,查询所有
|
|
|
filter_expr = ' and '.join(filters) if filters else 'id != ""'
|
|
filter_expr = ' and '.join(filters) if filters else 'id != ""'
|
|
@@ -705,7 +760,10 @@ def list_knowledge(
|
|
|
# 查询 Milvus
|
|
# 查询 Milvus
|
|
|
results = milvus_store.query(filter_expr, limit=limit)
|
|
results = milvus_store.query(filter_expr, limit=limit)
|
|
|
|
|
|
|
|
- return {"results": results, "count": len(results)}
|
|
|
|
|
|
|
+ # 转换为可序列化的格式
|
|
|
|
|
+ serialized_results = [serialize_milvus_result(r) for r in results]
|
|
|
|
|
+
|
|
|
|
|
+ return {"results": serialized_results, "count": len(serialized_results)}
|
|
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
print(f"[List Knowledge] 错误: {e}")
|
|
print(f"[List Knowledge] 错误: {e}")
|
|
@@ -721,7 +779,9 @@ def get_all_tags():
|
|
|
|
|
|
|
|
all_tags = set()
|
|
all_tags = set()
|
|
|
for item in results:
|
|
for item in results:
|
|
|
- tags_dict = item.get("tags", {})
|
|
|
|
|
|
|
+ # 转换为标准字典
|
|
|
|
|
+ serialized_item = serialize_milvus_result(item)
|
|
|
|
|
+ tags_dict = serialized_item.get("tags", {})
|
|
|
if isinstance(tags_dict, dict):
|
|
if isinstance(tags_dict, dict):
|
|
|
for key in tags_dict.keys():
|
|
for key in tags_dict.keys():
|
|
|
all_tags.add(key)
|
|
all_tags.add(key)
|
|
@@ -742,7 +802,7 @@ def get_knowledge(knowledge_id: str):
|
|
|
if not result:
|
|
if not result:
|
|
|
raise HTTPException(status_code=404, detail=f"Knowledge not found: {knowledge_id}")
|
|
raise HTTPException(status_code=404, detail=f"Knowledge not found: {knowledge_id}")
|
|
|
|
|
|
|
|
- return result
|
|
|
|
|
|
|
+ return serialize_milvus_result(result)
|
|
|
|
|
|
|
|
except HTTPException:
|
|
except HTTPException:
|
|
|
raise
|
|
raise
|
|
@@ -827,8 +887,7 @@ async def update_knowledge(knowledge_id: str, update: KnowledgeUpdateIn):
|
|
|
|
|
|
|
|
# 如果内容变化,重新生成向量
|
|
# 如果内容变化,重新生成向量
|
|
|
if need_reembed:
|
|
if need_reembed:
|
|
|
- text = f"{existing['task']}\n{content}"
|
|
|
|
|
- embedding = await get_embedding(text)
|
|
|
|
|
|
|
+ embedding = await get_embedding(existing['task'])
|
|
|
updates["embedding"] = embedding
|
|
updates["embedding"] = embedding
|
|
|
|
|
|
|
|
# 更新 Milvus
|
|
# 更新 Milvus
|
|
@@ -861,13 +920,15 @@ async def patch_knowledge(knowledge_id: str, patch: KnowledgePatchIn):
|
|
|
|
|
|
|
|
if patch.content is not None:
|
|
if patch.content is not None:
|
|
|
updates["content"] = patch.content
|
|
updates["content"] = patch.content
|
|
|
- need_reembed = True
|
|
|
|
|
|
|
+ # content 变化不需要重新生成 embedding(只基于 task)
|
|
|
|
|
|
|
|
if patch.types is not None:
|
|
if patch.types is not None:
|
|
|
updates["types"] = patch.types
|
|
updates["types"] = patch.types
|
|
|
|
|
|
|
|
if patch.tags is not None:
|
|
if patch.tags is not None:
|
|
|
updates["tags"] = patch.tags
|
|
updates["tags"] = patch.tags
|
|
|
|
|
+ # 同时更新 tag_keys
|
|
|
|
|
+ updates["tag_keys"] = list(patch.tags.keys()) if isinstance(patch.tags, dict) else []
|
|
|
|
|
|
|
|
if patch.scopes is not None:
|
|
if patch.scopes is not None:
|
|
|
updates["scopes"] = patch.scopes
|
|
updates["scopes"] = patch.scopes
|
|
@@ -878,12 +939,10 @@ async def patch_knowledge(knowledge_id: str, patch: KnowledgePatchIn):
|
|
|
if not updates:
|
|
if not updates:
|
|
|
return {"status": "ok", "knowledge_id": knowledge_id}
|
|
return {"status": "ok", "knowledge_id": knowledge_id}
|
|
|
|
|
|
|
|
- # 如果 task 或 content 变化,重新生成向量
|
|
|
|
|
|
|
+ # 如果 task 变化,重新生成向量
|
|
|
if need_reembed:
|
|
if need_reembed:
|
|
|
task = updates.get("task", existing["task"])
|
|
task = updates.get("task", existing["task"])
|
|
|
- content = updates.get("content", existing["content"])
|
|
|
|
|
- text = f"{task}\n{content}"
|
|
|
|
|
- embedding = await get_embedding(text)
|
|
|
|
|
|
|
+ embedding = await get_embedding(task)
|
|
|
updates["embedding"] = embedding
|
|
updates["embedding"] = embedding
|
|
|
|
|
|
|
|
# 更新 Milvus
|
|
# 更新 Milvus
|
|
@@ -947,9 +1006,8 @@ async def batch_update_knowledge(batch: KnowledgeBatchUpdateIn):
|
|
|
for (knowledge_id, _, _, eval_data, task), evolved_content in zip(evolution_tasks, evolved_results):
|
|
for (knowledge_id, _, _, eval_data, task), evolved_content in zip(evolution_tasks, evolved_results):
|
|
|
eval_data["helpful"] = eval_data.get("helpful", 0) + 1
|
|
eval_data["helpful"] = eval_data.get("helpful", 0) + 1
|
|
|
|
|
|
|
|
- # 重新生成向量
|
|
|
|
|
- text = f"{task}\n{evolved_content}"
|
|
|
|
|
- embedding = await get_embedding(text)
|
|
|
|
|
|
|
+ # 重新生成向量(只基于 task)
|
|
|
|
|
+ embedding = await get_embedding(task)
|
|
|
|
|
|
|
|
milvus_store.update(knowledge_id, {
|
|
milvus_store.update(knowledge_id, {
|
|
|
"content": evolved_content,
|
|
"content": evolved_content,
|
|
@@ -970,6 +1028,8 @@ async def slim_knowledge(model: str = "google/gemini-2.5-flash-lite"):
|
|
|
try:
|
|
try:
|
|
|
# 获取所有知识
|
|
# 获取所有知识
|
|
|
all_knowledge = milvus_store.query('id != ""', limit=10000)
|
|
all_knowledge = milvus_store.query('id != ""', limit=10000)
|
|
|
|
|
+ # 转换为可序列化的格式
|
|
|
|
|
+ all_knowledge = [serialize_milvus_result(item) for item in all_knowledge]
|
|
|
|
|
|
|
|
if len(all_knowledge) < 2:
|
|
if len(all_knowledge) < 2:
|
|
|
return {"status": "ok", "message": f"知识库仅有 {len(all_knowledge)} 条,无需瘦身"}
|
|
return {"status": "ok", "message": f"知识库仅有 {len(all_knowledge)} 条,无需瘦身"}
|
|
@@ -1087,8 +1147,8 @@ REPORT: 原有 X 条,合并后 Y 条,精简了 Z 条。
|
|
|
# 生成向量并重建知识库
|
|
# 生成向量并重建知识库
|
|
|
print(f"[知识瘦身] 正在为 {len(new_entries)} 条知识生成向量...")
|
|
print(f"[知识瘦身] 正在为 {len(new_entries)} 条知识生成向量...")
|
|
|
|
|
|
|
|
- # 批量生成向量
|
|
|
|
|
- texts = [f"{e['task']}\n{e['content']}" for e in new_entries]
|
|
|
|
|
|
|
+ # 批量生成向量(只基于 task)
|
|
|
|
|
+ texts = [e['task'] for e in new_entries]
|
|
|
embeddings = await get_embeddings_batch(texts)
|
|
embeddings = await get_embeddings_batch(texts)
|
|
|
|
|
|
|
|
# 清空并重建
|
|
# 清空并重建
|
|
@@ -1122,6 +1182,7 @@ REPORT: 原有 X 条,合并后 Y 条,精简了 Z 条。
|
|
|
"content": e["content"],
|
|
"content": e["content"],
|
|
|
"types": e["types"],
|
|
"types": e["types"],
|
|
|
"tags": {},
|
|
"tags": {},
|
|
|
|
|
+ "tag_keys": [],
|
|
|
"scopes": ["org:cybertogether"],
|
|
"scopes": ["org:cybertogether"],
|
|
|
"owner": "agent:slim",
|
|
"owner": "agent:slim",
|
|
|
"resource_ids": [],
|
|
"resource_ids": [],
|
|
@@ -1231,8 +1292,8 @@ async def extract_knowledge_from_messages(extract_req: MessageExtractIn):
|
|
|
if not extracted_knowledge:
|
|
if not extracted_knowledge:
|
|
|
return {"status": "ok", "extracted_count": 0, "knowledge_ids": []}
|
|
return {"status": "ok", "extracted_count": 0, "knowledge_ids": []}
|
|
|
|
|
|
|
|
- # 批量生成向量
|
|
|
|
|
- texts = [f"{item.get('task', '')}\n{item.get('content', '')}" for item in extracted_knowledge]
|
|
|
|
|
|
|
+ # 批量生成向量(只基于 task)
|
|
|
|
|
+ texts = [item.get('task', '') for item in extracted_knowledge]
|
|
|
embeddings = await get_embeddings_batch(texts)
|
|
embeddings = await get_embeddings_batch(texts)
|
|
|
|
|
|
|
|
# 保存提取的知识
|
|
# 保存提取的知识
|
|
@@ -1282,6 +1343,7 @@ async def extract_knowledge_from_messages(extract_req: MessageExtractIn):
|
|
|
"content": knowledge_content,
|
|
"content": knowledge_content,
|
|
|
"types": types,
|
|
"types": types,
|
|
|
"tags": {},
|
|
"tags": {},
|
|
|
|
|
+ "tag_keys": [],
|
|
|
"scopes": ["org:cybertogether"],
|
|
"scopes": ["org:cybertogether"],
|
|
|
"owner": extract_req.submitted_by,
|
|
"owner": extract_req.submitted_by,
|
|
|
"resource_ids": [],
|
|
"resource_ids": [],
|
|
@@ -1543,14 +1605,20 @@ def frontend():
|
|
|
|
|
|
|
|
document.getElementById('modalTitle').textContent = '编辑知识';
|
|
document.getElementById('modalTitle').textContent = '编辑知识';
|
|
|
document.getElementById('editId').value = k.id;
|
|
document.getElementById('editId').value = k.id;
|
|
|
- document.getElementById('taskInput').value = k.task;
|
|
|
|
|
- document.getElementById('contentInput').value = k.content;
|
|
|
|
|
- document.getElementById('tagsInput').value = JSON.stringify(k.tags);
|
|
|
|
|
- document.getElementById('scopesInput').value = k.scopes.join(', ');
|
|
|
|
|
- document.getElementById('ownerInput').value = k.owner;
|
|
|
|
|
|
|
+ document.getElementById('taskInput').value = k.task || '';
|
|
|
|
|
+ document.getElementById('contentInput').value = k.content || '';
|
|
|
|
|
+ document.getElementById('tagsInput').value = JSON.stringify(k.tags || {});
|
|
|
|
|
+
|
|
|
|
|
+ // 防御性检查:确保 scopes 是数组
|
|
|
|
|
+ const scopes = Array.isArray(k.scopes) ? k.scopes : [];
|
|
|
|
|
+ document.getElementById('scopesInput').value = scopes.join(', ');
|
|
|
|
|
+
|
|
|
|
|
+ document.getElementById('ownerInput').value = k.owner || '';
|
|
|
|
|
|
|
|
|
|
+ // 防御性检查:确保 types 是数组
|
|
|
|
|
+ const types = Array.isArray(k.types) ? k.types : [];
|
|
|
document.querySelectorAll('.type-checkbox').forEach(el => {
|
|
document.querySelectorAll('.type-checkbox').forEach(el => {
|
|
|
- el.checked = k.types.includes(el.value);
|
|
|
|
|
|
|
+ el.checked = types.includes(el.value);
|
|
|
});
|
|
});
|
|
|
|
|
|
|
|
document.getElementById('modal').classList.remove('hidden');
|
|
document.getElementById('modal').classList.remove('hidden');
|
|
@@ -1625,4 +1693,4 @@ def frontend():
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
|
import uvicorn
|
|
import uvicorn
|
|
|
- uvicorn.run(app, host="0.0.0.0", port=9998)
|
|
|
|
|
|
|
+ uvicorn.run(app, host="0.0.0.0", port=9999)
|