knowledge.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559
  1. """
  2. 原子知识保存工具
  3. 提供便捷的 API 让 Agent 快速保存结构化的原子知识
  4. """
  5. import os
  6. import re
  7. import json
  8. import yaml
  9. import logging
  10. from datetime import datetime
  11. from pathlib import Path
  12. from typing import List, Dict, Optional, Any
  13. from agent.tools import tool, ToolResult, ToolContext
  14. from ...llm.openrouter import openrouter_llm_call
  15. logger = logging.getLogger(__name__)
  16. def _generate_knowledge_id() -> str:
  17. """生成知识原子 ID"""
  18. return f"research-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
  19. def _format_yaml_list(items: List[str], indent: int = 2) -> str:
  20. """格式化 YAML 列表"""
  21. if not items:
  22. return "[]"
  23. indent_str = " " * indent
  24. return "\n" + "\n".join(f"{indent_str}- {item}" for item in items)
  25. @tool()
  26. async def save_knowledge(
  27. scenario: str,
  28. content: str,
  29. tags_type: List[str],
  30. urls: List[str] = None,
  31. agent_id: str = "research_agent",
  32. score: int = 3,
  33. trace_id: str = "",
  34. ) -> ToolResult:
  35. """
  36. 保存原子知识到本地文件(JSON 格式)
  37. Args:
  38. scenario: 任务描述(在什么情景下 + 要完成什么目标 + 得到能达成一个什么结果)
  39. content: 核心内容
  40. tags_type: 知识类型标签,可选:tool, usercase, definition, plan
  41. urls: 参考来源链接列表(论文/GitHub/博客等)
  42. agent_id: 执行此调研的 agent ID
  43. score: 初始评分 1-5(默认 3)
  44. trace_id: 当前 trace ID(可选)
  45. Returns:
  46. 保存结果
  47. """
  48. try:
  49. # 生成 ID
  50. knowledge_id = _generate_knowledge_id()
  51. # 准备目录
  52. knowledge_dir = Path(".cache/knowledge_atoms")
  53. knowledge_dir.mkdir(parents=True, exist_ok=True)
  54. # 构建文件路径(使用 .json 扩展名)
  55. file_path = knowledge_dir / f"{knowledge_id}.json"
  56. # 构建 JSON 数据结构
  57. knowledge_data = {
  58. "id": knowledge_id,
  59. "trace_id": trace_id or "N/A",
  60. "tags": {
  61. "type": tags_type
  62. },
  63. "scenario": scenario,
  64. "content": content,
  65. "trace": {
  66. "urls": urls or [],
  67. "agent_id": agent_id,
  68. "timestamp": datetime.now().isoformat()
  69. },
  70. "eval": {
  71. "score": score,
  72. "helpful": 0,
  73. "harmful": 0,
  74. "helpful_history": [],
  75. "harmful_history": []
  76. },
  77. "metrics": {
  78. "helpful": 1,
  79. "harmful": 0
  80. },
  81. "created_at": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  82. }
  83. # 保存为 JSON 文件
  84. with open(file_path, "w", encoding="utf-8") as f:
  85. json.dump(knowledge_data, f, ensure_ascii=False, indent=2)
  86. return ToolResult(
  87. title="✅ 原子知识已保存",
  88. output=f"知识 ID: {knowledge_id}\n文件路径: {file_path}\n\n场景:\n{scenario[:100]}...",
  89. long_term_memory=f"保存原子知识: {knowledge_id} - {scenario[:50]}",
  90. metadata={"knowledge_id": knowledge_id, "file_path": str(file_path)}
  91. )
  92. except Exception as e:
  93. return ToolResult(
  94. title="❌ 保存失败",
  95. output=f"错误: {str(e)}",
  96. error=str(e)
  97. )
  98. @tool()
  99. async def update_knowledge(
  100. knowledge_id: str,
  101. add_helpful_case: Optional[Dict[str, str]] = None,
  102. add_harmful_case: Optional[Dict[str, str]] = None,
  103. update_score: Optional[int] = None,
  104. ) -> ToolResult:
  105. """
  106. 更新已有的原子知识的评估反馈
  107. Args:
  108. knowledge_id: 知识 ID(如 research-20260302-001)
  109. add_helpful_case: 添加好用的案例 {"case_id": "...", "scenario": "...", "result": "...", "timestamp": "..."}
  110. add_harmful_case: 添加不好用的案例 {"case_id": "...", "scenario": "...", "result": "...", "timestamp": "..."}
  111. update_score: 更新评分(1-5)
  112. Returns:
  113. 更新结果
  114. """
  115. try:
  116. # 查找文件
  117. knowledge_dir = Path(".cache/knowledge_atoms")
  118. file_path = knowledge_dir / f"{knowledge_id}.md"
  119. if not file_path.exists():
  120. return ToolResult(
  121. title="❌ 文件不存在",
  122. output=f"未找到知识文件: {file_path}",
  123. error="文件不存在"
  124. )
  125. # 读取现有内容
  126. with open(file_path, "r", encoding="utf-8") as f:
  127. content = f.read()
  128. # 更新内容
  129. updated = False
  130. import re
  131. if add_helpful_case:
  132. # 增加 helpful 计数
  133. helpful_match = re.search(r"helpful: (\d+)", content)
  134. current_helpful = int(helpful_match.group(1)) if helpful_match else 0
  135. content = re.sub(
  136. r"helpful: \d+",
  137. f"helpful: {current_helpful + 1}",
  138. content
  139. )
  140. # 添加案例到 helpful_history
  141. case_yaml = f""" - case_id: {add_helpful_case.get('case_id', 'unknown')}
  142. scenario: "{add_helpful_case.get('scenario', '')}"
  143. result: "{add_helpful_case.get('result', '')}"
  144. timestamp: {add_helpful_case.get('timestamp', datetime.now().isoformat())}"""
  145. if "helpful_history: []" in content:
  146. content = content.replace(
  147. "helpful_history: []",
  148. f"helpful_history:\n{case_yaml}"
  149. )
  150. else:
  151. # 在 helpful_history 后追加
  152. content = re.sub(
  153. r"(helpful_history:.*?)(\n harmful)",
  154. f"\\1\n{case_yaml}\\2",
  155. content,
  156. flags=re.DOTALL
  157. )
  158. updated = True
  159. if add_harmful_case:
  160. # 增加 harmful 计数
  161. harmful_match = re.search(r"harmful: (\d+)", content)
  162. current_harmful = int(harmful_match.group(1)) if harmful_match else 0
  163. content = re.sub(
  164. r"harmful: \d+",
  165. f"harmful: {current_harmful + 1}",
  166. content
  167. )
  168. # 添加案例到 harmful_history
  169. case_yaml = f""" - case_id: {add_harmful_case.get('case_id', 'unknown')}
  170. scenario: "{add_harmful_case.get('scenario', '')}"
  171. result: "{add_harmful_case.get('result', '')}"
  172. timestamp: {add_harmful_case.get('timestamp', datetime.now().isoformat())}"""
  173. if "harmful_history: []" in content:
  174. content = content.replace(
  175. "harmful_history: []",
  176. f"harmful_history:\n{case_yaml}"
  177. )
  178. else:
  179. # 在 harmful_history 后追加
  180. content = re.sub(
  181. r"(harmful_history:.*?)(\nmetrics)",
  182. f"\\1\n{case_yaml}\\2",
  183. content,
  184. flags=re.DOTALL
  185. )
  186. updated = True
  187. if update_score is not None:
  188. content = re.sub(
  189. r"score: \d+",
  190. f"score: {update_score}",
  191. content
  192. )
  193. updated = True
  194. if not updated:
  195. return ToolResult(
  196. title="⚠️ 无更新",
  197. output="未指定任何更新内容",
  198. long_term_memory="尝试更新原子知识但未指定更新内容"
  199. )
  200. # 保存更新
  201. with open(file_path, "w", encoding="utf-8") as f:
  202. f.write(content)
  203. summary = []
  204. if add_helpful_case:
  205. summary.append(f"添加 helpful 案例: {add_helpful_case.get('case_id')}")
  206. if add_harmful_case:
  207. summary.append(f"添加 harmful 案例: {add_harmful_case.get('case_id')}")
  208. if update_score:
  209. summary.append(f"更新评分: {update_score}")
  210. return ToolResult(
  211. title="✅ 原子知识已更新",
  212. output=f"知识 ID: {knowledge_id}\n文件路径: {file_path}\n\n更新内容:\n" + "\n".join(f"- {s}" for s in summary),
  213. long_term_memory=f"更新原子知识: {knowledge_id}"
  214. )
  215. except Exception as e:
  216. return ToolResult(
  217. title="❌ 更新失败",
  218. output=f"错误: {str(e)}",
  219. error=str(e)
  220. )
  221. @tool()
  222. async def list_knowledge(
  223. limit: int = 10,
  224. tags_type: Optional[List[str]] = None,
  225. ) -> ToolResult:
  226. """
  227. 列出已保存的原子知识
  228. Args:
  229. limit: 返回数量限制(默认 10)
  230. tags_type: 按类型过滤(可选)
  231. Returns:
  232. 知识列表
  233. """
  234. try:
  235. knowledge_dir = Path(".cache/knowledge_atoms")
  236. if not knowledge_dir.exists():
  237. return ToolResult(
  238. title="📂 知识库为空",
  239. output="还没有保存任何原子知识",
  240. long_term_memory="知识库为空"
  241. )
  242. # 获取所有文件
  243. files = sorted(knowledge_dir.glob("*.md"), key=lambda x: x.stat().st_mtime, reverse=True)
  244. if not files:
  245. return ToolResult(
  246. title="📂 知识库为空",
  247. output="还没有保存任何原子知识",
  248. long_term_memory="知识库为空"
  249. )
  250. # 读取并过滤
  251. results = []
  252. for file_path in files[:limit]:
  253. with open(file_path, "r", encoding="utf-8") as f:
  254. content = f.read()
  255. # 提取关键信息
  256. import re
  257. id_match = re.search(r"id: (.+)", content)
  258. scenario_match = re.search(r"scenario: \|\n (.+)", content)
  259. score_match = re.search(r"score: (\d+)", content)
  260. knowledge_id = id_match.group(1) if id_match else "unknown"
  261. scenario = scenario_match.group(1) if scenario_match else "N/A"
  262. score = score_match.group(1) if score_match else "N/A"
  263. results.append(f"- [{knowledge_id}] (⭐{score}) {scenario[:60]}...")
  264. output = f"共找到 {len(files)} 条原子知识,显示最近 {len(results)} 条:\n\n" + "\n".join(results)
  265. return ToolResult(
  266. title="📚 原子知识列表",
  267. output=output,
  268. long_term_memory=f"列出 {len(results)} 条原子知识"
  269. )
  270. except Exception as e:
  271. return ToolResult(
  272. title="❌ 列表失败",
  273. output=f"错误: {str(e)}",
  274. error=str(e)
  275. )
  276. # ===== 语义检索功能 =====
  277. async def _route_knowledge_by_llm(query_text: str, metadata_list: List[Dict], k: int = 5) -> List[str]:
  278. """
  279. 第一阶段:语义路由。
  280. 让 LLM 挑选出 2*k 个语义相关的 ID。
  281. """
  282. if not metadata_list:
  283. return []
  284. # 扩大筛选范围到 2*k
  285. routing_k = k * 2
  286. routing_data = [
  287. {
  288. "id": m["id"],
  289. "tags": m["tags"],
  290. "scenario": m["scenario"][:100] # 只取前100字符
  291. } for m in metadata_list
  292. ]
  293. prompt = f"""
  294. 你是一个知识检索专家。根据用户的当前任务需求,从下列原子知识元数据中挑选出最相关的最多 {routing_k} 个知识 ID。
  295. 任务需求:"{query_text}"
  296. 可选知识列表:
  297. {json.dumps(routing_data, ensure_ascii=False, indent=1)}
  298. 请直接输出 ID 列表,用逗号分隔(例如: research-20260302-001, research-20260302-002)。若无相关项请输出 "None"。
  299. """
  300. try:
  301. print(f"\n[Step 1: 知识语义路由] 任务: '{query_text}' | 候选总数: {len(metadata_list)} | 目标提取数: {routing_k}")
  302. response = await openrouter_llm_call(
  303. messages=[{"role": "user", "content": prompt}],
  304. model="google/gemini-2.0-flash-001"
  305. )
  306. content = response.get("content", "").strip()
  307. selected_ids = [idx.strip() for idx in re.split(r'[,\s]+', content) if idx.strip().startswith("research-")]
  308. print(f"[Step 1: 知识语义路由] LLM 初选 ID ({len(selected_ids)}个): {selected_ids}")
  309. return selected_ids
  310. except Exception as e:
  311. logger.error(f"LLM 知识路由失败: {e}")
  312. return []
  313. async def _get_structured_knowledge(query_text: str, top_k: int = 5, min_score: int = 3) -> List[Dict]:
  314. """
  315. 语义检索原子知识
  316. 1. 解析知识库文件(支持 JSON 和 YAML 格式)
  317. 2. 语义路由:提取 2*k 个 ID
  318. 3. 质量精排:基于评分筛选出最终的 k 个
  319. """
  320. knowledge_dir = Path(".cache/knowledge_atoms")
  321. if not knowledge_dir.exists():
  322. print(f"[Knowledge System] 警告: 知识库目录不存在 ({knowledge_dir})")
  323. return []
  324. # 同时支持 .json 和 .md 文件
  325. json_files = list(knowledge_dir.glob("*.json"))
  326. md_files = list(knowledge_dir.glob("*.md"))
  327. files = json_files + md_files
  328. if not files:
  329. print(f"[Knowledge System] 警告: 知识库为空")
  330. return []
  331. # --- 阶段 1: 解析所有知识文件 ---
  332. content_map = {}
  333. metadata_list = []
  334. for file_path in files:
  335. try:
  336. with open(file_path, "r", encoding="utf-8") as f:
  337. content = f.read()
  338. # 根据文件扩展名选择解析方式
  339. if file_path.suffix == ".json":
  340. # 解析 JSON 格式
  341. metadata = json.loads(content)
  342. else:
  343. # 解析 YAML frontmatter(兼容旧格式)
  344. yaml_match = re.search(r'^---\n(.*?)\n---', content, re.DOTALL)
  345. if not yaml_match:
  346. logger.warning(f"跳过无效文件: {file_path}")
  347. continue
  348. metadata = yaml.safe_load(yaml_match.group(1))
  349. if not isinstance(metadata, dict):
  350. logger.warning(f"跳过损坏的知识文件: {file_path}")
  351. continue
  352. kid = metadata.get("id")
  353. if not kid:
  354. logger.warning(f"跳过缺少 id 的知识文件: {file_path}")
  355. continue
  356. # 提取 scenario 和 content
  357. scenario = metadata.get("scenario", "").strip()
  358. content_text = metadata.get("content", "").strip()
  359. meta_item = {
  360. "id": kid,
  361. "tags": metadata.get("tags", {}),
  362. "scenario": scenario,
  363. "score": metadata.get("eval", {}).get("score", 3),
  364. "helpful": metadata.get("metrics", {}).get("helpful", 0),
  365. "harmful": metadata.get("metrics", {}).get("harmful", 0),
  366. }
  367. metadata_list.append(meta_item)
  368. content_map[kid] = {
  369. "scenario": scenario,
  370. "content": content_text,
  371. "score": meta_item["score"],
  372. "helpful": meta_item["helpful"],
  373. "harmful": meta_item["harmful"],
  374. }
  375. except Exception as e:
  376. logger.error(f"解析知识文件失败 {file_path}: {e}")
  377. continue
  378. if not metadata_list:
  379. print(f"[Knowledge System] 警告: 没有有效的知识条目")
  380. return []
  381. # --- 阶段 2: 语义路由 (取 2*k) ---
  382. candidate_ids = await _route_knowledge_by_llm(query_text, metadata_list, k=top_k)
  383. # --- 阶段 3: 质量精排 (根据评分和反馈选出最终的 k) ---
  384. print(f"[Step 2: 知识质量精排] 正在根据评分和反馈进行打分...")
  385. scored_items = []
  386. for kid in candidate_ids:
  387. if kid in content_map:
  388. item = content_map[kid]
  389. score = item["score"]
  390. helpful = item["helpful"]
  391. harmful = item["harmful"]
  392. # 计算综合分:基础分 + helpful - harmful*2
  393. quality_score = score + helpful - (harmful * 2.0)
  394. # 过滤门槛:评分低于 min_score 或质量分过低
  395. if score < min_score or quality_score < 0:
  396. print(f" - 剔除低质量知识: {kid} (Score: {score}, Helpful: {helpful}, Harmful: {harmful})")
  397. continue
  398. scored_items.append({
  399. "id": kid,
  400. "scenario": item["scenario"],
  401. "content": item["content"],
  402. "score": score,
  403. "quality_score": quality_score
  404. })
  405. # 按照质量分排序
  406. final_sorted = sorted(scored_items, key=lambda x: x["quality_score"], reverse=True)
  407. # 截取最终的 top_k
  408. result = final_sorted[:top_k]
  409. print(f"[Step 2: 知识质量精排] 最终选定知识: {[it['id'] for it in result]}")
  410. print(f"[Knowledge System] 检索结束。\n")
  411. return result
  412. @tool()
  413. async def search_knowledge(
  414. query: str,
  415. top_k: int = 5,
  416. min_score: int = 3,
  417. tags_type: Optional[List[str]] = None,
  418. context: Optional[ToolContext] = None,
  419. ) -> ToolResult:
  420. """
  421. 语义检索原子知识库
  422. Args:
  423. query: 搜索查询(任务描述)
  424. top_k: 返回数量(默认 5)
  425. min_score: 最低评分过滤(默认 3)
  426. tags_type: 按类型过滤(tool/usercase/definition/plan)
  427. context: 工具上下文
  428. Returns:
  429. 相关知识列表
  430. """
  431. try:
  432. relevant_items = await _get_structured_knowledge(
  433. query_text=query,
  434. top_k=top_k,
  435. min_score=min_score
  436. )
  437. if not relevant_items:
  438. return ToolResult(
  439. title="🔍 未找到相关知识",
  440. output=f"查询: {query}\n\n知识库中暂无相关的高质量知识。建议进行调研。",
  441. long_term_memory=f"知识检索: 未找到相关知识 - {query[:50]}"
  442. )
  443. # 格式化输出
  444. output_lines = [f"查询: {query}\n", f"找到 {len(relevant_items)} 条相关知识:\n"]
  445. for idx, item in enumerate(relevant_items, 1):
  446. output_lines.append(f"\n### {idx}. [{item['id']}] (⭐ {item['score']})")
  447. output_lines.append(f"**场景**: {item['scenario'][:150]}...")
  448. output_lines.append(f"**内容**: {item['content'][:200]}...")
  449. return ToolResult(
  450. title="✅ 知识检索成功",
  451. output="\n".join(output_lines),
  452. long_term_memory=f"知识检索: 找到 {len(relevant_items)} 条相关知识 - {query[:50]}",
  453. metadata={
  454. "count": len(relevant_items),
  455. "knowledge_ids": [item["id"] for item in relevant_items],
  456. "items": relevant_items
  457. }
  458. )
  459. except Exception as e:
  460. logger.error(f"知识检索失败: {e}")
  461. return ToolResult(
  462. title="❌ 检索失败",
  463. output=f"错误: {str(e)}",
  464. error=str(e)
  465. )