pg_store.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518
  1. """
  2. PostgreSQL 存储封装(替代 Milvus)
  3. 使用远程 PostgreSQL + pgvector/fastann 存储知识数据
  4. """
  5. import os
  6. import json
  7. import psycopg2
  8. from psycopg2.extras import RealDictCursor, execute_batch
  9. from typing import List, Dict, Optional
  10. from dotenv import load_dotenv
  11. from knowhub.knowhub_db.cascade import cascade_delete
  12. load_dotenv()
  13. # 关联字段的子查询(从 junction table 读取)
  14. # 对于带 relation_type 的 *_knowledge 边,同时暴露两种视图:
  15. # - *_ids : 扁平 ID 列表(向后兼容,不含 type)
  16. # - *_links : [{id, relation_type}](含 type)
  17. _REL_SUBQUERIES = """
  18. (SELECT COALESCE(json_agg(rk.requirement_id), '[]'::json)
  19. FROM requirement_knowledge rk WHERE rk.knowledge_id = knowledge.id) AS requirement_ids,
  20. (SELECT COALESCE(json_agg(json_build_object(
  21. 'id', rk2.requirement_id, 'relation_type', rk2.relation_type
  22. )), '[]'::json)
  23. FROM requirement_knowledge rk2 WHERE rk2.knowledge_id = knowledge.id) AS requirement_links,
  24. (SELECT COALESCE(json_agg(ck.capability_id), '[]'::json)
  25. FROM capability_knowledge ck WHERE ck.knowledge_id = knowledge.id) AS capability_ids,
  26. (SELECT COALESCE(json_agg(json_build_object(
  27. 'id', ck2.capability_id, 'relation_type', ck2.relation_type
  28. )), '[]'::json)
  29. FROM capability_knowledge ck2 WHERE ck2.knowledge_id = knowledge.id) AS capability_links,
  30. (SELECT COALESCE(json_agg(tk.tool_id), '[]'::json)
  31. FROM tool_knowledge tk WHERE tk.knowledge_id = knowledge.id) AS tool_ids,
  32. (SELECT COALESCE(json_agg(json_build_object(
  33. 'id', tk2.tool_id, 'relation_type', tk2.relation_type
  34. )), '[]'::json)
  35. FROM tool_knowledge tk2 WHERE tk2.knowledge_id = knowledge.id) AS tool_links,
  36. (SELECT COALESCE(json_agg(kr.resource_id), '[]'::json)
  37. FROM knowledge_resource kr WHERE kr.knowledge_id = knowledge.id) AS resource_ids,
  38. (SELECT COALESCE(json_agg(json_build_object(
  39. 'target_id', krel.target_id, 'relation_type', krel.relation_type
  40. )), '[]'::json)
  41. FROM knowledge_relation krel WHERE krel.source_id = knowledge.id) AS relations
  42. """
  43. # 基础字段(不含 embedding)
  44. _BASE_FIELDS = (
  45. "id, message_id, task, content, types, tags, tag_keys, "
  46. "scopes, owner, source, eval, "
  47. "created_at, updated_at, status, version"
  48. )
  49. # 完整 SELECT(含关联子查询)
  50. _SELECT_FIELDS = f"{_BASE_FIELDS}, {_REL_SUBQUERIES}"
  51. # 含 embedding 的 SELECT
  52. _SELECT_FIELDS_WITH_EMB = f"task_embedding, content_embedding, {_SELECT_FIELDS}"
  53. def _normalize_links(data: Dict, links_key: str, ids_key: str, default_type: str):
  54. """
  55. 统一两种输入格式:
  56. - {links_key: [{id, relation_type}, ...]} → 使用指定 type
  57. - {ids_key: [id1, id2, ...]} → 使用 default_type
  58. 两个 key 都没有返回 None(不更新)
  59. """
  60. if links_key in data and data[links_key] is not None:
  61. out = []
  62. for item in data[links_key]:
  63. if isinstance(item, dict):
  64. out.append((item['id'], item.get('relation_type', default_type)))
  65. else:
  66. out.append((item, default_type))
  67. return out
  68. if ids_key in data and data[ids_key] is not None:
  69. return [(i, default_type) for i in data[ids_key]]
  70. return None
  71. class PostgreSQLStore:
  72. def __init__(self):
  73. """初始化 PostgreSQL 连接"""
  74. self.conn = psycopg2.connect(
  75. host=os.getenv('KNOWHUB_DB'),
  76. port=int(os.getenv('KNOWHUB_PORT', 5432)),
  77. user=os.getenv('KNOWHUB_USER'),
  78. password=os.getenv('KNOWHUB_PASSWORD'),
  79. database=os.getenv('KNOWHUB_DB_NAME')
  80. )
  81. self.conn.autocommit = True
  82. print(f"[PostgreSQL] 已连接到远程数据库: {os.getenv('KNOWHUB_DB')}")
  83. def _reconnect(self):
  84. self.conn = psycopg2.connect(
  85. host=os.getenv('KNOWHUB_DB'),
  86. port=int(os.getenv('KNOWHUB_PORT', 5432)),
  87. user=os.getenv('KNOWHUB_USER'),
  88. password=os.getenv('KNOWHUB_PASSWORD'),
  89. database=os.getenv('KNOWHUB_DB_NAME')
  90. )
  91. self.conn.autocommit = True
  92. def _ensure_connection(self):
  93. if self.conn.closed != 0:
  94. self._reconnect()
  95. else:
  96. try:
  97. c = self.conn.cursor()
  98. c.execute("SELECT 1")
  99. c.close()
  100. except (psycopg2.OperationalError, psycopg2.InterfaceError):
  101. self._reconnect()
  102. def _get_cursor(self):
  103. """获取游标"""
  104. self._ensure_connection()
  105. return self.conn.cursor(cursor_factory=RealDictCursor)
  106. def insert(self, knowledge: Dict):
  107. """插入单条知识。若同 id 已存在会先删再插(AnalyticDB beam 不支持 ON CONFLICT UPDATE)。"""
  108. cursor = self._get_cursor()
  109. try:
  110. cursor.execute("DELETE FROM knowledge WHERE id = %s", (knowledge['id'],))
  111. cursor.execute("""
  112. INSERT INTO knowledge (
  113. id, task_embedding, content_embedding, message_id, task, content, types, tags,
  114. tag_keys, scopes, owner, source, eval,
  115. created_at, updated_at, status, version
  116. ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  117. """, (
  118. knowledge['id'],
  119. knowledge.get('task_embedding') or knowledge.get('embedding'),
  120. knowledge.get('content_embedding'),
  121. knowledge['message_id'],
  122. knowledge['task'],
  123. knowledge['content'],
  124. knowledge.get('types', []),
  125. json.dumps(knowledge.get('tags', {})),
  126. knowledge.get('tag_keys', []),
  127. knowledge.get('scopes', []),
  128. knowledge['owner'],
  129. json.dumps(knowledge.get('source', {})),
  130. json.dumps(knowledge.get('eval', {})),
  131. knowledge['created_at'],
  132. knowledge['updated_at'],
  133. knowledge.get('status', 'approved'),
  134. knowledge.get('version', 'v0'),
  135. ))
  136. # 写入关联表
  137. kid = knowledge['id']
  138. req_links = _normalize_links(knowledge, 'requirement_links', 'requirement_ids', 'related') or []
  139. for req_id, rtype in req_links:
  140. cursor.execute(
  141. "INSERT INTO requirement_knowledge (requirement_id, knowledge_id, relation_type) "
  142. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  143. (req_id, kid, rtype))
  144. cap_links = _normalize_links(knowledge, 'capability_links', 'capability_ids', 'related') or []
  145. for cap_id, rtype in cap_links:
  146. cursor.execute(
  147. "INSERT INTO capability_knowledge (capability_id, knowledge_id, relation_type) "
  148. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  149. (cap_id, kid, rtype))
  150. tool_links = _normalize_links(knowledge, 'tool_links', 'tool_ids', 'related') or []
  151. for tool_id, rtype in tool_links:
  152. cursor.execute(
  153. "INSERT INTO tool_knowledge (tool_id, knowledge_id, relation_type) "
  154. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  155. (tool_id, kid, rtype))
  156. for res_id in knowledge.get('resource_ids', []):
  157. cursor.execute(
  158. "INSERT INTO knowledge_resource (knowledge_id, resource_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  159. (kid, res_id))
  160. self.conn.commit()
  161. finally:
  162. cursor.close()
  163. def _apply_relation_filters(self, where_clause: str, relation_filters: Optional[Dict[str, str]], params: list) -> str:
  164. if not relation_filters:
  165. return where_clause
  166. rel_clauses = []
  167. for k, v in relation_filters.items():
  168. if not v: continue
  169. if k == 'requirement_id':
  170. rel_clauses.append("EXISTS (SELECT 1 FROM requirement_knowledge rk WHERE rk.knowledge_id = knowledge.id AND rk.requirement_id = %s)")
  171. params.append(v)
  172. elif k == 'capability_id':
  173. rel_clauses.append("EXISTS (SELECT 1 FROM capability_knowledge ck WHERE ck.knowledge_id = knowledge.id AND ck.capability_id = %s)")
  174. params.append(v)
  175. elif k == 'tool_id':
  176. rel_clauses.append("EXISTS (SELECT 1 FROM tool_knowledge tk WHERE tk.knowledge_id = knowledge.id AND tk.tool_id = %s)")
  177. params.append(v)
  178. if not rel_clauses:
  179. return where_clause
  180. rel_where = " AND ".join(rel_clauses)
  181. if where_clause.strip():
  182. return f"{where_clause} AND {rel_where}"
  183. else:
  184. return f"WHERE {rel_where}"
  185. def search(self, query_embedding: List[float], filters: Optional[str] = None, limit: int = 10, relation_filters: Optional[Dict[str, str]] = None) -> List[Dict]:
  186. """向量检索(使用余弦相似度)"""
  187. cursor = self._get_cursor()
  188. try:
  189. where_clause = self._build_where_clause(filters) if filters else ""
  190. params = []
  191. where_clause = self._apply_relation_filters(where_clause, relation_filters, params)
  192. sql = f"""
  193. SELECT {_SELECT_FIELDS},
  194. 1 - (task_embedding <=> %s::real[]) as score
  195. FROM knowledge
  196. {where_clause}
  197. ORDER BY task_embedding <=> %s::real[]
  198. LIMIT %s
  199. """
  200. final_params = [query_embedding] + params + [query_embedding, limit]
  201. cursor.execute(sql, tuple(final_params))
  202. results = cursor.fetchall()
  203. return [self._format_result(r) for r in results]
  204. finally:
  205. cursor.close()
  206. def query(self, filters: str, limit: int = 100, relation_filters: Optional[Dict[str, str]] = None) -> List[Dict]:
  207. """纯标量查询"""
  208. cursor = self._get_cursor()
  209. try:
  210. where_clause = self._build_where_clause(filters) if filters else ""
  211. params = []
  212. where_clause = self._apply_relation_filters(where_clause, relation_filters, params)
  213. sql = f"""
  214. SELECT {_SELECT_FIELDS}
  215. FROM knowledge
  216. {where_clause}
  217. LIMIT %s
  218. """
  219. final_params = params + [limit]
  220. cursor.execute(sql, tuple(final_params))
  221. results = cursor.fetchall()
  222. return [self._format_result(r) for r in results]
  223. finally:
  224. cursor.close()
  225. def get_by_id(self, knowledge_id: str, include_embedding: bool = False) -> Optional[Dict]:
  226. """根据ID获取知识(默认不返回embedding以提升性能)"""
  227. cursor = self._get_cursor()
  228. try:
  229. fields = _SELECT_FIELDS_WITH_EMB if include_embedding else _SELECT_FIELDS
  230. cursor.execute(f"""
  231. SELECT {fields}
  232. FROM knowledge WHERE id = %s
  233. """, (knowledge_id,))
  234. result = cursor.fetchone()
  235. return self._format_result(result) if result else None
  236. finally:
  237. cursor.close()
  238. def update(self, knowledge_id: str, updates: Dict):
  239. """更新知识"""
  240. cursor = self._get_cursor()
  241. try:
  242. # 分离关联字段和实体字段
  243. rel_keys = ('requirement_ids', 'requirement_links',
  244. 'capability_ids', 'capability_links',
  245. 'tool_ids', 'tool_links', 'resource_ids')
  246. rel_data = {k: updates.pop(k) for k in rel_keys if k in updates}
  247. if updates:
  248. set_parts = []
  249. params = []
  250. for key, value in updates.items():
  251. if key in ('tags', 'source', 'eval'):
  252. set_parts.append(f"{key} = %s")
  253. params.append(json.dumps(value))
  254. else:
  255. set_parts.append(f"{key} = %s")
  256. params.append(value)
  257. params.append(knowledge_id)
  258. sql = f"UPDATE knowledge SET {', '.join(set_parts)} WHERE id = %s"
  259. cursor.execute(sql, params)
  260. # 更新关联表(全量替换)
  261. req_links = _normalize_links(rel_data, 'requirement_links', 'requirement_ids', 'related')
  262. if req_links is not None:
  263. cursor.execute("DELETE FROM requirement_knowledge WHERE knowledge_id = %s", (knowledge_id,))
  264. for req_id, rtype in req_links:
  265. cursor.execute(
  266. "INSERT INTO requirement_knowledge (requirement_id, knowledge_id, relation_type) "
  267. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  268. (req_id, knowledge_id, rtype))
  269. cap_links = _normalize_links(rel_data, 'capability_links', 'capability_ids', 'related')
  270. if cap_links is not None:
  271. cursor.execute("DELETE FROM capability_knowledge WHERE knowledge_id = %s", (knowledge_id,))
  272. for cap_id, rtype in cap_links:
  273. cursor.execute(
  274. "INSERT INTO capability_knowledge (capability_id, knowledge_id, relation_type) "
  275. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  276. (cap_id, knowledge_id, rtype))
  277. tool_links = _normalize_links(rel_data, 'tool_links', 'tool_ids', 'related')
  278. if tool_links is not None:
  279. cursor.execute("DELETE FROM tool_knowledge WHERE knowledge_id = %s", (knowledge_id,))
  280. for tool_id, rtype in tool_links:
  281. cursor.execute(
  282. "INSERT INTO tool_knowledge (tool_id, knowledge_id, relation_type) "
  283. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  284. (tool_id, knowledge_id, rtype))
  285. if 'resource_ids' in rel_data and rel_data['resource_ids'] is not None:
  286. cursor.execute("DELETE FROM knowledge_resource WHERE knowledge_id = %s", (knowledge_id,))
  287. for res_id in rel_data['resource_ids']:
  288. cursor.execute(
  289. "INSERT INTO knowledge_resource (knowledge_id, resource_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  290. (knowledge_id, res_id))
  291. self.conn.commit()
  292. finally:
  293. cursor.close()
  294. def delete(self, knowledge_id: str):
  295. """删除知识及其关联表记录"""
  296. cursor = self._get_cursor()
  297. try:
  298. cascade_delete(cursor, 'knowledge', knowledge_id)
  299. self.conn.commit()
  300. finally:
  301. cursor.close()
  302. def add_relation(self, source_id: str, target_id: str, relation_type: str):
  303. """添加一条知识间关系(不删除已有关系)"""
  304. cursor = self._get_cursor()
  305. try:
  306. cursor.execute(
  307. "INSERT INTO knowledge_relation (source_id, target_id, relation_type) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  308. (source_id, target_id, relation_type))
  309. self.conn.commit()
  310. finally:
  311. cursor.close()
  312. def add_resource(self, knowledge_id: str, resource_id: str):
  313. """添加一条知识-资源关联(不删除已有关联)"""
  314. cursor = self._get_cursor()
  315. try:
  316. cursor.execute(
  317. "INSERT INTO knowledge_resource (knowledge_id, resource_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  318. (knowledge_id, resource_id))
  319. self.conn.commit()
  320. finally:
  321. cursor.close()
  322. def add_requirement(self, knowledge_id: str, requirement_id: str,
  323. relation_type: str = 'related'):
  324. """增量挂接 requirement-knowledge 边"""
  325. cursor = self._get_cursor()
  326. try:
  327. cursor.execute(
  328. "INSERT INTO requirement_knowledge (requirement_id, knowledge_id, relation_type) "
  329. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  330. (requirement_id, knowledge_id, relation_type))
  331. self.conn.commit()
  332. finally:
  333. cursor.close()
  334. def add_capability(self, knowledge_id: str, capability_id: str,
  335. relation_type: str = 'related'):
  336. """增量挂接 capability-knowledge 边"""
  337. cursor = self._get_cursor()
  338. try:
  339. cursor.execute(
  340. "INSERT INTO capability_knowledge (capability_id, knowledge_id, relation_type) "
  341. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  342. (capability_id, knowledge_id, relation_type))
  343. self.conn.commit()
  344. finally:
  345. cursor.close()
  346. def add_tool(self, knowledge_id: str, tool_id: str,
  347. relation_type: str = 'related'):
  348. """增量挂接 tool-knowledge 边"""
  349. cursor = self._get_cursor()
  350. try:
  351. cursor.execute(
  352. "INSERT INTO tool_knowledge (tool_id, knowledge_id, relation_type) "
  353. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  354. (tool_id, knowledge_id, relation_type))
  355. self.conn.commit()
  356. finally:
  357. cursor.close()
  358. def count(self) -> int:
  359. """返回知识总数"""
  360. cursor = self._get_cursor()
  361. try:
  362. cursor.execute("SELECT COUNT(*) as count FROM knowledge")
  363. return cursor.fetchone()['count']
  364. finally:
  365. cursor.close()
  366. def _build_where_clause(self, filters: str) -> str:
  367. """将Milvus风格的过滤表达式转换为PostgreSQL WHERE子句"""
  368. if not filters:
  369. return ""
  370. where = filters
  371. import re
  372. # 替换操作符
  373. where = where.replace(' == ', ' = ')
  374. where = where.replace(' or ', ' OR ')
  375. where = where.replace(' and ', ' AND ')
  376. # 处理数组包含操作
  377. where = re.sub(r'array_contains\((\w+),\s*"([^"]+)"\)', r"\1 @> ARRAY['\2']", where)
  378. # 处理 eval["score"] 语法
  379. where = where.replace('eval["score"]', "(eval->>'score')::int")
  380. # 把所有剩余的双引号字符串值替换为单引号(PostgreSQL标准)
  381. where = re.sub(r'"([^"]*)"', r"'\1'", where)
  382. return f"WHERE {where}"
  383. def _format_result(self, row: Dict) -> Dict:
  384. """格式化查询结果"""
  385. if not row:
  386. return None
  387. result = dict(row)
  388. if 'tags' in result and isinstance(result['tags'], str):
  389. result['tags'] = json.loads(result['tags'])
  390. if 'source' in result and isinstance(result['source'], str):
  391. result['source'] = json.loads(result['source'])
  392. if 'eval' in result and isinstance(result['eval'], str):
  393. result['eval'] = json.loads(result['eval'])
  394. # 关联字段(来自 junction table 子查询,可能是 JSON 字符串或已解析的列表)
  395. for field in ('requirement_ids', 'capability_ids', 'tool_ids', 'resource_ids',
  396. 'requirement_links', 'capability_links', 'tool_links'):
  397. if field in result and isinstance(result[field], str):
  398. result[field] = json.loads(result[field])
  399. elif field in result and result[field] is None:
  400. result[field] = []
  401. if 'relations' in result and isinstance(result['relations'], str):
  402. result['relations'] = json.loads(result['relations'])
  403. elif 'relations' in result and result['relations'] is None:
  404. result['relations'] = []
  405. if 'created_at' in result and result['created_at']:
  406. result['created_at'] = result['created_at'] * 1000
  407. if 'updated_at' in result and result['updated_at']:
  408. result['updated_at'] = result['updated_at'] * 1000
  409. return result
  410. def close(self):
  411. """关闭连接"""
  412. if self.conn:
  413. self.conn.close()
  414. def insert_batch(self, knowledge_list: List[Dict]):
  415. """批量插入知识"""
  416. if not knowledge_list:
  417. return
  418. cursor = self._get_cursor()
  419. try:
  420. data = []
  421. for k in knowledge_list:
  422. data.append((
  423. k['id'], k.get('task_embedding') or k.get('embedding'),
  424. k.get('content_embedding'),
  425. k['message_id'], k['task'],
  426. k['content'], k.get('types', []),
  427. json.dumps(k.get('tags', {})), k.get('tag_keys', []),
  428. k.get('scopes', []), k['owner'],
  429. json.dumps(k.get('source', {})), json.dumps(k.get('eval', {})),
  430. k['created_at'], k['updated_at'], k.get('status', 'approved'),
  431. k.get('version', 'v0'),
  432. ))
  433. execute_batch(cursor, """
  434. INSERT INTO knowledge (
  435. id, task_embedding, content_embedding, message_id, task, content, types, tags,
  436. tag_keys, scopes, owner, source, eval,
  437. created_at, updated_at, status, version
  438. ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  439. """, data)
  440. # 批量写入关联表
  441. for k in knowledge_list:
  442. kid = k['id']
  443. req_links = _normalize_links(k, 'requirement_links', 'requirement_ids', 'related') or []
  444. for req_id, rtype in req_links:
  445. cursor.execute(
  446. "INSERT INTO requirement_knowledge (requirement_id, knowledge_id, relation_type) "
  447. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  448. (req_id, kid, rtype))
  449. cap_links = _normalize_links(k, 'capability_links', 'capability_ids', 'related') or []
  450. for cap_id, rtype in cap_links:
  451. cursor.execute(
  452. "INSERT INTO capability_knowledge (capability_id, knowledge_id, relation_type) "
  453. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  454. (cap_id, kid, rtype))
  455. tool_links = _normalize_links(k, 'tool_links', 'tool_ids', 'related') or []
  456. for tool_id, rtype in tool_links:
  457. cursor.execute(
  458. "INSERT INTO tool_knowledge (tool_id, knowledge_id, relation_type) "
  459. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  460. (tool_id, kid, rtype))
  461. for res_id in k.get('resource_ids', []):
  462. cursor.execute(
  463. "INSERT INTO knowledge_resource (knowledge_id, resource_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  464. (kid, res_id))
  465. self.conn.commit()
  466. finally:
  467. cursor.close()