pg_store.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515
  1. """
  2. PostgreSQL 存储封装(替代 Milvus)
  3. 使用远程 PostgreSQL + pgvector/fastann 存储知识数据
  4. """
  5. import os
  6. import json
  7. import psycopg2
  8. from psycopg2.extras import RealDictCursor, execute_batch
  9. from typing import List, Dict, Optional
  10. from dotenv import load_dotenv
  11. from knowhub.knowhub_db.cascade import cascade_delete
  12. load_dotenv()
  13. # 关联字段的子查询(从 junction table 读取)
  14. # 对于带 relation_type 的 *_knowledge 边,同时暴露两种视图:
  15. # - *_ids : 扁平 ID 列表(向后兼容,不含 type)
  16. # - *_links : [{id, relation_type}](含 type)
  17. _REL_SUBQUERIES = """
  18. (SELECT COALESCE(json_agg(rk.requirement_id), '[]'::json)
  19. FROM requirement_knowledge rk WHERE rk.knowledge_id = knowledge.id) AS requirement_ids,
  20. (SELECT COALESCE(json_agg(json_build_object(
  21. 'id', rk2.requirement_id, 'relation_type', rk2.relation_type
  22. )), '[]'::json)
  23. FROM requirement_knowledge rk2 WHERE rk2.knowledge_id = knowledge.id) AS requirement_links,
  24. (SELECT COALESCE(json_agg(ck.capability_id), '[]'::json)
  25. FROM capability_knowledge ck WHERE ck.knowledge_id = knowledge.id) AS capability_ids,
  26. (SELECT COALESCE(json_agg(json_build_object(
  27. 'id', ck2.capability_id, 'relation_type', ck2.relation_type
  28. )), '[]'::json)
  29. FROM capability_knowledge ck2 WHERE ck2.knowledge_id = knowledge.id) AS capability_links,
  30. (SELECT COALESCE(json_agg(tk.tool_id), '[]'::json)
  31. FROM tool_knowledge tk WHERE tk.knowledge_id = knowledge.id) AS tool_ids,
  32. (SELECT COALESCE(json_agg(json_build_object(
  33. 'id', tk2.tool_id, 'relation_type', tk2.relation_type
  34. )), '[]'::json)
  35. FROM tool_knowledge tk2 WHERE tk2.knowledge_id = knowledge.id) AS tool_links,
  36. (SELECT COALESCE(json_agg(kr.resource_id), '[]'::json)
  37. FROM knowledge_resource kr WHERE kr.knowledge_id = knowledge.id) AS resource_ids,
  38. (SELECT COALESCE(json_agg(json_build_object(
  39. 'target_id', krel.target_id, 'relation_type', krel.relation_type
  40. )), '[]'::json)
  41. FROM knowledge_relation krel WHERE krel.source_id = knowledge.id) AS relations
  42. """
  43. # 基础字段(不含 embedding)
  44. _BASE_FIELDS = (
  45. "id, message_id, task, content, types, tags, tag_keys, "
  46. "scopes, owner, source, eval, "
  47. "created_at, updated_at, status"
  48. )
  49. # 完整 SELECT(含关联子查询)
  50. _SELECT_FIELDS = f"{_BASE_FIELDS}, {_REL_SUBQUERIES}"
  51. # 含 embedding 的 SELECT
  52. _SELECT_FIELDS_WITH_EMB = f"task_embedding, content_embedding, {_SELECT_FIELDS}"
  53. def _normalize_links(data: Dict, links_key: str, ids_key: str, default_type: str):
  54. """
  55. 统一两种输入格式:
  56. - {links_key: [{id, relation_type}, ...]} → 使用指定 type
  57. - {ids_key: [id1, id2, ...]} → 使用 default_type
  58. 两个 key 都没有返回 None(不更新)
  59. """
  60. if links_key in data and data[links_key] is not None:
  61. out = []
  62. for item in data[links_key]:
  63. if isinstance(item, dict):
  64. out.append((item['id'], item.get('relation_type', default_type)))
  65. else:
  66. out.append((item, default_type))
  67. return out
  68. if ids_key in data and data[ids_key] is not None:
  69. return [(i, default_type) for i in data[ids_key]]
  70. return None
  71. class PostgreSQLStore:
  72. def __init__(self):
  73. """初始化 PostgreSQL 连接"""
  74. self.conn = psycopg2.connect(
  75. host=os.getenv('KNOWHUB_DB'),
  76. port=int(os.getenv('KNOWHUB_PORT', 5432)),
  77. user=os.getenv('KNOWHUB_USER'),
  78. password=os.getenv('KNOWHUB_PASSWORD'),
  79. database=os.getenv('KNOWHUB_DB_NAME')
  80. )
  81. self.conn.autocommit = True
  82. print(f"[PostgreSQL] 已连接到远程数据库: {os.getenv('KNOWHUB_DB')}")
  83. def _reconnect(self):
  84. self.conn = psycopg2.connect(
  85. host=os.getenv('KNOWHUB_DB'),
  86. port=int(os.getenv('KNOWHUB_PORT', 5432)),
  87. user=os.getenv('KNOWHUB_USER'),
  88. password=os.getenv('KNOWHUB_PASSWORD'),
  89. database=os.getenv('KNOWHUB_DB_NAME')
  90. )
  91. self.conn.autocommit = True
  92. def _ensure_connection(self):
  93. if self.conn.closed != 0:
  94. self._reconnect()
  95. else:
  96. try:
  97. c = self.conn.cursor()
  98. c.execute("SELECT 1")
  99. c.close()
  100. except (psycopg2.OperationalError, psycopg2.InterfaceError):
  101. self._reconnect()
  102. def _get_cursor(self):
  103. """获取游标"""
  104. self._ensure_connection()
  105. return self.conn.cursor(cursor_factory=RealDictCursor)
  106. def insert(self, knowledge: Dict):
  107. """插入单条知识"""
  108. cursor = self._get_cursor()
  109. try:
  110. cursor.execute("""
  111. INSERT INTO knowledge (
  112. id, task_embedding, content_embedding, message_id, task, content, types, tags,
  113. tag_keys, scopes, owner, source, eval,
  114. created_at, updated_at, status
  115. ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  116. """, (
  117. knowledge['id'],
  118. knowledge.get('task_embedding') or knowledge.get('embedding'),
  119. knowledge.get('content_embedding'),
  120. knowledge['message_id'],
  121. knowledge['task'],
  122. knowledge['content'],
  123. knowledge.get('types', []),
  124. json.dumps(knowledge.get('tags', {})),
  125. knowledge.get('tag_keys', []),
  126. knowledge.get('scopes', []),
  127. knowledge['owner'],
  128. json.dumps(knowledge.get('source', {})),
  129. json.dumps(knowledge.get('eval', {})),
  130. knowledge['created_at'],
  131. knowledge['updated_at'],
  132. knowledge.get('status', 'approved'),
  133. ))
  134. # 写入关联表
  135. kid = knowledge['id']
  136. req_links = _normalize_links(knowledge, 'requirement_links', 'requirement_ids', 'related') or []
  137. for req_id, rtype in req_links:
  138. cursor.execute(
  139. "INSERT INTO requirement_knowledge (requirement_id, knowledge_id, relation_type) "
  140. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  141. (req_id, kid, rtype))
  142. cap_links = _normalize_links(knowledge, 'capability_links', 'capability_ids', 'related') or []
  143. for cap_id, rtype in cap_links:
  144. cursor.execute(
  145. "INSERT INTO capability_knowledge (capability_id, knowledge_id, relation_type) "
  146. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  147. (cap_id, kid, rtype))
  148. tool_links = _normalize_links(knowledge, 'tool_links', 'tool_ids', 'related') or []
  149. for tool_id, rtype in tool_links:
  150. cursor.execute(
  151. "INSERT INTO tool_knowledge (tool_id, knowledge_id, relation_type) "
  152. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  153. (tool_id, kid, rtype))
  154. for res_id in knowledge.get('resource_ids', []):
  155. cursor.execute(
  156. "INSERT INTO knowledge_resource (knowledge_id, resource_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  157. (kid, res_id))
  158. self.conn.commit()
  159. finally:
  160. cursor.close()
  161. def _apply_relation_filters(self, where_clause: str, relation_filters: Optional[Dict[str, str]], params: list) -> str:
  162. if not relation_filters:
  163. return where_clause
  164. rel_clauses = []
  165. for k, v in relation_filters.items():
  166. if not v: continue
  167. if k == 'requirement_id':
  168. rel_clauses.append("EXISTS (SELECT 1 FROM requirement_knowledge rk WHERE rk.knowledge_id = knowledge.id AND rk.requirement_id = %s)")
  169. params.append(v)
  170. elif k == 'capability_id':
  171. rel_clauses.append("EXISTS (SELECT 1 FROM capability_knowledge ck WHERE ck.knowledge_id = knowledge.id AND ck.capability_id = %s)")
  172. params.append(v)
  173. elif k == 'tool_id':
  174. rel_clauses.append("EXISTS (SELECT 1 FROM tool_knowledge tk WHERE tk.knowledge_id = knowledge.id AND tk.tool_id = %s)")
  175. params.append(v)
  176. if not rel_clauses:
  177. return where_clause
  178. rel_where = " AND ".join(rel_clauses)
  179. if where_clause.strip():
  180. return f"{where_clause} AND {rel_where}"
  181. else:
  182. return f"WHERE {rel_where}"
  183. def search(self, query_embedding: List[float], filters: Optional[str] = None, limit: int = 10, relation_filters: Optional[Dict[str, str]] = None) -> List[Dict]:
  184. """向量检索(使用余弦相似度)"""
  185. cursor = self._get_cursor()
  186. try:
  187. where_clause = self._build_where_clause(filters) if filters else ""
  188. params = []
  189. where_clause = self._apply_relation_filters(where_clause, relation_filters, params)
  190. sql = f"""
  191. SELECT {_SELECT_FIELDS},
  192. 1 - (task_embedding <=> %s::real[]) as score
  193. FROM knowledge
  194. {where_clause}
  195. ORDER BY task_embedding <=> %s::real[]
  196. LIMIT %s
  197. """
  198. final_params = [query_embedding] + params + [query_embedding, limit]
  199. cursor.execute(sql, tuple(final_params))
  200. results = cursor.fetchall()
  201. return [self._format_result(r) for r in results]
  202. finally:
  203. cursor.close()
  204. def query(self, filters: str, limit: int = 100, relation_filters: Optional[Dict[str, str]] = None) -> List[Dict]:
  205. """纯标量查询"""
  206. cursor = self._get_cursor()
  207. try:
  208. where_clause = self._build_where_clause(filters) if filters else ""
  209. params = []
  210. where_clause = self._apply_relation_filters(where_clause, relation_filters, params)
  211. sql = f"""
  212. SELECT {_SELECT_FIELDS}
  213. FROM knowledge
  214. {where_clause}
  215. LIMIT %s
  216. """
  217. final_params = params + [limit]
  218. cursor.execute(sql, tuple(final_params))
  219. results = cursor.fetchall()
  220. return [self._format_result(r) for r in results]
  221. finally:
  222. cursor.close()
  223. def get_by_id(self, knowledge_id: str, include_embedding: bool = False) -> Optional[Dict]:
  224. """根据ID获取知识(默认不返回embedding以提升性能)"""
  225. cursor = self._get_cursor()
  226. try:
  227. fields = _SELECT_FIELDS_WITH_EMB if include_embedding else _SELECT_FIELDS
  228. cursor.execute(f"""
  229. SELECT {fields}
  230. FROM knowledge WHERE id = %s
  231. """, (knowledge_id,))
  232. result = cursor.fetchone()
  233. return self._format_result(result) if result else None
  234. finally:
  235. cursor.close()
  236. def update(self, knowledge_id: str, updates: Dict):
  237. """更新知识"""
  238. cursor = self._get_cursor()
  239. try:
  240. # 分离关联字段和实体字段
  241. rel_keys = ('requirement_ids', 'requirement_links',
  242. 'capability_ids', 'capability_links',
  243. 'tool_ids', 'tool_links', 'resource_ids')
  244. rel_data = {k: updates.pop(k) for k in rel_keys if k in updates}
  245. if updates:
  246. set_parts = []
  247. params = []
  248. for key, value in updates.items():
  249. if key in ('tags', 'source', 'eval'):
  250. set_parts.append(f"{key} = %s")
  251. params.append(json.dumps(value))
  252. else:
  253. set_parts.append(f"{key} = %s")
  254. params.append(value)
  255. params.append(knowledge_id)
  256. sql = f"UPDATE knowledge SET {', '.join(set_parts)} WHERE id = %s"
  257. cursor.execute(sql, params)
  258. # 更新关联表(全量替换)
  259. req_links = _normalize_links(rel_data, 'requirement_links', 'requirement_ids', 'related')
  260. if req_links is not None:
  261. cursor.execute("DELETE FROM requirement_knowledge WHERE knowledge_id = %s", (knowledge_id,))
  262. for req_id, rtype in req_links:
  263. cursor.execute(
  264. "INSERT INTO requirement_knowledge (requirement_id, knowledge_id, relation_type) "
  265. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  266. (req_id, knowledge_id, rtype))
  267. cap_links = _normalize_links(rel_data, 'capability_links', 'capability_ids', 'related')
  268. if cap_links is not None:
  269. cursor.execute("DELETE FROM capability_knowledge WHERE knowledge_id = %s", (knowledge_id,))
  270. for cap_id, rtype in cap_links:
  271. cursor.execute(
  272. "INSERT INTO capability_knowledge (capability_id, knowledge_id, relation_type) "
  273. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  274. (cap_id, knowledge_id, rtype))
  275. tool_links = _normalize_links(rel_data, 'tool_links', 'tool_ids', 'related')
  276. if tool_links is not None:
  277. cursor.execute("DELETE FROM tool_knowledge WHERE knowledge_id = %s", (knowledge_id,))
  278. for tool_id, rtype in tool_links:
  279. cursor.execute(
  280. "INSERT INTO tool_knowledge (tool_id, knowledge_id, relation_type) "
  281. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  282. (tool_id, knowledge_id, rtype))
  283. if 'resource_ids' in rel_data and rel_data['resource_ids'] is not None:
  284. cursor.execute("DELETE FROM knowledge_resource WHERE knowledge_id = %s", (knowledge_id,))
  285. for res_id in rel_data['resource_ids']:
  286. cursor.execute(
  287. "INSERT INTO knowledge_resource (knowledge_id, resource_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  288. (knowledge_id, res_id))
  289. self.conn.commit()
  290. finally:
  291. cursor.close()
  292. def delete(self, knowledge_id: str):
  293. """删除知识及其关联表记录"""
  294. cursor = self._get_cursor()
  295. try:
  296. cascade_delete(cursor, 'knowledge', knowledge_id)
  297. self.conn.commit()
  298. finally:
  299. cursor.close()
  300. def add_relation(self, source_id: str, target_id: str, relation_type: str):
  301. """添加一条知识间关系(不删除已有关系)"""
  302. cursor = self._get_cursor()
  303. try:
  304. cursor.execute(
  305. "INSERT INTO knowledge_relation (source_id, target_id, relation_type) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  306. (source_id, target_id, relation_type))
  307. self.conn.commit()
  308. finally:
  309. cursor.close()
  310. def add_resource(self, knowledge_id: str, resource_id: str):
  311. """添加一条知识-资源关联(不删除已有关联)"""
  312. cursor = self._get_cursor()
  313. try:
  314. cursor.execute(
  315. "INSERT INTO knowledge_resource (knowledge_id, resource_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  316. (knowledge_id, resource_id))
  317. self.conn.commit()
  318. finally:
  319. cursor.close()
  320. def add_requirement(self, knowledge_id: str, requirement_id: str,
  321. relation_type: str = 'related'):
  322. """增量挂接 requirement-knowledge 边"""
  323. cursor = self._get_cursor()
  324. try:
  325. cursor.execute(
  326. "INSERT INTO requirement_knowledge (requirement_id, knowledge_id, relation_type) "
  327. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  328. (requirement_id, knowledge_id, relation_type))
  329. self.conn.commit()
  330. finally:
  331. cursor.close()
  332. def add_capability(self, knowledge_id: str, capability_id: str,
  333. relation_type: str = 'related'):
  334. """增量挂接 capability-knowledge 边"""
  335. cursor = self._get_cursor()
  336. try:
  337. cursor.execute(
  338. "INSERT INTO capability_knowledge (capability_id, knowledge_id, relation_type) "
  339. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  340. (capability_id, knowledge_id, relation_type))
  341. self.conn.commit()
  342. finally:
  343. cursor.close()
  344. def add_tool(self, knowledge_id: str, tool_id: str,
  345. relation_type: str = 'related'):
  346. """增量挂接 tool-knowledge 边"""
  347. cursor = self._get_cursor()
  348. try:
  349. cursor.execute(
  350. "INSERT INTO tool_knowledge (tool_id, knowledge_id, relation_type) "
  351. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  352. (tool_id, knowledge_id, relation_type))
  353. self.conn.commit()
  354. finally:
  355. cursor.close()
  356. def count(self) -> int:
  357. """返回知识总数"""
  358. cursor = self._get_cursor()
  359. try:
  360. cursor.execute("SELECT COUNT(*) as count FROM knowledge")
  361. return cursor.fetchone()['count']
  362. finally:
  363. cursor.close()
  364. def _build_where_clause(self, filters: str) -> str:
  365. """将Milvus风格的过滤表达式转换为PostgreSQL WHERE子句"""
  366. if not filters:
  367. return ""
  368. where = filters
  369. import re
  370. # 替换操作符
  371. where = where.replace(' == ', ' = ')
  372. where = where.replace(' or ', ' OR ')
  373. where = where.replace(' and ', ' AND ')
  374. # 处理数组包含操作
  375. where = re.sub(r'array_contains\((\w+),\s*"([^"]+)"\)', r"\1 @> ARRAY['\2']", where)
  376. # 处理 eval["score"] 语法
  377. where = where.replace('eval["score"]', "(eval->>'score')::int")
  378. # 把所有剩余的双引号字符串值替换为单引号(PostgreSQL标准)
  379. where = re.sub(r'"([^"]*)"', r"'\1'", where)
  380. return f"WHERE {where}"
  381. def _format_result(self, row: Dict) -> Dict:
  382. """格式化查询结果"""
  383. if not row:
  384. return None
  385. result = dict(row)
  386. if 'tags' in result and isinstance(result['tags'], str):
  387. result['tags'] = json.loads(result['tags'])
  388. if 'source' in result and isinstance(result['source'], str):
  389. result['source'] = json.loads(result['source'])
  390. if 'eval' in result and isinstance(result['eval'], str):
  391. result['eval'] = json.loads(result['eval'])
  392. # 关联字段(来自 junction table 子查询,可能是 JSON 字符串或已解析的列表)
  393. for field in ('requirement_ids', 'capability_ids', 'tool_ids', 'resource_ids',
  394. 'requirement_links', 'capability_links', 'tool_links'):
  395. if field in result and isinstance(result[field], str):
  396. result[field] = json.loads(result[field])
  397. elif field in result and result[field] is None:
  398. result[field] = []
  399. if 'relations' in result and isinstance(result['relations'], str):
  400. result['relations'] = json.loads(result['relations'])
  401. elif 'relations' in result and result['relations'] is None:
  402. result['relations'] = []
  403. if 'created_at' in result and result['created_at']:
  404. result['created_at'] = result['created_at'] * 1000
  405. if 'updated_at' in result and result['updated_at']:
  406. result['updated_at'] = result['updated_at'] * 1000
  407. return result
  408. def close(self):
  409. """关闭连接"""
  410. if self.conn:
  411. self.conn.close()
  412. def insert_batch(self, knowledge_list: List[Dict]):
  413. """批量插入知识"""
  414. if not knowledge_list:
  415. return
  416. cursor = self._get_cursor()
  417. try:
  418. data = []
  419. for k in knowledge_list:
  420. data.append((
  421. k['id'], k.get('task_embedding') or k.get('embedding'),
  422. k.get('content_embedding'),
  423. k['message_id'], k['task'],
  424. k['content'], k.get('types', []),
  425. json.dumps(k.get('tags', {})), k.get('tag_keys', []),
  426. k.get('scopes', []), k['owner'],
  427. json.dumps(k.get('source', {})), json.dumps(k.get('eval', {})),
  428. k['created_at'], k['updated_at'], k.get('status', 'approved'),
  429. ))
  430. execute_batch(cursor, """
  431. INSERT INTO knowledge (
  432. id, task_embedding, content_embedding, message_id, task, content, types, tags,
  433. tag_keys, scopes, owner, source, eval,
  434. created_at, updated_at, status
  435. ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  436. """, data)
  437. # 批量写入关联表
  438. for k in knowledge_list:
  439. kid = k['id']
  440. req_links = _normalize_links(k, 'requirement_links', 'requirement_ids', 'related') or []
  441. for req_id, rtype in req_links:
  442. cursor.execute(
  443. "INSERT INTO requirement_knowledge (requirement_id, knowledge_id, relation_type) "
  444. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  445. (req_id, kid, rtype))
  446. cap_links = _normalize_links(k, 'capability_links', 'capability_ids', 'related') or []
  447. for cap_id, rtype in cap_links:
  448. cursor.execute(
  449. "INSERT INTO capability_knowledge (capability_id, knowledge_id, relation_type) "
  450. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  451. (cap_id, kid, rtype))
  452. tool_links = _normalize_links(k, 'tool_links', 'tool_ids', 'related') or []
  453. for tool_id, rtype in tool_links:
  454. cursor.execute(
  455. "INSERT INTO tool_knowledge (tool_id, knowledge_id, relation_type) "
  456. "VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  457. (tool_id, kid, rtype))
  458. for res_id in k.get('resource_ids', []):
  459. cursor.execute(
  460. "INSERT INTO knowledge_resource (knowledge_id, resource_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  461. (kid, res_id))
  462. self.conn.commit()
  463. finally:
  464. cursor.close()