pg_store.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. """
  2. PostgreSQL 存储封装(替代 Milvus)
  3. 使用远程 PostgreSQL + pgvector/fastann 存储知识数据
  4. """
  5. import os
  6. import json
  7. import psycopg2
  8. from psycopg2.extras import RealDictCursor, execute_batch
  9. from typing import List, Dict, Optional
  10. from dotenv import load_dotenv
  11. from knowhub.knowhub_db.cascade import cascade_delete
  12. load_dotenv()
  13. # 关联字段的子查询(从 junction table 读取,返回 JSON 数组)
  14. _REL_SUBQUERIES = """
  15. (SELECT COALESCE(json_agg(rk.requirement_id), '[]'::json)
  16. FROM requirement_knowledge rk WHERE rk.knowledge_id = knowledge.id) AS requirement_ids,
  17. (SELECT COALESCE(json_agg(ck.capability_id), '[]'::json)
  18. FROM capability_knowledge ck WHERE ck.knowledge_id = knowledge.id) AS capability_ids,
  19. (SELECT COALESCE(json_agg(tk.tool_id), '[]'::json)
  20. FROM tool_knowledge tk WHERE tk.knowledge_id = knowledge.id) AS tool_ids,
  21. (SELECT COALESCE(json_agg(kr.resource_id), '[]'::json)
  22. FROM knowledge_resource kr WHERE kr.knowledge_id = knowledge.id) AS resource_ids,
  23. (SELECT COALESCE(json_agg(json_build_object(
  24. 'target_id', krel.target_id, 'relation_type', krel.relation_type
  25. )), '[]'::json)
  26. FROM knowledge_relation krel WHERE krel.source_id = knowledge.id) AS relations
  27. """
  28. # 基础字段(不含 embedding)
  29. _BASE_FIELDS = (
  30. "id, message_id, task, content, types, tags, tag_keys, "
  31. "scopes, owner, source, eval, "
  32. "created_at, updated_at, status"
  33. )
  34. # 完整 SELECT(含关联子查询)
  35. _SELECT_FIELDS = f"{_BASE_FIELDS}, {_REL_SUBQUERIES}"
  36. # 含 embedding 的 SELECT
  37. _SELECT_FIELDS_WITH_EMB = f"task_embedding, content_embedding, {_SELECT_FIELDS}"
  38. class PostgreSQLStore:
  39. def __init__(self):
  40. """初始化 PostgreSQL 连接"""
  41. self.conn = psycopg2.connect(
  42. host=os.getenv('KNOWHUB_DB'),
  43. port=int(os.getenv('KNOWHUB_PORT', 5432)),
  44. user=os.getenv('KNOWHUB_USER'),
  45. password=os.getenv('KNOWHUB_PASSWORD'),
  46. database=os.getenv('KNOWHUB_DB_NAME')
  47. )
  48. self.conn.autocommit = False
  49. print(f"[PostgreSQL] 已连接到远程数据库: {os.getenv('KNOWHUB_DB')}")
  50. def _reconnect(self):
  51. self.conn = psycopg2.connect(
  52. host=os.getenv('KNOWHUB_DB'),
  53. port=int(os.getenv('KNOWHUB_PORT', 5432)),
  54. user=os.getenv('KNOWHUB_USER'),
  55. password=os.getenv('KNOWHUB_PASSWORD'),
  56. database=os.getenv('KNOWHUB_DB_NAME')
  57. )
  58. self.conn.autocommit = False
  59. def _ensure_connection(self):
  60. if self.conn.closed != 0:
  61. self._reconnect()
  62. else:
  63. try:
  64. c = self.conn.cursor()
  65. c.execute("SELECT 1")
  66. c.close()
  67. except (psycopg2.OperationalError, psycopg2.InterfaceError):
  68. self._reconnect()
  69. def _get_cursor(self):
  70. """获取游标"""
  71. self._ensure_connection()
  72. return self.conn.cursor(cursor_factory=RealDictCursor)
  73. def insert(self, knowledge: Dict):
  74. """插入单条知识"""
  75. cursor = self._get_cursor()
  76. try:
  77. cursor.execute("""
  78. INSERT INTO knowledge (
  79. id, task_embedding, content_embedding, message_id, task, content, types, tags,
  80. tag_keys, scopes, owner, source, eval,
  81. created_at, updated_at, status
  82. ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  83. """, (
  84. knowledge['id'],
  85. knowledge.get('task_embedding') or knowledge.get('embedding'),
  86. knowledge.get('content_embedding'),
  87. knowledge['message_id'],
  88. knowledge['task'],
  89. knowledge['content'],
  90. knowledge.get('types', []),
  91. json.dumps(knowledge.get('tags', {})),
  92. knowledge.get('tag_keys', []),
  93. knowledge.get('scopes', []),
  94. knowledge['owner'],
  95. json.dumps(knowledge.get('source', {})),
  96. json.dumps(knowledge.get('eval', {})),
  97. knowledge['created_at'],
  98. knowledge['updated_at'],
  99. knowledge.get('status', 'approved'),
  100. ))
  101. # 写入关联表
  102. kid = knowledge['id']
  103. for req_id in knowledge.get('requirement_ids', []):
  104. cursor.execute(
  105. "INSERT INTO requirement_knowledge (requirement_id, knowledge_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  106. (req_id, kid))
  107. for cap_id in knowledge.get('capability_ids', []):
  108. cursor.execute(
  109. "INSERT INTO capability_knowledge (capability_id, knowledge_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  110. (cap_id, kid))
  111. for tool_id in knowledge.get('tool_ids', []):
  112. cursor.execute(
  113. "INSERT INTO tool_knowledge (tool_id, knowledge_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  114. (tool_id, kid))
  115. for res_id in knowledge.get('resource_ids', []):
  116. cursor.execute(
  117. "INSERT INTO knowledge_resource (knowledge_id, resource_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  118. (kid, res_id))
  119. self.conn.commit()
  120. finally:
  121. cursor.close()
  122. def search(self, query_embedding: List[float], filters: Optional[str] = None, limit: int = 10) -> List[Dict]:
  123. """向量检索(使用余弦相似度)"""
  124. cursor = self._get_cursor()
  125. try:
  126. where_clause = self._build_where_clause(filters) if filters else ""
  127. sql = f"""
  128. SELECT {_SELECT_FIELDS},
  129. 1 - (task_embedding <=> %s::real[]) as score
  130. FROM knowledge
  131. {where_clause}
  132. ORDER BY task_embedding <=> %s::real[]
  133. LIMIT %s
  134. """
  135. cursor.execute(sql, (query_embedding, query_embedding, limit))
  136. results = cursor.fetchall()
  137. return [self._format_result(r) for r in results]
  138. finally:
  139. cursor.close()
  140. def query(self, filters: str, limit: int = 100) -> List[Dict]:
  141. """纯标量查询"""
  142. cursor = self._get_cursor()
  143. try:
  144. where_clause = self._build_where_clause(filters)
  145. sql = f"""
  146. SELECT {_SELECT_FIELDS}
  147. FROM knowledge
  148. {where_clause}
  149. LIMIT %s
  150. """
  151. cursor.execute(sql, (limit,))
  152. results = cursor.fetchall()
  153. return [self._format_result(r) for r in results]
  154. finally:
  155. cursor.close()
  156. def get_by_id(self, knowledge_id: str, include_embedding: bool = False) -> Optional[Dict]:
  157. """根据ID获取知识(默认不返回embedding以提升性能)"""
  158. cursor = self._get_cursor()
  159. try:
  160. fields = _SELECT_FIELDS_WITH_EMB if include_embedding else _SELECT_FIELDS
  161. cursor.execute(f"""
  162. SELECT {fields}
  163. FROM knowledge WHERE id = %s
  164. """, (knowledge_id,))
  165. result = cursor.fetchone()
  166. return self._format_result(result) if result else None
  167. finally:
  168. cursor.close()
  169. def update(self, knowledge_id: str, updates: Dict):
  170. """更新知识"""
  171. cursor = self._get_cursor()
  172. try:
  173. # 分离关联字段和实体字段
  174. req_ids = updates.pop('requirement_ids', None)
  175. cap_ids = updates.pop('capability_ids', None)
  176. tool_ids = updates.pop('tool_ids', None)
  177. resource_ids = updates.pop('resource_ids', None)
  178. if updates:
  179. set_parts = []
  180. params = []
  181. for key, value in updates.items():
  182. if key in ('tags', 'source', 'eval'):
  183. set_parts.append(f"{key} = %s")
  184. params.append(json.dumps(value))
  185. else:
  186. set_parts.append(f"{key} = %s")
  187. params.append(value)
  188. params.append(knowledge_id)
  189. sql = f"UPDATE knowledge SET {', '.join(set_parts)} WHERE id = %s"
  190. cursor.execute(sql, params)
  191. # 更新关联表(全量替换)
  192. if req_ids is not None:
  193. cursor.execute("DELETE FROM requirement_knowledge WHERE knowledge_id = %s", (knowledge_id,))
  194. for req_id in req_ids:
  195. cursor.execute(
  196. "INSERT INTO requirement_knowledge (requirement_id, knowledge_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  197. (req_id, knowledge_id))
  198. if cap_ids is not None:
  199. cursor.execute("DELETE FROM capability_knowledge WHERE knowledge_id = %s", (knowledge_id,))
  200. for cap_id in cap_ids:
  201. cursor.execute(
  202. "INSERT INTO capability_knowledge (capability_id, knowledge_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  203. (cap_id, knowledge_id))
  204. if tool_ids is not None:
  205. cursor.execute("DELETE FROM tool_knowledge WHERE knowledge_id = %s", (knowledge_id,))
  206. for tool_id in tool_ids:
  207. cursor.execute(
  208. "INSERT INTO tool_knowledge (tool_id, knowledge_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  209. (tool_id, knowledge_id))
  210. if resource_ids is not None:
  211. cursor.execute("DELETE FROM knowledge_resource WHERE knowledge_id = %s", (knowledge_id,))
  212. for res_id in resource_ids:
  213. cursor.execute(
  214. "INSERT INTO knowledge_resource (knowledge_id, resource_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  215. (knowledge_id, res_id))
  216. self.conn.commit()
  217. finally:
  218. cursor.close()
  219. def delete(self, knowledge_id: str):
  220. """删除知识及其关联表记录"""
  221. cursor = self._get_cursor()
  222. try:
  223. cascade_delete(cursor, 'knowledge', knowledge_id)
  224. self.conn.commit()
  225. finally:
  226. cursor.close()
  227. def add_relation(self, source_id: str, target_id: str, relation_type: str):
  228. """添加一条知识间关系(不删除已有关系)"""
  229. cursor = self._get_cursor()
  230. try:
  231. cursor.execute(
  232. "INSERT INTO knowledge_relation (source_id, target_id, relation_type) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
  233. (source_id, target_id, relation_type))
  234. self.conn.commit()
  235. finally:
  236. cursor.close()
  237. def add_resource(self, knowledge_id: str, resource_id: str):
  238. """添加一条知识-资源关联(不删除已有关联)"""
  239. cursor = self._get_cursor()
  240. try:
  241. cursor.execute(
  242. "INSERT INTO knowledge_resource (knowledge_id, resource_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  243. (knowledge_id, resource_id))
  244. self.conn.commit()
  245. finally:
  246. cursor.close()
  247. def count(self) -> int:
  248. """返回知识总数"""
  249. cursor = self._get_cursor()
  250. try:
  251. cursor.execute("SELECT COUNT(*) as count FROM knowledge")
  252. return cursor.fetchone()['count']
  253. finally:
  254. cursor.close()
  255. def _build_where_clause(self, filters: str) -> str:
  256. """将Milvus风格的过滤表达式转换为PostgreSQL WHERE子句"""
  257. if not filters:
  258. return ""
  259. where = filters
  260. import re
  261. # 替换操作符
  262. where = where.replace(' == ', ' = ')
  263. where = where.replace(' or ', ' OR ')
  264. where = where.replace(' and ', ' AND ')
  265. # 处理数组包含操作
  266. where = re.sub(r'array_contains\((\w+),\s*"([^"]+)"\)', r"\1 @> ARRAY['\2']", where)
  267. # 处理 eval["score"] 语法
  268. where = where.replace('eval["score"]', "(eval->>'score')::int")
  269. # 把所有剩余的双引号字符串值替换为单引号(PostgreSQL标准)
  270. where = re.sub(r'"([^"]*)"', r"'\1'", where)
  271. return f"WHERE {where}"
  272. def _format_result(self, row: Dict) -> Dict:
  273. """格式化查询结果"""
  274. if not row:
  275. return None
  276. result = dict(row)
  277. if 'tags' in result and isinstance(result['tags'], str):
  278. result['tags'] = json.loads(result['tags'])
  279. if 'source' in result and isinstance(result['source'], str):
  280. result['source'] = json.loads(result['source'])
  281. if 'eval' in result and isinstance(result['eval'], str):
  282. result['eval'] = json.loads(result['eval'])
  283. # 关联字段(来自 junction table 子查询,可能是 JSON 字符串或已解析的列表)
  284. for field in ('requirement_ids', 'capability_ids', 'tool_ids', 'resource_ids'):
  285. if field in result and isinstance(result[field], str):
  286. result[field] = json.loads(result[field])
  287. elif field in result and result[field] is None:
  288. result[field] = []
  289. if 'relations' in result and isinstance(result['relations'], str):
  290. result['relations'] = json.loads(result['relations'])
  291. elif 'relations' in result and result['relations'] is None:
  292. result['relations'] = []
  293. if 'created_at' in result and result['created_at']:
  294. result['created_at'] = result['created_at'] * 1000
  295. if 'updated_at' in result and result['updated_at']:
  296. result['updated_at'] = result['updated_at'] * 1000
  297. return result
  298. def close(self):
  299. """关闭连接"""
  300. if self.conn:
  301. self.conn.close()
  302. def insert_batch(self, knowledge_list: List[Dict]):
  303. """批量插入知识"""
  304. if not knowledge_list:
  305. return
  306. cursor = self._get_cursor()
  307. try:
  308. data = []
  309. for k in knowledge_list:
  310. data.append((
  311. k['id'], k.get('task_embedding') or k.get('embedding'),
  312. k.get('content_embedding'),
  313. k['message_id'], k['task'],
  314. k['content'], k.get('types', []),
  315. json.dumps(k.get('tags', {})), k.get('tag_keys', []),
  316. k.get('scopes', []), k['owner'],
  317. json.dumps(k.get('source', {})), json.dumps(k.get('eval', {})),
  318. k['created_at'], k['updated_at'], k.get('status', 'approved'),
  319. ))
  320. execute_batch(cursor, """
  321. INSERT INTO knowledge (
  322. id, task_embedding, content_embedding, message_id, task, content, types, tags,
  323. tag_keys, scopes, owner, source, eval,
  324. created_at, updated_at, status
  325. ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  326. """, data)
  327. # 批量写入关联表
  328. for k in knowledge_list:
  329. kid = k['id']
  330. for req_id in k.get('requirement_ids', []):
  331. cursor.execute(
  332. "INSERT INTO requirement_knowledge (requirement_id, knowledge_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  333. (req_id, kid))
  334. for cap_id in k.get('capability_ids', []):
  335. cursor.execute(
  336. "INSERT INTO capability_knowledge (capability_id, knowledge_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  337. (cap_id, kid))
  338. for tool_id in k.get('tool_ids', []):
  339. cursor.execute(
  340. "INSERT INTO tool_knowledge (tool_id, knowledge_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  341. (tool_id, kid))
  342. for res_id in k.get('resource_ids', []):
  343. cursor.execute(
  344. "INSERT INTO knowledge_resource (knowledge_id, resource_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
  345. (kid, res_id))
  346. self.conn.commit()
  347. finally:
  348. cursor.close()