|
@@ -0,0 +1,94 @@
|
|
|
|
|
+-- ============================================================================
|
|
|
|
|
+-- 文章解构结果缓存表 (对称 material_deconstruct_result)
|
|
|
|
|
+-- ============================================================================
|
|
|
|
|
+--
|
|
|
|
|
+-- 数据来源: AIGC API task callback detail (taskId=66)
|
|
|
|
|
+-- 写入方: ArticleVectorJob.syncArticleDeconstructJob()
|
|
|
|
|
+-- 读取方: ArticleVectorJob.vectorArticleJob() — 分页扫描 article_id
|
|
|
|
|
+-- VectorRecallTestServiceImpl — 按 article_id 批量查 result JSON
|
|
|
|
|
+-- 去重: UNIQUE (article_id, source),同一 article_id + source 只写入一次
|
|
|
|
|
+-- 清理: 暂无自动清理,需手动处理
|
|
|
|
|
+
|
|
|
|
|
+CREATE TABLE IF NOT EXISTS article_deconstruct_result (
|
|
|
|
|
+ id BIGSERIAL PRIMARY KEY,
|
|
|
|
|
+ article_id VARCHAR(128) NOT NULL, -- 文章 ID (来自 AIGC bizUniqueId)
|
|
|
|
|
+ source VARCHAR(64) NOT NULL DEFAULT 'aigc_deconstruct', -- 数据来源标识
|
|
|
|
|
+ result TEXT, -- 解构结果 JSON 完整内容 (dataContent)
|
|
|
|
|
+ create_time TIMESTAMP NOT NULL DEFAULT NOW(), -- 首次写入时间
|
|
|
|
|
+ update_time TIMESTAMP NOT NULL DEFAULT NOW(), -- 最近更新时间 (ON CONFLICT DO NOTHING 时不更新)
|
|
|
|
|
+ CONSTRAINT uq_article_deconstruct_article_source UNIQUE (article_id, source)
|
|
|
|
|
+);
|
|
|
|
|
+
|
|
|
|
|
+-- 分页扫描 + 批量查询都需要走 source 过滤
|
|
|
|
|
+-- selectArticleIdsBySourcePaged: WHERE source = ? ORDER BY article_id
|
|
|
|
|
+-- selectExistingArticleIds / selectResultsByArticleIds: WHERE source = ? AND article_id IN (...)
|
|
|
|
|
+CREATE INDEX IF NOT EXISTS idx_article_deconstruct_source_article
|
|
|
|
|
+ ON article_deconstruct_result(source, article_id);
|
|
|
|
|
+
|
|
|
|
|
+COMMENT ON TABLE article_deconstruct_result IS '文章解构结果缓存 (数据来源: AIGC taskId=66)';
|
|
|
|
|
+COMMENT ON COLUMN article_deconstruct_result.article_id IS '文章 ID (对应 AIGC bizUniqueId)';
|
|
|
|
|
+COMMENT ON COLUMN article_deconstruct_result.source IS '数据来源标识 (默认 aigc_deconstruct)';
|
|
|
|
|
+COMMENT ON COLUMN article_deconstruct_result.result IS '解构结果 JSON 完整内容 (dataContent)';
|
|
|
|
|
+COMMENT ON COLUMN article_deconstruct_result.create_time IS '首次写入时间';
|
|
|
|
|
+COMMENT ON COLUMN article_deconstruct_result.update_time IS '最近更新时间';
|
|
|
|
|
+
|
|
|
|
|
+-- ============================================================================
|
|
|
|
|
+-- 文章向量存储表 (对称 material_vectors)
|
|
|
|
|
+-- ============================================================================
|
|
|
|
|
+--
|
|
|
|
|
+-- 向量维度: 1024 (pgvector)
|
|
|
|
|
+-- 写入方: ArticleVectorJob.vectorArticleJob() — upsert 单条/多点的文本向量
|
|
|
|
|
+-- 读取方: VectorRecallTestServiceImpl.recallArticleItems() — 余弦相似度 TopN 检索
|
|
|
|
|
+-- ArticleVectorJob — text_hash 缓存命中检测、existsByIds 跳过已处理
|
|
|
|
|
+-- 去重: UNIQUE (config_code, article_id, point_index)
|
|
|
|
|
+-- 清理: deleteAbovePointIndex 多点模式下清理旧点
|
|
|
|
|
+-- 向量索引: 需要 AFTER INSERT 批量数据后执行 CREATE INDEX (ivfflat 需要训练数据)
|
|
|
|
|
+
|
|
|
|
|
+CREATE TABLE IF NOT EXISTS article_vectors (
|
|
|
|
|
+ id BIGSERIAL PRIMARY KEY,
|
|
|
|
|
+ article_id VARCHAR(128) NOT NULL, -- 文章 ID
|
|
|
|
|
+ config_code VARCHAR(64) NOT NULL, -- 向量配置编码 (如 VIDEO_TOPIC)
|
|
|
|
|
+ point_index INTEGER NOT NULL DEFAULT 0, -- 向量点索引: 单点=0, 多点=0,1,2,...
|
|
|
|
|
+ embedding vector(1024), -- 向量数据 (pgvector, 余弦距离检索)
|
|
|
|
|
+ text TEXT, -- 向量化的原始文本
|
|
|
|
|
+ text_hash VARCHAR(64), -- 原始文本 MD5, 用于跨素材/文章复用 embedding
|
|
|
|
|
+ created_at TIMESTAMP NOT NULL DEFAULT NOW(), -- 首次写入时间
|
|
|
|
|
+ updated_at TIMESTAMP NOT NULL DEFAULT NOW(), -- 最近更新时间
|
|
|
|
|
+ CONSTRAINT uq_article_vectors_cfg_id_point UNIQUE (config_code, article_id, point_index)
|
|
|
|
|
+);
|
|
|
|
|
+
|
|
|
|
|
+-- text_hash 缓存查询: WHERE text_hash = ? AND config_code = ? LIMIT 1
|
|
|
|
|
+CREATE INDEX IF NOT EXISTS idx_article_vectors_text_hash
|
|
|
|
|
+ ON article_vectors(text_hash, config_code);
|
|
|
|
|
+
|
|
|
|
|
+-- 批量存在性检查 + 全量 ID 查询: WHERE config_code = ? [AND article_id IN (...)]
|
|
|
|
|
+-- UNIQUE 索引前导列也是 config_code,此处单独建可避免回表
|
|
|
|
|
+CREATE INDEX IF NOT EXISTS idx_article_vectors_config_article
|
|
|
|
|
+ ON article_vectors(config_code, article_id);
|
|
|
|
|
+
|
|
|
|
|
+COMMENT ON TABLE article_vectors IS '文章向量存储表 (pgvector, 余弦相似度检索)';
|
|
|
|
|
+COMMENT ON COLUMN article_vectors.article_id IS '文章 ID';
|
|
|
|
|
+COMMENT ON COLUMN article_vectors.config_code IS '向量配置编码 (如 VIDEO_TOPIC)';
|
|
|
|
|
+COMMENT ON COLUMN article_vectors.point_index IS '向量点索引: 单点=0, 多点模式=0,1,2,...';
|
|
|
|
|
+COMMENT ON COLUMN article_vectors.embedding IS '向量数据 (pgvector vector(1024))';
|
|
|
|
|
+COMMENT ON COLUMN article_vectors.text IS '向量化的原始文本内容';
|
|
|
|
|
+COMMENT ON COLUMN article_vectors.text_hash IS '原始文本 MD5 (跨素材/文章复用 embedding 缓存)';
|
|
|
|
|
+COMMENT ON COLUMN article_vectors.created_at IS '首次写入时间';
|
|
|
|
|
+COMMENT ON COLUMN article_vectors.updated_at IS '最近更新时间';
|
|
|
|
|
+
|
|
|
|
|
+-- ============================================================================
|
|
|
|
|
+-- pgvector 向量索引 (HNSW / IVFFlat)
|
|
|
|
|
+-- ============================================================================
|
|
|
|
|
+--
|
|
|
|
|
+-- 注意: IVFFlat 需要一定量数据 (建议 > 1000 行) 才能有效训练,建表后暂不自动创建。
|
|
|
|
|
+-- 待 articleJob 写入首批数据后,按需执行以下语句:
|
|
|
|
|
+--
|
|
|
|
|
+-- CREATE INDEX IF NOT EXISTS idx_article_vectors_embedding_ivfflat
|
|
|
|
|
+-- ON article_vectors USING ivfflat (embedding vector_cosine_ops)
|
|
|
|
|
+-- WITH (lists = 100);
|
|
|
|
|
+--
|
|
|
|
|
+-- 或者使用 HNSW (PgVector 0.5+, 构建更快,查询性能通常更优):
|
|
|
|
|
+--
|
|
|
|
|
+-- CREATE INDEX IF NOT EXISTS idx_article_vectors_embedding_hnsw
|
|
|
|
|
+-- ON article_vectors USING hnsw (embedding vector_cosine_ops)
|
|
|
|
|
+-- WITH (m = 16, ef_construction = 200);
|