luojunhui пре 1 дан
родитељ
комит
cb5637b931

+ 3 - 0
.gitignore

@@ -40,3 +40,6 @@ config-cache/
 
 ### 回归测试生成的快照 ###
 script/recall_test_results/
+
+CLAUDE.md
+docs

+ 94 - 0
core/src/main/resources/sql/article_vector_ddl.sql

@@ -0,0 +1,94 @@
+-- ============================================================================
+-- 文章解构结果缓存表 (对称 material_deconstruct_result)
+-- ============================================================================
+--
+-- 数据来源: AIGC API task callback detail (taskId=66)
+-- 写入方:   ArticleVectorJob.syncArticleDeconstructJob()
+-- 读取方:   ArticleVectorJob.vectorArticleJob() — 分页扫描 article_id
+--           VectorRecallTestServiceImpl — 按 article_id 批量查 result JSON
+-- 去重:     UNIQUE (article_id, source),同一 article_id + source 只写入一次
+-- 清理:     暂无自动清理,需手动处理
+
+CREATE TABLE IF NOT EXISTS article_deconstruct_result (
+    id          BIGSERIAL    PRIMARY KEY,
+    article_id  VARCHAR(128) NOT NULL,                -- 文章 ID (来自 AIGC bizUniqueId)
+    source      VARCHAR(64)  NOT NULL DEFAULT 'aigc_deconstruct', -- 数据来源标识
+    result      TEXT,                                  -- 解构结果 JSON 完整内容 (dataContent)
+    create_time TIMESTAMP    NOT NULL DEFAULT NOW(),   -- 首次写入时间
+    update_time TIMESTAMP    NOT NULL DEFAULT NOW(),   -- 最近更新时间 (ON CONFLICT DO NOTHING 时不更新)
+    CONSTRAINT uq_article_deconstruct_article_source UNIQUE (article_id, source)
+);
+
+-- 分页扫描 + 批量查询都需要走 source 过滤
+-- selectArticleIdsBySourcePaged: WHERE source = ? ORDER BY article_id
+-- selectExistingArticleIds / selectResultsByArticleIds: WHERE source = ? AND article_id IN (...)
+CREATE INDEX IF NOT EXISTS idx_article_deconstruct_source_article
+    ON article_deconstruct_result(source, article_id);
+
+COMMENT ON TABLE  article_deconstruct_result IS '文章解构结果缓存 (数据来源: AIGC taskId=66)';
+COMMENT ON COLUMN article_deconstruct_result.article_id  IS '文章 ID (对应 AIGC bizUniqueId)';
+COMMENT ON COLUMN article_deconstruct_result.source      IS '数据来源标识 (默认 aigc_deconstruct)';
+COMMENT ON COLUMN article_deconstruct_result.result      IS '解构结果 JSON 完整内容 (dataContent)';
+COMMENT ON COLUMN article_deconstruct_result.create_time IS '首次写入时间';
+COMMENT ON COLUMN article_deconstruct_result.update_time IS '最近更新时间';
+
+-- ============================================================================
+-- 文章向量存储表 (对称 material_vectors)
+-- ============================================================================
+--
+-- 向量维度: 1024 (pgvector)
+-- 写入方:   ArticleVectorJob.vectorArticleJob() — upsert 单条/多点的文本向量
+-- 读取方:   VectorRecallTestServiceImpl.recallArticleItems() — 余弦相似度 TopN 检索
+--           ArticleVectorJob — text_hash 缓存命中检测、existsByIds 跳过已处理
+-- 去重:     UNIQUE (config_code, article_id, point_index)
+-- 清理:     deleteAbovePointIndex 多点模式下清理旧点
+-- 向量索引: 需要 AFTER INSERT 批量数据后执行 CREATE INDEX (ivfflat 需要训练数据)
+
+CREATE TABLE IF NOT EXISTS article_vectors (
+    id          BIGSERIAL    PRIMARY KEY,
+    article_id  VARCHAR(128) NOT NULL,                       -- 文章 ID
+    config_code VARCHAR(64)  NOT NULL,                       -- 向量配置编码 (如 VIDEO_TOPIC)
+    point_index INTEGER      NOT NULL DEFAULT 0,             -- 向量点索引: 单点=0, 多点=0,1,2,...
+    embedding   vector(1024),                                -- 向量数据 (pgvector, 余弦距离检索)
+    text        TEXT,                                        -- 向量化的原始文本
+    text_hash   VARCHAR(64),                                 -- 原始文本 MD5, 用于跨素材/文章复用 embedding
+    created_at  TIMESTAMP    NOT NULL DEFAULT NOW(),         -- 首次写入时间
+    updated_at  TIMESTAMP    NOT NULL DEFAULT NOW(),         -- 最近更新时间
+    CONSTRAINT uq_article_vectors_cfg_id_point UNIQUE (config_code, article_id, point_index)
+);
+
+-- text_hash 缓存查询: WHERE text_hash = ? AND config_code = ? LIMIT 1
+CREATE INDEX IF NOT EXISTS idx_article_vectors_text_hash
+    ON article_vectors(text_hash, config_code);
+
+-- 批量存在性检查 + 全量 ID 查询: WHERE config_code = ? [AND article_id IN (...)]
+-- UNIQUE 索引前导列也是 config_code,此处单独建可避免回表
+CREATE INDEX IF NOT EXISTS idx_article_vectors_config_article
+    ON article_vectors(config_code, article_id);
+
+COMMENT ON TABLE  article_vectors IS '文章向量存储表 (pgvector, 余弦相似度检索)';
+COMMENT ON COLUMN article_vectors.article_id  IS '文章 ID';
+COMMENT ON COLUMN article_vectors.config_code IS '向量配置编码 (如 VIDEO_TOPIC)';
+COMMENT ON COLUMN article_vectors.point_index IS '向量点索引: 单点=0, 多点模式=0,1,2,...';
+COMMENT ON COLUMN article_vectors.embedding   IS '向量数据 (pgvector vector(1024))';
+COMMENT ON COLUMN article_vectors.text        IS '向量化的原始文本内容';
+COMMENT ON COLUMN article_vectors.text_hash   IS '原始文本 MD5 (跨素材/文章复用 embedding 缓存)';
+COMMENT ON COLUMN article_vectors.created_at  IS '首次写入时间';
+COMMENT ON COLUMN article_vectors.updated_at  IS '最近更新时间';
+
+-- ============================================================================
+-- pgvector 向量索引 (HNSW / IVFFlat)
+-- ============================================================================
+--
+-- 注意: IVFFlat 需要一定量数据 (建议 > 1000 行) 才能有效训练,建表后暂不自动创建。
+-- 待 articleJob 写入首批数据后,按需执行以下语句:
+--
+--   CREATE INDEX IF NOT EXISTS idx_article_vectors_embedding_ivfflat
+--       ON article_vectors USING ivfflat (embedding vector_cosine_ops)
+--       WITH (lists = 100);
+--
+-- 或者使用 HNSW (PgVector 0.5+, 构建更快,查询性能通常更优):
+--
+--   CREATE INDEX IF NOT EXISTS idx_article_vectors_embedding_hnsw
+--       ON article_vectors USING hnsw (embedding vector_cosine_ops)
+--       WITH (m = 16, ef_construction = 200);

+ 137 - 0
server/src/test/resources/application-test-local.yml

@@ -0,0 +1,137 @@
+# 离线测试 profile:用于本地手动跑 MaterialEmbeddingTestRunner
+# - 关闭 Eureka / Apollo / XXL-Job admin 连接
+# - 复用 dev 的数据库、Redis、DashScope 配置(公网可达)
+# - 不启动 Web 容器(main 中已 set spring.main.web-application-type=none)
+
+server:
+  port: 0  # 不占用端口
+
+spring:
+  main:
+    web-application-type: none
+    allow-bean-definition-overriding: true
+  datasource:
+    video-vector:
+      driver-class-name: com.mysql.jdbc.Driver
+      jdbc-url: jdbc:mysql://rm-bp17q95335a99272b.mysql.rds.aliyuncs.com:3306/deconstruct-vector?useUnicode=true&characterEncoding=utf-8&zeroDateTimeBehavior=convertToNull&useSSL=false
+      username: crawler
+      password: crawler123456@
+      type: com.zaxxer.hikari.HikariDataSource
+      hikari:
+        minimum-idle: 2
+        maximum-pool-size: 5
+        connection-test-query: SELECT 1
+    pg-vector:
+      driver-class-name: org.postgresql.Driver
+      jdbc-url: jdbc:postgresql://pgm-bp1x72iry10srsc2jo.pg.rds.aliyuncs.com/vector?currentSchema=public
+      username: vector
+      password: vector123456@
+      type: com.zaxxer.hikari.HikariDataSource
+      hikari:
+        minimum-idle: 2
+        maximum-pool-size: 5
+        connection-test-query: SELECT 1
+        connection-init-sql: SET hnsw.ef_search = 100
+
+  redis:
+    host: r-bp1zg8fw8db0vxdo2mpd.redis.rds.aliyuncs.com
+    port: 6379
+    username: r-bp1zg8fw8db0vxdo2m
+    password: Wqsd@2026
+
+  cloud:
+    discovery:
+      enabled: false   # 关闭 Spring Cloud 服务发现
+
+# 关闭 Eureka 客户端,离线测试不注册到注册中心
+eureka:
+  client:
+    enabled: false
+    register-with-eureka: false
+    fetch-registry: false
+
+# 关闭 Apollo 启动加载
+apollo:
+  bootstrap:
+    enabled: false
+  autoUpdateInjectedSpringProperties: false
+
+# XXL-Job 占位(XxlJobConfig 已被 ComponentScan 排除,但@Value 仍需要默认值;如果未排除也不会真正连接)
+xxl:
+  job:
+    accessToken:
+    admin:
+      addresses: ""
+    executor:
+      appname: video-vector-server
+      address: ""
+      ip: ""
+      port: 0
+      logpath: /tmp/xxl-job-test
+      logretentiondays: 1
+
+# DashScope embedding 配置(公网可访问)
+embedding:
+  mode: dashscope
+  cache:
+    expire: 3600
+
+dashscope:
+  embedding:
+    dashscope:
+      api-url: https://dashscope.aliyuncs.com/api/v1/services/embeddings/multimodal-embedding/multimodal-embedding
+      api-key: sk-590b6529891f48c08970c66e9b76f8d6
+      timeout: 60
+
+# Apollo 兜底字段
+video:
+  detail:
+    metrics:
+      days: 7
+  api:
+    timeout: 30
+
+# AIGC 素材任务 → sourceType 映射(默认空 map,离线测试时通过 -D 注入或修改此处)
+# 例: {"67": 1, "69": 2}  — 67 任务的素材打 sourceType=1(外部合作),69 任务的素材打 sourceType=2(内部素材)
+aigc:
+  api:
+    timeout: 30
+  material:
+    task:
+      source:
+        map: "{}"
+    source:
+      type:
+        default: 2
+
+logging:
+  level:
+    com.tzld.videoVector: DEBUG
+    com.tzld.videoVector.dao.mapper: DEBUG
+    com.zaxxer.hikari: INFO
+    com.netflix: WARN
+    com.ctrip.framework.apollo: WARN
+
+# 以下为不相关 bean 的占位配置(测试不会真正调用,仅避免启动报错)
+oss:
+  videoVector:
+    accessKey: dummy
+    secretKey: dummy
+    ossEndPoint: dummy
+    priEndPoint: dummy
+    internal:
+      endPoint: dummy
+    projectName: dummy
+    cdnDomain: dummy
+    imgDomain: dummy
+    pubBucket: dummy
+    priBucket: dummy
+    needPress: false
+    videoDomain: dummy
+    videoEndPoint: dummy
+    expiration: 3600
+    lvvideoDomain: dummy
+
+cdn:
+  upload:
+    domain: dummy