|
|
@@ -0,0 +1,239 @@
|
|
|
+package com.tzld.videoVector.job;
|
|
|
+
|
|
|
+import com.tzld.videoVector.api.VideoApiService;
|
|
|
+import com.tzld.videoVector.common.constant.VectorConstants;
|
|
|
+import com.tzld.videoVector.dao.mapper.pgVector.DeconstructVectorConfigMapper;
|
|
|
+import com.tzld.videoVector.dao.mapper.pgVector.ext.VideoVectorMapperExt;
|
|
|
+import com.tzld.videoVector.model.entity.VideoDetail;
|
|
|
+import com.tzld.videoVector.model.po.pgVector.DeconstructVectorConfig;
|
|
|
+import com.tzld.videoVector.model.po.pgVector.DeconstructVectorConfigExample;
|
|
|
+import com.tzld.videoVector.service.EmbeddingService;
|
|
|
+import com.tzld.videoVector.service.VectorStoreService;
|
|
|
+import com.tzld.videoVector.util.Md5Util;
|
|
|
+import com.xxl.job.core.biz.model.ReturnT;
|
|
|
+import com.xxl.job.core.handler.annotation.XxlJob;
|
|
|
+import lombok.extern.slf4j.Slf4j;
|
|
|
+import org.springframework.stereotype.Component;
|
|
|
+import org.springframework.util.CollectionUtils;
|
|
|
+import org.springframework.util.StringUtils;
|
|
|
+
|
|
|
+import javax.annotation.Resource;
|
|
|
+import java.util.*;
|
|
|
+import java.util.concurrent.atomic.AtomicInteger;
|
|
|
+import java.util.stream.Collectors;
|
|
|
+
|
|
|
+/**
|
|
|
+ * 视频标题向量化定时任务
|
|
|
+ * <p>
|
|
|
+ * 1. 查询 video_vectors 表中排除 VIDEO_TITLE 本身的所有不重复 video_id(即其他 configCode 下的视频)
|
|
|
+ * 2. 对比已有 VIDEO_TITLE 向量的 video_id,删除不再存在于其他 configCode 中的历史标题向量
|
|
|
+ * 3. 对未向量化的 video_id 批量获取标题并向量化存储
|
|
|
+ */
|
|
|
+@Slf4j
|
|
|
+@Component
|
|
|
+public class VideoTitleVectorJob {
|
|
|
+
|
|
|
+ @Resource
|
|
|
+ private VideoVectorMapperExt videoVectorMapperExt;
|
|
|
+
|
|
|
+ @Resource
|
|
|
+ private DeconstructVectorConfigMapper vectorConfigMapper;
|
|
|
+
|
|
|
+ @Resource
|
|
|
+ private VideoApiService videoApiService;
|
|
|
+
|
|
|
+ @Resource
|
|
|
+ private EmbeddingService embeddingService;
|
|
|
+
|
|
|
+ @Resource
|
|
|
+ private VectorStoreService vectorStoreService;
|
|
|
+
|
|
|
+ /** 每批获取视频详情的数量 */
|
|
|
+ private static final int DETAIL_BATCH_SIZE = 100;
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 视频标题向量化任务
|
|
|
+ * 读取 video_vectors 中所有 video_id,获取视频标题,进行向量化写入
|
|
|
+ *
|
|
|
+ * @param param 参数(暂未使用)
|
|
|
+ * @return 执行结果
|
|
|
+ */
|
|
|
+ @XxlJob("videoTitleVectorJob")
|
|
|
+ public ReturnT<String> videoTitleVectorJob(String param) {
|
|
|
+ log.info("开始执行视频标题向量化任务, param: {}", param);
|
|
|
+
|
|
|
+ try {
|
|
|
+ // 1. 获取 VIDEO_TITLE 配置
|
|
|
+ DeconstructVectorConfig config = getVideoTitleConfig();
|
|
|
+ if (config == null) {
|
|
|
+ log.error("未找到 VIDEO_TITLE 向量化配置,任务终止");
|
|
|
+ return new ReturnT<>(ReturnT.FAIL_CODE, "未找到 VIDEO_TITLE 配置");
|
|
|
+ }
|
|
|
+ log.info("加载 VIDEO_TITLE 配置成功: model={}, dimension={}, maxLength={}",
|
|
|
+ config.getEmbeddingModel(), config.getDimension(), config.getMaxLength());
|
|
|
+
|
|
|
+ // 2. 查询其他 configCode 下的所有 video_id(排除 VIDEO_TITLE 自身)
|
|
|
+ List<Long> otherCodeVideoIds = videoVectorMapperExt.selectDistinctVideoIdsExcludeConfigCode(
|
|
|
+ VectorConstants.VIDEO_TITLE_CONFIG_CODE);
|
|
|
+ if (CollectionUtils.isEmpty(otherCodeVideoIds)) {
|
|
|
+ log.info("其他 configCode 下无视频数据,跳过");
|
|
|
+ return ReturnT.SUCCESS;
|
|
|
+ }
|
|
|
+ Set<Long> otherCodeVideoIdSet = new HashSet<>(otherCodeVideoIds);
|
|
|
+ log.info("其他 configCode 下的 video_id 数量: {}", otherCodeVideoIdSet.size());
|
|
|
+
|
|
|
+ // 3. 查询已有 VIDEO_TITLE 向量的 video_id
|
|
|
+ Set<Long> existingTitleIds = getExistingVideoTitleIds();
|
|
|
+ log.info("已有 VIDEO_TITLE 向量的 video_id 数量: {}", existingTitleIds.size());
|
|
|
+
|
|
|
+ // 4. 清理:删除已有 VIDEO_TITLE 向量但不再存在于其他 configCode 中的记录
|
|
|
+ List<Long> toDeleteIds = existingTitleIds.stream()
|
|
|
+ .filter(id -> !otherCodeVideoIdSet.contains(id))
|
|
|
+ .collect(Collectors.toList());
|
|
|
+ if (!toDeleteIds.isEmpty()) {
|
|
|
+ log.info("需要清理的 VIDEO_TITLE 向量数量: {}", toDeleteIds.size());
|
|
|
+ // 分批删除
|
|
|
+ for (int i = 0; i < toDeleteIds.size(); i += 1000) {
|
|
|
+ int end = Math.min(i + 1000, toDeleteIds.size());
|
|
|
+ List<Long> batch = toDeleteIds.subList(i, end);
|
|
|
+ videoVectorMapperExt.deleteBatchByVideoIdsAndConfigCode(batch, VectorConstants.VIDEO_TITLE_CONFIG_CODE);
|
|
|
+ }
|
|
|
+ log.info("清理完成,删除 {} 条 VIDEO_TITLE 向量", toDeleteIds.size());
|
|
|
+ }
|
|
|
+
|
|
|
+ // 5. 过滤出需要新增向量化的 video_id(在其他 code 中存在但无 VIDEO_TITLE 向量)
|
|
|
+ List<Long> needProcessIds = otherCodeVideoIds.stream()
|
|
|
+ .filter(id -> !existingTitleIds.contains(id))
|
|
|
+ .collect(Collectors.toList());
|
|
|
+ if (needProcessIds.isEmpty()) {
|
|
|
+ log.info("所有 video_id 已完成标题向量化,无需处理");
|
|
|
+ return ReturnT.SUCCESS;
|
|
|
+ }
|
|
|
+ log.info("需要新增标题向量化的 video_id 数量: {}", needProcessIds.size());
|
|
|
+
|
|
|
+ // 6. 分批获取视频详情并向量化
|
|
|
+ AtomicInteger totalSuccess = new AtomicInteger(0);
|
|
|
+ AtomicInteger totalFail = new AtomicInteger(0);
|
|
|
+ AtomicInteger totalSkip = new AtomicInteger(0);
|
|
|
+
|
|
|
+ for (int i = 0; i < needProcessIds.size(); i += DETAIL_BATCH_SIZE) {
|
|
|
+ int end = Math.min(i + DETAIL_BATCH_SIZE, needProcessIds.size());
|
|
|
+ List<Long> batchIds = needProcessIds.subList(i, end);
|
|
|
+
|
|
|
+ try {
|
|
|
+ processBatch(batchIds, config, totalSuccess, totalFail, totalSkip);
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("批次 {}-{} 处理异常: {}", i, end, e.getMessage(), e);
|
|
|
+ totalFail.addAndGet(batchIds.size());
|
|
|
+ }
|
|
|
+
|
|
|
+ if ((i / DETAIL_BATCH_SIZE + 1) % 10 == 0) {
|
|
|
+ log.info("进度: 已处理 {}/{}, 成功: {}, 失败: {}, 跳过: {}",
|
|
|
+ end, needProcessIds.size(), totalSuccess.get(), totalFail.get(), totalSkip.get());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ log.info("视频标题向量化任务完成,新增成功: {}, 失败: {}, 跳过: {}, 清理: {}",
|
|
|
+ totalSuccess.get(), totalFail.get(), totalSkip.get(), toDeleteIds.size());
|
|
|
+ return ReturnT.SUCCESS;
|
|
|
+
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("视频标题向量化任务执行失败: {}", e.getMessage(), e);
|
|
|
+ return new ReturnT<>(ReturnT.FAIL_CODE, "任务执行失败: " + e.getMessage());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 处理一批视频的标题向量化
|
|
|
+ */
|
|
|
+ private void processBatch(List<Long> videoIds, DeconstructVectorConfig config,
|
|
|
+ AtomicInteger totalSuccess, AtomicInteger totalFail, AtomicInteger totalSkip) {
|
|
|
+ // 批量获取视频详情
|
|
|
+ Map<Long, VideoDetail> detailMap = videoApiService.getVideoDetail(new HashSet<>(videoIds));
|
|
|
+ if (detailMap == null || detailMap.isEmpty()) {
|
|
|
+ log.warn("批量获取视频详情返回空,videoIds数量: {}", videoIds.size());
|
|
|
+ totalSkip.addAndGet(videoIds.size());
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ String configCode = config.getConfigCode();
|
|
|
+ Integer maxLength = config.getMaxLength();
|
|
|
+
|
|
|
+ for (Long videoId : videoIds) {
|
|
|
+ try {
|
|
|
+ VideoDetail detail = detailMap.get(videoId);
|
|
|
+ if (detail == null || !StringUtils.hasText(detail.getTitle())) {
|
|
|
+ log.debug("videoId={} 无视频详情或标题为空,跳过", videoId);
|
|
|
+ totalSkip.incrementAndGet();
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 审核过滤:未通过审核的视频跳过
|
|
|
+ if (!detail.isAuditPassed()) {
|
|
|
+ log.debug("videoId={} 审核未通过,跳过", videoId);
|
|
|
+ totalSkip.incrementAndGet();
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ String title = detail.getTitle();
|
|
|
+ // 截断标题
|
|
|
+ if (maxLength != null && maxLength > 0 && title.length() > maxLength) {
|
|
|
+ title = title.substring(0, maxLength);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 优先通过 text_hash 复用已有 embedding
|
|
|
+ List<Float> vector = getOrEmbed(title, config);
|
|
|
+ if (vector == null || vector.isEmpty()) {
|
|
|
+ log.warn("videoId={} 标题向量化失败,title={}", videoId, title);
|
|
|
+ totalFail.incrementAndGet();
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 存储向量
|
|
|
+ vectorStoreService.save(configCode, videoId, vector, title);
|
|
|
+ totalSuccess.incrementAndGet();
|
|
|
+ log.debug("videoId={} 标题向量化存储成功", videoId);
|
|
|
+
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("videoId={} 标题向量化异常: {}", videoId, e.getMessage(), e);
|
|
|
+ totalFail.incrementAndGet();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 优先通过 text_hash 复用已有 embedding,未命中则调用 embedding API
|
|
|
+ */
|
|
|
+ private List<Float> getOrEmbed(String text, DeconstructVectorConfig config) {
|
|
|
+ String configCode = config.getConfigCode();
|
|
|
+ String textHash = Md5Util.encoderByMd5(text);
|
|
|
+ if (StringUtils.hasText(textHash)) {
|
|
|
+ List<Float> cached = vectorStoreService.getVectorByTextHash(textHash, configCode);
|
|
|
+ if (cached != null && !cached.isEmpty()) {
|
|
|
+ log.debug("命中 text_hash 缓存,复用 embedding,hash={}", textHash);
|
|
|
+ return cached;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return embeddingService.embed(text, config);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 获取 VIDEO_TITLE 向量化配置
|
|
|
+ */
|
|
|
+ private DeconstructVectorConfig getVideoTitleConfig() {
|
|
|
+ DeconstructVectorConfigExample example = new DeconstructVectorConfigExample();
|
|
|
+ example.createCriteria()
|
|
|
+ .andEnabledEqualTo((short) 1)
|
|
|
+ .andConfigCodeEqualTo(VectorConstants.VIDEO_TITLE_CONFIG_CODE);
|
|
|
+ List<DeconstructVectorConfig> configs = vectorConfigMapper.selectByExample(example);
|
|
|
+ return CollectionUtils.isEmpty(configs) ? null : configs.get(0);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 查询已有 VIDEO_TITLE 向量的 video_id 集合
|
|
|
+ */
|
|
|
+ private Set<Long> getExistingVideoTitleIds() {
|
|
|
+ List<Long> ids = videoVectorMapperExt.selectAllVideoIdsByConfigCode(VectorConstants.VIDEO_TITLE_CONFIG_CODE);
|
|
|
+ return ids != null ? new HashSet<>(ids) : Collections.emptySet();
|
|
|
+ }
|
|
|
+}
|