|
|
@@ -1,15 +1,46 @@
|
|
|
package com.tzld.videoVector.job;
|
|
|
|
|
|
+import com.alibaba.fastjson.JSONObject;
|
|
|
+import com.aliyun.odps.data.Record;
|
|
|
+import com.tzld.videoVector.service.EmbeddingService;
|
|
|
+import com.tzld.videoVector.service.MilvusService;
|
|
|
+import com.tzld.videoVector.util.MilvusUtil;
|
|
|
+import com.tzld.videoVector.util.OdpsUtil;
|
|
|
import com.xxl.job.core.biz.model.ReturnT;
|
|
|
import com.xxl.job.core.handler.annotation.XxlJob;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
import org.springframework.stereotype.Component;
|
|
|
|
|
|
+import javax.annotation.Resource;
|
|
|
+import java.util.ArrayList;
|
|
|
+import java.util.List;
|
|
|
+import java.util.Objects;
|
|
|
+import java.util.Set;
|
|
|
+import java.util.stream.Collectors;
|
|
|
+
|
|
|
|
|
|
@Slf4j
|
|
|
@Component
|
|
|
public class VideoVectorJob {
|
|
|
|
|
|
+ @Resource
|
|
|
+ private MilvusService milvusService;
|
|
|
+
|
|
|
+ @Resource
|
|
|
+ private MilvusUtil milvusUtil;
|
|
|
+
|
|
|
+ @Resource
|
|
|
+ private EmbeddingService embeddingService;
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 集合名称
|
|
|
+ */
|
|
|
+ private static final String COLLECTION_NAME = "video_vector";
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 每页查询数量
|
|
|
+ */
|
|
|
+ private static final int PAGE_SIZE = 1000;
|
|
|
|
|
|
/**
|
|
|
* 视频向量化
|
|
|
@@ -18,8 +49,163 @@ public class VideoVectorJob {
|
|
|
*/
|
|
|
@XxlJob("vectorVideoJob")
|
|
|
public ReturnT<String> vectorVideoJob(String param) {
|
|
|
+ log.info("开始执行视频向量化任务, param: {}", param);
|
|
|
+
|
|
|
+ int totalSuccessCount = 0;
|
|
|
+ int totalFailCount = 0;
|
|
|
+ int pageNum = 0;
|
|
|
+
|
|
|
+ try {
|
|
|
+ while (true) {
|
|
|
+ // 1. 分页查询 videoId 列表
|
|
|
+ List<Long> videoIds = queryVideoIdsByPage(pageNum, PAGE_SIZE);
|
|
|
+ if (videoIds == null || videoIds.isEmpty()) {
|
|
|
+ log.info("第 {} 页没有查询到数据,分页查询结束", pageNum);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ log.info("第 {} 页查询到 {} 个 videoId", pageNum, videoIds.size());
|
|
|
|
|
|
- return ReturnT.SUCCESS;
|
|
|
+ // 2. 查询哪些 videoId 在 Milvus 中已存在
|
|
|
+ Set<Long> existingIds = milvusService.existsByIds(COLLECTION_NAME, videoIds);
|
|
|
+ log.info("已存在 {} 个 videoId,将跳过", existingIds.size());
|
|
|
+
|
|
|
+ // 3. 过滤出不存在的 videoId
|
|
|
+ List<Long> newVideoIds = videoIds.stream()
|
|
|
+ .filter(id -> !existingIds.contains(id))
|
|
|
+ .collect(Collectors.toList());
|
|
|
+
|
|
|
+ if (!newVideoIds.isEmpty()) {
|
|
|
+ log.info("第 {} 页需要处理 {} 个新的 videoId", pageNum, newVideoIds.size());
|
|
|
+
|
|
|
+ // 4. 逐个处理新的 videoId
|
|
|
+ for (Long videoId : newVideoIds) {
|
|
|
+ try {
|
|
|
+ // 4.1 查询视频详情
|
|
|
+ JSONObject videoDetail = queryVideoDetail(videoId);
|
|
|
+ if (videoDetail == null) {
|
|
|
+ log.warn("videoId={} 详情查询为空,跳过", videoId);
|
|
|
+ totalFailCount++;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 4.2 提取字段并向量化
|
|
|
+ List<Float> vector = extractAndVectorize(videoDetail);
|
|
|
+ if (vector == null || vector.isEmpty()) {
|
|
|
+ log.warn("videoId={} 向量化失败,跳过", videoId);
|
|
|
+ totalFailCount++;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 4.3 存储到 Milvus
|
|
|
+ insertToMilvus(videoId, vector, videoDetail);
|
|
|
+ totalSuccessCount++;
|
|
|
+ log.debug("videoId={} 处理成功", videoId);
|
|
|
+
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("处理 videoId={} 时发生异常: {}", videoId, e.getMessage(), e);
|
|
|
+ totalFailCount++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 如果查询到的数据少于 PAGE_SIZE,说明已经是最后一页
|
|
|
+ if (videoIds.size() < PAGE_SIZE) {
|
|
|
+ log.info("第 {} 页数据量 {} 小于 PAGE_SIZE {},分页查询结束", pageNum, videoIds.size(), PAGE_SIZE);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ pageNum++;
|
|
|
+ }
|
|
|
+
|
|
|
+ log.info("视频向量化任务完成,总成功: {}, 总失败: {}, 总页数: {}", totalSuccessCount, totalFailCount, pageNum + 1);
|
|
|
+ return ReturnT.SUCCESS;
|
|
|
+
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("视频向量化任务执行失败: {}", e.getMessage(), e);
|
|
|
+ return new ReturnT<>(ReturnT.FAIL_CODE, "任务执行失败: " + e.getMessage());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 分页查询 videoId 列表
|
|
|
+ * @param pageNum 页码(从0开始)
|
|
|
+ * @param pageSize 每页数量
|
|
|
+ * @return videoId 列表
|
|
|
+ */
|
|
|
+ private List<Long> queryVideoIdsByPage(int pageNum, int pageSize) {
|
|
|
+ int offset = pageNum * pageSize;
|
|
|
+ String sql = String.format(
|
|
|
+ "SELECT video_id FROM your_table WHERE status = 1 ORDER BY video_id LIMIT %d, %d",
|
|
|
+ offset, pageSize);
|
|
|
+ List<Record> records = OdpsUtil.getOdpsData(sql);
|
|
|
+ if (records == null || records.isEmpty()) {
|
|
|
+ return new ArrayList<>();
|
|
|
+ }
|
|
|
+ return records.stream()
|
|
|
+ .map(record -> record.getBigint("video_id"))
|
|
|
+ .filter(Objects::nonNull)
|
|
|
+ .collect(Collectors.toList());
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 查询视频详情
|
|
|
+ */
|
|
|
+ private JSONObject queryVideoDetail(Long videoId) {
|
|
|
+ String sql = String.format(
|
|
|
+ "SELECT video_id, title, description, tags, category FROM your_detail_table WHERE video_id = %d",
|
|
|
+ videoId);
|
|
|
+ List<Record> records = OdpsUtil.getOdpsData(sql);
|
|
|
+ if (records == null || records.isEmpty()) {
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+ Record record = records.get(0);
|
|
|
+ JSONObject result = new JSONObject();
|
|
|
+ result.put("video_id", record.getBigint("video_id"));
|
|
|
+ result.put("title", record.getString("title"));
|
|
|
+ result.put("description", record.getString("description"));
|
|
|
+ result.put("tags", record.getString("tags"));
|
|
|
+ result.put("category", record.getString("category"));
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 提取字段并向量化
|
|
|
+ */
|
|
|
+ private List<Float> extractAndVectorize(JSONObject videoDetail) {
|
|
|
+ // 提取用于向量化的文本字段
|
|
|
+ String title = videoDetail.getString("title");
|
|
|
+ String description = videoDetail.getString("description");
|
|
|
+ String tags = videoDetail.getString("tags");
|
|
|
+
|
|
|
+ // 拼接文本
|
|
|
+ StringBuilder textBuilder = new StringBuilder();
|
|
|
+ if (title != null && !title.isEmpty()) {
|
|
|
+ textBuilder.append(title).append(" ");
|
|
|
+ }
|
|
|
+ if (description != null && !description.isEmpty()) {
|
|
|
+ textBuilder.append(description).append(" ");
|
|
|
+ }
|
|
|
+ if (tags != null && !tags.isEmpty()) {
|
|
|
+ textBuilder.append(tags);
|
|
|
+ }
|
|
|
+ String text = textBuilder.toString().trim();
|
|
|
+ if (text.isEmpty()) {
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 使用 EmbeddingService 进行向量化
|
|
|
+ return embeddingService.embed(text);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 将向量数据存储到 Milvus
|
|
|
+ */
|
|
|
+ private void insertToMilvus(Long videoId, List<Float> vector, JSONObject videoDetail) {
|
|
|
+ // 使用 MilvusUtil 进行插入操作
|
|
|
+ // 注意:需要确保 Milvus 集合已创建,且包含 video_id 和 vector 字段
|
|
|
+ List<List<Float>> vectors = new ArrayList<>();
|
|
|
+ vectors.add(vector);
|
|
|
+ milvusUtil.insertVectors(COLLECTION_NAME, vectors);
|
|
|
}
|
|
|
|
|
|
}
|