1 месяц назад · 215b96d232
--- a/core/src/main/java/com/tzld/videoVector/job/VideoVectorJob.java
+++ b/core/src/main/java/com/tzld/videoVector/job/VideoVectorJob.java
@@ -28,6 +28,7 @@ import org.springframework.util.StringUtils;
 
				 
			
 
				 import javax.annotation.Resource;
			
 
				 import java.util.*;
			
 
				+import java.util.concurrent.*;
			
 
				 import java.util.stream.Collectors;
			
 
				 
			
 
				 
			
@@ -96,15 +97,26 @@ public class VideoVectorJob {
 
				                 }
			
 
				                 log.info("第 {} 页查询到 {} 个 videoId", pageNum, videoIds.size());
			
 
				 
			
 
				-                // 3. 对每个配置进行处理
			
 
				+                // 3. 先进行审核过滤（每页只过滤一次，避免在 config 循环内重复调用）
			
 
				+                List<Long> auditPassedIds = filterAuditPassedIds(videoIds);
			
 
				+                if (auditPassedIds.isEmpty()) {
			
 
				+                    log.info("第 {} 页所有视频均未通过审核，跳过", pageNum);
			
 
				+                    if (videoIds.size() < VectorConstants.PAGE_SIZE) {
			
 
				+                        break;
			
 
				+                    }
			
 
				+                    pageNum++;
			
 
				+                    continue;
			
 
				+                }
			
 
				+                log.info("第 {} 页审核通过 {} 个视频", pageNum, auditPassedIds.size());
			
 
				+
			
 
				+                // 4. 对每个配置进行处理
			
 
				                 for (DeconstructVectorConfig config : configs) {
			
 
				                     String configCode = config.getConfigCode();
			
 
				 
			
 
				-                    // 3.0 审核清理已移至分页外，此处仅进行向量存在性检查
			
 
				-                    // 3.1 查询哪些 videoId 在该配置下已有向量（数据库层已做 DISTINCT video_id）
			
 
				-                    Set<Long> existingVideoIds = vectorStoreService.existsByIds(configCode, videoIds);
			
 
				-                    // 3.2 过滤出需要处理的 videoId（排除已有向量的）
			
 
				-                    List<Long> needProcessIds = videoIds.stream()
			
 
				+                    // 4.1 查询哪些 videoId 在该配置下已有向量（数据库层已做 DISTINCT video_id）
			
 
				+                    Set<Long> existingVideoIds = vectorStoreService.existsByIds(configCode, auditPassedIds);
			
 
				+                    // 4.2 过滤出需要处理的 videoId（排除已有向量的）
			
 
				+                    List<Long> needProcessIds = auditPassedIds.stream()
			
 
				                             .filter(id -> !existingVideoIds.contains(id))
			
 
				                             .collect(Collectors.toList());
			
 
				                     
			
@@ -114,15 +126,7 @@ public class VideoVectorJob {
 
				                     }
			
 
				                     log.info("配置 {} 需要处理 {} 个视频", configCode, needProcessIds.size());
			
 
				 
			
 
				-                    // 3.3 审核状态过滤：排除审核未通过的视频
			
 
				-                    needProcessIds = filterAuditPassedIds(needProcessIds);
			
 
				-                    if (needProcessIds.isEmpty()) {
			
 
				-                        log.info("配置 {} 待处理视频均未通过审核，跳过", configCode);
			
 
				-                        continue;
			
 
				-                    }
			
 
				-                    log.info("配置 {} 审核通过后需处理 {} 个视频", configCode, needProcessIds.size());
			
 
				-
			
 
				-                    // 3.4 批量查询需要处理的视频 raw_result
			
 
				+                    // 4.3 批量查询需要处理的视频 raw_result
			
 
				                     for (List<Long> partition : Lists.partition(needProcessIds, 50)) {
			
 
				                         Map<Long, String> videoRawResults = batchQueryVideoRawResults(partition);
			
 
				                         if (videoRawResults.isEmpty()) {
			
@@ -130,7 +134,7 @@ public class VideoVectorJob {
 
				                             continue;
			
 
				                         }
			
 
				 
			
 
				-                        // 3.5 逐个处理
			
 
				+                        // 4.4 逐个处理
			
 
				                         for (Long videoId : partition) {
			
 
				                             try {
			
 
				                                 String rawResult = videoRawResults.get(videoId);
			
@@ -495,16 +499,24 @@ public class VideoVectorJob {
 
				             List<Long> allVideoIds = new ArrayList<>(videoIdToTaskInstanceId.keySet());
			
 
				             log.info("共 {} 个有效 videoId", allVideoIds.size());
			
 
				 
			
 
				+            // 4. 先进行审核过滤（只过滤一次，避免在 config 循环内重复调用）
			
 
				+            List<Long> auditPassedIds = filterAuditPassedIds(allVideoIds);
			
 
				+            if (auditPassedIds.isEmpty()) {
			
 
				+                log.info("所有视频均未通过审核，任务结束");
			
 
				+                return ReturnT.SUCCESS;
			
 
				+            }
			
 
				+            log.info("审核通过 {} 个视频", auditPassedIds.size());
			
 
				+
			
 
				             int totalSuccessCount = 0;
			
 
				             int totalFailCount = 0;
			
 
				 
			
 
				-            // 4. 对每个配置进行处理
			
 
				+            // 5. 对每个配置进行处理
			
 
				             for (DeconstructVectorConfig config : configs) {
			
 
				                 String configCode = config.getConfigCode();
			
 
				 
			
 
				-                // 4.1 查询该配置下已有向量的 videoId，排除已处理过的（数据库层已做 DISTINCT video_id）
			
 
				-                Set<Long> existingVideoIds = vectorStoreService.existsByIds(configCode, allVideoIds);
			
 
				-                List<Long> needProcessIds = allVideoIds.stream()
			
 
				+                // 5.1 查询该配置下已有向量的 videoId，排除已处理过的（数据库层已做 DISTINCT video_id）
			
 
				+                Set<Long> existingVideoIds = vectorStoreService.existsByIds(configCode, auditPassedIds);
			
 
				+                List<Long> needProcessIds = auditPassedIds.stream()
			
 
				                         .filter(id -> !existingVideoIds.contains(id))
			
 
				                         .collect(Collectors.toList());
			
 
				                 if (needProcessIds.isEmpty()) {
			
@@ -513,15 +525,7 @@ public class VideoVectorJob {
 
				                 }
			
 
				                 log.info("配置 {} 需要处理 {} 个视频", configCode, needProcessIds.size());
			
 
				 
			
 
				-                // 4.2 审核状态过滤：排除审核未通过的视频
			
 
				-                needProcessIds = filterAuditPassedIds(needProcessIds);
			
 
				-                if (needProcessIds.isEmpty()) {
			
 
				-                    log.info("配置 {} 待处理视频均未通过审核，跳过", configCode);
			
 
				-                    continue;
			
 
				-                }
			
 
				-                log.info("配置 {} 审核通过后需处理 {} 个视频", configCode, needProcessIds.size());
			
 
				-
			
 
				-                // 4.3 逐个调用 detail 接口，提取选题并向量化存储
			
 
				+                // 5.2 逐个调用 detail 接口，提取选题并向量化存储
			
 
				                 for (Long videoId : needProcessIds) {
			
 
				                     try {
			
 
				                         Long taskInstanceId = videoIdToTaskInstanceId.get(videoId);
			
@@ -751,15 +755,37 @@ public class VideoVectorJob {
 
				         if (CollectionUtils.isEmpty(videoIds)) {
			
 
				             return Collections.emptyList();
			
 
				         }
			
 
				-        Set<Long> notPassedIds = new HashSet<>();
			
 
				-        for (List<Long> batch : Lists.partition(videoIds, VectorConstants.AUDIT_CHECK_BATCH_SIZE)) {
			
 
				-            try {
			
 
				-                Set<Long> batchNotPassed = videoApiService.getNotAuditPassedVideoIds(new HashSet<>(batch));
			
 
				-                notPassedIds.addAll(batchNotPassed);
			
 
				-            } catch (Exception e) {
			
 
				-                log.error("审核状态查询失败，batch={}, error={}", batch, e.getMessage(), e);
			
 
				+
			
 
				+        List<List<Long>> batches = Lists.partition(videoIds, VectorConstants.AUDIT_CHECK_BATCH_SIZE);
			
 
				+        // 并发提交所有批次的审核查询（API 限制每批 20 条，需用更多线程提升吞吐）
			
 
				+        int parallelism = Math.min(batches.size(), 10);
			
 
				+        ExecutorService executor = Executors.newFixedThreadPool(parallelism);
			
 
				+        Set<Long> notPassedIds = ConcurrentHashMap.newKeySet();
			
 
				+
			
 
				+        try {
			
 
				+            List<Future<?>> futures = new ArrayList<>();
			
 
				+            for (List<Long> batch : batches) {
			
 
				+                futures.add(executor.submit(() -> {
			
 
				+                    try {
			
 
				+                        Set<Long> batchNotPassed = videoApiService.getNotAuditPassedVideoIds(new HashSet<>(batch));
			
 
				+                        notPassedIds.addAll(batchNotPassed);
			
 
				+                    } catch (Exception e) {
			
 
				+                        log.error("审核状态查询失败，batch={}, error={}", batch, e.getMessage(), e);
			
 
				+                    }
			
 
				+                }));
			
 
				             }
			
 
				+            // 等待所有任务完成
			
 
				+            for (Future<?> future : futures) {
			
 
				+                try {
			
 
				+                    future.get(30, TimeUnit.SECONDS);
			
 
				+                } catch (Exception e) {
			
 
				+                    log.error("审核查询任务等待异常: {}", e.getMessage(), e);
			
 
				+                }
			
 
				+            }
			
 
				+        } finally {
			
 
				+            executor.shutdown();
			
 
				         }
			
 
				+
			
 
				         if (notPassedIds.isEmpty()) {
			
 
				             return videoIds;
			
 
				         }
			
@@ -811,13 +837,25 @@ public class VideoVectorJob {
 
				                 }
			
 
				                 log.info("第 {} 页查询到 {} 个 videoId", pageNum, videoIds.size());
			
 
				 
			
 
				-                // 4. 对每个配置进行处理
			
 
				+                // 4. 先进行审核过滤（每页只过滤一次，避免在 config 循环内重复调用）
			
 
				+                List<Long> auditPassedIds = filterAuditPassedIds(videoIds);
			
 
				+                if (auditPassedIds.isEmpty()) {
			
 
				+                    log.info("第 {} 页所有视频均未通过审核，跳过", pageNum);
			
 
				+                    if (videoIds.size() < VectorConstants.PAGE_SIZE) {
			
 
				+                        break;
			
 
				+                    }
			
 
				+                    pageNum++;
			
 
				+                    continue;
			
 
				+                }
			
 
				+                log.info("第 {} 页审核通过 {} 个视频", pageNum, auditPassedIds.size());
			
 
				+
			
 
				+                // 5. 对每个配置进行处理
			
 
				                 for (DeconstructVectorConfig config : configs) {
			
 
				                     String configCode = config.getConfigCode();
			
 
				 
			
 
				-                    // 4.1 已向量化过滤
			
 
				-                    Set<Long> existingVideoIds = vectorStoreService.existsByIds(configCode, videoIds);
			
 
				-                    List<Long> needProcessIds = videoIds.stream()
			
 
				+                    // 5.1 已向量化过滤
			
 
				+                    Set<Long> existingVideoIds = vectorStoreService.existsByIds(configCode, auditPassedIds);
			
 
				+                    List<Long> needProcessIds = auditPassedIds.stream()
			
 
				                             .filter(id -> !existingVideoIds.contains(id))
			
 
				                             .collect(Collectors.toList());
			
 
				                     if (needProcessIds.isEmpty()) {
			
@@ -826,15 +864,7 @@ public class VideoVectorJob {
 
				                     }
			
 
				                     log.info("配置 {} 需要处理 {} 个视频", configCode, needProcessIds.size());
			
 
				 
			
 
				-                    // 4.2 审核状态过滤
			
 
				-                    needProcessIds = filterAuditPassedIds(needProcessIds);
			
 
				-                    if (needProcessIds.isEmpty()) {
			
 
				-                        log.info("配置 {} 待处理视频均未通过审核，跳过", configCode);
			
 
				-                        continue;
			
 
				-                    }
			
 
				-                    log.info("配置 {} 审核通过后需处理 {} 个视频", configCode, needProcessIds.size());
			
 
				-
			
 
				-                    // 4.3 分批查询 result_log 的 data 字段并向量化
			
 
				+                    // 5.2 分批查询 result_log 的 data 字段并向量化
			
 
				                     for (List<Long> partition : Lists.partition(needProcessIds, 50)) {
			
 
				                         Map<Long, String> videoDataMap = batchQueryResultLogData(partition);
			
 
				                         if (videoDataMap.isEmpty()) {