|
|
@@ -76,7 +76,9 @@ public class VideoVectorJob {
|
|
|
@Resource
|
|
|
private VideoDeconstructResultMapperExt videoDeconstructResultMapperExt;
|
|
|
|
|
|
- /** 本次 Job 执行中已缓存 decode 的 videoId,避免多配置下重复写入 */
|
|
|
+ /**
|
|
|
+ * 本次 Job 执行中已缓存 decode 的 videoId,避免多配置下重复写入
|
|
|
+ */
|
|
|
private final Set<Long> decodeCachedInThisRun = ConcurrentHashMap.newKeySet();
|
|
|
|
|
|
|
|
|
@@ -182,26 +184,17 @@ public class VideoVectorJob {
|
|
|
}
|
|
|
log.info("配置 {} 需要处理 {} 个视频", configCode, needProcessIds.size());
|
|
|
|
|
|
- // 3. 从本地DB批量查询解构结果并并发embedding
|
|
|
- ExecutorService embedExecutor = Executors.newFixedThreadPool(VectorConstants.EMBEDDING_PARALLELISM);
|
|
|
- Semaphore inFlightLimiter = new Semaphore(VectorConstants.MAX_EMBEDDING_IN_FLIGHT);
|
|
|
- List<Future<?>> futures = new ArrayList<>();
|
|
|
-
|
|
|
- try {
|
|
|
- for (List<Long> batchIds : Lists.partition(needProcessIds, VectorConstants.ODPS_IN_BATCH_SIZE)) {
|
|
|
- List<VideoDeconstructResult> results = videoDeconstructResultMapperExt
|
|
|
- .selectResultsByVideoIds("result_json", batchIds);
|
|
|
- for (VideoDeconstructResult r : results) {
|
|
|
- if (!StringUtils.hasText(r.getResult())) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- submitLocalEmbeddingTask(r.getVideoId(), r.getResult(), config,
|
|
|
- embedExecutor, inFlightLimiter, futures,
|
|
|
- totalSuccessCount, totalFailCount, "raw_result");
|
|
|
+ // 3. 从本地DB批量查询解构结果并顺序embedding
|
|
|
+ for (List<Long> batchIds : Lists.partition(needProcessIds, VectorConstants.ODPS_IN_BATCH_SIZE)) {
|
|
|
+ List<VideoDeconstructResult> results = videoDeconstructResultMapperExt
|
|
|
+ .selectResultsByVideoIds("result_json", batchIds);
|
|
|
+ for (VideoDeconstructResult r : results) {
|
|
|
+ if (!StringUtils.hasText(r.getResult())) {
|
|
|
+ continue;
|
|
|
}
|
|
|
+ executeEmbeddingTask(r.getVideoId(), r.getResult(), config,
|
|
|
+ totalSuccessCount, totalFailCount, "raw_result");
|
|
|
}
|
|
|
- } finally {
|
|
|
- awaitAndShutdown(futures, embedExecutor, 30, "embedding");
|
|
|
}
|
|
|
} catch (Exception e) {
|
|
|
log.error("配置 {} 处理异常: {}", configCode, e.getMessage(), e);
|
|
|
@@ -209,46 +202,33 @@ public class VideoVectorJob {
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * 提交单条记录的 embedding 任务到线程池(基于本地DB数据)
|
|
|
- * 使用 Semaphore 控制在途并发数
|
|
|
+ * 顺序执行单条记录的 embedding 任务(基于本地DB数据)
|
|
|
*/
|
|
|
- private void submitLocalEmbeddingTask(Long videoId, String rawData, DeconstructVectorConfig config,
|
|
|
- ExecutorService executor, Semaphore inFlightLimiter,
|
|
|
- List<Future<?>> futures, AtomicInteger successCount,
|
|
|
- AtomicInteger failCount, String dataType) {
|
|
|
+ private void executeEmbeddingTask(Long videoId, String rawData, DeconstructVectorConfig config,
|
|
|
+ AtomicInteger successCount, AtomicInteger failCount, String dataType) {
|
|
|
if (videoId == null || !StringUtils.hasText(rawData)) {
|
|
|
return;
|
|
|
}
|
|
|
- try {
|
|
|
- inFlightLimiter.acquire();
|
|
|
- } catch (InterruptedException e) {
|
|
|
- Thread.currentThread().interrupt();
|
|
|
- return;
|
|
|
- }
|
|
|
String configCode = config.getConfigCode();
|
|
|
- futures.add(executor.submit(() -> {
|
|
|
- try {
|
|
|
- List<String> texts = "result_log".equals(dataType)
|
|
|
- ? extractTextsFromResultLogData(rawData, config)
|
|
|
- : extractTextsFromRawResult(rawData, config);
|
|
|
- if (CollectionUtils.isEmpty(texts)) {
|
|
|
- log.info("videoId={} 配置 {} 未提取到文本,跳过", videoId, configCode);
|
|
|
- failCount.incrementAndGet();
|
|
|
- return;
|
|
|
- }
|
|
|
- int storeCount = vectorizeAndStore(config, videoId, texts);
|
|
|
- if (storeCount > 0) {
|
|
|
- successCount.incrementAndGet();
|
|
|
- } else {
|
|
|
- failCount.incrementAndGet();
|
|
|
- }
|
|
|
- } catch (Exception e) {
|
|
|
- log.error("处理 videoId={} 配置 {} 时发生异常: {}", videoId, configCode, e.getMessage(), e);
|
|
|
+ try {
|
|
|
+ List<String> texts = "result_log".equals(dataType)
|
|
|
+ ? extractTextsFromResultLogData(rawData, config)
|
|
|
+ : extractTextsFromRawResult(rawData, config);
|
|
|
+ if (CollectionUtils.isEmpty(texts)) {
|
|
|
+ log.info("videoId={} 配置 {} 未提取到文本,跳过", videoId, configCode);
|
|
|
+ failCount.incrementAndGet();
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ int storeCount = vectorizeAndStore(config, videoId, texts);
|
|
|
+ if (storeCount > 0) {
|
|
|
+ successCount.incrementAndGet();
|
|
|
+ } else {
|
|
|
failCount.incrementAndGet();
|
|
|
- } finally {
|
|
|
- inFlightLimiter.release();
|
|
|
}
|
|
|
- }));
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("处理 videoId={} 配置 {} 时发生异常: {}", videoId, configCode, e.getMessage(), e);
|
|
|
+ failCount.incrementAndGet();
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
@@ -322,8 +302,8 @@ public class VideoVectorJob {
|
|
|
* - 数组路径(以 [*] 结尾):从数组中提取满足置信度条件的文本
|
|
|
* - 单对象路径(不以 [*] 结尾):对单个对象进行置信度检查后提取文本
|
|
|
*
|
|
|
- * @param json 原始JSON
|
|
|
- * @param sourcePath 路径(如 $.keypoint_final.最终关键点列表[*] 或 $.最终选题)
|
|
|
+ * @param json 原始JSON
|
|
|
+ * @param sourcePath 路径(如 $.keypoint_final.最终关键点列表[*] 或 $.最终选题)
|
|
|
* @param extractRule 提取规则JSON(如 {"text_field":"关键点","confidence_field":"置信度","confidence_threshold":0.8})
|
|
|
* @return 满足置信度条件的文本列表
|
|
|
*/
|
|
|
@@ -516,7 +496,8 @@ public class VideoVectorJob {
|
|
|
|
|
|
/**
|
|
|
* 分页查询 videoId 列表(从本地解构结果表查询 result_json 来源)
|
|
|
- * @param pageNum 页码(从0开始)
|
|
|
+ *
|
|
|
+ * @param pageNum 页码(从0开始)
|
|
|
* @param pageSize 每页数量
|
|
|
* @return videoId 列表
|
|
|
*/
|
|
|
@@ -617,57 +598,37 @@ public class VideoVectorJob {
|
|
|
}
|
|
|
log.info("配置 {} 需要处理 {} 个视频", configCode, needProcessIds.size());
|
|
|
|
|
|
- // 3. 从本地DB批量查询解构结果并并发embedding
|
|
|
- ExecutorService embedExecutor = Executors.newFixedThreadPool(VectorConstants.EMBEDDING_PARALLELISM);
|
|
|
- Semaphore inFlightLimiter = new Semaphore(VectorConstants.MAX_EMBEDDING_IN_FLIGHT);
|
|
|
- List<Future<?>> futures = new ArrayList<>();
|
|
|
-
|
|
|
- try {
|
|
|
- for (List<Long> batchIds : Lists.partition(needProcessIds, VectorConstants.ODPS_IN_BATCH_SIZE)) {
|
|
|
- List<VideoDeconstructResult> results = videoDeconstructResultMapperExt
|
|
|
- .selectResultsByVideoIds("aigc_deconstruct", batchIds);
|
|
|
- for (VideoDeconstructResult r : results) {
|
|
|
- if (!StringUtils.hasText(r.getResult())) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- Long videoId = r.getVideoId();
|
|
|
- JSONObject dataContent = JSON.parseObject(r.getResult());
|
|
|
- if (dataContent == null) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- tryCacheDecodeResult(videoId);
|
|
|
+ // 3. 从本地DB批量查询解构结果并顺序embedding
|
|
|
+ List<VideoDeconstructResult> results = videoDeconstructResultMapperExt
|
|
|
+ .selectResultsByVideoIds("aigc_deconstruct", needProcessIds);
|
|
|
+ for (VideoDeconstructResult r : results) {
|
|
|
+ if (!StringUtils.hasText(r.getResult())) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ Long videoId = r.getVideoId();
|
|
|
+ JSONObject dataContent = JSON.parseObject(r.getResult());
|
|
|
+ if (dataContent == null) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ tryCacheDecodeResult(videoId);
|
|
|
|
|
|
- try {
|
|
|
- inFlightLimiter.acquire();
|
|
|
- } catch (InterruptedException e) {
|
|
|
- Thread.currentThread().interrupt();
|
|
|
- return;
|
|
|
- }
|
|
|
- futures.add(embedExecutor.submit(() -> {
|
|
|
- try {
|
|
|
- List<String> texts = extractTextsFromDataContent(dataContent, config);
|
|
|
- if (CollectionUtils.isEmpty(texts)) {
|
|
|
- log.info("videoId={} 配置 {} 未提取到选题文本,跳过", videoId, configCode);
|
|
|
- totalFailCount.incrementAndGet();
|
|
|
- return;
|
|
|
- }
|
|
|
- int storeCount = vectorizeAndStore(config, videoId, texts);
|
|
|
- if (storeCount > 0) {
|
|
|
- totalSuccessCount.incrementAndGet();
|
|
|
- } else {
|
|
|
- totalFailCount.incrementAndGet();
|
|
|
- }
|
|
|
- } catch (Exception e) {
|
|
|
- log.error("处理 videoId={} 配置 {} 时发生异常: {}", videoId, configCode, e.getMessage(), e);
|
|
|
- totalFailCount.incrementAndGet();
|
|
|
- } finally {
|
|
|
- inFlightLimiter.release();
|
|
|
- }
|
|
|
- }));
|
|
|
+ try {
|
|
|
+ List<String> texts = extractTextsFromDataContent(dataContent, config);
|
|
|
+ if (CollectionUtils.isEmpty(texts)) {
|
|
|
+ log.info("videoId={} 配置 {} 未提取到选题文本,跳过", videoId, configCode);
|
|
|
+ totalFailCount.incrementAndGet();
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ int storeCount = vectorizeAndStore(config, videoId, texts);
|
|
|
+ if (storeCount > 0) {
|
|
|
+ totalSuccessCount.incrementAndGet();
|
|
|
+ } else {
|
|
|
+ totalFailCount.incrementAndGet();
|
|
|
}
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("处理 videoId={} 配置 {} 时发生异常: {}", videoId, configCode, e.getMessage(), e);
|
|
|
+ totalFailCount.incrementAndGet();
|
|
|
}
|
|
|
- } finally {
|
|
|
- awaitAndShutdown(futures, embedExecutor, 30, "AIGC embedding");
|
|
|
}
|
|
|
} catch (Exception e) {
|
|
|
log.error("配置 {} 处理异常: {}", configCode, e.getMessage(), e);
|
|
|
@@ -709,7 +670,7 @@ public class VideoVectorJob {
|
|
|
|
|
|
/**
|
|
|
* 从解构数据中按 点类型 + 置信度 + 实质/形式 + 贡献度 提取向量化文本
|
|
|
- *
|
|
|
+ * <p>
|
|
|
* 提取流程:
|
|
|
* 1. 从 final_result_path 获取最终点列表,按 confidence_field >= confidence_threshold 过滤
|
|
|
* 2. 对通过的点,从主数组 point_array_path 中匹配对应点的详细解构
|
|
|
@@ -895,6 +856,7 @@ public class VideoVectorJob {
|
|
|
|
|
|
/**
|
|
|
* 从 dataContent 中提取选题文本(向后兼容)
|
|
|
+ *
|
|
|
* @deprecated 请使用 extractTextsFromDataContent(dataContent, config)
|
|
|
*/
|
|
|
@Deprecated
|
|
|
@@ -1212,26 +1174,17 @@ public class VideoVectorJob {
|
|
|
}
|
|
|
log.info("配置 {} 需要处理 {} 个视频", configCode, needProcessIds.size());
|
|
|
|
|
|
- // 3. 从本地DB批量查询解构结果并并发embedding
|
|
|
- ExecutorService embedExecutor = Executors.newFixedThreadPool(VectorConstants.EMBEDDING_PARALLELISM);
|
|
|
- Semaphore inFlightLimiter = new Semaphore(VectorConstants.MAX_EMBEDDING_IN_FLIGHT);
|
|
|
- List<Future<?>> futures = new ArrayList<>();
|
|
|
-
|
|
|
- try {
|
|
|
- for (List<Long> batchIds : Lists.partition(needProcessIds, VectorConstants.ODPS_IN_BATCH_SIZE)) {
|
|
|
- List<VideoDeconstructResult> results = videoDeconstructResultMapperExt
|
|
|
- .selectResultsByVideoIds("result_log", batchIds);
|
|
|
- for (VideoDeconstructResult r : results) {
|
|
|
- if (!StringUtils.hasText(r.getResult())) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- submitLocalEmbeddingTask(r.getVideoId(), r.getResult(), config,
|
|
|
- embedExecutor, inFlightLimiter, futures,
|
|
|
- totalSuccessCount, totalFailCount, "result_log");
|
|
|
+ // 3. 从本地DB批量查询解构结果并顺序embedding
|
|
|
+ for (List<Long> batchIds : Lists.partition(needProcessIds, VectorConstants.ODPS_IN_BATCH_SIZE)) {
|
|
|
+ List<VideoDeconstructResult> results = videoDeconstructResultMapperExt
|
|
|
+ .selectResultsByVideoIds("result_log", batchIds);
|
|
|
+ for (VideoDeconstructResult r : results) {
|
|
|
+ if (!StringUtils.hasText(r.getResult())) {
|
|
|
+ continue;
|
|
|
}
|
|
|
+ executeEmbeddingTask(r.getVideoId(), r.getResult(), config,
|
|
|
+ totalSuccessCount, totalFailCount, "result_log");
|
|
|
}
|
|
|
- } finally {
|
|
|
- awaitAndShutdown(futures, embedExecutor, 30, "embedding");
|
|
|
}
|
|
|
} catch (Exception e) {
|
|
|
log.error("配置 {} 处理异常: {}", configCode, e.getMessage(), e);
|