|
@@ -2,6 +2,8 @@ package com.tzld.videoVector.service.recall.impl;
|
|
|
|
|
|
|
|
import com.alibaba.fastjson.JSON;
|
|
import com.alibaba.fastjson.JSON;
|
|
|
import com.alibaba.fastjson.JSONArray;
|
|
import com.alibaba.fastjson.JSONArray;
|
|
|
|
|
+import java.util.function.Function;
|
|
|
|
|
+import java.util.function.ToDoubleFunction;
|
|
|
import com.alibaba.fastjson.JSONObject;
|
|
import com.alibaba.fastjson.JSONObject;
|
|
|
import com.tzld.videoVector.api.VideoApiService;
|
|
import com.tzld.videoVector.api.VideoApiService;
|
|
|
import com.tzld.videoVector.common.constant.VectorConstants;
|
|
import com.tzld.videoVector.common.constant.VectorConstants;
|
|
@@ -420,16 +422,19 @@ public class VectorRecallTestServiceImpl implements VectorRecallTestService {
|
|
|
// ============================================================
|
|
// ============================================================
|
|
|
|
|
|
|
|
// 视频去重
|
|
// 视频去重
|
|
|
- List<VideoMatchResult> dedupedVideo = deduplicateRawVideo(allRawVideo);
|
|
|
|
|
|
|
+ List<VideoMatchResult> dedupedVideo = deduplicateRaw(allRawVideo,
|
|
|
|
|
+ VideoMatchResult::getVideoId, m -> m.getScore() != null ? m.getScore() : 0.0);
|
|
|
// 素材去重 + 批量查 quality 做轻量预打分(不等 full enrich)
|
|
// 素材去重 + 批量查 quality 做轻量预打分(不等 full enrich)
|
|
|
- List<MaterialMatch> dedupedMaterial = deduplicateRawMaterial(allRawMaterial);
|
|
|
|
|
|
|
+ List<MaterialMatch> dedupedMaterial = deduplicateRaw(allRawMaterial,
|
|
|
|
|
+ MaterialMatch::getMaterialId, MaterialMatch::getScore);
|
|
|
List<String> matIds = dedupedMaterial.stream()
|
|
List<String> matIds = dedupedMaterial.stream()
|
|
|
.map(MaterialMatch::getMaterialId).distinct().limit(enrichK * 2)
|
|
.map(MaterialMatch::getMaterialId).distinct().limit(enrichK * 2)
|
|
|
.collect(Collectors.toList());
|
|
.collect(Collectors.toList());
|
|
|
Map<String, MaterialQuality> qualitySnapshot = matIds.isEmpty()
|
|
Map<String, MaterialQuality> qualitySnapshot = matIds.isEmpty()
|
|
|
? Collections.emptyMap() : loadMaterialQualityRows(matIds);
|
|
? Collections.emptyMap() : loadMaterialQualityRows(matIds);
|
|
|
// 文章去重
|
|
// 文章去重
|
|
|
- List<ArticleMatch> dedupedArticle = deduplicateRawArticle(allRawArticle);
|
|
|
|
|
|
|
+ List<ArticleMatch> dedupedArticle = deduplicateRaw(allRawArticle,
|
|
|
|
|
+ ArticleMatch::getArticleId, ArticleMatch::getScore);
|
|
|
|
|
|
|
|
double preAlpha = RankingParams.defaults().getAlpha(); // 0.6
|
|
double preAlpha = RankingParams.defaults().getAlpha(); // 0.6
|
|
|
|
|
|
|
@@ -571,54 +576,26 @@ public class VectorRecallTestServiceImpl implements VectorRecallTestService {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /** 视频原始 ANN 去重:同一 videoId 保留最高分 */
|
|
|
|
|
- private List<VideoMatchResult> deduplicateRawVideo(List<VideoMatchResult> all) {
|
|
|
|
|
- if (CollectionUtils.isEmpty(all)) return Collections.emptyList();
|
|
|
|
|
- Map<Long, VideoMatchResult> dedup = new LinkedHashMap<>();
|
|
|
|
|
- for (VideoMatchResult m : all) {
|
|
|
|
|
- if (m == null || m.getVideoId() == null) continue;
|
|
|
|
|
- VideoMatchResult existing = dedup.get(m.getVideoId());
|
|
|
|
|
- if (existing == null || (m.getScore() != null
|
|
|
|
|
- && (existing.getScore() == null || m.getScore() > existing.getScore()))) {
|
|
|
|
|
- dedup.put(m.getVideoId(), m);
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- return new ArrayList<>(dedup.values()).stream()
|
|
|
|
|
- .sorted(Comparator.comparing(VideoMatchResult::getScore,
|
|
|
|
|
- Comparator.nullsLast(Comparator.reverseOrder())))
|
|
|
|
|
- .collect(Collectors.toList());
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- /** 素材原始 ANN 去重:同一 materialId 保留最高分 */
|
|
|
|
|
- private List<MaterialMatch> deduplicateRawMaterial(List<MaterialMatch> all) {
|
|
|
|
|
- if (CollectionUtils.isEmpty(all)) return Collections.emptyList();
|
|
|
|
|
- Map<String, MaterialMatch> dedup = new LinkedHashMap<>();
|
|
|
|
|
- for (MaterialMatch m : all) {
|
|
|
|
|
- if (m == null || m.getMaterialId() == null) continue;
|
|
|
|
|
- MaterialMatch existing = dedup.get(m.getMaterialId());
|
|
|
|
|
- if (existing == null || m.getScore() > existing.getScore()) {
|
|
|
|
|
- dedup.put(m.getMaterialId(), m);
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- return new ArrayList<>(dedup.values()).stream()
|
|
|
|
|
- .sorted(Comparator.comparingDouble(MaterialMatch::getScore).reversed())
|
|
|
|
|
- .collect(Collectors.toList());
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- /** 文章原始 ANN 去重:同一 articleId 保留最高分 */
|
|
|
|
|
- private List<ArticleMatch> deduplicateRawArticle(List<ArticleMatch> all) {
|
|
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 原始 ANN 去重:同一 ID 保留最高分,按分降序排列。
|
|
|
|
|
+ * 三类模态的 idExtractor/scoreExtractor 不同(Video 的 score 可为 null),在此集中处理。
|
|
|
|
|
+ */
|
|
|
|
|
+ private static <T, K> List<T> deduplicateRaw(
|
|
|
|
|
+ List<T> all, Function<T, K> idExtractor, ToDoubleFunction<T> scoreExtractor) {
|
|
|
if (CollectionUtils.isEmpty(all)) return Collections.emptyList();
|
|
if (CollectionUtils.isEmpty(all)) return Collections.emptyList();
|
|
|
- Map<String, ArticleMatch> dedup = new LinkedHashMap<>();
|
|
|
|
|
- for (ArticleMatch m : all) {
|
|
|
|
|
- if (m == null || m.getArticleId() == null) continue;
|
|
|
|
|
- ArticleMatch existing = dedup.get(m.getArticleId());
|
|
|
|
|
- if (existing == null || m.getScore() > existing.getScore()) {
|
|
|
|
|
- dedup.put(m.getArticleId(), m);
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- return new ArrayList<>(dedup.values()).stream()
|
|
|
|
|
- .sorted(Comparator.comparingDouble(ArticleMatch::getScore).reversed())
|
|
|
|
|
- .collect(Collectors.toList());
|
|
|
|
|
|
|
+ Map<K, T> dedup = new LinkedHashMap<>();
|
|
|
|
|
+ for (T item : all) {
|
|
|
|
|
+ if (item == null) continue;
|
|
|
|
|
+ K id = idExtractor.apply(item);
|
|
|
|
|
+ if (id == null) continue;
|
|
|
|
|
+ T existing = dedup.get(id);
|
|
|
|
|
+ if (existing == null || scoreExtractor.applyAsDouble(item) > scoreExtractor.applyAsDouble(existing)) {
|
|
|
|
|
+ dedup.put(id, item);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ List<T> result = new ArrayList<>(dedup.values());
|
|
|
|
|
+ result.sort(Comparator.comparingDouble(scoreExtractor::applyAsDouble).reversed());
|
|
|
|
|
+ return result;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
@@ -1148,7 +1125,7 @@ public class VectorRecallTestServiceImpl implements VectorRecallTestService {
|
|
|
meta.title = targetPost.getString("title");
|
|
meta.title = targetPost.getString("title");
|
|
|
String bodyText = targetPost.getString("body_text");
|
|
String bodyText = targetPost.getString("body_text");
|
|
|
if (StringUtils.hasText(bodyText)) {
|
|
if (StringUtils.hasText(bodyText)) {
|
|
|
- meta.summary = bodyText.length() > 120 ? bodyText.substring(0, 120) : bodyText;
|
|
|
|
|
|
|
+ meta.summary = bodyText.length() > 200 ? bodyText.substring(0, 200) : bodyText;
|
|
|
}
|
|
}
|
|
|
meta.articleId = targetPost.getString("channel_content_id");
|
|
meta.articleId = targetPost.getString("channel_content_id");
|
|
|
|
|
|