소스 검색

优化现有召排--code-review

luojunhui 19 시간 전
부모
커밋
7b51959ffb

+ 4 - 2
.gitignore

@@ -41,8 +41,10 @@ config-cache/
 ### 回归测试生成的快照 ###
 script/recall_test_results/
 
-CLAUDE.md
-docs
+### Claude Code 本地配置(不提交,每个开发者自己的) ###
+.claude
 
 ### 测试配置(含本地密钥,禁止提交)###
 **/application-test-local.yml
+
+CLAUDE.md

+ 0 - 1
core/pom.xml

@@ -17,5 +17,4 @@
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     </properties>
 
-
 </project>

+ 14 - 7
core/src/main/java/com/tzld/videoVector/service/impl/MaterialSearchServiceImpl.java

@@ -20,6 +20,7 @@ import com.tzld.videoVector.model.po.pgVector.DeconstructVectorConfigExample;
 import com.tzld.videoVector.model.po.pgVector.MaterialDeconstructResult;
 import com.tzld.videoVector.model.po.pgVector.MaterialQuality;
 import com.tzld.videoVector.model.po.pgVector.MaterialVector;
+import com.tzld.videoVector.service.rank.RankingParams;
 import com.tzld.videoVector.model.vo.MaterialMatchResult;
 import com.tzld.videoVector.model.vo.MaterialQualityVO;
 import com.tzld.videoVector.model.vo.RecallMaterialScoreVO;
@@ -87,9 +88,6 @@ public class MaterialSearchServiceImpl implements MaterialSearchService {
     @Resource
     private MaterialDeconstructResultMapperExt materialDeconstructResultMapperExt;
 
-    // 默认参数——WP2 收敛:统一使用 RankingParams 和 VectorConstants
-    private static final double DEFAULT_ALPHA = 0.6;        // 对齐 RankingParams.alpha
-    private static final double DEFAULT_SIM_MIN = 0.65;     // 对齐 RankingParams.simThreshold
 
     // ================================================================ 入库
     @Override
@@ -310,8 +308,9 @@ public class MaterialSearchServiceImpl implements MaterialSearchService {
         int topN = param.getTopN() != null && param.getTopN() > 0 ? param.getTopN() : 10;
         int expansionFactor = param.getExpansionFactor() != null && param.getExpansionFactor() > 0
                 ? param.getExpansionFactor() : MULTI_POINT_RECALL_CANDIDATE_FACTOR;
-        double alpha = param.getAlpha() != null ? param.getAlpha() : DEFAULT_ALPHA;
-        double simMin = param.getSimMin() != null ? param.getSimMin() : DEFAULT_SIM_MIN;
+        RankingParams rankDefaults = RankingParams.defaults();
+        double alpha = param.getAlpha() != null ? param.getAlpha() : rankDefaults.getAlpha();
+        double simMin = param.getSimMin() != null ? param.getSimMin() : rankDefaults.getSimThreshold();
         String configCode = param.getConfigCode();
         if (!StringUtils.hasText(configCode)) {
             configCode = DEFAULT_CONFIG_CODE;
@@ -368,17 +367,19 @@ public class MaterialSearchServiceImpl implements MaterialSearchService {
     private <T> List<ScoredMaterial> buildModalityItems(String configCode, String modality,
                                                          List<T> candidates, int topN,
                                                          double simMin) {
+        double denom = 1.0 - simMin;
         List<ScoredMaterial> list = new ArrayList<>();
         for (T m : candidates) {
             double sim = getMatchScore(m);
             if (sim < simMin) continue;
+            double simNorm = denom > 0 ? clip01((sim - simMin) / denom) : 0;
             ScoredMaterial item = new ScoredMaterial();
             item.setConfigCode(configCode);
             item.setModality(modality);
             item.setSim(round4(sim));
             item.setQualityScore(0.5);
             item.setConfidence(0.0);
-            item.setFinalScore(round4(sim)); // 仅用 sim
+            item.setFinalScore(round4(simNorm));
             if (m instanceof VideoMatch) {
                 VideoMatch vm = (VideoMatch) m;
                 item.setVideoId(String.valueOf(vm.getVideoId()));
@@ -404,6 +405,7 @@ public class MaterialSearchServiceImpl implements MaterialSearchService {
         Map<String, MaterialQuality> qualityMap = batchGetMaterialQuality(materialIds);
         Map<String, JSONObject> deconstructMap = batchGetMaterialDeconstruct(materialIds);
 
+        double denom = 1.0 - simMin;
         List<ScoredMaterial> list = new ArrayList<>();
         for (MaterialMatch m : candidates) {
             double sim = m.getScore();
@@ -419,7 +421,8 @@ public class MaterialSearchServiceImpl implements MaterialSearchService {
                 qualityScore = 0.5;
                 confidence = 0;
             }
-            double finalScore = alpha * sim + (1 - alpha) * qualityScore;
+            double simNorm = denom > 0 ? clip01((sim - simMin) / denom) : 0;
+            double finalScore = alpha * simNorm + (1 - alpha) * qualityScore;
 
             ScoredMaterial item = new ScoredMaterial();
             item.setModality("MATERIAL");
@@ -738,6 +741,10 @@ public class MaterialSearchServiceImpl implements MaterialSearchService {
         }
     }
 
+    private static double clip01(double x) {
+        return Math.max(0, Math.min(1, x));
+    }
+
     private static double round4(double v) {
         return Math.round(v * 10000.0) / 10000.0;
     }

+ 2 - 2
core/src/main/java/com/tzld/videoVector/service/rank/RankServiceImpl.java

@@ -151,10 +151,10 @@ public class RankServiceImpl implements RankService {
 
         double maxSim = sims.stream().max(Double::compare).orElse(0.0);
 
-        // 自适应 ROV 分位(从小样本估计 P5/P95
+        // 自适应 ROV 分位(从候选集估计 P5/P95,样本需足够大才有统计意义
         double rovLow = baseParams.getRovClipLow();
         double rovHigh = baseParams.getRovClipHigh();
-        if (rovs.size() >= 10) {
+        if (rovs.size() >= 30) {
             rovs.sort(Double::compare);
             int p5Idx = Math.max(0, (int) (rovs.size() * 0.05));
             int p95Idx = Math.min(rovs.size() - 1, (int) (rovs.size() * 0.95));

+ 28 - 51
core/src/main/java/com/tzld/videoVector/service/recall/impl/VectorRecallTestServiceImpl.java

@@ -2,6 +2,8 @@ package com.tzld.videoVector.service.recall.impl;
 
 import com.alibaba.fastjson.JSON;
 import com.alibaba.fastjson.JSONArray;
+import java.util.function.Function;
+import java.util.function.ToDoubleFunction;
 import com.alibaba.fastjson.JSONObject;
 import com.tzld.videoVector.api.VideoApiService;
 import com.tzld.videoVector.common.constant.VectorConstants;
@@ -420,16 +422,19 @@ public class VectorRecallTestServiceImpl implements VectorRecallTestService {
         // ============================================================
 
         // 视频去重
-        List<VideoMatchResult> dedupedVideo = deduplicateRawVideo(allRawVideo);
+        List<VideoMatchResult> dedupedVideo = deduplicateRaw(allRawVideo,
+                VideoMatchResult::getVideoId, m -> m.getScore() != null ? m.getScore() : 0.0);
         // 素材去重 + 批量查 quality 做轻量预打分(不等 full enrich)
-        List<MaterialMatch> dedupedMaterial = deduplicateRawMaterial(allRawMaterial);
+        List<MaterialMatch> dedupedMaterial = deduplicateRaw(allRawMaterial,
+                MaterialMatch::getMaterialId, MaterialMatch::getScore);
         List<String> matIds = dedupedMaterial.stream()
                 .map(MaterialMatch::getMaterialId).distinct().limit(enrichK * 2)
                 .collect(Collectors.toList());
         Map<String, MaterialQuality> qualitySnapshot = matIds.isEmpty()
                 ? Collections.emptyMap() : loadMaterialQualityRows(matIds);
         // 文章去重
-        List<ArticleMatch> dedupedArticle = deduplicateRawArticle(allRawArticle);
+        List<ArticleMatch> dedupedArticle = deduplicateRaw(allRawArticle,
+                ArticleMatch::getArticleId, ArticleMatch::getScore);
 
         double preAlpha = RankingParams.defaults().getAlpha(); // 0.6
 
@@ -571,54 +576,26 @@ public class VectorRecallTestServiceImpl implements VectorRecallTestService {
         }
     }
 
-    /** 视频原始 ANN 去重:同一 videoId 保留最高分 */
-    private List<VideoMatchResult> deduplicateRawVideo(List<VideoMatchResult> all) {
-        if (CollectionUtils.isEmpty(all)) return Collections.emptyList();
-        Map<Long, VideoMatchResult> dedup = new LinkedHashMap<>();
-        for (VideoMatchResult m : all) {
-            if (m == null || m.getVideoId() == null) continue;
-            VideoMatchResult existing = dedup.get(m.getVideoId());
-            if (existing == null || (m.getScore() != null
-                    && (existing.getScore() == null || m.getScore() > existing.getScore()))) {
-                dedup.put(m.getVideoId(), m);
-            }
-        }
-        return new ArrayList<>(dedup.values()).stream()
-                .sorted(Comparator.comparing(VideoMatchResult::getScore,
-                        Comparator.nullsLast(Comparator.reverseOrder())))
-                .collect(Collectors.toList());
-    }
-
-    /** 素材原始 ANN 去重:同一 materialId 保留最高分 */
-    private List<MaterialMatch> deduplicateRawMaterial(List<MaterialMatch> all) {
-        if (CollectionUtils.isEmpty(all)) return Collections.emptyList();
-        Map<String, MaterialMatch> dedup = new LinkedHashMap<>();
-        for (MaterialMatch m : all) {
-            if (m == null || m.getMaterialId() == null) continue;
-            MaterialMatch existing = dedup.get(m.getMaterialId());
-            if (existing == null || m.getScore() > existing.getScore()) {
-                dedup.put(m.getMaterialId(), m);
-            }
-        }
-        return new ArrayList<>(dedup.values()).stream()
-                .sorted(Comparator.comparingDouble(MaterialMatch::getScore).reversed())
-                .collect(Collectors.toList());
-    }
-
-    /** 文章原始 ANN 去重:同一 articleId 保留最高分 */
-    private List<ArticleMatch> deduplicateRawArticle(List<ArticleMatch> all) {
+    /**
+     * 原始 ANN 去重:同一 ID 保留最高分,按分降序排列。
+     * 三类模态的 idExtractor/scoreExtractor 不同(Video 的 score 可为 null),在此集中处理。
+     */
+    private static <T, K> List<T> deduplicateRaw(
+            List<T> all, Function<T, K> idExtractor, ToDoubleFunction<T> scoreExtractor) {
         if (CollectionUtils.isEmpty(all)) return Collections.emptyList();
-        Map<String, ArticleMatch> dedup = new LinkedHashMap<>();
-        for (ArticleMatch m : all) {
-            if (m == null || m.getArticleId() == null) continue;
-            ArticleMatch existing = dedup.get(m.getArticleId());
-            if (existing == null || m.getScore() > existing.getScore()) {
-                dedup.put(m.getArticleId(), m);
-            }
-        }
-        return new ArrayList<>(dedup.values()).stream()
-                .sorted(Comparator.comparingDouble(ArticleMatch::getScore).reversed())
-                .collect(Collectors.toList());
+        Map<K, T> dedup = new LinkedHashMap<>();
+        for (T item : all) {
+            if (item == null) continue;
+            K id = idExtractor.apply(item);
+            if (id == null) continue;
+            T existing = dedup.get(id);
+            if (existing == null || scoreExtractor.applyAsDouble(item) > scoreExtractor.applyAsDouble(existing)) {
+                dedup.put(id, item);
+            }
+        }
+        List<T> result = new ArrayList<>(dedup.values());
+        result.sort(Comparator.comparingDouble(scoreExtractor::applyAsDouble).reversed());
+        return result;
     }
 
     /**
@@ -1148,7 +1125,7 @@ public class VectorRecallTestServiceImpl implements VectorRecallTestService {
         meta.title = targetPost.getString("title");
         String bodyText = targetPost.getString("body_text");
         if (StringUtils.hasText(bodyText)) {
-            meta.summary = bodyText.length() > 120 ? bodyText.substring(0, 120) : bodyText;
+            meta.summary = bodyText.length() > 200 ? bodyText.substring(0, 200) : bodyText;
         }
         meta.articleId = targetPost.getString("channel_content_id");