فهرست منبع

新增素材解构信息

luojunhui 1 هفته پیش
والد
کامیت
147440420f

+ 3 - 3
core/src/main/java/com/tzld/videoVector/model/param/recall/MatchByTextParam.java

@@ -18,12 +18,12 @@ public class MatchByTextParam {
      */
     private String configCode;
 
-    /** 取前 N 条(视频与素材合并后按 score 截断),缺省 50 */
+    /** 视频与素材各自的默认返回条数;未传 videoTopN / materialTopN 时分别回落到此值,缺省 50 */
     private Integer topN = 50;
 
-    /** 视频召回候选 Top-N(内部召回用,不传则与 topN 相同) */
+    /** 视频返回条数;不传则与 topN 相同 */
     private Integer videoTopN;
 
-    /** 素材召回候选 Top-N(内部召回用,不传则与 topN 相同) */
+    /** 素材返回条数;不传则与 topN 相同 */
     private Integer materialTopN;
 }

+ 11 - 0
core/src/main/java/com/tzld/videoVector/service/MaterialVectorStoreService.java

@@ -33,7 +33,18 @@ public interface MaterialVectorStoreService {
 
     List<Float> getVectorByTextHash(String textHash, String configCode);
 
+    /**
+     * 根据 text_hash 查询缓存的 embedding 原始字符串,不做 Float 解析/序列化,
+     * 直接传给 searchTopN 的 ::vector cast,避免 Java Float 回环精度损失。
+     */
+    String getRawVectorByTextHash(String textHash, String configCode);
+
     List<MaterialMatch> searchTopN(String configCode, List<Float> queryVector, int topN);
 
+    /**
+     * 用原始 embedding 字符串搜索(绕过 Java Float 回环)
+     */
+    List<MaterialMatch> searchTopNByRawVector(String configCode, String rawVector, int topN);
+
     List<MaterialMatch> searchTopNBySource(String configCode, List<Float> queryVector, int topN, Short sourceType);
 }

+ 83 - 4
core/src/main/java/com/tzld/videoVector/service/impl/PgMaterialVectorStoreServiceImpl.java

@@ -129,14 +129,82 @@ public class PgMaterialVectorStoreServiceImpl implements MaterialVectorStoreServ
         if (textHash == null || textHash.isEmpty() || configCode == null || configCode.isEmpty()) return null;
         try {
             MaterialVector mv = materialVectorMapperExt.selectByTextHashAndConfigCode(textHash, configCode);
-            if (mv == null) return null;
-            return VectorUtils.parseVectorString(mv.getEmbedding());
+            if (mv == null) {
+                log.info("getVectorByTextHash MISS: textHash={}, configCode={}", textHash, configCode);
+                return null;
+            }
+            if (mv.getEmbedding() == null) {
+                log.info("getVectorByTextHash HIT but embedding IS NULL: textHash={}, configCode={}, materialId={}",
+                        textHash, configCode, mv.getMaterialId());
+                return null;
+            }
+            List<Float> vector = VectorUtils.parseVectorString(mv.getEmbedding());
+            if (vector == null || vector.isEmpty()) {
+                log.info("getVectorByTextHash HIT but parseVectorString FAILED: textHash={}, configCode={}, embeddingLen={}",
+                        textHash, configCode, mv.getEmbedding().length());
+                return null;
+            }
+            log.info("getVectorByTextHash HIT OK: textHash={}, configCode={}, materialId={}, dim={}",
+                    textHash, configCode, mv.getMaterialId(), vector.size());
+            return vector;
         } catch (Exception e) {
             log.error("根据 text_hash 查询素材向量失败,hash={}, configCode={}, error={}", textHash, configCode, e.getMessage());
             return null;
         }
     }
 
+    @Override
+    public String getRawVectorByTextHash(String textHash, String configCode) {
+        if (textHash == null || textHash.isEmpty() || configCode == null || configCode.isEmpty()) return null;
+        try {
+            MaterialVector mv = materialVectorMapperExt.selectByTextHashAndConfigCode(textHash, configCode);
+            if (mv == null) {
+                log.info("getRawVectorByTextHash MISS: textHash={}, configCode={}", textHash, configCode);
+                return null;
+            }
+            String raw = mv.getEmbedding();
+            if (raw == null || raw.isEmpty()) {
+                log.info("getRawVectorByTextHash HIT but embedding IS NULL: textHash={}, configCode={}, materialId={}",
+                        textHash, configCode, mv.getMaterialId());
+                return null;
+            }
+            // 验证 embedding 格式:以 [ 开头,至少几十个字符
+            if (raw.length() < 10 || !raw.trim().startsWith("[")) {
+                log.info("getRawVectorByTextHash HIT but format SUSPECT: textHash={}, configCode={}, len={}, preview={}",
+                        textHash, configCode, raw.length(), raw.substring(0, Math.min(80, raw.length())));
+                return null;
+            }
+            log.info("getRawVectorByTextHash HIT OK: textHash={}, configCode={}, materialId={}, len={}, preview={}",
+                    textHash, configCode, mv.getMaterialId(), raw.length(),
+                    raw.substring(0, Math.min(80, raw.length())));
+            return raw;
+        } catch (Exception e) {
+            log.error("getRawVectorByTextHash 异常,hash={}, configCode={}, error={}", textHash, configCode, e.getMessage());
+            return null;
+        }
+    }
+
+    @Override
+    public List<MaterialMatch> searchTopNByRawVector(String configCode, String rawVector, int topN) {
+        if (rawVector == null || rawVector.isEmpty() || topN <= 0) {
+            return Collections.emptyList();
+        }
+        if (configCode == null || configCode.isEmpty()) {
+            log.error("searchTopNByRawVector configCode 不能为空");
+            return Collections.emptyList();
+        }
+        log.info("searchTopNByRawVector raw前100字符: {}, topN={}, configCode={}",
+                rawVector.substring(0, Math.min(100, rawVector.length())), topN, configCode);
+        List<MaterialVector> results = materialVectorMapperExt.searchTopN(configCode, rawVector, topN);
+        if (results == null || results.isEmpty()) {
+            log.info("素材向量库为空或无匹配结果,configCode={}", configCode);
+            return Collections.emptyList();
+        }
+        List<MaterialMatch> matches = convertToMatch(results, configCode);
+        log.info("searchTopNByRawVector DB返回 {} 行, configCode={}", results.size(), configCode);
+        return matches;
+    }
+
     @Override
     public List<MaterialMatch> searchTopN(String configCode, List<Float> queryVector, int topN) {
         if (queryVector == null || queryVector.isEmpty() || topN <= 0) {
@@ -148,13 +216,17 @@ public class PgMaterialVectorStoreServiceImpl implements MaterialVectorStoreServ
         }
 
         String queryVectorStr = vectorToString(queryVector);
+        log.info("searchTopN SQL vector前100字符: {}, topN={}, configCode={}",
+                queryVectorStr.substring(0, Math.min(100, queryVectorStr.length())), topN, configCode);
         List<MaterialVector> results = materialVectorMapperExt.searchTopN(configCode, queryVectorStr, topN);
         if (results == null || results.isEmpty()) {
             log.info("素材向量库为空或无匹配结果,configCode={}", configCode);
             return Collections.emptyList();
         }
 
-        return convertToMatch(results, configCode);
+        List<MaterialMatch> matches = convertToMatch(results, configCode);
+        log.info("searchTopN DB返回 {} 行, configCode={}", results.size(), configCode);
+        return matches;
     }
 
     @Override
@@ -197,7 +269,14 @@ public class PgMaterialVectorStoreServiceImpl implements MaterialVectorStoreServ
         StringBuilder sb = new StringBuilder("[");
         for (int i = 0; i < vector.size(); i++) {
             if (i > 0) sb.append(",");
-            sb.append(vector.get(i));
+            // Float.toString() 对 |v| < 1e-3 的值会输出科学计数法(如 6.399564E-4)
+            // pgvector 的 ::vector 只认标准十进制格式, 必须用 BigDecimal.toPlainString() 兜底
+            float v = vector.get(i);
+            String s = Float.toString(v);
+            if (s.indexOf('E') >= 0 || s.indexOf('e') >= 0) {
+                s = new java.math.BigDecimal(s).toPlainString();
+            }
+            sb.append(s);
         }
         sb.append("]");
         return sb.toString();

+ 6 - 1
core/src/main/java/com/tzld/videoVector/service/impl/PgVectorStoreServiceImpl.java

@@ -286,7 +286,12 @@ public class PgVectorStoreServiceImpl implements VectorStoreService {
         StringBuilder sb = new StringBuilder("[");
         for (int i = 0; i < vector.size(); i++) {
             if (i > 0) sb.append(",");
-            sb.append(vector.get(i));
+            float v = vector.get(i);
+            String s = Float.toString(v);
+            if (s.indexOf('E') >= 0 || s.indexOf('e') >= 0) {
+                s = new java.math.BigDecimal(s).toPlainString();
+            }
+            sb.append(s);
         }
         sb.append("]");
         return sb.toString();

+ 52 - 16
core/src/main/java/com/tzld/videoVector/service/recall/impl/VectorRecallTestServiceImpl.java

@@ -176,10 +176,8 @@ public class VectorRecallTestServiceImpl implements VectorRecallTestService {
             materialItems = Collections.emptyList();
         }
 
-        List<VideoMatchEnrichedVO> videoItems = limitEnrichedItemsByScore(
-                enrichVideoMatches(videoMatches, configCode), videoTopN);
-        materialItems = limitEnrichedItemsByScore(materialItems, materialTopN);
-        return buildResult(videoItems, materialItems, defaultTopN);
+        List<VideoMatchEnrichedVO> videoItems = enrichVideoMatches(videoMatches, configCode);
+        return buildResult(videoItems, materialItems);
     }
 
     private List<VideoMatchResult> limitVideoMatchesByScore(List<VideoMatchResult> matches, int topN) {
@@ -205,25 +203,56 @@ public class VectorRecallTestServiceImpl implements VectorRecallTestService {
     }
 
     /**
-     * 素材文本召回:material_vectors → material_deconstruct_result(已废弃 deconstruct_content)
+     * 素材文本召回:material_vectors → material_deconstruct_result
      */
     private List<VideoMatchEnrichedVO> recallMaterialItems(String queryText, String configCode, int topN) {
         try {
+            int candidate = Math.max(topN * VectorConstants.MULTI_POINT_RECALL_CANDIDATE_FACTOR,
+                    VectorConstants.MULTI_POINT_RECALL_MIN_CANDIDATES);
+
+            // 优先尝试 text_hash 缓存:直接用 PG 返回的原始 embedding 字符串搜索,
+            // 绕过 Java Float.parseFloat/Float.toString 回环的精度损失
+            String textHash = Md5Util.encoderByMd5(queryText);
+            if (StringUtils.hasText(textHash)) {
+                String rawVector = materialVectorStoreService.getRawVectorByTextHash(textHash, configCode);
+                if (rawVector != null && !rawVector.isEmpty()) {
+                    log.info("素材召回 使用缓存的原始向量字符串,跳过 Float 回环, configCode={}", configCode);
+                    List<MaterialMatch> raw = materialVectorStoreService.searchTopNByRawVector(
+                            configCode, rawVector, candidate);
+                    List<MaterialMatch> matches = deduplicateMaterialMatches(raw, topN);
+                    if (!CollectionUtils.isEmpty(matches)) {
+                        List<String> matchSample = new ArrayList<>();
+                        for (MaterialMatch m : matches) {
+                            matchSample.add(m.getMaterialId() + ":" + String.format("%.4f", m.getScore()));
+                        }
+                        log.info("素材召回(rawVector) 去重后({}条): {}, configCode={}",
+                                matches.size(), matchSample, configCode);
+                        return limitEnrichedItemsByScore(enrichMaterialMatches(matches, configCode), topN);
+                    }
+                    log.info("素材召回(rawVector) 无结果, configCode={}", configCode);
+                    return Collections.emptyList();
+                }
+                log.info("素材召回 text_hash 缓存未命中, textHash={}, 降级到 embedding API", textHash);
+            }
+
+            // 降级:embedding API → Float 向量 → 搜索(非缓存路径,容忍精度损失)
             List<Float> queryVector = resolveQueryVectorForMaterial(queryText, configCode);
             if (queryVector == null || queryVector.isEmpty()) {
                 log.info("素材召回: 无法获取查询向量, queryText={}", queryText);
                 return Collections.emptyList();
             }
-            // 多点配置下同一素材会被多个 point 命中,需取 topN * 倍数 候选后在应用层按 materialId 去重
-            int candidate = Math.max(topN * VectorConstants.MULTI_POINT_RECALL_CANDIDATE_FACTOR,
-                    VectorConstants.MULTI_POINT_RECALL_MIN_CANDIDATES);
+            log.info("素材召回 使用 embedding API 向量, dim={}", queryVector.size());
             List<MaterialMatch> raw = materialVectorStoreService.searchTopN(configCode, queryVector, candidate);
             List<MaterialMatch> matches = deduplicateMaterialMatches(raw, topN);
             if (CollectionUtils.isEmpty(matches)) {
                 log.info("素材召回 material_vectors 无结果, configCode={}", configCode);
                 return Collections.emptyList();
             }
-            log.info("素材召回 material_vectors 命中 {} 条, configCode={}", matches.size(), configCode);
+            List<String> matchSample = new ArrayList<>();
+            for (MaterialMatch m : matches) {
+                matchSample.add(m.getMaterialId() + ":" + String.format("%.4f", m.getScore()));
+            }
+            log.info("素材召回(embedding API) 去重后({}条): {}, configCode={}", matches.size(), matchSample, configCode);
             return limitEnrichedItemsByScore(enrichMaterialMatches(matches, configCode), topN);
         } catch (Exception e) {
             log.error("素材召回 material_vectors 异常: {}", e.getMessage(), e);
@@ -243,19 +272,28 @@ public class VectorRecallTestServiceImpl implements VectorRecallTestService {
             config = new DeconstructVectorConfig();
             config.setConfigCode(configCode);
         }
+        log.info("resolveQueryVectorForMaterial: queryText={}, configCode={}, model={}, dim={}",
+                queryText, configCode, config.getEmbeddingModel(), config.getDimension());
 
         // 1. 先查 material_vectors 的 text_hash 缓存
         String textHash = Md5Util.encoderByMd5(queryText);
         if (StringUtils.hasText(textHash)) {
+            log.info("resolveQueryVectorForMaterial textHash={}, 开始查 text_hash 缓存", textHash);
             List<Float> cached = materialVectorStoreService.getVectorByTextHash(textHash, configCode);
             if (cached != null && !cached.isEmpty()) {
+                log.info("resolveQueryVectorForMaterial 命中 text_hash 缓存,dim={}", cached.size());
                 return cached;
             }
+            log.info("resolveQueryVectorForMaterial text_hash 缓存未命中,降级到 embedding API");
         }
 
         // 2. 调用 embedding API(与入库时相同的 model / dimension)
         try {
-            return embeddingService.embed(queryText, config);
+            log.info("resolveQueryVectorForMaterial 调用 embedding API: text={}, model={}, dim={}",
+                    queryText, config.getEmbeddingModel(), config.getDimension());
+            List<Float> result = embeddingService.embed(queryText, config);
+            log.info("resolveQueryVectorForMaterial embedding API 返回, dim={}", result != null ? result.size() : 0);
+            return result;
         } catch (Exception e) {
             log.error("素材召回 embedding 失败: queryText={}, error={}", queryText, e.getMessage());
             return null;
@@ -614,13 +652,11 @@ public class VectorRecallTestServiceImpl implements VectorRecallTestService {
     }
 
     /**
-     * 组装返回结果:视频 + 素材合并为 items
-     * 各模态在前置链路里已按各自 topN 截断,这里仅做拼接 + 计数,
-     * 不再合并截断(否则视频分数普遍较高会把素材全部挤掉)。
+     * 组装返回结果:视频 + 素材合并为 items。
+     * 各模态在前置链路已按 videoTopN / materialTopN 各自截断,此处仅拼接 + 计数,不做合并截断。
      */
     private RecallResultVO buildResult(List<VideoMatchEnrichedVO> videoItems,
-                                       List<VideoMatchEnrichedVO> materialItems,
-                                       int topN) {
+                                       List<VideoMatchEnrichedVO> materialItems) {
         if (videoItems == null) {
             videoItems = Collections.emptyList();
         }
@@ -694,7 +730,7 @@ public class VectorRecallTestServiceImpl implements VectorRecallTestService {
         String configCode = StringUtils.hasText(param.getConfigCode())
                 ? param.getConfigCode() : VectorConstants.DEFAULT_CONFIG_CODE;
         List<VideoMatchEnrichedVO> videoItems = enrichVideoMatches(rawMatches, configCode);
-        return buildResult(videoItems, Collections.emptyList(), matchParam.getTopN());
+        return buildResult(videoItems, Collections.emptyList());
     }
 
     @Override

+ 3 - 3
core/src/main/resources/mapper/pgVector/ext/MaterialVectorMapperExt.xml

@@ -7,7 +7,7 @@
         <id column="id" property="id" jdbcType="BIGINT"/>
         <result column="material_id" jdbcType="VARCHAR" property="materialId"/>
         <result column="config_code" property="configCode" jdbcType="VARCHAR"/>
-        <result column="embedding" property="embedding" jdbcType="OTHER"/>
+        <result column="embedding" property="embedding" jdbcType="VARCHAR"/>
         <result column="created_at" property="createdAt" jdbcType="TIMESTAMP"/>
         <result column="updated_at" property="updatedAt" jdbcType="TIMESTAMP"/>
         <result column="point_index" property="pointIndex" jdbcType="INTEGER"/>
@@ -106,13 +106,13 @@
         LIMIT #{topN}
     </select>
 
-    <!-- 根据 text_hash 查询向量 -->
+    <!-- 根据 text_hash 查询向量,embedding::text 保证 PG 输出高精度文本,避免 JDBC 驱动 PGobject.getValue() 精度损失 -->
     <select id="selectByTextHashAndConfigCode" resultMap="MaterialVectorResultMap">
         SELECT
             id,
             material_id,
             config_code,
-            embedding,
+            embedding::text AS embedding,
             created_at,
             updated_at,
             point_index,

+ 2 - 2
server/src/main/java/com/tzld/videoVector/controller/VectorRecallTestController.java

@@ -41,8 +41,8 @@ public class VectorRecallTestController {
     /**
      * 文本召回 (Tab2)
      * POST /videoVector/recallTest/matchByText
-     * body: { "queryText": "...", "configCode": "VIDEO_TOPIC", "topN": 50 }
-     * 返回 items 扁平列表,前端按 modality 分组展示 Tab
+     * body: { "queryText": "...", "configCode": "VIDEO_TOPIC", "topN": 50, "videoTopN": 50, "materialTopN": 50 }
+     * 视频与素材各自按 videoTopN / materialTopN 返回(未传则与 topN 相同),items 合并后不再截断
      */
     @PostMapping("/matchByText")
     public CommonResponse<RecallResultVO> matchByText(@RequestBody MatchByTextParam param) {