Browse Source

新增六种向量化内容(灵感点-实质、关键点-实质、目的点-实质、灵感点-形式、关键点-形式、目的点-形式)

wangyunpeng 1 tuần trước cách đây
mục cha
commit
7fe4f49d3b
1 tập tin đã thay đổi với 175 bổ sung1 xóa
  1. 175 1
      core/src/main/java/com/tzld/videoVector/job/VideoVectorJob.java

+ 175 - 1
core/src/main/java/com/tzld/videoVector/job/VideoVectorJob.java

@@ -689,6 +689,15 @@ public class VideoVectorJob {
 
         String extractRule = config.getExtractRule();
         if (StringUtils.hasText(extractRule)) {
+            // 检查是否为 point_decomposition 类型
+            try {
+                JSONObject rule = JSON.parseObject(extractRule);
+                if ("point_decomposition".equals(rule.getString("type"))) {
+                    return extractTextsFromPointDecomposition(dataContent, rule);
+                }
+            } catch (Exception e) {
+                // 不是JSON或无type字段,走原有逻辑
+            }
             // 多点模式:使用配置的 sourcePath 中相对路径 + 置信度过滤
             // AIGC dataContent 的结构与 raw_result 中 final_normalization_rebuild 下的子结构一致
             return extractTextsWithConfidence(dataContent, config.getSourcePath(), extractRule);
@@ -698,6 +707,170 @@ public class VideoVectorJob {
         }
     }
 
+    /**
+     * 从解构数据中按 点类型 + 置信度 + 实质/形式 + 贡献度 提取向量化文本
+     *
+     * 提取流程:
+     * 1. 从 final_result_path 获取最终点列表,按 confidence_field >= confidence_threshold 过滤
+     * 2. 对通过的点,从主数组 point_array_path 中匹配对应点的详细解构
+     * 3. 根据 target (substance/form) 收集子项名称
+     * 4. 在 contribution_results 中查找对应词的贡献度,贡献度 >= contribution_threshold 的进行向量化
+     *
+     * @param dataContent 解构JSON数据
+     * @param rule        提取规则
+     * @return 满足条件的文本列表
+     */
+    private List<String> extractTextsFromPointDecomposition(JSONObject dataContent, JSONObject rule) {
+        List<String> texts = new ArrayList<>();
+
+        try {
+            String pointArrayPath = rule.getString("point_array_path");
+            String finalResultPath = rule.getString("final_result_path");
+            String pointNameField = rule.getString("point_name_field");
+            String confidenceField = rule.getString("confidence_field");
+            double confidenceThreshold = rule.getDoubleValue("confidence_threshold");
+            String target = rule.getString("target"); // "substance" or "form"
+            String contributionPath = rule.getString("contribution_path");
+            double contributionThreshold = rule.getDoubleValue("contribution_threshold");
+
+            // 1. 获取最终结果列表,过滤置信度
+            List<JSONObject> finalPoints = VectorUtils.extractArrayItemsFromJson(dataContent, finalResultPath + "[*]");
+            List<String> qualifiedPointNames = new ArrayList<>();
+            for (JSONObject fp : finalPoints) {
+                if (isConfidenceQualified(fp, confidenceField, confidenceThreshold)) {
+                    String pointName = fp.getString(pointNameField);
+                    if (StringUtils.hasText(pointName)) {
+                        qualifiedPointNames.add(pointName);
+                    }
+                }
+            }
+
+            if (qualifiedPointNames.isEmpty()) {
+                log.info("point_decomposition: 无满足置信度的点, path={}", finalResultPath);
+                return texts;
+            }
+            log.info("point_decomposition: 置信度通过 {} 个点: {}", qualifiedPointNames.size(), qualifiedPointNames);
+
+            // 2. 获取主数组中对应的点详情
+            List<JSONObject> pointDetails = VectorUtils.extractArrayItemsFromJson(dataContent, pointArrayPath + "[*]");
+
+            // 3. 构建贡献度查找表
+            Map<String, Double> contributionMap = buildContributionMap(dataContent, contributionPath);
+
+            // 4. 对每个通过置信度的点,提取实质/形式子项并检查贡献度
+            for (String pointName : qualifiedPointNames) {
+                // 在主数组中匹配点
+                JSONObject matchedPoint = null;
+                for (JSONObject detail : pointDetails) {
+                    if (pointName.equals(detail.getString("点"))) {
+                        matchedPoint = detail;
+                        break;
+                    }
+                }
+                if (matchedPoint == null) {
+                    log.info("point_decomposition: 未找到点 '{}' 的详情", pointName);
+                    continue;
+                }
+
+                // 根据 target 提取子项名称
+                List<String> itemNames;
+                if ("substance".equals(target)) {
+                    itemNames = extractSubstanceNames(matchedPoint);
+                } else {
+                    itemNames = extractFormNames(matchedPoint);
+                }
+
+                // 检查贡献度
+                for (String name : itemNames) {
+                    Double contribution = contributionMap.get(name);
+                    if (contribution != null && contribution >= contributionThreshold) {
+                        texts.add(name);
+                        log.info("point_decomposition: 通过贡献度检查, 点='{}', 名称='{}', 贡献度={}",
+                                pointName, name, contribution);
+                    }
+                }
+            }
+
+            log.info("point_decomposition: target={}, 最终提取文本数={}", target, texts.size());
+
+        } catch (Exception e) {
+            log.error("extractTextsFromPointDecomposition 失败: {}", e.getMessage(), e);
+        }
+
+        return texts;
+    }
+
+    /**
+     * 从点的 "实质" 中提取所有子项名称
+     * 合并 具体元素、具象概念、抽象概念 中的 "名称" 字段
+     */
+    private List<String> extractSubstanceNames(JSONObject point) {
+        List<String> names = new ArrayList<>();
+        JSONObject substance = point.getJSONObject("实质");
+        if (substance == null) {
+            return names;
+        }
+        collectNamesFromArray(substance.getJSONArray("具体元素"), names);
+        collectNamesFromArray(substance.getJSONArray("具象概念"), names);
+        collectNamesFromArray(substance.getJSONArray("抽象概念"), names);
+        return names;
+    }
+
+    /**
+     * 从点的 "形式" 中提取所有子项名称
+     * 合并 具体元素形式、具象概念形式、整体形式 中的 "名称" 字段
+     */
+    private List<String> extractFormNames(JSONObject point) {
+        List<String> names = new ArrayList<>();
+        JSONObject form = point.getJSONObject("形式");
+        if (form == null) {
+            return names;
+        }
+        collectNamesFromArray(form.getJSONArray("具体元素形式"), names);
+        collectNamesFromArray(form.getJSONArray("具象概念形式"), names);
+        collectNamesFromArray(form.getJSONArray("整体形式"), names);
+        return names;
+    }
+
+    /**
+     * 从JSON数组中收集 "名称" 字段值
+     */
+    private void collectNamesFromArray(JSONArray array, List<String> names) {
+        if (array == null || array.isEmpty()) {
+            return;
+        }
+        for (int i = 0; i < array.size(); i++) {
+            JSONObject item = array.getJSONObject(i);
+            if (item != null) {
+                String name = item.getString("名称");
+                if (StringUtils.hasText(name)) {
+                    names.add(name);
+                }
+            }
+        }
+    }
+
+    /**
+     * 构建贡献度查找表(词 → 贡献度)
+     * 从 contribution_results 数组中提取
+     */
+    private Map<String, Double> buildContributionMap(JSONObject dataContent, String contributionPath) {
+        Map<String, Double> map = new HashMap<>();
+        try {
+            List<JSONObject> contributions = VectorUtils.extractArrayItemsFromJson(dataContent, contributionPath + "[*]");
+            for (JSONObject c : contributions) {
+                String word = c.getString("词");
+                Double contribution = c.getDouble("贡献度");
+                if (StringUtils.hasText(word) && contribution != null) {
+                    map.put(word, contribution);
+                }
+            }
+        } catch (Exception e) {
+            log.error("构建贡献度查找表失败: {}", e.getMessage());
+        }
+        return map;
+    }
+
     /**
      * 从 dataContent 中提取选题文本(向后兼容)
      * @deprecated 请使用 extractTextsFromDataContent(dataContent, config)
@@ -978,7 +1151,8 @@ public class VideoVectorJob {
         DeconstructVectorConfigExample example = new DeconstructVectorConfigExample();
         example.createCriteria()
                 .andEnabledEqualTo((short) 1)
-                .andSourceFieldEqualTo(sourceField);
+                .andSourceFieldEqualTo(sourceField)
+                .andIdGreaterThan(14L);
         example.setOrderByClause("priority ASC");
         return vectorConfigMapper.selectByExample(example);
     }