|
@@ -689,6 +689,15 @@ public class VideoVectorJob {
|
|
|
|
|
|
|
|
String extractRule = config.getExtractRule();
|
|
String extractRule = config.getExtractRule();
|
|
|
if (StringUtils.hasText(extractRule)) {
|
|
if (StringUtils.hasText(extractRule)) {
|
|
|
|
|
+ // 检查是否为 point_decomposition 类型
|
|
|
|
|
+ try {
|
|
|
|
|
+ JSONObject rule = JSON.parseObject(extractRule);
|
|
|
|
|
+ if ("point_decomposition".equals(rule.getString("type"))) {
|
|
|
|
|
+ return extractTextsFromPointDecomposition(dataContent, rule);
|
|
|
|
|
+ }
|
|
|
|
|
+ } catch (Exception e) {
|
|
|
|
|
+ // 不是JSON或无type字段,走原有逻辑
|
|
|
|
|
+ }
|
|
|
// 多点模式:使用配置的 sourcePath 中相对路径 + 置信度过滤
|
|
// 多点模式:使用配置的 sourcePath 中相对路径 + 置信度过滤
|
|
|
// AIGC dataContent 的结构与 raw_result 中 final_normalization_rebuild 下的子结构一致
|
|
// AIGC dataContent 的结构与 raw_result 中 final_normalization_rebuild 下的子结构一致
|
|
|
return extractTextsWithConfidence(dataContent, config.getSourcePath(), extractRule);
|
|
return extractTextsWithConfidence(dataContent, config.getSourcePath(), extractRule);
|
|
@@ -698,6 +707,170 @@ public class VideoVectorJob {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 从解构数据中按 点类型 + 置信度 + 实质/形式 + 贡献度 提取向量化文本
|
|
|
|
|
+ *
|
|
|
|
|
+ * 提取流程:
|
|
|
|
|
+ * 1. 从 final_result_path 获取最终点列表,按 confidence_field >= confidence_threshold 过滤
|
|
|
|
|
+ * 2. 对通过的点,从主数组 point_array_path 中匹配对应点的详细解构
|
|
|
|
|
+ * 3. 根据 target (substance/form) 收集子项名称
|
|
|
|
|
+ * 4. 在 contribution_results 中查找对应词的贡献度,贡献度 >= contribution_threshold 的进行向量化
|
|
|
|
|
+ *
|
|
|
|
|
+ * @param dataContent 解构JSON数据
|
|
|
|
|
+ * @param rule 提取规则
|
|
|
|
|
+ * @return 满足条件的文本列表
|
|
|
|
|
+ */
|
|
|
|
|
+ private List<String> extractTextsFromPointDecomposition(JSONObject dataContent, JSONObject rule) {
|
|
|
|
|
+ List<String> texts = new ArrayList<>();
|
|
|
|
|
+
|
|
|
|
|
+ try {
|
|
|
|
|
+ String pointArrayPath = rule.getString("point_array_path");
|
|
|
|
|
+ String finalResultPath = rule.getString("final_result_path");
|
|
|
|
|
+ String pointNameField = rule.getString("point_name_field");
|
|
|
|
|
+ String confidenceField = rule.getString("confidence_field");
|
|
|
|
|
+ double confidenceThreshold = rule.getDoubleValue("confidence_threshold");
|
|
|
|
|
+ String target = rule.getString("target"); // "substance" or "form"
|
|
|
|
|
+ String contributionPath = rule.getString("contribution_path");
|
|
|
|
|
+ double contributionThreshold = rule.getDoubleValue("contribution_threshold");
|
|
|
|
|
+
|
|
|
|
|
+ // 1. 获取最终结果列表,过滤置信度
|
|
|
|
|
+ List<JSONObject> finalPoints = VectorUtils.extractArrayItemsFromJson(dataContent, finalResultPath + "[*]");
|
|
|
|
|
+ List<String> qualifiedPointNames = new ArrayList<>();
|
|
|
|
|
+ for (JSONObject fp : finalPoints) {
|
|
|
|
|
+ if (isConfidenceQualified(fp, confidenceField, confidenceThreshold)) {
|
|
|
|
|
+ String pointName = fp.getString(pointNameField);
|
|
|
|
|
+ if (StringUtils.hasText(pointName)) {
|
|
|
|
|
+ qualifiedPointNames.add(pointName);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (qualifiedPointNames.isEmpty()) {
|
|
|
|
|
+ log.info("point_decomposition: 无满足置信度的点, path={}", finalResultPath);
|
|
|
|
|
+ return texts;
|
|
|
|
|
+ }
|
|
|
|
|
+ log.info("point_decomposition: 置信度通过 {} 个点: {}", qualifiedPointNames.size(), qualifiedPointNames);
|
|
|
|
|
+
|
|
|
|
|
+ // 2. 获取主数组中对应的点详情
|
|
|
|
|
+ List<JSONObject> pointDetails = VectorUtils.extractArrayItemsFromJson(dataContent, pointArrayPath + "[*]");
|
|
|
|
|
+
|
|
|
|
|
+ // 3. 构建贡献度查找表
|
|
|
|
|
+ Map<String, Double> contributionMap = buildContributionMap(dataContent, contributionPath);
|
|
|
|
|
+
|
|
|
|
|
+ // 4. 对每个通过置信度的点,提取实质/形式子项并检查贡献度
|
|
|
|
|
+ for (String pointName : qualifiedPointNames) {
|
|
|
|
|
+ // 在主数组中匹配点
|
|
|
|
|
+ JSONObject matchedPoint = null;
|
|
|
|
|
+ for (JSONObject detail : pointDetails) {
|
|
|
|
|
+ if (pointName.equals(detail.getString("点"))) {
|
|
|
|
|
+ matchedPoint = detail;
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ if (matchedPoint == null) {
|
|
|
|
|
+ log.info("point_decomposition: 未找到点 '{}' 的详情", pointName);
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 根据 target 提取子项名称
|
|
|
|
|
+ List<String> itemNames;
|
|
|
|
|
+ if ("substance".equals(target)) {
|
|
|
|
|
+ itemNames = extractSubstanceNames(matchedPoint);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ itemNames = extractFormNames(matchedPoint);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 检查贡献度
|
|
|
|
|
+ for (String name : itemNames) {
|
|
|
|
|
+ Double contribution = contributionMap.get(name);
|
|
|
|
|
+ if (contribution != null && contribution >= contributionThreshold) {
|
|
|
|
|
+ texts.add(name);
|
|
|
|
|
+ log.info("point_decomposition: 通过贡献度检查, 点='{}', 名称='{}', 贡献度={}",
|
|
|
|
|
+ pointName, name, contribution);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ log.info("point_decomposition: target={}, 最终提取文本数={}", target, texts.size());
|
|
|
|
|
+
|
|
|
|
|
+ } catch (Exception e) {
|
|
|
|
|
+ log.error("extractTextsFromPointDecomposition 失败: {}", e.getMessage(), e);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return texts;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 从点的 "实质" 中提取所有子项名称
|
|
|
|
|
+ * 合并 具体元素、具象概念、抽象概念 中的 "名称" 字段
|
|
|
|
|
+ */
|
|
|
|
|
+ private List<String> extractSubstanceNames(JSONObject point) {
|
|
|
|
|
+ List<String> names = new ArrayList<>();
|
|
|
|
|
+ JSONObject substance = point.getJSONObject("实质");
|
|
|
|
|
+ if (substance == null) {
|
|
|
|
|
+ return names;
|
|
|
|
|
+ }
|
|
|
|
|
+ collectNamesFromArray(substance.getJSONArray("具体元素"), names);
|
|
|
|
|
+ collectNamesFromArray(substance.getJSONArray("具象概念"), names);
|
|
|
|
|
+ collectNamesFromArray(substance.getJSONArray("抽象概念"), names);
|
|
|
|
|
+ return names;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 从点的 "形式" 中提取所有子项名称
|
|
|
|
|
+ * 合并 具体元素形式、具象概念形式、整体形式 中的 "名称" 字段
|
|
|
|
|
+ */
|
|
|
|
|
+ private List<String> extractFormNames(JSONObject point) {
|
|
|
|
|
+ List<String> names = new ArrayList<>();
|
|
|
|
|
+ JSONObject form = point.getJSONObject("形式");
|
|
|
|
|
+ if (form == null) {
|
|
|
|
|
+ return names;
|
|
|
|
|
+ }
|
|
|
|
|
+ collectNamesFromArray(form.getJSONArray("具体元素形式"), names);
|
|
|
|
|
+ collectNamesFromArray(form.getJSONArray("具象概念形式"), names);
|
|
|
|
|
+ collectNamesFromArray(form.getJSONArray("整体形式"), names);
|
|
|
|
|
+ return names;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 从JSON数组中收集 "名称" 字段值
|
|
|
|
|
+ */
|
|
|
|
|
+ private void collectNamesFromArray(JSONArray array, List<String> names) {
|
|
|
|
|
+ if (array == null || array.isEmpty()) {
|
|
|
|
|
+ return;
|
|
|
|
|
+ }
|
|
|
|
|
+ for (int i = 0; i < array.size(); i++) {
|
|
|
|
|
+ JSONObject item = array.getJSONObject(i);
|
|
|
|
|
+ if (item != null) {
|
|
|
|
|
+ String name = item.getString("名称");
|
|
|
|
|
+ if (StringUtils.hasText(name)) {
|
|
|
|
|
+ names.add(name);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 构建贡献度查找表(词 → 贡献度)
|
|
|
|
|
+ * 从 contribution_results 数组中提取
|
|
|
|
|
+ */
|
|
|
|
|
+ private Map<String, Double> buildContributionMap(JSONObject dataContent, String contributionPath) {
|
|
|
|
|
+ Map<String, Double> map = new HashMap<>();
|
|
|
|
|
+ try {
|
|
|
|
|
+ List<JSONObject> contributions = VectorUtils.extractArrayItemsFromJson(dataContent, contributionPath + "[*]");
|
|
|
|
|
+ for (JSONObject c : contributions) {
|
|
|
|
|
+ String word = c.getString("词");
|
|
|
|
|
+ Double contribution = c.getDouble("贡献度");
|
|
|
|
|
+ if (StringUtils.hasText(word) && contribution != null) {
|
|
|
|
|
+ map.put(word, contribution);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ } catch (Exception e) {
|
|
|
|
|
+ log.error("构建贡献度查找表失败: {}", e.getMessage());
|
|
|
|
|
+ }
|
|
|
|
|
+ return map;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
/**
|
|
/**
|
|
|
* 从 dataContent 中提取选题文本(向后兼容)
|
|
* 从 dataContent 中提取选题文本(向后兼容)
|
|
|
* @deprecated 请使用 extractTextsFromDataContent(dataContent, config)
|
|
* @deprecated 请使用 extractTextsFromDataContent(dataContent, config)
|
|
@@ -978,7 +1151,8 @@ public class VideoVectorJob {
|
|
|
DeconstructVectorConfigExample example = new DeconstructVectorConfigExample();
|
|
DeconstructVectorConfigExample example = new DeconstructVectorConfigExample();
|
|
|
example.createCriteria()
|
|
example.createCriteria()
|
|
|
.andEnabledEqualTo((short) 1)
|
|
.andEnabledEqualTo((short) 1)
|
|
|
- .andSourceFieldEqualTo(sourceField);
|
|
|
|
|
|
|
+ .andSourceFieldEqualTo(sourceField)
|
|
|
|
|
+ .andIdGreaterThan(14L);
|
|
|
example.setOrderByClause("priority ASC");
|
|
example.setOrderByClause("priority ASC");
|
|
|
return vectorConfigMapper.selectByExample(example);
|
|
return vectorConfigMapper.selectByExample(example);
|
|
|
}
|
|
}
|