Просмотр исходного кода

后验自标题过滤补 emoji/空白/全半角归一化

SQL 严格 <> 仅能拦截字节完全一致的同标题;脏数据里大量「前缀 emoji 不同」「尾随空格」「全半角差异」的视觉同标题漏网。
新增 TitleNormalizer.isSelfTitle 在 app 层做二次剔除。

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
刘立冬 1 день назад
Родитель
Сommit
b53ac1d1a9

+ 7 - 0
api-module/src/main/java/com/tzld/piaoquan/api/service/contentplatform/impl/ContentPlatformPlanServiceImpl.java

@@ -27,6 +27,7 @@ import com.tzld.piaoquan.api.service.VideoMultiService;
 import com.tzld.piaoquan.api.service.contentplatform.ContentPlatformAccountService;
 import com.tzld.piaoquan.api.service.contentplatform.ContentPlatformCooperateAccountService;
 import com.tzld.piaoquan.api.service.contentplatform.ContentPlatformPlanService;
+import com.tzld.piaoquan.api.util.TitleNormalizer;
 import com.tzld.piaoquan.growth.common.common.enums.GhTypeEnum;
 import com.tzld.piaoquan.growth.common.common.enums.StrategyStatusEnum;
 import com.tzld.piaoquan.growth.common.dao.mapper.ext.CgiReplyBucketDataMapperExt;
@@ -673,6 +674,9 @@ public class ContentPlatformPlanServiceImpl implements ContentPlatformPlanServic
             return result;
         }
         List<ContentPlatformDemandVideo> rows = demandVideoMapperExt.selectForRecommendPaged(dt, crowdSegment, demandStrategy, offset, pageSize, excludeSelfTitle);
+        if (excludeSelfTitle) {
+            rows.removeIf(r -> TitleNormalizer.isSelfTitle(r.getTitle(), r.getDemandContentTitle()));
+        }
         List<VideoContentItemVO> list = buildDemandVideoContentItemVOList(rows);
         for (VideoContentItemVO v : list) {
             v.setSource(source);
@@ -803,6 +807,9 @@ public class ContentPlatformPlanServiceImpl implements ContentPlatformPlanServic
             if (row.getVideoId() == null) {
                 continue;
             }
+            if (excludeSelfTitle && TitleNormalizer.isSelfTitle(row.getTitle(), row.getDemandContentTitle())) {
+                continue;
+            }
             if (!distinct.containsKey(row.getVideoId())) {
                 distinct.put(row.getVideoId(), row);
                 if (distinct.size() >= limit) {

+ 44 - 0
api-module/src/main/java/com/tzld/piaoquan/api/util/TitleNormalizer.java

@@ -0,0 +1,44 @@
+package com.tzld.piaoquan.api.util;
+
+import java.text.Normalizer;
+import java.util.regex.Pattern;
+
+/**
+ * 标题归一化:用于「视频标题 == 需求种子标题」的模糊比对,
+ * 规避脏数据中的 emoji 差异、空白差异、全/半角差异。
+ */
+public final class TitleNormalizer {
+
+    private TitleNormalizer() {
+    }
+
+    // 覆盖常见 emoji 区间(符号、表情、补充符号、各类杂项符号)
+    private static final Pattern EMOJI = Pattern.compile(
+            "[\\p{So}\\p{Cn}]" +
+                    "|[\uD83C-\uDBFF][\uDC00-\uDFFF]" +
+                    "|[\u2600-\u27BF]" +
+                    "|[\uFE00-\uFE0F]" +
+                    "|[\u200D]"
+    );
+
+    private static final Pattern WHITESPACE = Pattern.compile("\\s+");
+
+    public static String normalize(String s) {
+        if (s == null || s.isEmpty()) {
+            return "";
+        }
+        String n = Normalizer.normalize(s, Normalizer.Form.NFKC);
+        n = EMOJI.matcher(n).replaceAll("");
+        n = WHITESPACE.matcher(n).replaceAll("");
+        return n;
+    }
+
+    public static boolean isSelfTitle(String title, String demandContentTitle) {
+        if (title == null || demandContentTitle == null) {
+            return false;
+        }
+        String a = normalize(title);
+        String b = normalize(demandContentTitle);
+        return !a.isEmpty() && a.equals(b);
+    }
+}