Browse Source

Merge branch 'feature/20241217-improve-perf' of Server/long-article-recommend into master

fengzhoutian 7 tháng trước cách đây
mục cha
commit
48f48e4d31

+ 30 - 23
long-article-recommend-service/src/main/java/com/tzld/longarticle/recommend/server/util/TitleSimilarCheckUtil.java

@@ -9,14 +9,19 @@ public class TitleSimilarCheckUtil {
     public static final double SIMILARITY_THRESHOLD = 0.8;
     public static final double ARTICLE_PROMOTION_THRESHOLD = 0.75;
 
+    public static Set<Character> makeCache(String title) {
+        title = title.trim().replace("\u200b", "");
+        Set<Character> cacheSet = new HashSet<>(title.length());
+        for (char c : title.toCharArray()) {
+            cacheSet.add(c);
+        }
+        return cacheSet;
+    }
+
     public static List<Set<Character>> makeCache(List<String> titles) {
         List<Set<Character>> cache = new ArrayList<>(titles.size());
         for (String title : titles) {
-            Set<Character> currentSet = new HashSet<>(title.length());
-            for (char c : title.toCharArray()) {
-                currentSet.add(c);
-            }
-            cache.add(currentSet);
+            cache.add(makeCache(title));
         }
         return cache;
     }
@@ -25,9 +30,9 @@ public class TitleSimilarCheckUtil {
         if (CollectionUtils.isEmpty(existsContentCache)) {
             return false;
         }
-        title = title.trim().replace("\u200b", "");
+        Set<Character> titleCache = makeCache(title);
         for (Set<Character> existTitleCache : existsContentCache) {
-            if (isSimilar(title, existTitleCache, threshold)) {
+            if (isSimilar(titleCache, existTitleCache, threshold)) {
                 return true;
             }
         }
@@ -49,29 +54,27 @@ public class TitleSimilarCheckUtil {
         return result;
     }
 
-    public static boolean isSimilar(String titleA, Set<Character> titleB, double threshold) {
+    public static boolean isSimilar(Set<Character> titleA, Set<Character> titleB, double threshold) {
         if (titleA.isEmpty() || titleB.isEmpty()) {
             return false;
         }
-        Set<Character> setA = new HashSet<>(titleA.length());
-        for (char c : titleA.toCharArray()) {
-            setA.add(c);
-        }
-        int minLen = Math.max(Math.min(setA.size(), titleB.size()), 1);
-        setA.retainAll(titleB);
-        double rate = setA.size() / (double) minLen;
+        int minLen = Math.min(titleA.size(), titleB.size());
+        // since retainAll is an in-place operation, copy it first
+        titleA = new HashSet<>(titleA);
+        titleA.retainAll(titleB);
+        double rate = titleA.size() / (double) minLen;
         return rate >= threshold;
     }
 
+    public static boolean isSimilar(String titleA, Set<Character> titleB, double threshold) {
+        Set<Character> setA = makeCache(titleA);
+        return isSimilar(setA, titleB, threshold);
+    }
+
     public static boolean isSimilar(String titleA, String titleB, double threshold) {
-        if (titleA.isEmpty() || titleB.isEmpty()) {
-            return false;
-        }
-        Set<Character> setB = new HashSet<>(titleB.length());
-        for (char c : titleB.toCharArray()) {
-            setB.add(c);
-        }
-        return isSimilar(titleA, setB, threshold);
+        Set<Character> setA = makeCache(titleA);
+        Set<Character> setB = makeCache(titleB);
+        return isSimilar(setA, setB, threshold);
     }
 
     public static void main(String[] args) {
@@ -86,5 +89,9 @@ public class TitleSimilarCheckUtil {
         List<Set<Character>> titlesCache = makeCache(existsContentTitle);
         result = isDuplicateContentByCache(title, titlesCache, SIMILARITY_THRESHOLD);
         System.out.println(result);
+
+        title = "江苏高考文科女状元,遭多所985名校拒绝录取,成为“最惨状元”";
+        result = isDuplicateContentByCache(title, titlesCache, SIMILARITY_THRESHOLD);
+        System.out.println(result);
     }
 }