Browse Source

Merge branch 'feature/20241202-improve-perf' of Server/long-article-recommend into master

fengzhoutian 7 months ago
parent
commit
95075404d5

+ 19 - 52
long-article-recommend-service/src/main/java/com/tzld/longarticle/recommend/server/service/recommend/filter/strategy/SensitiveStrategy.java

@@ -15,16 +15,11 @@ import com.tzld.longarticle.recommend.server.util.TitleSimilarCheckUtil;
 import lombok.extern.slf4j.Slf4j;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.stereotype.Component;
-import org.springframework.util.StringUtils;
 
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
 
 @Component
@@ -43,63 +38,35 @@ public class SensitiveStrategy implements FilterStrategy {
 
     @Override
     public FilterResult filter(FilterParam param) {
-        long start = System.currentTimeMillis();
         FilterResult filterResult = new FilterResult();
-        List<String> result = new ArrayList<>();
+        List<String> result = new ArrayList<>(param.getContents().size());
         List<Content> filterContents = new ArrayList<>();
 
-        CountDownLatch cdl = new CountDownLatch(param.getContents().size());
-        List<Future<Content>> futures = new ArrayList<>();
-        Map<String, String> titleMd5Map = new HashMap<>();
-        Map<String, ArticleSensitive> articleSensitiveMap = new HashMap<>();
-        List<String> md5List = new ArrayList<>();
-        for (Content content : param.getContents()) {
-            String md5 = Md5Util.encoderByMd5(content.getTitle());
-            md5List.add(md5);
-            titleMd5Map.put(content.getTitle(), md5);
-        }
-        List<ArticleSensitive> articleSensitiveList = getArticleSensitive(md5List);
-        if (CollectionUtil.isNotEmpty(articleSensitiveList)) {
-            articleSensitiveMap = articleSensitiveList.stream().collect(Collectors.toMap(ArticleSensitive::getMd5, o -> o));
-        }
+//        Map<String, String> titleMd5Map = new HashMap<>();
+//        Map<String, ArticleSensitive> articleSensitiveMap = new HashMap<>();
+//        List<String> md5List = new ArrayList<>();
+//        for (Content content : param.getContents()) {
+//            String md5 = Md5Util.encoderByMd5(content.getTitle());
+//            md5List.add(md5);
+//            titleMd5Map.put(content.getTitle(), md5);
+//        }
+//        List<ArticleSensitive> articleSensitiveList = getArticleSensitive(md5List);
+//        if (CollectionUtil.isNotEmpty(articleSensitiveList)) {
+//            articleSensitiveMap = articleSensitiveList.stream().collect(Collectors.toMap(ArticleSensitive::getMd5, o -> o));
+//        }
 
+        List<Set<Character>> unsafeTitleCache = TitleSimilarCheckUtil.makeCache(UnSafeTitles);
         for (Content content : param.getContents()) {
-            Map<String, ArticleSensitive> finalArticleSensitiveMap = articleSensitiveMap;
-            Future<Content> future = pool.submit(() -> {
-                try {
-//                    boolean isSensitive = articleSensitiveRemoteService.articleSensitive(content.getTitle(),
-//                            titleMd5Map,
-//                            finalArticleSensitiveMap);
-//                    if (isSensitive) {
-//                        content.setFilterReason("安全违规");
-//                    } else
-                    if (TitleSimilarCheckUtil.isDuplicateContent(content.getTitle(), UnSafeTitles, TitleSimilarCheckUtil.SIMILARITY_THRESHOLD)) {
-                        content.setFilterReason("安全违规");
-                    }
-                    return content;
-                } finally {
-                    cdl.countDown();
-                }
-            });
-            futures.add(future);
-        }
-        try {
-            cdl.await(5000, TimeUnit.MILLISECONDS);
-        } catch (InterruptedException e) {
-            log.error("filter error", e);
-            return null;
-        }
-
-        for (Future<Content> f : futures) {
             try {
-                Content content = f.get();
-                if (StringUtils.hasText(content.getFilterReason())) {
+                if (TitleSimilarCheckUtil.isDuplicateContentByCache(content.getTitle(), unsafeTitleCache,
+                        TitleSimilarCheckUtil.SIMILARITY_THRESHOLD)) {
+                    content.setFilterReason("安全违规");
                     filterContents.add(content);
                 } else {
                     result.add(content.getId());
                 }
             } catch (Exception e) {
-                log.error("future get error ", e);
+                log.error("similar check error ", e);
             }
         }
         filterResult.setContentIds(result);

+ 44 - 12
long-article-recommend-service/src/main/java/com/tzld/longarticle/recommend/server/util/TitleSimilarCheckUtil.java

@@ -2,16 +2,38 @@ package com.tzld.longarticle.recommend.server.util;
 
 import org.springframework.util.CollectionUtils;
 
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
+import java.util.*;
 
 public class TitleSimilarCheckUtil {
 
     public static final double SIMILARITY_THRESHOLD = 0.8;
     public static final double ARTICLE_PROMOTION_THRESHOLD = 0.75;
 
+    public static List<Set<Character>> makeCache(List<String> titles) {
+        List<Set<Character>> cache = new ArrayList<>(titles.size());
+        for (String title : titles) {
+            Set<Character> currentSet = new HashSet<>(title.length());
+            for (char c : title.toCharArray()) {
+                currentSet.add(c);
+            }
+            cache.add(currentSet);
+        }
+        return cache;
+    }
+
+    public static boolean isDuplicateContentByCache(String title, List<Set<Character>> existsContentCache, double threshold) {
+        if (CollectionUtils.isEmpty(existsContentCache)) {
+            return false;
+        }
+        title = title.trim().replace("\u200b", "");
+        for (Set<Character> existTitleCache : existsContentCache) {
+            if (isSimilar(title, existTitleCache, threshold)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     public static boolean isDuplicateContent(String title, List<String> existsContentTitle, double threshold) {
         boolean result = false;
         if (CollectionUtils.isEmpty(existsContentTitle)) {
@@ -27,23 +49,29 @@ public class TitleSimilarCheckUtil {
         return result;
     }
 
-    public static boolean isSimilar(String titleA, String titleB, double threshold) {
+    public static boolean isSimilar(String titleA, Set<Character> titleB, double threshold) {
         if (titleA.isEmpty() || titleB.isEmpty()) {
             return false;
         }
-        Set<Character> setA = new HashSet<>();
+        Set<Character> setA = new HashSet<>(titleA.length());
         for (char c : titleA.toCharArray()) {
             setA.add(c);
         }
-        Set<Character> setB = new HashSet<>();
+        int minLen = Math.max(Math.min(setA.size(), titleB.size()), 1);
+        setA.retainAll(titleB);
+        double rate = setA.size() / (double) minLen;
+        return rate >= threshold;
+    }
+
+    public static boolean isSimilar(String titleA, String titleB, double threshold) {
+        if (titleA.isEmpty() || titleB.isEmpty()) {
+            return false;
+        }
+        Set<Character> setB = new HashSet<>(titleB.length());
         for (char c : titleB.toCharArray()) {
             setB.add(c);
         }
-        Set<Character> setCross = new HashSet<>(setA);
-        setCross.retainAll(setB);
-        int minLen = Math.max(Math.min(setA.size(), setB.size()), 1);
-        double rate = (double) setCross.size() / minLen;
-        return rate >= threshold;
+        return isSimilar(titleA, setB, threshold);
     }
 
     public static void main(String[] args) {
@@ -54,5 +82,9 @@ public class TitleSimilarCheckUtil {
                 "陕西女孩去医院体检后,发现左肾不见了,意外牵出8年前手术疑云");
         boolean result = isDuplicateContent(title, existsContentTitle, SIMILARITY_THRESHOLD);
         System.out.println(result);
+
+        List<Set<Character>> titlesCache = makeCache(existsContentTitle);
+        result = isDuplicateContentByCache(title, titlesCache, SIMILARITY_THRESHOLD);
+        System.out.println(result);
     }
 }