|
@@ -2,16 +2,38 @@ package com.tzld.longarticle.recommend.server.util;
|
|
|
|
|
|
import org.springframework.util.CollectionUtils;
|
|
|
|
|
|
-import java.util.Arrays;
|
|
|
-import java.util.HashSet;
|
|
|
-import java.util.List;
|
|
|
-import java.util.Set;
|
|
|
+import java.util.*;
|
|
|
|
|
|
public class TitleSimilarCheckUtil {
|
|
|
|
|
|
public static final double SIMILARITY_THRESHOLD = 0.8;
|
|
|
public static final double ARTICLE_PROMOTION_THRESHOLD = 0.75;
|
|
|
|
|
|
+ public static List<Set<Character>> makeCache(List<String> titles) {
|
|
|
+ List<Set<Character>> cache = new ArrayList<>(titles.size());
|
|
|
+ for (String title : titles) {
|
|
|
+ Set<Character> currentSet = new HashSet<>(title.length());
|
|
|
+ for (char c : title.toCharArray()) {
|
|
|
+ currentSet.add(c);
|
|
|
+ }
|
|
|
+ cache.add(currentSet);
|
|
|
+ }
|
|
|
+ return cache;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static boolean isDuplicateContentByCache(String title, List<Set<Character>> existsContentCache, double threshold) {
|
|
|
+ if (CollectionUtils.isEmpty(existsContentCache)) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ title = title.trim().replace("\u200b", "");
|
|
|
+ for (Set<Character> existTitleCache : existsContentCache) {
|
|
|
+ if (isSimilar(title, existTitleCache, threshold)) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
public static boolean isDuplicateContent(String title, List<String> existsContentTitle, double threshold) {
|
|
|
boolean result = false;
|
|
|
if (CollectionUtils.isEmpty(existsContentTitle)) {
|
|
@@ -27,23 +49,29 @@ public class TitleSimilarCheckUtil {
|
|
|
return result;
|
|
|
}
|
|
|
|
|
|
- public static boolean isSimilar(String titleA, String titleB, double threshold) {
|
|
|
+ public static boolean isSimilar(String titleA, Set<Character> titleB, double threshold) {
|
|
|
if (titleA.isEmpty() || titleB.isEmpty()) {
|
|
|
return false;
|
|
|
}
|
|
|
- Set<Character> setA = new HashSet<>();
|
|
|
+ Set<Character> setA = new HashSet<>(titleA.length());
|
|
|
for (char c : titleA.toCharArray()) {
|
|
|
setA.add(c);
|
|
|
}
|
|
|
- Set<Character> setB = new HashSet<>();
|
|
|
+ int minLen = Math.max(Math.min(setA.size(), titleB.size()), 1);
|
|
|
+ setA.retainAll(titleB);
|
|
|
+ double rate = setA.size() / (double) minLen;
|
|
|
+ return rate >= threshold;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static boolean isSimilar(String titleA, String titleB, double threshold) {
|
|
|
+ if (titleA.isEmpty() || titleB.isEmpty()) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ Set<Character> setB = new HashSet<>(titleB.length());
|
|
|
for (char c : titleB.toCharArray()) {
|
|
|
setB.add(c);
|
|
|
}
|
|
|
- Set<Character> setCross = new HashSet<>(setA);
|
|
|
- setCross.retainAll(setB);
|
|
|
- int minLen = Math.max(Math.min(setA.size(), setB.size()), 1);
|
|
|
- double rate = (double) setCross.size() / minLen;
|
|
|
- return rate >= threshold;
|
|
|
+ return isSimilar(titleA, setB, threshold);
|
|
|
}
|
|
|
|
|
|
public static void main(String[] args) {
|
|
@@ -54,5 +82,9 @@ public class TitleSimilarCheckUtil {
|
|
|
"陕西女孩去医院体检后,发现左肾不见了,意外牵出8年前手术疑云");
|
|
|
boolean result = isDuplicateContent(title, existsContentTitle, SIMILARITY_THRESHOLD);
|
|
|
System.out.println(result);
|
|
|
+
|
|
|
+ List<Set<Character>> titlesCache = makeCache(existsContentTitle);
|
|
|
+ result = isDuplicateContentByCache(title, titlesCache, SIMILARITY_THRESHOLD);
|
|
|
+ System.out.println(result);
|
|
|
}
|
|
|
}
|