|
@@ -9,14 +9,19 @@ public class TitleSimilarCheckUtil {
|
|
|
public static final double SIMILARITY_THRESHOLD = 0.8;
|
|
|
public static final double ARTICLE_PROMOTION_THRESHOLD = 0.75;
|
|
|
|
|
|
+ public static Set<Character> makeCache(String title) {
|
|
|
+ title = title.trim().replace("\u200b", "");
|
|
|
+ Set<Character> cacheSet = new HashSet<>(title.length());
|
|
|
+ for (char c : title.toCharArray()) {
|
|
|
+ cacheSet.add(c);
|
|
|
+ }
|
|
|
+ return cacheSet;
|
|
|
+ }
|
|
|
+
|
|
|
public static List<Set<Character>> makeCache(List<String> titles) {
|
|
|
List<Set<Character>> cache = new ArrayList<>(titles.size());
|
|
|
for (String title : titles) {
|
|
|
- Set<Character> currentSet = new HashSet<>(title.length());
|
|
|
- for (char c : title.toCharArray()) {
|
|
|
- currentSet.add(c);
|
|
|
- }
|
|
|
- cache.add(currentSet);
|
|
|
+ cache.add(makeCache(title));
|
|
|
}
|
|
|
return cache;
|
|
|
}
|
|
@@ -25,9 +30,9 @@ public class TitleSimilarCheckUtil {
|
|
|
if (CollectionUtils.isEmpty(existsContentCache)) {
|
|
|
return false;
|
|
|
}
|
|
|
- title = title.trim().replace("\u200b", "");
|
|
|
+ Set<Character> titleCache = makeCache(title);
|
|
|
for (Set<Character> existTitleCache : existsContentCache) {
|
|
|
- if (isSimilar(title, existTitleCache, threshold)) {
|
|
|
+ if (isSimilar(titleCache, existTitleCache, threshold)) {
|
|
|
return true;
|
|
|
}
|
|
|
}
|
|
@@ -49,29 +54,27 @@ public class TitleSimilarCheckUtil {
|
|
|
return result;
|
|
|
}
|
|
|
|
|
|
- public static boolean isSimilar(String titleA, Set<Character> titleB, double threshold) {
|
|
|
+ public static boolean isSimilar(Set<Character> titleA, Set<Character> titleB, double threshold) {
|
|
|
if (titleA.isEmpty() || titleB.isEmpty()) {
|
|
|
return false;
|
|
|
}
|
|
|
- Set<Character> setA = new HashSet<>(titleA.length());
|
|
|
- for (char c : titleA.toCharArray()) {
|
|
|
- setA.add(c);
|
|
|
- }
|
|
|
- int minLen = Math.max(Math.min(setA.size(), titleB.size()), 1);
|
|
|
- setA.retainAll(titleB);
|
|
|
- double rate = setA.size() / (double) minLen;
|
|
|
+ int minLen = Math.min(titleA.size(), titleB.size());
|
|
|
+ // since retainAll is an in-place operation, copy it first
|
|
|
+ titleA = new HashSet<>(titleA);
|
|
|
+ titleA.retainAll(titleB);
|
|
|
+ double rate = titleA.size() / (double) minLen;
|
|
|
return rate >= threshold;
|
|
|
}
|
|
|
|
|
|
+ public static boolean isSimilar(String titleA, Set<Character> titleB, double threshold) {
|
|
|
+ Set<Character> setA = makeCache(titleA);
|
|
|
+ return isSimilar(setA, titleB, threshold);
|
|
|
+ }
|
|
|
+
|
|
|
public static boolean isSimilar(String titleA, String titleB, double threshold) {
|
|
|
- if (titleA.isEmpty() || titleB.isEmpty()) {
|
|
|
- return false;
|
|
|
- }
|
|
|
- Set<Character> setB = new HashSet<>(titleB.length());
|
|
|
- for (char c : titleB.toCharArray()) {
|
|
|
- setB.add(c);
|
|
|
- }
|
|
|
- return isSimilar(titleA, setB, threshold);
|
|
|
+ Set<Character> setA = makeCache(titleA);
|
|
|
+ Set<Character> setB = makeCache(titleB);
|
|
|
+ return isSimilar(setA, setB, threshold);
|
|
|
}
|
|
|
|
|
|
public static void main(String[] args) {
|
|
@@ -86,5 +89,9 @@ public class TitleSimilarCheckUtil {
|
|
|
List<Set<Character>> titlesCache = makeCache(existsContentTitle);
|
|
|
result = isDuplicateContentByCache(title, titlesCache, SIMILARITY_THRESHOLD);
|
|
|
System.out.println(result);
|
|
|
+
|
|
|
+ title = "江苏高考文科女状元,遭多所985名校拒绝录取,成为“最惨状元”";
|
|
|
+ result = isDuplicateContentByCache(title, titlesCache, SIMILARITY_THRESHOLD);
|
|
|
+ System.out.println(result);
|
|
|
}
|
|
|
}
|