|
@@ -0,0 +1,112 @@
|
|
|
+"""
|
|
|
+@author: luojunhui
|
|
|
+"""
|
|
|
+from Levenshtein import distance
|
|
|
+from transformers import BertTokenizer, BertModel
|
|
|
+from sklearn.feature_extraction.text import CountVectorizer
|
|
|
+from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
+from sklearn.metrics import jaccard_score
|
|
|
+from sklearn.metrics.pairwise import cosine_similarity
|
|
|
+
|
|
|
+
|
|
|
+class titleSimilarity(object):
|
|
|
+ """
|
|
|
+ 标题相似度代码
|
|
|
+ """
|
|
|
+ bert_tokenizer = None
|
|
|
+ bert_model = None
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def loading_model(cls):
|
|
|
+ """
|
|
|
+ bert 模型加载
|
|
|
+ """
|
|
|
+ cls.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
|
|
|
+ cls.bert_model = BertModel.from_pretrained('bert-base-chinese')
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def title_similarity(cls, title_a, title_b, td=0.8):
|
|
|
+ """
|
|
|
+ :param title_a:
|
|
|
+ :param title_b:
|
|
|
+ :param td:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ if len(title_a) < 1 or len(title_b) < 1:
|
|
|
+ return False
|
|
|
+ set_a = set(title_a)
|
|
|
+ set_b = set(title_b)
|
|
|
+ set_cross = set_a & set_b
|
|
|
+ set_union = set_a | set_b
|
|
|
+ if not set_union:
|
|
|
+ return False
|
|
|
+ min_len = max(min(len(set_a), len(set_b)), 1)
|
|
|
+ rate = len(set_cross) / min_len
|
|
|
+ if rate >= td:
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def levenshtein_similarity(cls, title1, title2):
|
|
|
+ """
|
|
|
+ 编辑距离相似度
|
|
|
+ :param title1:
|
|
|
+ :param title2:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ dist = distance(title1, title2)
|
|
|
+ max_len = max(len(title1), len(title2))
|
|
|
+ return 1 - dist / max_len
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def jaccard_similarity(cls, title1, title2):
|
|
|
+ """
|
|
|
+ jaccard 相似度
|
|
|
+ :param title1:
|
|
|
+ :param title2:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ vectorizer = CountVectorizer(binary=True)
|
|
|
+ count_matrix = vectorizer.fit_transform([title1, title2])
|
|
|
+ return jaccard_score(count_matrix[0].toarray()[0], count_matrix[1].toarray()[0])
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def cosine_similarity_titles(cls, title1, title2):
|
|
|
+ """
|
|
|
+ cosine 相似度
|
|
|
+ :param title1:
|
|
|
+ :param title2:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ vectorizer = TfidfVectorizer()
|
|
|
+ tfidf_matrix = vectorizer.fit_transform([title1, title2])
|
|
|
+ return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def bert_similarity(cls, title1, title2):
|
|
|
+ """
|
|
|
+ bert相似度
|
|
|
+ :param title1:
|
|
|
+ :param title2:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ def embed_sentences(sentences, model, tokenizer):
|
|
|
+ """
|
|
|
+ 嵌入句子
|
|
|
+ :param sentences:
|
|
|
+ :param model:
|
|
|
+ :param tokenizer:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
|
|
|
+
|
|
|
+ outputs = model(**inputs)
|
|
|
+ return outputs.last_hidden_state.mean(dim=1).detach().numpy()
|
|
|
+
|
|
|
+ embeddings = embed_sentences(
|
|
|
+ [title1, title2],
|
|
|
+ cls.bert_model,
|
|
|
+ cls.bert_tokenizer)
|
|
|
+ return cosine_similarity(embeddings)[0][1]
|
|
|
+
|