""" @author: luojunhui """ from Levenshtein import distance from transformers import BertTokenizer, BertModel from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import jaccard_score from sklearn.metrics.pairwise import cosine_similarity class Similarity(object): """ 标题相似度代码 """ bert_tokenizer = None bert_model = None @classmethod def loading_model(cls): """ bert 模型加载 """ cls.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') cls.bert_model = BertModel.from_pretrained('bert-base-chinese') @classmethod def title_similarity(cls, title_a, title_b, td=0.8): """ :param title_a: :param title_b: :param td: :return: """ if len(title_a) < 1 or len(title_b) < 1: return False set_a = set(title_a) set_b = set(title_b) set_cross = set_a & set_b set_union = set_a | set_b if not set_union: return False min_len = max(min(len(set_a), len(set_b)), 1) rate = len(set_cross) / min_len if rate >= td: return True else: return False @classmethod def levenshtein_similarity(cls, title1, title2): """ 编辑距离相似度 :param title1: :param title2: :return: """ dist = distance(title1, title2) max_len = max(len(title1), len(title2)) return 1 - dist / max_len @classmethod def jaccard_similarity(cls, title1, title2): """ jaccard 相似度 :param title1: :param title2: :return: """ vectorizer = CountVectorizer(binary=True) count_matrix = vectorizer.fit_transform([title1, title2]) return jaccard_score(count_matrix[0].toarray()[0], count_matrix[1].toarray()[0]) @classmethod def cosine_similarity_titles(cls, title1, title2): """ cosine 相似度 :param title1: :param title2: :return: """ vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform([title1, title2]) return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] @classmethod def bert_similarity(cls, title1, title2): """ bert相似度 :param title1: :param title2: :return: """ def embed_sentences(sentences, model, tokenizer): """ 嵌入句子 :param sentences: :param model: :param tokenizer: :return: """ inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True) outputs = model(**inputs) return outputs.last_hidden_state.mean(dim=1).detach().numpy() embeddings = embed_sentences( [title1, title2], cls.bert_model, cls.bert_tokenizer) return cosine_similarity(embeddings)[0][1] class KeyWords(object): """ keywords对象 """ class Sensitive(object): """ 敏感词对象 """