123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124 |
- """
- @author: luojunhui
- """
- from Levenshtein import distance
- from transformers import BertTokenizer, BertModel
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.metrics import jaccard_score
- from sklearn.metrics.pairwise import cosine_similarity
- class Similarity(object):
- """
- 标题相似度代码
- """
- bert_tokenizer = None
- bert_model = None
- @classmethod
- def loading_model(cls):
- """
- bert 模型加载
- """
- cls.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
- cls.bert_model = BertModel.from_pretrained('bert-base-chinese')
- @classmethod
- def title_similarity(cls, title_a, title_b, td=0.8):
- """
- :param title_a:
- :param title_b:
- :param td:
- :return:
- """
- if len(title_a) < 1 or len(title_b) < 1:
- return False
- set_a = set(title_a)
- set_b = set(title_b)
- set_cross = set_a & set_b
- set_union = set_a | set_b
- if not set_union:
- return False
- min_len = max(min(len(set_a), len(set_b)), 1)
- rate = len(set_cross) / min_len
- if rate >= td:
- return True
- else:
- return False
- @classmethod
- def levenshtein_similarity(cls, title1, title2):
- """
- 编辑距离相似度
- :param title1:
- :param title2:
- :return:
- """
- dist = distance(title1, title2)
- max_len = max(len(title1), len(title2))
- return 1 - dist / max_len
- @classmethod
- def jaccard_similarity(cls, title1, title2):
- """
- jaccard 相似度
- :param title1:
- :param title2:
- :return:
- """
- vectorizer = CountVectorizer(binary=True)
- count_matrix = vectorizer.fit_transform([title1, title2])
- return jaccard_score(count_matrix[0].toarray()[0], count_matrix[1].toarray()[0])
- @classmethod
- def cosine_similarity_titles(cls, title1, title2):
- """
- cosine 相似度
- :param title1:
- :param title2:
- :return:
- """
- vectorizer = TfidfVectorizer()
- tfidf_matrix = vectorizer.fit_transform([title1, title2])
- return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
- @classmethod
- def bert_similarity(cls, title1, title2):
- """
- bert相似度
- :param title1:
- :param title2:
- :return:
- """
- def embed_sentences(sentences, model, tokenizer):
- """
- 嵌入句子
- :param sentences:
- :param model:
- :param tokenizer:
- :return:
- """
- inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
- outputs = model(**inputs)
- return outputs.last_hidden_state.mean(dim=1).detach().numpy()
- embeddings = embed_sentences(
- [title1, title2],
- cls.bert_model,
- cls.bert_tokenizer)
- return cosine_similarity(embeddings)[0][1]
- class KeyWords(object):
- """
- keywords对象
- """
- class Sensitive(object):
- """
- 敏感词对象
- """
|