algorthims.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. """
  2. @author: luojunhui
  3. """
  4. from Levenshtein import distance
  5. from transformers import BertTokenizer, BertModel
  6. from sklearn.feature_extraction.text import CountVectorizer
  7. from sklearn.feature_extraction.text import TfidfVectorizer
  8. from sklearn.metrics import jaccard_score
  9. from sklearn.metrics.pairwise import cosine_similarity
  10. class Similarity(object):
  11. """
  12. 标题相似度代码
  13. """
  14. bert_tokenizer = None
  15. bert_model = None
  16. @classmethod
  17. def loading_model(cls):
  18. """
  19. bert 模型加载
  20. """
  21. cls.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
  22. cls.bert_model = BertModel.from_pretrained('bert-base-chinese')
  23. @classmethod
  24. def title_similarity(cls, title_a, title_b, td=0.8):
  25. """
  26. :param title_a:
  27. :param title_b:
  28. :param td:
  29. :return:
  30. """
  31. if len(title_a) < 1 or len(title_b) < 1:
  32. return False
  33. set_a = set(title_a)
  34. set_b = set(title_b)
  35. set_cross = set_a & set_b
  36. set_union = set_a | set_b
  37. if not set_union:
  38. return False
  39. min_len = max(min(len(set_a), len(set_b)), 1)
  40. rate = len(set_cross) / min_len
  41. if rate >= td:
  42. return True
  43. else:
  44. return False
  45. @classmethod
  46. def levenshtein_similarity(cls, title1, title2):
  47. """
  48. 编辑距离相似度
  49. :param title1:
  50. :param title2:
  51. :return:
  52. """
  53. dist = distance(title1, title2)
  54. max_len = max(len(title1), len(title2))
  55. return 1 - dist / max_len
  56. @classmethod
  57. def jaccard_similarity(cls, title1, title2):
  58. """
  59. jaccard 相似度
  60. :param title1:
  61. :param title2:
  62. :return:
  63. """
  64. vectorizer = CountVectorizer(binary=True)
  65. count_matrix = vectorizer.fit_transform([title1, title2])
  66. return jaccard_score(count_matrix[0].toarray()[0], count_matrix[1].toarray()[0])
  67. @classmethod
  68. def cosine_similarity_titles(cls, title1, title2):
  69. """
  70. cosine 相似度
  71. :param title1:
  72. :param title2:
  73. :return:
  74. """
  75. vectorizer = TfidfVectorizer()
  76. tfidf_matrix = vectorizer.fit_transform([title1, title2])
  77. return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
  78. @classmethod
  79. def bert_similarity(cls, title1, title2):
  80. """
  81. bert相似度
  82. :param title1:
  83. :param title2:
  84. :return:
  85. """
  86. def embed_sentences(sentences, model, tokenizer):
  87. """
  88. 嵌入句子
  89. :param sentences:
  90. :param model:
  91. :param tokenizer:
  92. :return:
  93. """
  94. inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
  95. outputs = model(**inputs)
  96. return outputs.last_hidden_state.mean(dim=1).detach().numpy()
  97. embeddings = embed_sentences(
  98. [title1, title2],
  99. cls.bert_model,
  100. cls.bert_tokenizer)
  101. return cosine_similarity(embeddings)[0][1]
  102. class KeyWords(object):
  103. """
  104. keywords对象
  105. """
  106. class Sensitive(object):
  107. """
  108. 敏感词对象
  109. """