title_like.py 1.1 KB

123456789101112131415161718192021222324252627282930
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/7/26
  4. import jieba
  5. from sklearn.feature_extraction.text import TfidfVectorizer
  6. from sklearn.metrics.pairwise import cosine_similarity
  7. class TitleLike:
  8. @classmethod
  9. def similarity(cls, title1, title2):
  10. # 分词
  11. seg1 = jieba.lcut(title1)
  12. seg2 = jieba.lcut(title2)
  13. # 构建TF-IDF向量
  14. tfidf_vectorizer = TfidfVectorizer()
  15. # tfidf_matrix = tfidf_vectorizer.fit_transform([title1, title2])
  16. tfidf_matrix = tfidf_vectorizer.fit_transform(["".join(seg1), "".join(seg2)])
  17. # 计算余弦相似度
  18. similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
  19. return similarity
  20. if __name__ == "__main__":
  21. t1 = """#发现未来 7月18日(发布)广东(发布)男生满心欢喜准备迎接喜欢的女孩 下一秒"""
  22. t2 = "...7月18日(发布)广东(发布)男生满心欢喜准备迎接喜欢的女孩 下一秒其他出"
  23. # t2 = "2月23日,广东。男子地铁口挥拳重击抱娃女子。网友:对于家暴零容忍"
  24. print(TitleLike.similarity(t1, t2))