123456789101112131415161718192021222324252627282930 |
- # -*- coding: utf-8 -*-
- # @Author: wangkun
- # @Time: 2023/7/26
- import jieba
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.metrics.pairwise import cosine_similarity
- class TitleLike:
- @classmethod
- def similarity(cls, title1, title2):
- # 分词
- seg1 = jieba.lcut(title1)
- seg2 = jieba.lcut(title2)
- # 构建TF-IDF向量
- tfidf_vectorizer = TfidfVectorizer()
- # tfidf_matrix = tfidf_vectorizer.fit_transform([title1, title2])
- tfidf_matrix = tfidf_vectorizer.fit_transform(["".join(seg1), "".join(seg2)])
- # 计算余弦相似度
- similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
- return similarity
- if __name__ == "__main__":
- t1 = """#发现未来 7月18日(发布)广东(发布)男生满心欢喜准备迎接喜欢的女孩 下一秒"""
- t2 = "...7月18日(发布)广东(发布)男生满心欢喜准备迎接喜欢的女孩 下一秒其他出"
- # t2 = "2月23日,广东。男子地铁口挥拳重击抱娃女子。网友:对于家暴零容忍"
- print(TitleLike.similarity(t1, t2))
|