# -*- coding: utf-8 -*- # @Author: wangkun # @Time: 2023/7/26 import jieba from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity class TitleLike: @classmethod def similarity(cls, title1, title2): # 分词 seg1 = jieba.lcut(title1) seg2 = jieba.lcut(title2) # 构建TF-IDF向量 tfidf_vectorizer = TfidfVectorizer() # tfidf_matrix = tfidf_vectorizer.fit_transform([title1, title2]) tfidf_matrix = tfidf_vectorizer.fit_transform(["".join(seg1), "".join(seg2)]) # 计算余弦相似度 similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0] return similarity if __name__ == "__main__": t1 = """#发现未来 7月18日(发布)广东(发布)男生满心欢喜准备迎接喜欢的女孩 下一秒""" t2 = "...7月18日(发布)广东(发布)男生满心欢喜准备迎接喜欢的女孩 下一秒其他出" # t2 = "2月23日,广东。男子地铁口挥拳重击抱娃女子。网友:对于家暴零容忍" print(TitleLike.similarity(t1, t2))