1 year ago · d0f9a0b6ea
--- a/alg_app.py
+++ b/alg_app.py
@@ -4,6 +4,7 @@
 
				 from quart import Quart
			
 
				 from routes import AlgRoutes
			
 
				 from applications import AsyncMySQLClient
			
 
				+from applications.algorthims import titleSimilarity
			
 
				 
			
 
				 app = Quart(__name__)
			
 
				 AsyncMySQL = AsyncMySQLClient(app)
			
@@ -20,6 +21,17 @@ async def init_db():
 
				     await AsyncMySQL.init_pool()
			
 
				 
			
 
				 
			
 
				+@app.before_serving
			
 
				+def init_bert_model():
			
 
				+    """
			
 
				+    初始化bert模型
			
 
				+    :return:
			
 
				+    """
			
 
				+    Ts = titleSimilarity()
			
 
				+    Ts.loading_model()
			
 
				+    print("Bert Model has already loaded")
			
 
				+
			
 
				+
			
 
				 @app.after_serving
			
 
				 async def close_db():
			
 
				     """
			
--- a/applications/accounts.py
+++ b/applications/accounts.py
@@ -0,0 +1,40 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+import json
			
 
				+import requests
			
 
				+
			
 
				+
			
 
				+class Accounts(object):
			
 
				+    """
			
 
				+    公众号历史文章信息
			
 
				+    """
			
 
				+    ROOT_URL = 'http://192.168.100.31:8179'
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_history_articles(cls, account_nickname):
			
 
				+        """
			
 
				+        获取账号历史文章
			
 
				+        :return:
			
 
				+        """
			
 
				+        api_url = f'{cls.ROOT_URL}/artlce_list'
			
 
				+        payload = json.dumps({
			
 
				+            "account_nickname": account_nickname,
			
 
				+            "filter_same": filter_same,
			
 
				+            "filter_sensitive": filter_sensitive,
			
 
				+            "index_list": index_list,
			
 
				+            "keys": keys,
			
 
				+            "max_time": max_time,
			
 
				+            "min_time": min_time,
			
 
				+            "msg_type": msg_type,
			
 
				+            "rate": rate,
			
 
				+            "reverse": reverse,
			
 
				+            "top_n": 10000,
			
 
				+            "use_max_time": True,
			
 
				+            "use_min_time": True
			
 
				+        })
			
 
				+        res = requests.request("POST", api_url, headers={}, data=payload).json()
			
 
				+        return res
			
 
				+
			
 
				+
			
 
				+
			
--- a/applications/algorthims.py
+++ b/applications/algorthims.py
@@ -0,0 +1,112 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+from Levenshtein import distance
			
 
				+from transformers import BertTokenizer, BertModel
			
 
				+from sklearn.feature_extraction.text import CountVectorizer
			
 
				+from sklearn.feature_extraction.text import TfidfVectorizer
			
 
				+from sklearn.metrics import jaccard_score
			
 
				+from sklearn.metrics.pairwise import cosine_similarity
			
 
				+
			
 
				+
			
 
				+class titleSimilarity(object):
			
 
				+    """
			
 
				+    标题相似度代码
			
 
				+    """
			
 
				+    bert_tokenizer = None
			
 
				+    bert_model = None
			
 
				+
			
 
				+    @classmethod
			
 
				+    def loading_model(cls):
			
 
				+        """
			
 
				+        bert 模型加载
			
 
				+        """
			
 
				+        cls.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
			
 
				+        cls.bert_model = BertModel.from_pretrained('bert-base-chinese')
			
 
				+
			
 
				+    @classmethod
			
 
				+    def title_similarity(cls, title_a, title_b, td=0.8):
			
 
				+        """
			
 
				+        :param title_a:
			
 
				+        :param title_b:
			
 
				+        :param td:
			
 
				+        :return:
			
 
				+        """
			
 
				+        if len(title_a) < 1 or len(title_b) < 1:
			
 
				+            return False
			
 
				+        set_a = set(title_a)
			
 
				+        set_b = set(title_b)
			
 
				+        set_cross = set_a & set_b
			
 
				+        set_union = set_a | set_b
			
 
				+        if not set_union:
			
 
				+            return False
			
 
				+        min_len = max(min(len(set_a), len(set_b)), 1)
			
 
				+        rate = len(set_cross) / min_len
			
 
				+        if rate >= td:
			
 
				+            return True
			
 
				+        else:
			
 
				+            return False
			
 
				+
			
 
				+    @classmethod
			
 
				+    def levenshtein_similarity(cls, title1, title2):
			
 
				+        """
			
 
				+        编辑距离相似度
			
 
				+        :param title1:
			
 
				+        :param title2:
			
 
				+        :return:
			
 
				+        """
			
 
				+        dist = distance(title1, title2)
			
 
				+        max_len = max(len(title1), len(title2))
			
 
				+        return 1 - dist / max_len
			
 
				+
			
 
				+    @classmethod
			
 
				+    def jaccard_similarity(cls, title1, title2):
			
 
				+        """
			
 
				+        jaccard 相似度
			
 
				+        :param title1:
			
 
				+        :param title2:
			
 
				+        :return:
			
 
				+        """
			
 
				+        vectorizer = CountVectorizer(binary=True)
			
 
				+        count_matrix = vectorizer.fit_transform([title1, title2])
			
 
				+        return jaccard_score(count_matrix[0].toarray()[0], count_matrix[1].toarray()[0])
			
 
				+
			
 
				+    @classmethod
			
 
				+    def cosine_similarity_titles(cls, title1, title2):
			
 
				+        """
			
 
				+        cosine 相似度
			
 
				+        :param title1:
			
 
				+        :param title2:
			
 
				+        :return:
			
 
				+        """
			
 
				+        vectorizer = TfidfVectorizer()
			
 
				+        tfidf_matrix = vectorizer.fit_transform([title1, title2])
			
 
				+        return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
			
 
				+
			
 
				+    @classmethod
			
 
				+    def bert_similarity(cls, title1, title2):
			
 
				+        """
			
 
				+        bert相似度
			
 
				+        :param title1:
			
 
				+        :param title2:
			
 
				+        :return:
			
 
				+        """
			
 
				+        def embed_sentences(sentences, model, tokenizer):
			
 
				+            """
			
 
				+            嵌入句子
			
 
				+            :param sentences:
			
 
				+            :param model:
			
 
				+            :param tokenizer:
			
 
				+            :return:
			
 
				+            """
			
 
				+            inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
			
 
				+
			
 
				+            outputs = model(**inputs)
			
 
				+            return outputs.last_hidden_state.mean(dim=1).detach().numpy()
			
 
				+
			
 
				+        embeddings = embed_sentences(
			
 
				+            [title1, title2],
			
 
				+            cls.bert_model,
			
 
				+            cls.bert_tokenizer)
			
 
				+        return cosine_similarity(embeddings)[0][1]
			
 
				+