Browse Source

代码重构开发ing

罗俊辉 1 year ago
parent
commit
d0f9a0b6ea
3 changed files with 164 additions and 0 deletions
  1. 12 0
      alg_app.py
  2. 40 0
      applications/accounts.py
  3. 112 0
      applications/algorthims.py

+ 12 - 0
alg_app.py

@@ -4,6 +4,7 @@
 from quart import Quart
 from routes import AlgRoutes
 from applications import AsyncMySQLClient
+from applications.algorthims import titleSimilarity
 
 app = Quart(__name__)
 AsyncMySQL = AsyncMySQLClient(app)
@@ -20,6 +21,17 @@ async def init_db():
     await AsyncMySQL.init_pool()
 
 
+@app.before_serving
+def init_bert_model():
+    """
+    初始化bert模型
+    :return:
+    """
+    Ts = titleSimilarity()
+    Ts.loading_model()
+    print("Bert Model has already loaded")
+
+
 @app.after_serving
 async def close_db():
     """

+ 40 - 0
applications/accounts.py

@@ -0,0 +1,40 @@
+"""
+@author: luojunhui
+"""
+import json
+import requests
+
+
+class Accounts(object):
+    """
+    公众号历史文章信息
+    """
+    ROOT_URL = 'http://192.168.100.31:8179'
+
+    @classmethod
+    def get_history_articles(cls, account_nickname):
+        """
+        获取账号历史文章
+        :return:
+        """
+        api_url = f'{cls.ROOT_URL}/artlce_list'
+        payload = json.dumps({
+            "account_nickname": account_nickname,
+            "filter_same": filter_same,
+            "filter_sensitive": filter_sensitive,
+            "index_list": index_list,
+            "keys": keys,
+            "max_time": max_time,
+            "min_time": min_time,
+            "msg_type": msg_type,
+            "rate": rate,
+            "reverse": reverse,
+            "top_n": 10000,
+            "use_max_time": True,
+            "use_min_time": True
+        })
+        res = requests.request("POST", api_url, headers={}, data=payload).json()
+        return res
+
+
+

+ 112 - 0
applications/algorthims.py

@@ -0,0 +1,112 @@
+"""
+@author: luojunhui
+"""
+from Levenshtein import distance
+from transformers import BertTokenizer, BertModel
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics import jaccard_score
+from sklearn.metrics.pairwise import cosine_similarity
+
+
+class titleSimilarity(object):
+    """
+    标题相似度代码
+    """
+    bert_tokenizer = None
+    bert_model = None
+
+    @classmethod
+    def loading_model(cls):
+        """
+        bert 模型加载
+        """
+        cls.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
+        cls.bert_model = BertModel.from_pretrained('bert-base-chinese')
+
+    @classmethod
+    def title_similarity(cls, title_a, title_b, td=0.8):
+        """
+        :param title_a:
+        :param title_b:
+        :param td:
+        :return:
+        """
+        if len(title_a) < 1 or len(title_b) < 1:
+            return False
+        set_a = set(title_a)
+        set_b = set(title_b)
+        set_cross = set_a & set_b
+        set_union = set_a | set_b
+        if not set_union:
+            return False
+        min_len = max(min(len(set_a), len(set_b)), 1)
+        rate = len(set_cross) / min_len
+        if rate >= td:
+            return True
+        else:
+            return False
+
+    @classmethod
+    def levenshtein_similarity(cls, title1, title2):
+        """
+        编辑距离相似度
+        :param title1:
+        :param title2:
+        :return:
+        """
+        dist = distance(title1, title2)
+        max_len = max(len(title1), len(title2))
+        return 1 - dist / max_len
+
+    @classmethod
+    def jaccard_similarity(cls, title1, title2):
+        """
+        jaccard 相似度
+        :param title1:
+        :param title2:
+        :return:
+        """
+        vectorizer = CountVectorizer(binary=True)
+        count_matrix = vectorizer.fit_transform([title1, title2])
+        return jaccard_score(count_matrix[0].toarray()[0], count_matrix[1].toarray()[0])
+
+    @classmethod
+    def cosine_similarity_titles(cls, title1, title2):
+        """
+        cosine 相似度
+        :param title1:
+        :param title2:
+        :return:
+        """
+        vectorizer = TfidfVectorizer()
+        tfidf_matrix = vectorizer.fit_transform([title1, title2])
+        return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
+
+    @classmethod
+    def bert_similarity(cls, title1, title2):
+        """
+        bert相似度
+        :param title1:
+        :param title2:
+        :return:
+        """
+        def embed_sentences(sentences, model, tokenizer):
+            """
+            嵌入句子
+            :param sentences:
+            :param model:
+            :param tokenizer:
+            :return:
+            """
+            inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
+
+            outputs = model(**inputs)
+            return outputs.last_hidden_state.mean(dim=1).detach().numpy()
+
+        embeddings = embed_sentences(
+            [title1, title2],
+            cls.bert_model,
+            cls.bert_tokenizer)
+        return cosine_similarity(embeddings)[0][1]
+