1 year ago · 149347555e
--- a/alg_app.py
+++ b/alg_app.py
@@ -5,11 +5,11 @@ from quart import Quart
 
															 from similarities import BertSimilarity
														
 
															 from routes import AlgRoutes
														
 
															 from applications import AsyncMySQLClient
														
 
															+from applications.embedding_manager import EmbeddingManager
														
 
															 app = Quart(__name__)
														
 
															 AsyncMySQL = AsyncMySQLClient(app)
														
 
															-
														
 
															 @app.before_serving
														
 
															 async def init():
														
 
															     """
														
@@ -17,8 +17,9 @@ async def init():
 
															     """
														
 
															     await AsyncMySQL.init_pool()
														
 
															     model = BertSimilarity(model_name_or_path="BAAI/bge-large-zh-v1.5")
														
 
															+    embedding_manager = EmbeddingManager(model)
														
 
															     print("模型加载成功")
														
 
															-    app_routes = AlgRoutes(AsyncMySQL, model)
														
 
															+    app_routes = AlgRoutes(AsyncMySQL, model, embedding_manager)
														
 
															     app.register_blueprint(app_routes)
														
--- a/applications/embedding_manager.py
+++ b/applications/embedding_manager.py
@@ -0,0 +1,131 @@
 
															+import os
														
 
															+import threading
														
 
															+from filelock import FileLock
														
 
															+from time import sleep
														
 
															+import numpy as np
														
 
															+import random
														
 
															+
														
 
															+
														
 
															+class EmbeddingManager:
														
 
															+    def __init__(self, model, emb_size=1024, cache_file="cache/embedding_cache", save_interval=600):
														
 
															+        self.model = model
														
 
															+        self.emb_size = emb_size
														
 
															+        self.cache_file = cache_file
														
 
															+        self.cache_file_real = self.cache_file + ".npy"
														
 
															+        self.cache_key_file = f'{self.cache_file}.keys'
														
 
															+        # avoid multiple process read and write at same time and wait for filelock
														
 
															+        self.save_interval = save_interval + random.randint(0, save_interval)
														
 
															+        self.cache = {}
														
 
															+        self.lock = threading.Lock()  # Thread-safe lock
														
 
															+        self.filelock = FileLock(self.cache_file + ".lock")
														
 
															+
														
 
															+        self.load_cache()
														
 
															+
														
 
															+        # Start the periodic saving thread
														
 
															+        self.saving_thread = threading.Thread(target=self._periodic_save, daemon=True)
														
 
															+        self.saving_thread.start()
														
 
															+
														
 
															+
														
 
															+    def _load_cache_unsafe(self):
														
 
															+        """inter-thread and inter-process safety must be guaranteed by caller"""
														
 
															+        embedding_data = np.load(self.cache_file_real)
														
 
															+        embedding_keys = open(self.cache_key_file, "r").readlines()
														
 
															+        embedding_keys = [key.strip("\n") for key in embedding_keys]
														
 
															+        for idx, key in enumerate(embedding_keys):
														
 
															+            self.cache[key] = embedding_data[idx]
														
 
															+
														
 
															+    def load_cache(self):
														
 
															+        with self.lock:
														
 
															+            if os.path.exists(self.cache_file_real):
														
 
															+                with self.filelock:
														
 
															+                    self._load_cache_unsafe()
														
 
															+            print("[EmbeddingManager]cache loaded")
														
 
															+
														
 
															+    def dump_cache(self):
														
 
															+        if os.path.dirname(self.cache_file):
														
 
															+            os.makedirs(os.path.dirname(self.cache_file), 0o755, True)
														
 
															+        tmp_cache_file = self.cache_file + ".tmp"
														
 
															+        tmp_cache_key_file = self.cache_key_file + ".tmp"
														
 
															+        with self.lock:  # Ensure thread-safe access firstly
														
 
															+            with self.filelock: # Ensure inter-process safety secondly
														
 
															+                if os.path.exists(self.cache_file_real):
														
 
															+                    self._load_cache_unsafe()
														
 
															+                keys = self.cache.keys()
														
 
															+                cache_to_save = np.zeros((len(keys), self.emb_size), np.float32)
														
 
															+                for idx, key in enumerate(keys):
														
 
															+                    cache_to_save[idx] = self.cache[key]
														
 
															+                np.save(tmp_cache_file, cache_to_save)
														
 
															+                with open(tmp_cache_key_file, 'w') as fp:
														
 
															+                    fp.write('\n'.join(keys))
														
 
															+                if os.path.exists(self.cache_file + ".npy"):
														
 
															+                    os.rename(self.cache_file + ".npy", self.cache_file + ".npy.bak")
														
 
															+                if os.path.exists(self.cache_key_file):
														
 
															+                    os.rename(self.cache_key_file, self.cache_key_file + ".bak")
														
 
															+                os.rename(tmp_cache_file + ".npy", self.cache_file + ".npy")
														
 
															+                os.rename(tmp_cache_key_file, self.cache_key_file)
														
 
															+        print("[EmbeddingManager]cache dumped")
														
 
															+
														
 
															+    def get_embeddings(self, text_list):
														
 
															+        """
														
 
															+        Search embedding for a given text. If not found, generate using the model, save to cache, and return it.
														
 
															+        """
														
 
															+        if not isinstance(text_list, list):
														
 
															+            raise Exception(f"Invalid parameter type: text_list {type(text_list)}")
														
 
															+        embedding_list = np.zeros((len(text_list), self.emb_size), np.float32)
														
 
															+        if not text_list:
														
 
															+            return embedding_list
														
 
															+        new_texts = []
														
 
															+        new_texts_ori_idx = []
														
 
															+        with self.lock:
														
 
															+            for idx, text in enumerate(text_list):
														
 
															+                if text in self.cache:
														
 
															+                    # print(f"find {text} in cache")
														
 
															+                    embedding_list[idx] = self.cache[text]
														
 
															+                else:
														
 
															+                    new_texts.append(text)
														
 
															+                    new_texts_ori_idx.append(idx)
														
 
															+
														
 
															+        new_embeddings = self.model.get_embeddings(new_texts)
														
 
															+        if new_embeddings.shape[0] > 0 and new_embeddings.shape[1] != self.emb_size:
														
 
															+            raise Exception("Embedding size mismatch")
														
 
															+
														
 
															+        # Generate embedding if not found in cache
														
 
															+        with self.lock:  # Ensure thread-safe access
														
 
															+            for idx, text in enumerate(new_texts):
														
 
															+                if text not in self.cache:  # Re-check in case another thread added it
														
 
															+                    self.cache[text] = new_embeddings[idx]
														
 
															+                embedding_list[new_texts_ori_idx[idx]] = new_embeddings[idx]
														
 
															+        return embedding_list
														
 
															+
														
 
															+    def _periodic_save(self):
														
 
															+        """Periodically save the cache to disk."""
														
 
															+        while True:
														
 
															+            sleep(self.save_interval)
														
 
															+            self.dump_cache()
														
 
															+
														
 
															+
														
 
															+# Only for testing
														
 
															+class DummyModel:
														
 
															+    def padding_text(self, text):
														
 
															+        padding_factor = 1024 // len(text)
														
 
															+        text = text * padding_factor
														
 
															+        text += text[:1024 - len(text)]
														
 
															+        return text
														
 
															+
														
 
															+    def get_embeddings(self, text_list):
														
 
															+        embeddings = np.zeros((len(text_list), 1024), np.float32)
														
 
															+        for idx, text in enumerate(text_list):
														
 
															+            text = self.padding_text(text)
														
 
															+            embedding = np.array([ord(c) for c in text], np.float32)
														
 
															+            embeddings[idx] = embedding
														
 
															+        return embeddings
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    model = DummyModel()
														
 
															+    manager = EmbeddingManager(model)
														
 
															+    print(manager.get_embeddings(["hello"]))
														
 
															+    print(manager.get_embeddings(["world"]))
														
 
															+    print(manager.get_embeddings(["hello world"]))
														
 
															+    manager.dump_cache()
														
 
															+    print(manager.get_embeddings(["new", "word"]))
														
 
															+    manager.dump_cache()
														
--- a/applications/textSimilarity.py
+++ b/applications/textSimilarity.py
@@ -17,14 +17,31 @@ def score_to_attention(score, symbol=1):
 
															     score_attn = torch.nn.functional.softmax(score_norm, dim=1)
														
 
															     return score_attn, score_norm, score_pred
														
 
															+def compare_tensor(tensor1, tensor2):
														
 
															+    if tensor1.shape != tensor2.shape:
														
 
															+        print(f"[compare_tensor]shape error: {tensor1.shape} vs {tensor2.shape}")
														
 
															+        return
														
 
															+    if not torch.allclose(tensor1, tensor2):
														
 
															+        print("[compare_tensor]value error: tensor1 not close to tensor2")
														
 
															 class NLPFunction(object):
														
 
															     """
														
 
															     NLP Task
														
 
															     """
														
 
															-    def __init__(self, model):
														
 
															+    def __init__(self, model, embedding_manager):
														
 
															         self.model = model
														
 
															+        self.embedding_manager = embedding_manager
														
 
															+
														
 
															+    def direct_similarity(self, a, b):
														
 
															+        return self.model.similarity(a, b)
														
 
															+
														
 
															+    def cached_similarity(self, a, b):
														
 
															+        text_emb1 = self.embedding_manager.get_embeddings(a)
														
 
															+        text_emb2 = self.embedding_manager.get_embeddings(b)
														
 
															+        score_function = self.model.score_functions['cos_sim']
														
 
															+        score_tensor = score_function(text_emb1, text_emb2)
														
 
															+        return score_tensor
														
 
															     def base_string_similarity(self, text_dict):
														
 
															         """
														
@@ -32,10 +49,9 @@ class NLPFunction(object):
 
															         :param text_dict:
														
 
															         :return:
														
 
															         """
														
 
															-        score_tensor = self.model.similarity(
														
 
															-            text_dict['text_a'],
														
 
															-            text_dict['text_b']
														
 
															-        )
														
 
															+        text_a = text_dict['text_a']
														
 
															+        text_b = text_dict['text_b']
														
 
															+        score_tensor = self.cached_similarity(text_a, text_b)
														
 
															         response = {
														
 
															             "score": score_tensor.squeeze().tolist()
														
 
															         }
														
@@ -46,10 +62,9 @@ class NLPFunction(object):
 
															         计算两个list的相似度
														
 
															         :return:
														
 
															         """
														
 
															-        score_tensor = self.model.similarity(
														
 
															-            pair_list_dict['text_list_a'],
														
 
															-            pair_list_dict['text_list_b']
														
 
															-        )
														
 
															+        text_a = pair_list_dict['text_list_a']
														
 
															+        text_b = pair_list_dict['text_list_b']
														
 
															+        score_tensor = self.cached_similarity(text_a, text_b)
														
 
															         response = {
														
 
															             "score_list_list": score_tensor.tolist()
														
 
															         }
														
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,4 +26,5 @@ torch~=2.3.1
 
															 tqdm~=4.66.4
														
 
															 transformers
														
 
															 pydantic~=2.6.4
														
 
															-similarities~=1.1.7
														
 
															+similarities~=1.1.7
														
 
															+filelock
														
--- a/routes/__init__.py
+++ b/routes/__init__.py
@@ -10,8 +10,7 @@ from .articleDBServer import ArticleSpider
 
															 from .accountServer import AccountServer
														
 
															 from applications.articleTools import ArticleDBTools
														
 
															-
														
 
															-def AlgRoutes(mysql_client, model):
														
 
															+def AlgRoutes(mysql_client, model, embedding_manager):
														
 
															     """
														
 
															     ALG ROUTES
														
 
															     :return:
														
@@ -46,7 +45,7 @@ def AlgRoutes(mysql_client, model):
 
															         :return:
														
 
															         """
														
 
															         params = await request.get_json()
														
 
															-        nlpS = NLPServer(params=params, model=model)
														
 
															+        nlpS = NLPServer(params=params, model=model, embedding_manager=embedding_manager)
														
 
															         response = nlpS.deal()
														
 
															         return jsonify(response)
														
--- a/routes/accountServer.py
+++ b/routes/accountServer.py
@@ -44,7 +44,7 @@ class AccountServer(object):
 
															         async with aiohttp.ClientSession() as session:
														
 
															             async with session.post(url, headers=headers, json=body) as response:
														
 
															                 response_text = await response.text()
														
 
															-                print("结果：\t", response_text)
														
 
															+                # print("结果：\t", response_text)
														
 
															                 if response_text:
														
 
															                     return await response.json()
														
 
															                 else:
														
@@ -112,8 +112,6 @@ class AccountServer(object):
 
															                 (good_df["show_view_count"] / good_df["view_count_avg"]).values.tolist()
														
 
															         account_interest = good_df["title"].values.tolist()
														
 
															-        print(account_interest)
														
 
															-        print(extend_dicts)
														
 
															         return account_interest, extend_dicts
														
 
															     async def get_each_account_score_list(self, gh_id):
														
--- a/routes/nlpServer.py
+++ b/routes/nlpServer.py
@@ -3,19 +3,18 @@
 
															 """
														
 
															 from applications.textSimilarity import NLPFunction
														
 
															-
														
 
															 class NLPServer(object):
														
 
															     """
														
 
															     nlp_server
														
 
															     """
														
 
															-    def __init__(self, params, model):
														
 
															+    def __init__(self, params, model, embedding_manager):
														
 
															         """
														
 
															         :param params:
														
 
															         """
														
 
															         self.data = None
														
 
															         self.function = None
														
 
															         self.params = params
														
 
															-        self.nlp = NLPFunction(model=model)
														
 
															+        self.nlp = NLPFunction(model=model, embedding_manager=embedding_manager)
														
 
															     def check_params(self):
														
 
															         """