Browse Source

新增 word2vec功能

luojunhui 1 week ago
parent
commit
909d632c37
4 changed files with 44 additions and 9 deletions
  1. 11 4
      alg_app.py
  2. 6 4
      requirements.txt
  3. 18 1
      routes/__init__.py
  4. 9 0
      routes/word_2_vec.py

+ 11 - 4
alg_app.py

@@ -1,12 +1,17 @@
 """
 @author: luojunhui
 """
+import jieba
 from quart import Quart
+from text2vec import Word2Vec
 from similarities import BertSimilarity
 from routes import AlgRoutes
 from applications import AsyncMySQLClient
 from applications.embedding_manager import EmbeddingManager
 
+jieba.initialize()
+print("jieba初始化成功")
+
 app = Quart(__name__)
 AsyncMySQL = AsyncMySQLClient(app)
 
@@ -16,10 +21,12 @@ async def init():
     初始化模型
     """
     await AsyncMySQL.init_pool()
-    model = BertSimilarity(model_name_or_path="BAAI/bge-large-zh-v1.5")
-    embedding_manager = EmbeddingManager(model)
-    print("模型加载成功")
-    app_routes = AlgRoutes(AsyncMySQL, model, embedding_manager)
+    similarity_model = BertSimilarity(model_name_or_path="BAAI/bge-large-zh-v1.5")
+    embedding_manager = EmbeddingManager(similarity_model)
+    print("相似度模型加载成功")
+    word2vec_model = Word2Vec("lili666/text2vec-word2vec-tencent-chinese")
+    print("词向量模型加载成功")
+    app_routes = AlgRoutes(AsyncMySQL, similarity_model, word2vec_model, embedding_manager)
     app.register_blueprint(app_routes)
 
 

+ 6 - 4
requirements.txt

@@ -1,5 +1,5 @@
 aiofiles
-aiohttp
+aiohttp~=3.11.18
 aiomysql~=0.2.0
 aiosignal
 alembic
@@ -15,8 +15,8 @@ openai
 openpyxl
 optuna
 packaging
-pandas
-pymysql
+pandas~=2.2.3
+pymysql~=1.1.1
 pyodps
 quart~=0.19.6
 requests~=2.32.3
@@ -27,4 +27,6 @@ tqdm~=4.66.4
 transformers
 pydantic~=2.6.4
 similarities~=1.1.7
-filelock
+filelock~=3.18.0
+text2vec~=1.3.4
+jieba~=0.42.1

+ 18 - 1
routes/__init__.py

@@ -8,15 +8,32 @@ from .accountArticleRank import AccountArticleRank
 from .nlpServer import NLPServer
 from .articleDBServer import ArticleSpider
 from .accountServer import AccountServer
+from .word_2_vec import process_text
 from applications.articleTools import ArticleDBTools
 
-def AlgRoutes(mysql_client, model, embedding_manager):
+def AlgRoutes(mysql_client, model, word_vec_model, embedding_manager):
     """
     ALG ROUTES
     :return:
     """
     blueprint = Blueprint("LongArticlesAlgServer", __name__)
 
+    @blueprint.route("/embed", methods=["POST"])
+    async def embed():
+        """
+        测试词向量模型
+        :return:
+        """
+        params = await request.get_json()
+        text = params["text"]
+        words, vectors = process_text(word_vec_model, text)
+        res = {
+            "text": text,
+            "tokens": words,
+            "embeddings": [vec.tolist() for vec in vectors]
+        }
+        return jsonify(res)
+
     @blueprint.route("/healthCheck")
     def helloFuture():
         """

+ 9 - 0
routes/word_2_vec.py

@@ -0,0 +1,9 @@
+import jieba
+
+def process_text(model, text):
+    words = list(jieba.cut(text))  # 中文分词
+    vectors = []
+    for word in words:
+        vec = model.encode(word)  # 用encode方法取向量
+        vectors.append(vec)
+    return words, vectors