浏览代码

第一版初始化

luojunhui 18 小时之前
父节点
当前提交
ab07d80841

+ 24 - 6
Dockerfile

@@ -1,17 +1,35 @@
+# 使用轻量级 Python 基础镜像
 FROM python:3.11-slim
 
+# 设置工作目录
 WORKDIR /app
 
-ENV PYTHONDONTWRITEBYTECODE=1
-ENV PYTHONUNBUFFERED=1
-ENV PIP_DISABLE_PIP_VERSION_CHECK=on
-ENV TZ=Asia/Shanghai
+# 环境变量优化
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=on \
+    TZ=Asia/Shanghai \
+    PATH="/root/.local/bin:$PATH"
 
+# 安装系统依赖(构建 wheel、时区等)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        tzdata \
+    && rm -rf /var/lib/apt/lists/*
+
+# 设置时区
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+
+# 复制 requirements 并安装依赖
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
 
+# 复制项目文件
 COPY . .
 
-EXPOSE 8000
+# 暴露端口
+EXPOSE 8001
 
-CMD ["python3", "-m", "vllm.entrypoints.openai.api_server", "--model", "models/Qwen3-Embedding-0.6B", "--dtype", "float16"]
+# 启动命令
+CMD ["hypercorn", "vector_app:app", "--config", "config.toml"]

+ 5 - 0
applications/api/__init__.py

@@ -0,0 +1,5 @@
+from .embedding import get_basic_embedding
+
+__all__ = [
+    "get_basic_embedding"
+]

+ 24 - 0
applications/api/embedding.py

@@ -0,0 +1,24 @@
+from applications.config import LOCAL_MODEL_CONFIG, VLLM_SERVER_URL
+from applications.utils import AsyncHttpClient
+
+
+async def get_basic_embedding(text: str, model: str):
+    """
+    embedding text into vectors
+    :param text:
+    :param model:
+    :return:tong
+    """
+    model_path = LOCAL_MODEL_CONFIG[model]
+    async with AsyncHttpClient(timeout=20) as client:
+        response = await client.post(
+            url=VLLM_SERVER_URL,
+            json={"input": text, "model": model_path},
+            headers={"Content-Type": "application/json"},
+        )
+        return response['data'][0]["embedding"]
+
+
+__all__ = [
+    "get_basic_embedding"
+]

+ 3 - 3
applications/config/__init__.py

@@ -1,7 +1,7 @@
-from .model_config import MODEL_CONFIG, DEFAULT_MODEL, LOCAL_MODEL_CONFIG
+from .model_config import  DEFAULT_MODEL, LOCAL_MODEL_CONFIG, VLLM_SERVER_URL
 
 __all__ = [
-    "MODEL_CONFIG",
     "DEFAULT_MODEL",
-    "LOCAL_MODEL_CONFIG"
+    "LOCAL_MODEL_CONFIG",
+    "VLLM_SERVER_URL"
 ]

+ 5 - 12
applications/config/model_config.py

@@ -1,16 +1,9 @@
-MODEL_CONFIG = {
-    "Qwen3-Embedding-0.6B": {"url": "http://vllm-0.6b:8000/v1/embeddings", "dim": 1536},
-    "Qwen3-Embedding-4B": {"url": "http://vllm-4b:8000/v1/embeddings", "dim": 1536},
-    "Qwen3-Embedding-8B": {"url": "http://vllm-8b:8000/v1/embeddings", "dim": 1536},
-}
-
 LOCAL_MODEL_CONFIG = {
-    "Qwen3-Embedding-0.6B": "models/Qwen3-Embedding-0.6B",
-    "Qwen3-Embedding-4B": "models/Qwen3-Embedding-4B",
-    "Qwen3-Embedding-8B": "models/Qwen3-Embedding-8B",
+    "Qwen3-Embedding-0.6B": "/models/Qwen3-Embedding-0.6B",
+    "Qwen3-Embedding-4B": "/models/Qwen3-Embedding-4B",
+    "Qwen3-Embedding-8B": "/models/Qwen3-Embedding-8B",
 }
 
-DEFAULT_MODEL = "Qwen3-Embedding-0.6B"
-
+DEFAULT_MODEL = "Qwen3-Embedding-4B"
 
-__all__ = ["MODEL_CONFIG", "DEFAULT_MODEL", "LOCAL_MODEL_CONFIG"]
+VLLM_SERVER_URL = "http://localhost:8080"

+ 0 - 6
applications/embedding/__init__.py

@@ -1,6 +0,0 @@
-from .basic import get_basic_embedding, get_local_embedding
-
-__all__ = [
-    "get_basic_embedding",
-    "get_local_embedding"
-]

+ 0 - 38
applications/embedding/basic.py

@@ -1,38 +0,0 @@
-from applications.config import MODEL_CONFIG, LOCAL_MODEL_CONFIG
-from applications.utils import AsyncHttpClient
-
-
-async def get_basic_embedding(text: str, model: str):
-    """
-    embedding text into vectors
-    :param text:
-    :param model:
-    :return:tong
-    """
-    cfg = MODEL_CONFIG[model]
-    async with AsyncHttpClient(timeout=20) as client:
-        response = await client.post(
-            url=cfg["url"],
-            json={"input": text, "model": model},
-            headers={"Content-Type": "application/json"},
-        )
-        return response['data'][0]["embedding"]
-
-
-async def get_local_embedding(text, model):
-    """
-    embedding text into vectors
-    :param text:
-    :param model:
-    :return:
-    """
-    outputs = model.get_embedding([text])
-    embedding = outputs[0]
-    return embedding
-
-
-
-__all__ = [
-    "get_basic_embedding",
-    "get_local_embedding"
-]

+ 1 - 1
config.toml

@@ -1,5 +1,5 @@
 reload = true
-bind = "0.0.0.0:8080"
+bind = "0.0.0.0:8081"
 workers = 1
 keep_alive_timeout = 120  # 保持连接的最大秒数,根据需要调整
 graceful_timeout = 30    # 重启或停止之前等待当前工作完成的时间

+ 6 - 20
docker-compose.yml

@@ -1,29 +1,15 @@
-version: '3.8'
-
 services:
-  triton-server:
-    build:
-      context: .
-      dockerfile: Dockerfile
-    image: my-tritonserver:25.05-vllm-python-py3
+  # vllm服务
+  vllm-qwen:
+    image: vllm-qwen
+    container_name: vllm-qwen
     ports:
       - "8000:8000"
-    environment:
-      - TZ=Asia/Shanghai
-      - PYTHONDONTWRITEBYTECODE=1
-      - PYTHONUNBUFFERED=1
-      - PIP_DISABLE_PIP_VERSION_CHECK=on
-    # 如果需要GPU支持,取消下面的注释
     deploy:
       resources:
         reservations:
           devices:
             - driver: nvidia
-              count: 1
+              count: all
               capabilities: [gpu]
-    # 如果需要挂载模型数据卷,取消下面的注释
-    # volumes:
-    #   - ./models:/app/models
-    # 如果需要设置资源限制,取消下面的注释
-    # mem_limit: 8g
-    # cpus: 4.0
+    restart: always

+ 4 - 20
routes/buleprint.py

@@ -1,19 +1,19 @@
 from quart import Blueprint, jsonify, request
 
-from applications.config import DEFAULT_MODEL, MODEL_CONFIG
-from applications.embedding import get_basic_embedding, get_local_embedding
+from applications.config import DEFAULT_MODEL, LOCAL_MODEL_CONFIG
+from applications.api import get_basic_embedding
 
 
 server_bp = Blueprint('api', __name__, url_prefix='/api')
 
-def server_routes(llm, vector_db):
+def server_routes(vector_db):
 
     @server_bp.route('/embed', methods=['POST'])
     async def embed():
         body = await request.get_json()
         text = body.get('text')
         model_name = body.get('model', DEFAULT_MODEL)
-        if not MODEL_CONFIG.get(model_name):
+        if not LOCAL_MODEL_CONFIG.get(model_name):
             return jsonify(
                 {"error": "error  model"}
             )
@@ -23,22 +23,6 @@ def server_routes(llm, vector_db):
             "embedding": embedding
         })
 
-    @server_bp.route('/embed_v1', methods=['POST'])
-    async def embed_v1():
-        body = await request.get_json()
-        text = body.get('text')
-        model_name = body.get('model', DEFAULT_MODEL)
-        if not MODEL_CONFIG.get(model_name):
-            return jsonify(
-                {"error": "error  model"}
-            )
-
-        embedding = await get_local_embedding(text, llm)
-        return jsonify({
-            "embedding": embedding
-        })
-
-
     @server_bp.route('/search', methods=['POST'])
     async def search():
         pass

+ 1 - 8
vector_app.py

@@ -1,7 +1,6 @@
 from quart import Quart
 from quart_cors import cors
 # from pymilvus import connections
-from vllm import LLM, SamplingParams
 
 from applications.config import LOCAL_MODEL_CONFIG, DEFAULT_MODEL
 from routes import server_routes
@@ -10,12 +9,6 @@ app = Quart(__name__)
 
 MODEL_PATH = LOCAL_MODEL_CONFIG[DEFAULT_MODEL]
 
-llm = LLM(
-    model=MODEL_PATH,
-    dtype="float16",   # 节省显存
-    trust_remote_code=True
-)
-print(f"{MODEL_PATH} 模型加载完成!")
 
 # 连接向量数据库
 # connections.connect("default", host="milvus", port="19530")
@@ -23,6 +16,6 @@ print(f"{MODEL_PATH} 模型加载完成!")
 connections = None
 
 # 注册路由
-app_route = server_routes(llm, connections)
+app_route = server_routes(connections)
 app.register_blueprint(app_route)