#!/bin/bash # 多worker部署脚本 - 在单台机器上启动API服务 # 使用方法: ./deploy_multi_worker.sh [num_workers] [port] [gpu_id] [quantize] # 示例: # ./deploy_multi_worker.sh 2 8080 0 # 2个worker, 不量化 # ./deploy_multi_worker.sh 3 8080 0 1 # 3个worker, INT8量化 set -e # 配置参数 NUM_WORKERS=${1:-2} # 默认2个worker PORT=${2:-8080} # 默认端口8080 GPU_ID=${3:-0} # 默认GPU 0 QUANTIZE=${4:-0} # 是否启用INT8量化 (0=否, 1=是) LLAMA_CHECKPOINT="checkpoints/s2-pro" DECODER_CHECKPOINT="checkpoints/s2-pro/codec.pth" # 设置环境变量 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export CUDA_VISIBLE_DEVICES=${GPU_ID} echo "=========================================" echo "Fish-Speech Multi-Worker Deployment" echo "=========================================" echo "Workers: ${NUM_WORKERS}" echo "Port: ${PORT}" echo "GPU: ${GPU_ID}" echo "Precision: BFloat16 (default, better stability than FP16)" echo "Quantize (INT8): ${QUANTIZE}" echo "=========================================" # 构建量化参数 QUANTIZE_ARG="" if [ "${QUANTIZE}" = "1" ]; then QUANTIZE_ARG="--quantize" echo "INT8 quantization enabled: VRAM per worker ~6GB (was ~12GB with BF16)" echo "Recommended workers with INT8: 3-4 per GPU" else echo "BF16 mode: VRAM per worker ~10-12GB" echo "Recommended workers with BF16: 2 per GPU" fi # 启动API服务 # 注意:不使用 --half 参数,默认使用 bfloat16(数值稳定性更好) python tools/api_server.py \ --listen 0.0.0.0:${PORT} \ --llama-checkpoint-path ${LLAMA_CHECKPOINT} \ --decoder-checkpoint-path ${DECODER_CHECKPOINT} \ --workers 1 \ --num-workers ${NUM_WORKERS} \ ${QUANTIZE_ARG}