| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- #!/bin/bash
- # 多worker部署脚本 - 在单台机器上启动API服务
- # 使用方法: ./deploy_multi_worker.sh [num_workers] [port] [gpu_id] [quantize]
- # 示例:
- # ./deploy_multi_worker.sh 2 8080 0 # 2个worker, 不量化
- # ./deploy_multi_worker.sh 3 8080 0 1 # 3个worker, INT8量化
- set -e
- # 配置参数
- NUM_WORKERS=${1:-2} # 默认2个worker
- PORT=${2:-8080} # 默认端口8080
- GPU_ID=${3:-0} # 默认GPU 0
- QUANTIZE=${4:-0} # 是否启用INT8量化 (0=否, 1=是)
- LLAMA_CHECKPOINT="checkpoints/s2-pro"
- DECODER_CHECKPOINT="checkpoints/s2-pro/codec.pth"
- # 设置环境变量
- export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
- export CUDA_VISIBLE_DEVICES=${GPU_ID}
- echo "========================================="
- echo "Fish-Speech Multi-Worker Deployment"
- echo "========================================="
- echo "Workers: ${NUM_WORKERS}"
- echo "Port: ${PORT}"
- echo "GPU: ${GPU_ID}"
- echo "Precision: BFloat16 (default, better stability than FP16)"
- echo "Quantize (INT8): ${QUANTIZE}"
- echo "========================================="
- # 构建量化参数
- QUANTIZE_ARG=""
- if [ "${QUANTIZE}" = "1" ]; then
- QUANTIZE_ARG="--quantize"
- echo "INT8 quantization enabled: VRAM per worker ~6GB (was ~12GB with BF16)"
- echo "Recommended workers with INT8: 3-4 per GPU"
- else
- echo "BF16 mode: VRAM per worker ~10-12GB"
- echo "Recommended workers with BF16: 2 per GPU"
- fi
- # 启动API服务
- # 注意:不使用 --half 参数,默认使用 bfloat16(数值稳定性更好)
- python tools/api_server.py \
- --listen 0.0.0.0:${PORT} \
- --llama-checkpoint-path ${LLAMA_CHECKPOINT} \
- --decoder-checkpoint-path ${DECODER_CHECKPOINT} \
- --workers 1 \
- --num-workers ${NUM_WORKERS} \
- ${QUANTIZE_ARG}
|