deploy_multi_worker.sh 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. #!/bin/bash
  2. # 多worker部署脚本 - 在单台机器上启动API服务
  3. # 使用方法: ./deploy_multi_worker.sh [num_workers] [port] [gpu_id] [quantize]
  4. # 示例:
  5. # ./deploy_multi_worker.sh 2 8080 0 # 2个worker, 不量化
  6. # ./deploy_multi_worker.sh 3 8080 0 1 # 3个worker, INT8量化
  7. set -e
  8. # 配置参数
  9. NUM_WORKERS=${1:-2} # 默认2个worker
  10. PORT=${2:-8080} # 默认端口8080
  11. GPU_ID=${3:-0} # 默认GPU 0
  12. QUANTIZE=${4:-0} # 是否启用INT8量化 (0=否, 1=是)
  13. LLAMA_CHECKPOINT="checkpoints/s2-pro"
  14. DECODER_CHECKPOINT="checkpoints/s2-pro/codec.pth"
  15. # 设置环境变量
  16. export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
  17. export CUDA_VISIBLE_DEVICES=${GPU_ID}
  18. echo "========================================="
  19. echo "Fish-Speech Multi-Worker Deployment"
  20. echo "========================================="
  21. echo "Workers: ${NUM_WORKERS}"
  22. echo "Port: ${PORT}"
  23. echo "GPU: ${GPU_ID}"
  24. echo "Precision: BFloat16 (default, better stability than FP16)"
  25. echo "Quantize (INT8): ${QUANTIZE}"
  26. echo "========================================="
  27. # 构建量化参数
  28. QUANTIZE_ARG=""
  29. if [ "${QUANTIZE}" = "1" ]; then
  30. QUANTIZE_ARG="--quantize"
  31. echo "INT8 quantization enabled: VRAM per worker ~6GB (was ~12GB with BF16)"
  32. echo "Recommended workers with INT8: 3-4 per GPU"
  33. else
  34. echo "BF16 mode: VRAM per worker ~10-12GB"
  35. echo "Recommended workers with BF16: 2 per GPU"
  36. fi
  37. # 启动API服务
  38. # 注意:不使用 --half 参数,默认使用 bfloat16(数值稳定性更好)
  39. python tools/api_server.py \
  40. --listen 0.0.0.0:${PORT} \
  41. --llama-checkpoint-path ${LLAMA_CHECKPOINT} \
  42. --decoder-checkpoint-path ${DECODER_CHECKPOINT} \
  43. --workers 1 \
  44. --num-workers ${NUM_WORKERS} \
  45. ${QUANTIZE_ARG}