3 недель назад · 663f123f9d
--- a/deploy_multi_worker.sh
+++ b/deploy_multi_worker.sh
@@ -1,6 +1,9 @@
 
															 #!/bin/bash
														
 
															 # 多worker部署脚本 - 在单台机器上启动API服务
														
 
															-# 使用方法: ./deploy_multi_worker.sh [num_workers] [port]
														
 
															+# 使用方法: ./deploy_multi_worker.sh [num_workers] [port] [gpu_id] [quantize]
														
 
															+# 示例:
														
 
															+#   ./deploy_multi_worker.sh 2 8080 0       # 2个worker, 不量化
														
 
															+#   ./deploy_multi_worker.sh 3 8080 0 1     # 3个worker, INT8量化
														
 
															 set -e
														
@@ -8,6 +11,7 @@ set -e
 
															 NUM_WORKERS=${1:-2}  # 默认2个worker
														
 
															 PORT=${2:-8080}      # 默认端口8080
														
 
															 GPU_ID=${3:-0}       # 默认GPU 0
														
 
															+QUANTIZE=${4:-0}     # 是否启用INT8量化 (0=否, 1=是)
														
 
															 LLAMA_CHECKPOINT="checkpoints/s2-pro"
														
 
															 DECODER_CHECKPOINT="checkpoints/s2-pro/codec.pth"
														
@@ -22,13 +26,27 @@ echo "========================================="
 
															 echo "Workers: ${NUM_WORKERS}"
														
 
															 echo "Port: ${PORT}"
														
 
															 echo "GPU: ${GPU_ID}"
														
 
															+echo "Precision: BFloat16 (default, better stability than FP16)"
														
 
															+echo "Quantize (INT8): ${QUANTIZE}"
														
 
															 echo "========================================="
														
 
															+# 构建量化参数
														
 
															+QUANTIZE_ARG=""
														
 
															+if [ "${QUANTIZE}" = "1" ]; then
														
 
															+    QUANTIZE_ARG="--quantize"
														
 
															+    echo "INT8 quantization enabled: VRAM per worker ~6GB (was ~12GB with BF16)"
														
 
															+    echo "Recommended workers with INT8: 3-4 per GPU"
														
 
															+else
														
 
															+    echo "BF16 mode: VRAM per worker ~10-12GB"
														
 
															+    echo "Recommended workers with BF16: 2 per GPU"
														
 
															+fi
														
 
															+
														
 
															 # 启动API服务
														
 
															+# 注意：不使用 --half 参数，默认使用 bfloat16（数值稳定性更好）
														
 
															 python tools/api_server.py \
														
 
															   --listen 0.0.0.0:${PORT} \
														
 
															   --llama-checkpoint-path ${LLAMA_CHECKPOINT} \
														
 
															   --decoder-checkpoint-path ${DECODER_CHECKPOINT} \
														
 
															-  --half \
														
 
															   --workers 1 \
														
 
															-  --num-workers ${NUM_WORKERS}
														
 
															+  --num-workers ${NUM_WORKERS} \
														
 
															+  ${QUANTIZE_ARG}
														
--- a/fish_speech/models/text2semantic/inference.py
+++ b/fish_speech/models/text2semantic/inference.py
@@ -357,7 +357,7 @@ def generate(
 
															     return seq
														
 
															-def init_model(checkpoint_path, device, precision, compile=False):
														
 
															+def init_model(checkpoint_path, device, precision, compile=False, quantize=False):
														
 
															     model = DualARTransformer.from_pretrained(checkpoint_path, load_weights=True)
														
 
															     logger.info(f"precision: {precision.__class__.__name__}")
														
@@ -365,9 +365,44 @@ def init_model(checkpoint_path, device, precision, compile=False):
 
															     model = model.to(device=device, dtype=precision)
														
 
															     logger.info(f"Restored model from checkpoint")
														
 
															+    # Apply INT8 quantization if requested
														
 
															+    if quantize:
														
 
															+        try:
														
 
															+            import bitsandbytes as bnb
														
 
															+            logger.info("Applying INT8 quantization with bitsandbytes...")
														
 
															+
														
 
															+            # Replace all Linear layers with 8-bit quantized versions
														
 
															+            def replace_linear_with_int8(module):
														
 
															+                for name, child in module.named_children():
														
 
															+                    if isinstance(child, torch.nn.Linear):
														
 
															+                        # Create 8-bit linear layer
														
 
															+                        int8_layer = bnb.nn.Linear8bitLt(
														
 
															+                            child.in_features,
														
 
															+                            child.out_features,
														
 
															+                            bias=child.bias is not None,
														
 
															+                            has_fp16_weights=False,
														
 
															+                            threshold=6.0
														
 
															+                        )
														
 
															+                        # Copy weights
														
 
															+                        int8_layer.weight = bnb.nn.Int8Params(
														
 
															+                            child.weight.data,
														
 
															+                            requires_grad=False,
														
 
															+                            has_fp16_weights=False
														
 
															+                        )
														
 
															+                        if child.bias is not None:
														
 
															+                            int8_layer.bias = child.bias
														
 
															+                        setattr(module, name, int8_layer)
														
 
															+                    else:
														
 
															+                        replace_linear_with_int8(child)
														
 
															+
														
 
															+            replace_linear_with_int8(model)
														
 
															+            logger.info("INT8 quantization applied successfully")
														
 
															+        except ImportError:
														
 
															+            logger.error("bitsandbytes not installed. Install with: pip install bitsandbytes")
														
 
															+            raise
														
 
															+
														
 
															     if isinstance(model, DualARTransformer):
														
 
															         decode_one_token = decode_one_token_ar
														
 
															-        # prefill_n_tokens = decode_one_token_ar
														
 
															         logger.info("Using DualARTransformer")
														
 
															     else:
														
 
															         raise ValueError("Unsupported model type")
														
@@ -380,7 +415,8 @@ def init_model(checkpoint_path, device, precision, compile=False):
 
															     # Mark whether cache has been initialized
														
 
															     model._cache_setup_done = False
														
 
															-    if compile:
														
 
															+    # Disable compile if quantization is enabled (bitsandbytes INT8 is incompatible with torch.compile)
														
 
															+    if compile and not quantize:
														
 
															         logger.info("Compiling function...")
														
 
															         decode_one_token = torch.compile(
														
 
															             decode_one_token,
														
@@ -388,6 +424,8 @@ def init_model(checkpoint_path, device, precision, compile=False):
 
															             mode="default" if torch.cuda.is_available() else None,
														
 
															             fullgraph=True,
														
 
															         )
														
 
															+    elif compile and quantize:
														
 
															+        logger.warning("torch.compile disabled when quantization is enabled (bitsandbytes compatibility)")
														
 
															     return model.eval(), decode_one_token
														
@@ -775,6 +813,7 @@ def launch_thread_safe_queue(
 
															     precision,
														
 
															     compile: bool = False,
														
 
															     num_workers: int = 1,
														
 
															+    quantize: bool = False,
														
 
															 ):
														
 
															     input_queue = queue.Queue()
														
 
															     init_events = [threading.Event() for _ in range(num_workers)]
														
@@ -782,7 +821,7 @@ def launch_thread_safe_queue(
 
															     def worker(worker_id, init_event):
														
 
															         logger.info(f"Worker {worker_id} starting, loading model...")
														
 
															         model, decode_one_token = init_model(
														
 
															-            checkpoint_path, device, precision, compile=compile
														
 
															+            checkpoint_path, device, precision, compile=compile, quantize=quantize
														
 
															         )
														
 
															         with torch.device(device):
														
 
															             model.setup_caches(
														
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,6 +70,9 @@ cu129 = [
 
															   "torch==2.8.0",
														
 
															   "torchaudio",
														
 
															 ]
														
 
															+quantization = [
														
 
															+  "bitsandbytes>=0.41.0",
														
 
															+]
														
 
															 [tool.uv]
														
 
															 override-dependencies = [
														
--- a/tools/api_server.py
+++ b/tools/api_server.py
@@ -97,6 +97,7 @@ class API(ExceptionHandler):
 
															             decoder_checkpoint_path=self.args.decoder_checkpoint_path,
														
 
															             decoder_config_name=self.args.decoder_config_name,
														
 
															             num_workers=self.args.num_workers,
														
 
															+            quantize=self.args.quantize,
														
 
															         )
														
 
															         logger.info(f"Startup done, listening server at http://{self.args.listen}")
														
--- a/tools/server/api_utils.py
+++ b/tools/server/api_utils.py
@@ -39,6 +39,7 @@ def parse_args():
 
															     parser.add_argument("--listen", type=str, default="127.0.0.1:8080")
														
 
															     parser.add_argument("--workers", type=int, default=1)
														
 
															     parser.add_argument("--num-workers", type=int, default=1, help="Number of model worker threads for parallel inference")
														
 
															+    parser.add_argument("--quantize", action="store_true", help="Enable INT8 quantization to reduce VRAM usage")
														
 
															     parser.add_argument("--api-key", type=str, default=None)
														
 
															     return parser.parse_args()
														
--- a/tools/server/model_manager.py
+++ b/tools/server/model_manager.py
@@ -19,6 +19,7 @@ class ModelManager:
 
															         decoder_checkpoint_path: str,
														
 
															         decoder_config_name: str,
														
 
															         num_workers: int = 1,
														
 
															+        quantize: bool = False,
														
 
															     ) -> None:
														
 
															         self.mode = mode
														
@@ -26,6 +27,7 @@ class ModelManager:
 
															         self.half = half
														
 
															         self.compile = compile
														
 
															         self.num_workers = num_workers
														
 
															+        self.quantize = quantize
														
 
															         self.precision = torch.half if half else torch.bfloat16
														
@@ -66,6 +68,7 @@ class ModelManager:
 
															                 precision=precision,
														
 
															                 compile=compile,
														
 
															                 num_workers=self.num_workers,
														
 
															+                quantize=self.quantize,
														
 
															             )
														
 
															         else:
														
 
															             raise ValueError(f"Invalid mode: {mode}")