2 mēneši atpakaļ · 5e6e36d912
--- a/fish_speech/models/text2semantic/inference.py
+++ b/fish_speech/models/text2semantic/inference.py
@@ -208,7 +208,7 @@ def decode_n_tokens(
 
				 
			
 
				     for i in tqdm(range(num_new_tokens)):
			
 
				         f_start = time.perf_counter()
			
 
				-        with sdpa_kernel(SDPBackend.MATH):
			
 
				+        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
			
 
				             next_token = decode_one_token(
			
 
				                 model=model,
			
 
				                 x=cur_token,
			
@@ -242,101 +242,6 @@ def decode_n_tokens(
 
				 
			
 
				     return torch.cat(new_tokens, dim=1)
			
 
				 
			
 
				-
			
 
				-def decode_n_tokens_optimized(
			
 
				-        model: DualARTransformer,
			
 
				-        cur_token: torch.Tensor,
			
 
				-        input_pos: torch.Tensor,
			
 
				-        num_new_tokens: int,
			
 
				-        temperature: torch.Tensor,
			
 
				-        top_p: torch.Tensor,
			
 
				-        top_k: int,
			
 
				-        semantic_logit_bias: torch.Tensor,
			
 
				-        audio_masks: torch.Tensor,
			
 
				-        audio_parts: torch.Tensor,
			
 
				-        decode_one_token=decode_one_token_ar,
			
 
				-):
			
 
				-    """
			
 
				-    Optimized version:
			
 
				-    - no roll (ring buffer)
			
 
				-    - flash attention
			
 
				-    - reduced view/reshape
			
 
				-    """
			
 
				-
			
 
				-    device = cur_token.device
			
 
				-    num_streams = model.config.num_codebooks + 1
			
 
				-
			
 
				-    # =========================
			
 
				-    # 1. ring buffer index (替代 roll)
			
 
				-    # =========================
			
 
				-    previous_tokens = torch.zeros(
			
 
				-        (model.config.num_codebooks + 1, RAS_WIN_SIZE),
			
 
				-        dtype=torch.int,
			
 
				-        device=cur_token.device,
			
 
				-    )
			
 
				-    history_len = previous_tokens.size(1)
			
 
				-    write_idx = history_len - 1
			
 
				-
			
 
				-    new_tokens = []
			
 
				-
			
 
				-    # =========================
			
 
				-    # 2. precompute reshape shape
			
 
				-    # =========================
			
 
				-    batch = 1
			
 
				-    im_end_id = model.tokenizer.get_token_id(IM_END_TOKEN)
			
 
				-
			
 
				-    # =========================
			
 
				-    # 3. main loop
			
 
				-    # =========================
			
 
				-    for i in range(num_new_tokens):
			
 
				-
			
 
				-        # ⚡ use flash attention (重要优化)
			
 
				-        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
			
 
				-            next_token = decode_one_token(
			
 
				-                model=model,
			
 
				-                x=cur_token,
			
 
				-                input_pos=input_pos,
			
 
				-                previous_tokens=previous_tokens,
			
 
				-                temperature=temperature,
			
 
				-                top_p=top_p,
			
 
				-                top_k=top_k,
			
 
				-                semantic_logit_bias=semantic_logit_bias,
			
 
				-                audio_masks=audio_masks,
			
 
				-                audio_parts=audio_parts,
			
 
				-            ).clone()
			
 
				-
			
 
				-        # =========================
			
 
				-        # 4. update position
			
 
				-        # =========================
			
 
				-        input_pos += 1
			
 
				-
			
 
				-        # =========================
			
 
				-        # 5. reshape once (reuse view logic)
			
 
				-        # =========================
			
 
				-        next_token_2d = next_token.view(num_streams, -1)
			
 
				-
			
 
				-        cur_token = next_token_2d.unsqueeze(0)
			
 
				-
			
 
				-        # =========================
			
 
				-        # 6. ring buffer update (NO roll)
			
 
				-        # =========================
			
 
				-        previous_tokens[:, write_idx] = next_token_2d[:, 0]
			
 
				-        write_idx = (write_idx + 1) % history_len
			
 
				-
			
 
				-        # =========================
			
 
				-        # 7. store output
			
 
				-        # =========================
			
 
				-        new_tokens.append(next_token)
			
 
				-
			
 
				-        # =========================
			
 
				-        # 8. EOS check
			
 
				-        # =========================
			
 
				-        if cur_token[0, 0, -1] == im_end_id:
			
 
				-            break
			
 
				-
			
 
				-    return new_tokens
			
 
				-
			
 
				-
			
 
				 @torch.no_grad()
			
 
				 @torch.inference_mode()
			
 
				 def generate(
			
@@ -476,7 +381,7 @@ def generate(
 
				     # =========================
			
 
				     previous_tokens[:, -1, :] = first_token.view(codebook_dim)
			
 
				 
			
 
				-    x = decode_n_tokens_optimized(
			
 
				+    x = decode_n_tokens(
			
 
				         model,
			
 
				         first_token.view(1, codebook_dim, -1),
			
 
				         input_pos,