пре 2 година · a003e5a390
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ __pycache__
 
				 /data
			
 
				 /*.test.sh
			
 
				 *.filelist
			
 
				+filelists
			
--- a/preparing_data/split_filelist.py
+++ b/preparing_data/split_filelist.py
@@ -0,0 +1,27 @@
 
				+from pathlib import Path
			
 
				+import click
			
 
				+import random
			
 
				+from loguru import logger
			
 
				+
			
 
				+@click.command()
			
 
				+@click.argument('list-file', type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path))
			
 
				+@click.option('--train-proportion', '-p', type=float, default=0.95)
			
 
				+def main(list_file, train_proportion):
			
 
				+    lines = list_file.read_text().splitlines()
			
 
				+    logger.info(f'Found {len(lines)} lines in {list_file}')
			
 
				+
			
 
				+    random.shuffle(lines)
			
 
				+
			
 
				+    train_size = int(len(lines) * train_proportion)
			
 
				+
			
 
				+    train_file = list_file.with_suffix(f'.train{list_file.suffix}')
			
 
				+    train_file.write_text('\n'.join(lines[:train_size]))
			
 
				+
			
 
				+    test_file = list_file.with_suffix(f'.test{list_file.suffix}')
			
 
				+    test_file.write_text('\n'.join(lines[train_size:]))
			
 
				+
			
 
				+    logger.info(f'Wrote {len(lines[:train_size])} lines to {train_file}')
			
 
				+    logger.info(f'Wrote {len(lines[train_size:])} lines to {test_file}')
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@ tensorboard>=2.14.1
 
				 natsort>=8.4.0
			
 
				 einops>=0.7.0
			
 
				 librosa>=0.10.1
			
 
				+vector-quantize-pytorch>=1.9.18
			
--- a/speech_lm/configs/pretrain.yaml
+++ b/speech_lm/configs/pretrain.yaml
@@ -37,7 +37,7 @@ tokenizer:
 
				 # This is a 300 billion seen token schedule
			
 
				 schedule:
			
 
				   max_length: 1024
			
 
				-  batch_size: 64  # 128 * 4 = 512
			
 
				+  batch_size: 128  # 128 * 4 = 512
			
 
				   micro_batch_size: 8
			
 
				   max_steps: 100000
			
 
				   save_interval: 5000
			
--- a/speech_lm/configs/whisper_vq.yaml
+++ b/speech_lm/configs/whisper_vq.yaml
@@ -1,5 +1,5 @@
 
				 paths:
			
 
				-  run_dir: results/hubert-vq
			
 
				+  run_dir: results/whisper-vq
			
 
				   checkpoint_dir: ${paths.run_dir}/checkpoints
			
 
				 
			
 
				 hydra:
			
@@ -23,25 +23,35 @@ trainer:
 
				     version: null
			
 
				 
			
 
				 model:
			
 
				-  _target_: speech_lm.models.hubert_vq.HubertVQDistill
			
 
				-  model_name_or_path: facebook/hubert-large-ls960-ft
			
 
				-  vq_layer: -4
			
 
				+  _target_: speech_lm.models.whisper_vq.WhisperVQ
			
 
				+  model_name_or_path: "openai/whisper-medium"
			
 
				+
			
 
				+  # Quantization
			
 
				+  codebook_dim: 32
			
 
				   codebook_size: 4096
			
 
				-  trainable_layers_before_vq: 2
			
 
				-  trainable_layers_after_vq: 2
			
 
				-  vq_loss_weight: 1.0
			
 
				+  codebook_decay: 0.9
			
 
				+  threshold_ema_dead_code: 0
			
 
				+  use_cosine_similarity: true
			
 
				+  downsample: true
			
 
				+
			
 
				+  # Attention
			
 
				+  post_attention_depth: 2
			
 
				 
			
 
				 schedule:
			
 
				-  batch_size: 32
			
 
				-  micro_batch_size: 32
			
 
				-  max_steps: 10000
			
 
				+  batch_size: 64
			
 
				+  micro_batch_size: 64
			
 
				+  max_steps: 1000000
			
 
				   save_interval: 2000
			
 
				   gradient_accumulation_steps: "${eval: ${schedule.batch_size} // ${schedule.micro_batch_size}}"
			
 
				-  clip_grad_norm: 1.0
			
 
				+  clip_grad_norm: 2.0
			
 
				+
			
 
				+train_dataset:
			
 
				+  _target_: speech_lm.datasets.whisper_vq.WhisperVQDataset
			
 
				+  filelist: filelists/whisper-vq.train.train.filelist
			
 
				 
			
 
				-dataset:
			
 
				-  _target_: speech_lm.datasets.hubert_vq.HubertVQDataset
			
 
				-  filelist: libritts-r.filelist
			
 
				+valid_dataset:
			
 
				+  _target_: speech_lm.datasets.whisper_vq.WhisperVQDataset
			
 
				+  filelist: filelists/whisper-vq.train.test.filelist
			
 
				 
			
 
				 train_dataloader:
			
 
				   _target_: torch.utils.data.DataLoader
			
@@ -49,7 +59,15 @@ train_dataloader:
 
				   batch_size: ${schedule.micro_batch_size}
			
 
				   num_workers: 4
			
 
				   collate_fn:
			
 
				-    _target_: speech_lm.datasets.hubert_vq.HubertVQCollator
			
 
				+    _target_: speech_lm.datasets.whisper_vq.WhisperVQCollator
			
 
				+
			
 
				+valid_dataloader:
			
 
				+  _target_: torch.utils.data.DataLoader
			
 
				+  dataset: ${dataset}
			
 
				+  batch_size: ${schedule.micro_batch_size}
			
 
				+  num_workers: 4
			
 
				+  collate_fn:
			
 
				+    _target_: speech_lm.datasets.whisper_vq.WhisperVQCollator
			
 
				 
			
 
				 optimizer:
			
 
				   _target_: torch.optim.AdamW
			
--- a/speech_lm/datasets/whisper_vq.py
+++ b/speech_lm/datasets/whisper_vq.py
@@ -5,7 +5,7 @@ import librosa
 
				 import torch
			
 
				 from torch.utils.data import Dataset
			
 
				 from transformers import WhisperProcessor
			
 
				-from whisper.audio import load_audio, log_mel_spectrogram, pad_or_trim
			
 
				+from whisper.audio import HOP_LENGTH, load_audio, log_mel_spectrogram, pad_or_trim
			
 
				 
			
 
				 
			
 
				 class WhisperVQDataset(Dataset):
			
@@ -25,9 +25,14 @@ class WhisperVQDataset(Dataset):
 
				     def __getitem__(self, idx):
			
 
				         file = self.files[idx]
			
 
				         wav = load_audio(file)
			
 
				+        wav_length = wav.shape[-1]
			
 
				+        mel_length = wav_length // HOP_LENGTH + 1
			
 
				+
			
 
				         wav = pad_or_trim(wav)
			
 
				         wav = torch.from_numpy(wav).float()
			
 
				         input_features = log_mel_spectrogram(wav)
			
 
				+        mel_mask = torch.zeros(input_features.shape[1], dtype=torch.float)
			
 
				+        mel_mask[:mel_length] = 1
			
 
				 
			
 
				         input_ids = file.with_suffix(".whisper.txt").read_text().strip().split("\t")[0]
			
 
				         input_ids = [int(x) for x in input_ids.split(",")]
			
@@ -45,6 +50,7 @@ class WhisperVQDataset(Dataset):
 
				             "input_values": wav,
			
 
				             "input_features": input_features,
			
 
				             "input_ids": input_ids,
			
 
				+            "mel_mask": mel_mask,
			
 
				         }
			
 
				 
			
 
				 
			
@@ -61,6 +67,7 @@ class WhisperVQCollator:
 
				         decoder_attention_mask = []
			
 
				         decoder_input_ids = []
			
 
				         input_features = torch.stack([x["input_features"] for x in batch])
			
 
				+        encoder_attention_mask = torch.stack([x["mel_mask"] for x in batch])
			
 
				 
			
 
				         for data in batch:
			
 
				             values_length = data["input_values"].shape[-1]
			
@@ -90,6 +97,7 @@ class WhisperVQCollator:
 
				         return {
			
 
				             "input_values": torch.stack(input_values),
			
 
				             "input_features": input_features,
			
 
				+            "encoder_attention_mask": encoder_attention_mask,
			
 
				             "decoder_input_ids": decoder_input_ids[:, :-1],
			
 
				             "decoder_attention_mask": decoder_attention_mask[:, :-1],
			
 
				             "labels": labels[:, 1:],
			
--- a/speech_lm/models/flash_whisper.py
+++ b/speech_lm/models/flash_whisper.py
@@ -5,12 +5,8 @@ import numpy as np
 
				 import torch
			
 
				 import torch.nn.functional as F
			
 
				 from torch import nn
			
 
				-from torch.nn import CrossEntropyLoss
			
 
				-from transformers.generation.logits_process import WhisperTimeStampLogitsProcessor
			
 
				 from transformers.modeling_outputs import BaseModelOutput
			
 
				 from transformers.models.whisper.modeling_whisper import (
			
 
				-    WHISPER_INPUTS_DOCSTRING,
			
 
				-    WHISPER_START_DOCSTRING,
			
 
				     WhisperAttention,
			
 
				     WhisperConfig,
			
 
				     WhisperDecoder,
			
@@ -19,23 +15,11 @@ from transformers.models.whisper.modeling_whisper import (
 
				     WhisperEncoderLayer,
			
 
				     WhisperForConditionalGeneration,
			
 
				     WhisperModel,
			
 
				-    WhisperPreTrainedModel,
			
 
				-    _dynamic_time_warping,
			
 
				-    _median_filter,
			
 
				-    shift_tokens_right,
			
 
				-)
			
 
				-from transformers.models.whisper.tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE
			
 
				-from transformers.utils import (
			
 
				-    add_start_docstrings,
			
 
				-    add_start_docstrings_to_model_forward,
			
 
				-    logging,
			
 
				-    replace_return_docstrings,
			
 
				 )
			
 
				+from transformers.utils import logging
			
 
				 
			
 
				 logger = logging.get_logger(__name__)
			
 
				 
			
 
				-_CONFIG_FOR_DOC = "WhisperConfig"
			
 
				-
			
 
				 
			
 
				 class FlashWhisperAttention(WhisperAttention):
			
 
				     """Multi-headed attention from 'Attention Is All You Need' paper"""
			
--- a/speech_lm/models/hubert_vq.py
+++ b/speech_lm/models/hubert_vq.py
@@ -1,289 +0,0 @@
 
				-from dataclasses import dataclass
			
 
				-from typing import Optional
			
 
				-
			
 
				-import torch
			
 
				-from encodec.quantization.core_vq import VectorQuantization
			
 
				-from torch import nn
			
 
				-from transformers import HubertModel
			
 
				-
			
 
				-
			
 
				-class HubertVQ(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        model_name_or_path: str = "facebook/hubert-large-ls960-ft",
			
 
				-        vq_layer: int = -4,  # the layer to extract the quantized features
			
 
				-        codebook_size: int = 1024,
			
 
				-        trainable_layers_before_vq: int = 2,
			
 
				-        trainable_layers_after_vq: int = 2,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        self.hubert = HubertModel.from_pretrained(model_name_or_path)
			
 
				-        self.vq_layer = (
			
 
				-            (self.hubert.config.num_hidden_layers + vq_layer)
			
 
				-            if vq_layer < 0
			
 
				-            else vq_layer
			
 
				-        )
			
 
				-        self.trainable_layers_before_vq = trainable_layers_before_vq
			
 
				-        self.trainable_layers_after_vq = trainable_layers_after_vq
			
 
				-
			
 
				-        assert (
			
 
				-            self.vq_layer >= trainable_layers_before_vq
			
 
				-            and self.vq_layer
			
 
				-            < self.hubert.config.num_hidden_layers - trainable_layers_after_vq
			
 
				-        ), "vq_layer must be between trainable_layers_before_vq and num_hidden_layers - trainable_layers_after_vq"
			
 
				-
			
 
				-        # Freeze both feature extractor & lm head
			
 
				-        for param in self.hubert.parameters():
			
 
				-            param.requires_grad = False
			
 
				-
			
 
				-        # Unfreeze layers between vq_layer - trainable_layers_before_vq and vq_layer + trainable_layers_after_vq
			
 
				-        for param in self.hubert.encoder.layers[
			
 
				-            self.vq_layer
			
 
				-            - trainable_layers_before_vq : self.vq_layer
			
 
				-            + trainable_layers_after_vq
			
 
				-        ].parameters():
			
 
				-            param.requires_grad = True
			
 
				-
			
 
				-        # Quantization
			
 
				-        self.quantizer_ln = nn.LayerNorm(self.hubert.config.hidden_size)
			
 
				-        self.quantizer = VectorQuantization(
			
 
				-            codebook_size=codebook_size,
			
 
				-            dim=self.hubert.config.hidden_size,
			
 
				-            kmeans_init=False,
			
 
				-        )
			
 
				-
			
 
				-    @torch.no_grad()
			
 
				-    def _get_attention_mask(
			
 
				-        self, hidden_states: torch.Tensor, attention_mask: torch.Tensor
			
 
				-    ) -> tuple[torch.Tensor, torch.Tensor]:
			
 
				-        # compute reduced attention_mask corresponding to feature vectors
			
 
				-        attention_mask = self.hubert._get_feature_vector_attention_mask(
			
 
				-            hidden_states.shape[1], attention_mask
			
 
				-        )
			
 
				-
			
 
				-        # make sure padded tokens are not attended to
			
 
				-        expand_attention_mask = attention_mask.unsqueeze(-1).repeat(
			
 
				-            1, 1, hidden_states.shape[2]
			
 
				-        )
			
 
				-        hidden_states[~expand_attention_mask] = 0
			
 
				-
			
 
				-        # extend attention_mask
			
 
				-        attention_mask = 1.0 - attention_mask[:, None, None, :].to(
			
 
				-            dtype=hidden_states.dtype
			
 
				-        )
			
 
				-        attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
			
 
				-        attention_mask = attention_mask.expand(
			
 
				-            attention_mask.shape[0],
			
 
				-            1,
			
 
				-            attention_mask.shape[-1],
			
 
				-            attention_mask.shape[-1],
			
 
				-        )
			
 
				-
			
 
				-        return hidden_states, attention_mask
			
 
				-
			
 
				-    def encode(
			
 
				-        self,
			
 
				-        input_values: Optional[torch.Tensor],
			
 
				-        attention_mask: Optional[torch.Tensor] = None,
			
 
				-        mask_time_indices: Optional[torch.FloatTensor] = None,
			
 
				-    ) -> torch.Tensor:
			
 
				-        with torch.no_grad():
			
 
				-            # Extract features
			
 
				-            extract_features = self.hubert.feature_extractor(input_values)
			
 
				-            extract_features = extract_features.transpose(1, 2)
			
 
				-
			
 
				-            hidden_states = self.hubert.feature_projection(extract_features)
			
 
				-            hidden_states = self.hubert._mask_hidden_states(
			
 
				-                hidden_states, mask_time_indices=mask_time_indices
			
 
				-            )
			
 
				-
			
 
				-            position_embeddings = self.hubert.encoder.pos_conv_embed(hidden_states)
			
 
				-            hidden_states = hidden_states + position_embeddings
			
 
				-
			
 
				-            if attention_mask is not None:
			
 
				-                # compute reduced attention_mask corresponding to feature vectors
			
 
				-                hidden_states, attention_mask = self._get_attention_mask(
			
 
				-                    hidden_states, attention_mask
			
 
				-                )
			
 
				-
			
 
				-            # Only do layer norm if do_stable_layer_norm is False
			
 
				-            if self.hubert.config.do_stable_layer_norm is False:
			
 
				-                hidden_states = self.hubert.encoder.layer_norm(hidden_states)
			
 
				-
			
 
				-            hidden_states = self.hubert.encoder.dropout(hidden_states)
			
 
				-
			
 
				-        # Execute transformer
			
 
				-        for idx, layer_module in enumerate(self.hubert.encoder.layers[: self.vq_layer]):
			
 
				-            if idx < self.vq_layer - self.trainable_layers_before_vq:
			
 
				-                with torch.no_grad():
			
 
				-                    hidden_states = layer_module(hidden_states, attention_mask)[0]
			
 
				-            else:
			
 
				-                hidden_states = layer_module(hidden_states, attention_mask)[0]
			
 
				-
			
 
				-        return hidden_states
			
 
				-
			
 
				-    @torch.no_grad()
			
 
				-    def decode(
			
 
				-        self,
			
 
				-        hidden_states: torch.Tensor,
			
 
				-        attention_mask: Optional[torch.Tensor] = None,
			
 
				-    ) -> torch.Tensor:
			
 
				-        if attention_mask is not None:
			
 
				-            # compute reduced attention_mask corresponding to feature vectors
			
 
				-            _, attention_mask = self._get_attention_mask(
			
 
				-                hidden_states.clone(), attention_mask
			
 
				-            )
			
 
				-
			
 
				-        # Execute transformer
			
 
				-        for idx, layer_module in enumerate(self.hubert.encoder.layers[self.vq_layer :]):
			
 
				-            if idx >= self.trainable_layers_after_vq:
			
 
				-                with torch.no_grad():
			
 
				-                    hidden_states = layer_module(hidden_states, attention_mask)[0]
			
 
				-            else:
			
 
				-                hidden_states = layer_module(hidden_states, attention_mask)[0]
			
 
				-
			
 
				-        with torch.no_grad():
			
 
				-            # Only do layer norm if do_stable_layer_norm is False
			
 
				-            if self.hubert.config.do_stable_layer_norm is False:
			
 
				-                hidden_states = self.hubert.encoder.last_layer_norm(hidden_states)
			
 
				-            else:
			
 
				-                hidden_states = self.hubert.encoder.layer_norm(hidden_states)
			
 
				-
			
 
				-        return hidden_states
			
 
				-
			
 
				-    def forward(
			
 
				-        self,
			
 
				-        input_values: Optional[torch.Tensor],
			
 
				-        attention_mask: Optional[torch.Tensor] = None,
			
 
				-        mask_time_indices: Optional[torch.FloatTensor] = None,
			
 
				-    ):
			
 
				-        hidden_states = self.encode(
			
 
				-            input_values,
			
 
				-            attention_mask=attention_mask,
			
 
				-            mask_time_indices=mask_time_indices,
			
 
				-        )
			
 
				-
			
 
				-        # Quantize
			
 
				-        hidden_states = self.quantizer_ln(hidden_states)
			
 
				-        quantize, _, vq_loss = self.quantizer(hidden_states.transpose(1, 2))
			
 
				-        quantize = quantize.transpose(1, 2)
			
 
				-
			
 
				-        # Inject position embeddings
			
 
				-        with torch.no_grad():
			
 
				-            position_embeddings = self.hubert.encoder.pos_conv_embed(hidden_states)
			
 
				-
			
 
				-        quantize = quantize + position_embeddings
			
 
				-
			
 
				-        # Decode
			
 
				-        hidden_states = self.decode(quantize, attention_mask=attention_mask)
			
 
				-
			
 
				-        return hidden_states, vq_loss
			
 
				-
			
 
				-
			
 
				-@dataclass
			
 
				-class HubertVQOutput:
			
 
				-    loss: torch.Tensor
			
 
				-    metrics: dict[str, torch.Tensor]
			
 
				-
			
 
				-
			
 
				-class HubertVQDistill(nn.Module):
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        model_name_or_path: str = "facebook/hubert-large-ls960-ft",
			
 
				-        vq_layer: int = -4,  # the layer to extract the quantized features
			
 
				-        codebook_size: int = 1024,
			
 
				-        trainable_layers_before_vq: int = 2,
			
 
				-        trainable_layers_after_vq: int = 2,
			
 
				-        vq_loss_weight: float = 1.0,
			
 
				-    ):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        self.hubert_vq = HubertVQ(
			
 
				-            model_name_or_path=model_name_or_path,
			
 
				-            vq_layer=vq_layer,
			
 
				-            codebook_size=codebook_size,
			
 
				-            trainable_layers_before_vq=trainable_layers_before_vq,
			
 
				-            trainable_layers_after_vq=trainable_layers_after_vq,
			
 
				-        )
			
 
				-
			
 
				-        self.hubert_teacher = HubertModel.from_pretrained(model_name_or_path)
			
 
				-        self.vq_loss_weight = vq_loss_weight
			
 
				-
			
 
				-        # Freeze teacher
			
 
				-        for param in self.hubert_teacher.parameters():
			
 
				-            param.requires_grad = False
			
 
				-
			
 
				-    def forward(
			
 
				-        self,
			
 
				-        input_values: Optional[torch.Tensor],
			
 
				-        attention_mask: Optional[torch.Tensor] = None,
			
 
				-        mask_time_indices: Optional[torch.FloatTensor] = None,
			
 
				-    ) -> HubertVQOutput:
			
 
				-        hidden_states, vq_loss = self.hubert_vq(
			
 
				-            input_values,
			
 
				-            attention_mask=attention_mask,
			
 
				-            mask_time_indices=mask_time_indices,
			
 
				-        )
			
 
				-
			
 
				-        # Teacher
			
 
				-        with torch.no_grad():
			
 
				-            teacher_hidden_states = self.hubert_teacher(
			
 
				-                input_values,
			
 
				-                attention_mask=attention_mask,
			
 
				-                mask_time_indices=mask_time_indices,
			
 
				-            ).last_hidden_state
			
 
				-
			
 
				-        distill_loss = torch.nn.functional.mse_loss(
			
 
				-            hidden_states, teacher_hidden_states
			
 
				-        )
			
 
				-
			
 
				-        loss = distill_loss + vq_loss * self.vq_loss_weight
			
 
				-
			
 
				-        metrics = {
			
 
				-            "distill_loss": distill_loss,
			
 
				-            "vq_loss": vq_loss,
			
 
				-        }
			
 
				-
			
 
				-        return HubertVQOutput(loss=loss, metrics=metrics)
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    from datasets import load_dataset
			
 
				-    from transformers import Wav2Vec2Tokenizer
			
 
				-
			
 
				-    processor = Wav2Vec2Tokenizer.from_pretrained("facebook/hubert-large-ls960-ft")
			
 
				-    model = HubertVQ()
			
 
				-    model.train()
			
 
				-    print("Loaded model")
			
 
				-
			
 
				-    optim = torch.optim.Adam(model.parameters(), lr=1e-4)
			
 
				-
			
 
				-    gt_hubert = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
			
 
				-    gt_hubert.train()
			
 
				-    print("Loaded ground truth model")
			
 
				-
			
 
				-    ds = load_dataset(
			
 
				-        "patrickvonplaten/librispeech_asr_dummy", "clean", split="validation"
			
 
				-    )
			
 
				-    print("Loaded dataset")
			
 
				-
			
 
				-    input_values = processor(
			
 
				-        ds[0]["audio"]["array"], return_tensors="pt"
			
 
				-    )  # Batch size 1
			
 
				-
			
 
				-    optim.zero_grad()
			
 
				-    # hidden_states = model.decode(model.encode(**input_values))
			
 
				-    hidden_states, vq_loss = model(**input_values)
			
 
				-    print(hidden_states, vq_loss)
			
 
				-
			
 
				-    gt = gt_hubert(**input_values).last_hidden_state
			
 
				-
			
 
				-    loss = torch.nn.functional.mse_loss(hidden_states, gt)
			
 
				-    print(loss)
			
 
				-
			
 
				-    total_loss = loss + vq_loss
			
 
				-    total_loss.backward()
			
 
				-    optim.step()
			
 
				-
			
 
				-    print("Backward pass done")
			
--- a/speech_lm/models/whisper_vq.py
+++ b/speech_lm/models/whisper_vq.py
@@ -0,0 +1,202 @@
 
				+from dataclasses import dataclass
			
 
				+from typing import Optional
			
 
				+
			
 
				+import torch
			
 
				+from vector_quantize_pytorch import VectorQuantize
			
 
				+from torch import nn
			
 
				+from speech_lm.models.flash_whisper import (
			
 
				+    FlashWhisperForConditionalGeneration,
			
 
				+    FlashWhisperEncoderLayer,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class WhisperVQOutput:
			
 
				+    loss: torch.Tensor
			
 
				+    metrics: dict[str, torch.Tensor]
			
 
				+
			
 
				+class WhisperVQ(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        model_name_or_path: str = "openai/whisper-medium",
			
 
				+        # Quantization
			
 
				+        codebook_dim: int = 32,
			
 
				+        codebook_size: int = 4096,
			
 
				+        codebook_decay: float = 0.9,
			
 
				+        threshold_ema_dead_code: int = 0,
			
 
				+        use_cosine_similarity: bool = True,
			
 
				+        downsample: bool = True,
			
 
				+        # Attention
			
 
				+        post_attention_depth: int = 2,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.whisper = FlashWhisperForConditionalGeneration.from_pretrained(
			
 
				+            model_name_or_path
			
 
				+        )
			
 
				+
			
 
				+        # Freeze Whisper
			
 
				+        for param in self.whisper.parameters():
			
 
				+            param.requires_grad = False
			
 
				+
			
 
				+        # Store vars
			
 
				+        self.downsample = downsample
			
 
				+        self.codebook_dim = codebook_dim
			
 
				+        self.codebook_size = codebook_size
			
 
				+
			
 
				+        # Pre-quantization
			
 
				+        whisper_config = self.whisper.model.config
			
 
				+        encoder_width = whisper_config.encoder_attention_heads * 64
			
 
				+
			
 
				+        self.pre_ln = nn.LayerNorm(encoder_width)
			
 
				+        self.pre_mlp = nn.Sequential(
			
 
				+            nn.Linear(encoder_width, whisper_config.encoder_ffn_dim),
			
 
				+            nn.GELU(),
			
 
				+            nn.Linear(whisper_config.encoder_ffn_dim, encoder_width),
			
 
				+        )
			
 
				+
			
 
				+        # Quantization
			
 
				+        self.quantizer = VectorQuantize(
			
 
				+            dim=encoder_width,
			
 
				+            codebook_size=codebook_size,
			
 
				+            codebook_dim=codebook_dim,
			
 
				+            decay=codebook_decay,
			
 
				+            commitment_weight=1.0,
			
 
				+            threshold_ema_dead_code=threshold_ema_dead_code,
			
 
				+            use_cosine_sim=use_cosine_similarity,
			
 
				+        )
			
 
				+        self.pad_embedding = nn.Parameter(torch.randn(encoder_width))
			
 
				+
			
 
				+        # Post-quantization
			
 
				+        self.post_positional_embedding = nn.Embedding(
			
 
				+            whisper_config.max_source_positions, encoder_width
			
 
				+        )
			
 
				+        self.post_attention = nn.Sequential(
			
 
				+            *[
			
 
				+                FlashWhisperEncoderLayer(
			
 
				+                    config=whisper_config,
			
 
				+                )
			
 
				+                for _ in range(post_attention_depth)
			
 
				+            ]
			
 
				+        )
			
 
				+        self.post_ln = nn.LayerNorm(encoder_width)
			
 
				+
			
 
				+    def encode(
			
 
				+        self,
			
 
				+        input_features: Optional[torch.Tensor],
			
 
				+        attention_mask: Optional[torch.Tensor] = None,
			
 
				+    ) -> torch.Tensor:
			
 
				+        if attention_mask is not None:
			
 
				+            assert attention_mask.ndim == 2, "Attention mask must be 2D"
			
 
				+        
			
 
				+        # Whisper will downsample by 2
			
 
				+        attention_mask = attention_mask[:, ::2]
			
 
				+
			
 
				+        with torch.no_grad():
			
 
				+            hidden_states = self.whisper.model.encoder(
			
 
				+                input_features,
			
 
				+            ).last_hidden_state
			
 
				+
			
 
				+            x = hidden_states
			
 
				+            if self.downsample:
			
 
				+                x = x.reshape(x.shape[0], x.shape[1] // 2, 2, x.shape[2]).mean(dim=2)
			
 
				+                attention_mask = attention_mask[:, ::2]
			
 
				+
			
 
				+        x = x + self.pre_mlp(self.pre_ln(x))
			
 
				+        quantized, indices, loss = self.quantizer(x, mask=attention_mask.bool())
			
 
				+
			
 
				+        # Fill masked positions with pad embedding
			
 
				+        if attention_mask is not None:
			
 
				+            quantized[attention_mask == 0] = self.pad_embedding
			
 
				+
			
 
				+        return quantized, indices, loss, hidden_states
			
 
				+
			
 
				+    @torch.no_grad()
			
 
				+    def decode(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+    ) -> torch.Tensor:
			
 
				+        # Upsample
			
 
				+        if self.downsample:
			
 
				+            hidden_states = hidden_states.repeat_interleave(2, dim=1)
			
 
				+
			
 
				+        # Inject position embeddings
			
 
				+        positions = torch.arange(0, hidden_states.shape[1], dtype=torch.long, device=hidden_states.device)
			
 
				+        x = hidden_states + self.post_positional_embedding(positions)
			
 
				+
			
 
				+        # Decode
			
 
				+        for layer in self.post_attention:
			
 
				+            x = layer(x, None, None)[0]
			
 
				+        hidden_states = self.post_ln(hidden_states)
			
 
				+
			
 
				+        return hidden_states
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_features: torch.Tensor,
			
 
				+        encoder_attention_mask: torch.Tensor,
			
 
				+        decoder_input_ids: torch.Tensor,
			
 
				+        decoder_attention_mask: torch.Tensor,
			
 
				+        labels: torch.Tensor,
			
 
				+        # Audio, not used here
			
 
				+        input_values: Optional[torch.Tensor] = None,
			
 
				+    ) -> WhisperVQOutput:
			
 
				+        quantize, _, vq_loss, teacher_hidden_states = self.encode(
			
 
				+            input_features=input_features,
			
 
				+            attention_mask=encoder_attention_mask,
			
 
				+        )
			
 
				+        vq_hidden_states = self.decode(quantize)
			
 
				+
			
 
				+        # student cross entropy loss
			
 
				+        outputs = self.whisper(
			
 
				+            encoder_outputs=(vq_hidden_states,),
			
 
				+            decoder_input_ids=decoder_input_ids,
			
 
				+            decoder_attention_mask=decoder_attention_mask,
			
 
				+            labels=labels,
			
 
				+        )
			
 
				+        student_ce_loss = outputs.loss
			
 
				+        student_logits = outputs.logits
			
 
				+
			
 
				+        # teacher cross entropy loss
			
 
				+        with torch.no_grad():
			
 
				+            outputs = self.whisper(
			
 
				+                encoder_outputs=(teacher_hidden_states,),
			
 
				+                decoder_input_ids=decoder_input_ids,
			
 
				+                decoder_attention_mask=decoder_attention_mask,
			
 
				+                labels=labels,
			
 
				+            )
			
 
				+            teacher_ce_loss = outputs.loss
			
 
				+            teacher_logits = outputs.logits
			
 
				+
			
 
				+        # KL divergence
			
 
				+        kl_loss = nn.functional.kl_div(
			
 
				+            nn.functional.log_softmax(student_logits, dim=-1),
			
 
				+            nn.functional.softmax(teacher_logits, dim=-1),
			
 
				+            reduction="batchmean",
			
 
				+        )
			
 
				+
			
 
				+        loss = vq_loss + student_ce_loss + kl_loss
			
 
				+
			
 
				+        return WhisperVQOutput(loss=loss, metrics={
			
 
				+            "vq_loss": vq_loss,
			
 
				+            "student_ce_loss": student_ce_loss,
			
 
				+            "teacher_ce_loss": teacher_ce_loss,
			
 
				+            "kl_loss": kl_loss,
			
 
				+        })
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    from transformers import WhisperProcessor
			
 
				+    from speech_lm.datasets.whisper_vq import WhisperVQDataset, WhisperVQCollator
			
 
				+    from torch.utils.data import DataLoader
			
 
				+
			
 
				+    processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
			
 
				+    model = WhisperVQ()
			
 
				+
			
 
				+    ds = WhisperVQDataset("filelists/whisper-vq.train.test.filelist", "openai/whisper-medium")
			
 
				+    loader = DataLoader(ds, batch_size=8, collate_fn=WhisperVQCollator())
			
 
				+
			
 
				+    for batch in loader:
			
 
				+        output = model(**batch)
			
 
				+        print(output)
			
 
				+        break