Przeglądaj źródła

remove unused configs

Lengyue 2 lat temu
rodzic
commit
49f84853c4

+ 0 - 63
fish_speech/configs/llama_finetune.yaml

@@ -1,63 +0,0 @@
-defaults:
-  - base
-  - _self_
-
-project: llama_finetune
-
-# Lightning Trainer
-trainer:
-  accumulate_grad_batches: 2
-  gradient_clip_val: 1.0
-  gradient_clip_algorithm: 'norm'
-
-# Dataset Configuration
-tokenizer:
-  _target_: transformers.AutoTokenizer.from_pretrained
-  pretrained_model_name_or_path: fishaudio/speech-lm-300m
-  revision: text-pretrain-10k
-
-# Dataset Configuration
-train_dataset:
-  - _target_: fish_speech.datasets.text.StreamTextDataset
-    repo: fishaudio/cn-hubert-25hz-vq
-    prefix: 'data/train'
-
-val_dataset:
-  _target_: fish_speech.datasets.text.StreamTextDataset
-  repo: fishaudio/cn-hubert-25hz-vq
-  prefix: 'data/test'
-
-data:
-  _target_: fish_speech.datasets.text.TextDataModule
-  train_dataset: ${train_dataset}
-  val_dataset: ${val_dataset}
-  num_workers: 4
-  batch_size: 8
-  tokenizer: ${tokenizer}
-
-# Model Configuration
-model:
-  _target_: fish_speech.models.text2semantic.TextToSemantic
-
-  model:
-    _target_: transformers.AutoModelForCausalLM.from_pretrained
-    pretrained_model_name_or_path: fishaudio/speech-lm-300m
-    revision: text-pretrain-10k
-
-  optimizer:
-    _target_: torch.optim.AdamW
-    _partial_: true
-    lr: 1e-4
-    weight_decay: 0.1
-    betas: [0.9, 0.95]
-    eps: 1e-5
-
-  lr_scheduler:
-    _target_: torch.optim.lr_scheduler.LambdaLR
-    _partial_: true
-    lr_lambda:
-      _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
-      _partial_: true
-      num_warmup_steps: 2000
-      num_training_steps: ${trainer.max_steps}
-      final_lr_ratio: 0.1

+ 0 - 73
fish_speech/configs/llama_pretrain.yaml

@@ -1,73 +0,0 @@
-defaults:
-  - base
-  - _self_
-
-project: llama_pretrain
-
-# Say we want a 3 trillion seen token schedule
-# 3e12 / 1024 / 512 / 8 = 715255
-# But we use a 100k steps schedule here to save time
-# This is a 400 billion seen token schedule:
-# 1024 * 512 * 8 * 100000 = 419_430_400_000
-
-# Lightning Trainer
-trainer:
-  accumulate_grad_batches: 64
-  gradient_clip_val: 1.0
-  gradient_clip_algorithm: 'norm'
-  num_nodes: 1
-  limit_val_batches: 100 # 100 batches for validation
-
-# Dataset Configuration
-tokenizer:
-  _target_: transformers.AutoTokenizer.from_pretrained
-  pretrained_model_name_or_path: fishaudio/speech-lm-300m
-  revision: init
-
-# Dataset Configuration
-dataset:
-  _target_: fish_speech.datasets.text.InterleaveDataset
-  datasets:
-    - _target_: fish_speech.datasets.text.StreamTextDataset
-      prefix: 'en/'
-    - _target_: fish_speech.datasets.text.StreamTextDataset
-      prefix: 'zh/'
-    - _target_: fish_speech.datasets.text.StreamTextDataset
-      prefix: 'ja/'
-  probabilities: [0.4, 0.3, 0.3]
-  seed: 42
-
-data:
-  _target_: fish_speech.datasets.text.TextDataModule
-  train_dataset: ${dataset}
-  val_dataset: ${dataset}
-  num_workers: 4
-  batch_size: 8
-  tokenizer: ${tokenizer}
-
-# Model Configuration
-model:
-  _target_: fish_speech.models.text2semantic.TextToSemantic
-
-  model:
-    _target_: transformers.AutoModelForCausalLM.from_pretrained
-    pretrained_model_name_or_path: fishaudio/speech-lm-300m
-    revision: init
-
-  optimizer:
-    _target_: torch.optim.AdamW
-    _partial_: true
-    lr: 3e-4
-    weight_decay: 0.1
-    betas: [0.9, 0.95]
-    eps: 1e-5
-
-  lr_scheduler:
-    _target_: torch.optim.lr_scheduler.LambdaLR
-    _partial_: true
-    lr_lambda:
-      _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
-      _partial_: true
-      num_warmup_steps: 2000
-      num_training_steps: ${trainer.max_steps}
-      final_lr_ratio: 0.1

+ 10 - 4
fish_speech/configs/text2semantic.yaml → fish_speech/configs/text2semantic_finetune.yaml

@@ -2,15 +2,15 @@ defaults:
   - base
   - _self_
 
-project: text2semantic_400m
-max_length: 1024
+project: text2semantic_400m_multi
+max_length: 4096
 
 # Lightning Trainer
 trainer:
   accumulate_grad_batches: 2
   gradient_clip_val: 1.0
   gradient_clip_algorithm: 'norm'
-  max_steps: 1_000_000
+  max_steps: 10000
   precision: bf16-true
   limit_val_batches: 10
 
@@ -55,7 +55,8 @@ model:
       dim: 1024
       rope_base: 10000
       norm_eps: 1e-5
-      num_codebooks: 0  # single codebook
+      num_codebooks: 4  # single codebook
+      codebook_size: 168 # codebook size 160 + 2 special tokens
 
   optimizer:
     _target_: torch.optim.AdamW
@@ -74,3 +75,8 @@ model:
       num_warmup_steps: 2000
       num_training_steps: ${trainer.max_steps}
       final_lr_ratio: 0.1
+
+# Callbacks
+callbacks:
+  model_checkpoint:
+    every_n_train_steps: 1000

+ 0 - 0
fish_speech/configs/text2semantic_multi.yaml → fish_speech/configs/text2semantic_pretrain.yaml


+ 0 - 145
fish_speech/configs/vq_naive.yaml

@@ -1,145 +0,0 @@
-defaults:
-  - base
-  - _self_
-
-project: vq_naive
-
-# Lightning Trainer
-trainer:
-  accelerator: gpu
-  devices: 1
-  strategy: ddp_find_unused_parameters_true
-  gradient_clip_val: 1.0
-  gradient_clip_algorithm: 'norm'
-  precision: bf16-mixed
-  max_steps: 100_000
-  val_check_interval: 5000
-
-sample_rate: 22050
-hop_length: 256
-num_mels: 80
-n_fft: 1024
-win_length: 1024
-segment_size: 512
-
-# Dataset Configuration
-train_dataset:
-  _target_: fish_speech.datasets.vqgan.VQGANDataset
-  filelist: data/filelist.split.train
-  sample_rate: ${sample_rate}
-  hop_length: ${hop_length}
-  slice_frames: ${segment_size}
-
-val_dataset:
-  _target_: fish_speech.datasets.vqgan.VQGANDataset
-  filelist: data/filelist.split.valid
-  sample_rate: ${sample_rate}
-  hop_length: ${hop_length}
-
-data:
-  _target_: fish_speech.datasets.vqgan.VQGANDataModule
-  train_dataset: ${train_dataset}
-  val_dataset: ${val_dataset}
-  num_workers: 8
-  batch_size: 128
-  val_batch_size: 16
-
-# Model Configuration
-model:
-  _target_: fish_speech.models.vqgan.VQNaive
-  sample_rate: ${sample_rate}
-  hop_length: ${hop_length}
-
-  downsample:
-    _target_: fish_speech.models.vq_diffusion.lit_module.ConvDownSample
-    dims: ["${num_mels}", 512, 256]
-    kernel_sizes: [3, 3]
-    strides: [2, 2]
-
-  mel_encoder:
-    _target_: fish_speech.models.vqgan.modules.encoders.TextEncoder
-    in_channels: 256
-    out_channels: 256
-    hidden_channels: 192
-    hidden_channels_ffn: 768
-    n_heads: 2
-    n_layers: 6
-    kernel_size: 1
-    dropout: 0.1
-    use_vae: false
-
-  vq_encoder:
-    _target_: fish_speech.models.vqgan.modules.encoders.VQEncoder
-    in_channels: 256
-    vq_channels: 256
-    codebook_size: 4096
-    downsample: 1
-
-  speaker_encoder:
-    _target_: fish_speech.models.vqgan.modules.encoders.SpeakerEncoder
-    in_channels: ${num_mels}
-    hidden_channels: 192
-    out_channels: 256
-    num_heads: 2
-    num_layers: 4
-    p_dropout: 0.1
-
-  decoder:
-    _target_: fish_speech.models.vqgan.modules.encoders.TextEncoder
-    in_channels: 256
-    out_channels: ${num_mels}
-    hidden_channels: 192
-    hidden_channels_ffn: 768
-    n_heads: 2
-    n_layers: 8
-    kernel_size: 1
-    use_vae: false
-    dropout: 0.1
-    gin_channels: 256
-    speaker_cond_layer: 4
-
-  mel_transform:
-    _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
-    sample_rate: ${sample_rate}
-    n_fft: ${n_fft}
-    hop_length: ${hop_length}
-    win_length: ${win_length}
-    n_mels: ${num_mels}
-    f_min: 0
-    f_max: 8000
-
-  vocoder:
-    _target_: fish_speech.models.vqgan.modules.decoder.Generator
-    initial_channel: ${num_mels}
-    resblock: "1"
-    resblock_kernel_sizes: [3, 7, 11]
-    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
-    upsample_rates: [8, 8, 2, 2]
-    upsample_initial_channel: 512
-    upsample_kernel_sizes: [16, 16, 4, 4]
-    ckpt_path: "checkpoints/hifigan-v1-universal-22050/g_02500000"
-
-  optimizer:
-    _target_: torch.optim.AdamW
-    _partial_: true
-    lr: 1e-4
-    betas: [0.9, 0.999]
-    eps: 1e-5
-
-  lr_scheduler:
-    _target_: torch.optim.lr_scheduler.LambdaLR
-    _partial_: true
-    lr_lambda:
-      _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
-      _partial_: true
-      num_warmup_steps: 1000
-      num_training_steps: ${trainer.max_steps}
-      final_lr_ratio: 0.05
-
-callbacks:
-  grad_norm_monitor:
-    sub_module: 
-      - mel_encoder
-      - vq_encoder
-      - speaker_encoder
-      - decoder

+ 0 - 145
fish_speech/configs/vq_naive_40hz.yaml

@@ -1,145 +0,0 @@
-defaults:
-  - base
-  - _self_
-
-project: vq_naive_40hz
-
-# Lightning Trainer
-trainer:
-  accelerator: gpu
-  devices: 4
-  strategy: ddp_find_unused_parameters_true
-  gradient_clip_val: 1.0
-  gradient_clip_algorithm: 'norm'
-  precision: bf16-mixed
-  max_steps: 1_000_000
-  val_check_interval: 5000
-
-sample_rate: 22050
-hop_length: 256
-num_mels: 80
-n_fft: 1024
-win_length: 1024
-segment_size: 512
-
-# Dataset Configuration
-train_dataset:
-  _target_: fish_speech.datasets.vqgan.VQGANDataset
-  filelist: data/filelist.split.train
-  sample_rate: ${sample_rate}
-  hop_length: ${hop_length}
-  slice_frames: ${segment_size}
-
-val_dataset:
-  _target_: fish_speech.datasets.vqgan.VQGANDataset
-  filelist: data/filelist.split.valid
-  sample_rate: ${sample_rate}
-  hop_length: ${hop_length}
-
-data:
-  _target_: fish_speech.datasets.vqgan.VQGANDataModule
-  train_dataset: ${train_dataset}
-  val_dataset: ${val_dataset}
-  num_workers: 4
-  batch_size: 32
-  val_batch_size: 16
-
-# Model Configuration
-model:
-  _target_: fish_speech.models.vqgan.VQNaive
-  sample_rate: ${sample_rate}
-  hop_length: ${hop_length}
-
-  downsample:
-    _target_: fish_speech.models.vq_diffusion.lit_module.ConvDownSample
-    dims: ["${num_mels}", 256]
-    kernel_sizes: [3]
-    strides: [2]
-
-  mel_encoder:
-    _target_: fish_speech.models.vqgan.modules.encoders.TextEncoder
-    in_channels: 256
-    out_channels: 256
-    hidden_channels: 192
-    hidden_channels_ffn: 768
-    n_heads: 2
-    n_layers: 6
-    kernel_size: 1
-    dropout: 0.1
-    use_vae: false
-
-  vq_encoder:
-    _target_: fish_speech.models.vqgan.modules.encoders.VQEncoder
-    in_channels: 256
-    vq_channels: 256
-    codebook_size: 4096
-    downsample: 1
-
-  speaker_encoder:
-    _target_: fish_speech.models.vqgan.modules.encoders.SpeakerEncoder
-    in_channels: ${num_mels}
-    hidden_channels: 192
-    out_channels: 256
-    num_heads: 2
-    num_layers: 4
-    p_dropout: 0.1
-
-  decoder:
-    _target_: fish_speech.models.vqgan.modules.encoders.TextEncoder
-    in_channels: 256
-    out_channels: ${num_mels}
-    hidden_channels: 192
-    hidden_channels_ffn: 768
-    n_heads: 2
-    n_layers: 8
-    kernel_size: 1
-    use_vae: false
-    dropout: 0.1
-    gin_channels: 256
-    speaker_cond_layer: 4
-
-  mel_transform:
-    _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
-    sample_rate: ${sample_rate}
-    n_fft: ${n_fft}
-    hop_length: ${hop_length}
-    win_length: ${win_length}
-    n_mels: ${num_mels}
-    f_min: 0
-    f_max: 8000
-
-  vocoder:
-    _target_: fish_speech.models.vqgan.modules.decoder.Generator
-    initial_channel: ${num_mels}
-    resblock: "1"
-    resblock_kernel_sizes: [3, 7, 11]
-    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
-    upsample_rates: [8, 8, 2, 2]
-    upsample_initial_channel: 512
-    upsample_kernel_sizes: [16, 16, 4, 4]
-    ckpt_path: "checkpoints/hifigan-v1-universal-22050/g_02500000"
-
-  optimizer:
-    _target_: torch.optim.AdamW
-    _partial_: true
-    lr: 1e-4
-    betas: [0.9, 0.999]
-    eps: 1e-5
-
-  lr_scheduler:
-    _target_: torch.optim.lr_scheduler.LambdaLR
-    _partial_: true
-    lr_lambda:
-      _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
-      _partial_: true
-      num_warmup_steps: 1000
-      num_training_steps: ${trainer.max_steps}
-      final_lr_ratio: 0.05
-
-callbacks:
-  grad_norm_monitor:
-    sub_module: 
-      - mel_encoder
-      - vq_encoder
-      - speaker_encoder
-      - decoder

+ 0 - 134
fish_speech/configs/vqgan_single.yaml

@@ -1,134 +0,0 @@
-defaults:
-  - base
-  - _self_
-
-project: vqgan_single
-
-# Lightning Trainer
-trainer:
-  accelerator: gpu
-  devices: 4
-  strategy: ddp_find_unused_parameters_true
-  precision: 32
-  max_steps: 1_000_000
-  val_check_interval: 5000
-
-sample_rate: 22050
-hop_length: 256
-num_mels: 80
-n_fft: 1024
-win_length: 1024
-segment_size: 256
-
-# Dataset Configuration
-train_dataset:
-  _target_: fish_speech.datasets.vqgan.VQGANDataset
-  filelist: data/filelist.split.train
-  sample_rate: ${sample_rate}
-  hop_length: ${hop_length}
-  slice_frames: ${segment_size}
-
-val_dataset:
-  _target_: fish_speech.datasets.vqgan.VQGANDataset
-  filelist: data/filelist.split.valid
-  sample_rate: ${sample_rate}
-  hop_length: ${hop_length}
-
-data:
-  _target_: fish_speech.datasets.vqgan.VQGANDataModule
-  train_dataset: ${train_dataset}
-  val_dataset: ${val_dataset}
-  num_workers: 4
-  batch_size: 32
-  val_batch_size: 4
-
-# Model Configuration
-model:
-  _target_: fish_speech.models.vqgan.VQGAN
-  sample_rate: ${sample_rate}
-  hop_length: ${hop_length}
-  segment_size: 8192
-  freeze_hifigan: false
-
-  downsample:
-    _target_: fish_speech.models.vqgan.modules.encoders.ConvDownSampler
-    dims: ["${num_mels}", 512, 256]
-    kernel_sizes: [3, 3]
-    strides: [2, 2]
-
-  mel_encoder:
-    _target_: fish_speech.models.vqgan.modules.modules.WN
-    hidden_channels: 256
-    kernel_size: 3
-    dilation_rate: 2
-    n_layers: 6
-
-  vq_encoder:
-    _target_: fish_speech.models.vqgan.modules.encoders.VQEncoder
-    in_channels: 256
-    vq_channels: 256
-    codebook_size: 4096
-    codebook_groups: 1
-    downsample: 1
-
-  speaker_encoder:
-    _target_: fish_speech.models.vqgan.modules.encoders.SpeakerEncoder
-    in_channels: ${num_mels}
-    hidden_channels: 256
-    out_channels: 512
-    num_layers: 6
-
-  decoder:
-    _target_: fish_speech.models.vqgan.modules.modules.WN
-    hidden_channels: 256
-    out_channels: ${num_mels}
-    kernel_size: 3
-    dilation_rate: 2
-    n_layers: 6
-    gin_channels: 512
-
-  generator:
-    _target_: fish_speech.models.vqgan.modules.decoder.Generator
-    initial_channel: ${num_mels}
-    resblock: "1"
-    resblock_kernel_sizes: [3, 7, 11]
-    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
-    upsample_rates: [8, 8, 2, 2]
-    upsample_initial_channel: 512
-    upsample_kernel_sizes: [16, 16, 4, 4]
-    ckpt_path: "checkpoints/hifigan-v1-universal-22050/g_02500000"
-
-  discriminator:
-    _target_: fish_speech.models.vqgan.modules.discriminator.EnsembleDiscriminator
-    ckpt_path: checkpoints/hifigan-v1-universal-22050/do_02500000
-
-  mel_transform:
-    _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
-    sample_rate: ${sample_rate}
-    n_fft: ${n_fft}
-    hop_length: ${hop_length}
-    win_length: ${win_length}
-    n_mels: ${num_mels}
-    f_min: 0
-    f_max: 8000
-
-  optimizer:
-    _target_: torch.optim.AdamW
-    _partial_: true
-    lr: 2e-4
-    betas: [0.8, 0.99]
-    eps: 1e-5
-
-  lr_scheduler:
-    _target_: torch.optim.lr_scheduler.ExponentialLR
-    _partial_: true
-    gamma: 0.999999  # Estimated base on LibriTTS dataset
-
-callbacks:
-  grad_norm_monitor:
-    sub_module: 
-      - generator
-      - discriminator
-      - mel_encoder
-      - vq_encoder
-      - decoder

+ 0 - 134
fish_speech/configs/vqgan_single_2x.yaml

@@ -1,134 +0,0 @@
-defaults:
-  - base
-  - _self_
-
-project: vqgan_single_2x
-
-# Lightning Trainer
-trainer:
-  accelerator: gpu
-  devices: 4
-  strategy: ddp_find_unused_parameters_true
-  precision: 32
-  max_steps: 1_000_000
-  val_check_interval: 5000
-
-sample_rate: 22050
-hop_length: 256
-num_mels: 80
-n_fft: 1024
-win_length: 1024
-segment_size: 256
-
-# Dataset Configuration
-train_dataset:
-  _target_: fish_speech.datasets.vqgan.VQGANDataset
-  filelist: data/filelist.split.train
-  sample_rate: ${sample_rate}
-  hop_length: ${hop_length}
-  slice_frames: ${segment_size}
-
-val_dataset:
-  _target_: fish_speech.datasets.vqgan.VQGANDataset
-  filelist: data/filelist.split.valid
-  sample_rate: ${sample_rate}
-  hop_length: ${hop_length}
-
-data:
-  _target_: fish_speech.datasets.vqgan.VQGANDataModule
-  train_dataset: ${train_dataset}
-  val_dataset: ${val_dataset}
-  num_workers: 4
-  batch_size: 32
-  val_batch_size: 4
-
-# Model Configuration
-model:
-  _target_: fish_speech.models.vqgan.VQGAN
-  sample_rate: ${sample_rate}
-  hop_length: ${hop_length}
-  segment_size: 8192
-  freeze_hifigan: false
-
-  downsample:
-    _target_: fish_speech.models.vqgan.modules.encoders.ConvDownSampler
-    dims: ["${num_mels}", 512, 384]
-    kernel_sizes: [3, 3]
-    strides: [2, 2]
-
-  mel_encoder:
-    _target_: fish_speech.models.vqgan.modules.modules.WN
-    hidden_channels: 384
-    kernel_size: 3
-    dilation_rate: 2
-    n_layers: 12
-
-  vq_encoder:
-    _target_: fish_speech.models.vqgan.modules.encoders.VQEncoder
-    in_channels: 384
-    vq_channels: 384
-    codebook_size: 4096
-    codebook_groups: 1
-    downsample: 1
-
-  speaker_encoder:
-    _target_: fish_speech.models.vqgan.modules.encoders.SpeakerEncoder
-    in_channels: ${num_mels}
-    hidden_channels: 384
-    out_channels: 512
-    num_layers: 12
-
-  decoder:
-    _target_: fish_speech.models.vqgan.modules.modules.WN
-    hidden_channels: 384
-    out_channels: ${num_mels}
-    kernel_size: 3
-    dilation_rate: 2
-    n_layers: 12
-    gin_channels: 512
-
-  generator:
-    _target_: fish_speech.models.vqgan.modules.decoder.Generator
-    initial_channel: ${num_mels}
-    resblock: "1"
-    resblock_kernel_sizes: [3, 7, 11]
-    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
-    upsample_rates: [8, 8, 2, 2]
-    upsample_initial_channel: 512
-    upsample_kernel_sizes: [16, 16, 4, 4]
-    ckpt_path: "checkpoints/hifigan-v1-universal-22050/g_02500000"
-
-  discriminator:
-    _target_: fish_speech.models.vqgan.modules.discriminator.EnsembleDiscriminator
-    ckpt_path: checkpoints/hifigan-v1-universal-22050/do_02500000
-
-  mel_transform:
-    _target_: fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
-    sample_rate: ${sample_rate}
-    n_fft: ${n_fft}
-    hop_length: ${hop_length}
-    win_length: ${win_length}
-    n_mels: ${num_mels}
-    f_min: 0
-    f_max: 8000
-
-  optimizer:
-    _target_: torch.optim.AdamW
-    _partial_: true
-    lr: 2e-4
-    betas: [0.8, 0.99]
-    eps: 1e-5
-
-  lr_scheduler:
-    _target_: torch.optim.lr_scheduler.ExponentialLR
-    _partial_: true
-    gamma: 0.999999  # Estimated base on LibriTTS dataset
-
-callbacks:
-  grad_norm_monitor:
-    sub_module: 
-      - generator
-      - discriminator
-      - mel_encoder
-      - vq_encoder
-      - decoder