text2semantic.yaml 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. defaults:
  2. - base
  3. - _self_
  4. project: text2semantic_400m
  5. # Lightning Trainer
  6. trainer:
  7. accumulate_grad_batches: 2
  8. gradient_clip_val: 1.0
  9. gradient_clip_algorithm: 'norm'
  10. max_steps: 1_000_000
  11. precision: bf16-true
  12. limit_val_batches: 10
  13. # Dataset Configuration
  14. tokenizer:
  15. _target_: transformers.AutoTokenizer.from_pretrained
  16. pretrained_model_name_or_path: fishaudio/speech-lm-v1
  17. # Dataset Configuration
  18. train_dataset:
  19. _target_: fish_speech.datasets.text.AutoAugTextDataset
  20. tokenizer: ${tokenizer}
  21. val_dataset:
  22. _target_: fish_speech.datasets.text.AutoAugTextDataset
  23. tokenizer: ${tokenizer}
  24. data:
  25. _target_: fish_speech.datasets.text.TextDataModule
  26. train_dataset: ${train_dataset}
  27. val_dataset: ${val_dataset}
  28. num_workers: 4
  29. batch_size: 16
  30. tokenizer: ${tokenizer}
  31. max_length: 1024
  32. # Model Configuration
  33. model:
  34. _target_: fish_speech.models.text2semantic.TextToSemantic
  35. model:
  36. # ~ 130M parameters, for debug purpose
  37. _target_: fish_speech.models.text2semantic.llama.Transformer
  38. config:
  39. _target_: fish_speech.models.text2semantic.llama.ModelArgs
  40. max_seq_len: 4096
  41. vocab_size: 32312
  42. n_layer: 24
  43. n_head: 16
  44. dim: 1024
  45. rope_base: 10000
  46. norm_eps: 1e-5
  47. codebook_size: 168
  48. num_codebooks: 4
  49. optimizer:
  50. _target_: torch.optim.AdamW
  51. _partial_: true
  52. lr: 1e-4
  53. weight_decay: 0.1
  54. betas: [0.9, 0.95]
  55. eps: 1e-5
  56. lr_scheduler:
  57. _target_: torch.optim.lr_scheduler.LambdaLR
  58. _partial_: true
  59. lr_lambda:
  60. _target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda
  61. _partial_: true
  62. num_warmup_steps: 2000
  63. num_training_steps: ${trainer.max_steps}
  64. final_lr_ratio: 0.1