train.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. import os
  2. import sys
  3. from typing import Optional
  4. import hydra
  5. import lightning as L
  6. import pyrootutils
  7. import torch
  8. from lightning import Callback, LightningDataModule, LightningModule, Trainer
  9. from lightning.pytorch.loggers import Logger
  10. from lightning.pytorch.strategies import DDPStrategy
  11. from omegaconf import DictConfig, OmegaConf
  12. os.environ.pop("SLURM_NTASKS", None)
  13. os.environ.pop("SLURM_JOB_NAME", None)
  14. os.environ.pop("SLURM_NTASKS_PER_NODE", None)
  15. # register eval resolver and root
  16. pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
  17. # Allow TF32 on Ampere GPUs
  18. torch.set_float32_matmul_precision("high")
  19. torch.backends.cudnn.allow_tf32 = True
  20. # register eval resolver
  21. OmegaConf.register_new_resolver("eval", eval)
  22. import fish_speech.utils as utils
  23. log = utils.RankedLogger(__name__, rank_zero_only=True)
  24. @utils.task_wrapper
  25. def train(cfg: DictConfig) -> tuple[dict, dict]:
  26. """Trains the model. Can additionally evaluate on a testset, using best weights obtained during
  27. training.
  28. This method is wrapped in optional @task_wrapper decorator, that controls the behavior during
  29. failure. Useful for multiruns, saving info about the crash, etc.
  30. Args:
  31. cfg (DictConfig): Configuration composed by Hydra.
  32. Returns:
  33. Tuple[dict, dict]: Dict with metrics and dict with all instantiated objects.
  34. """ # noqa: E501
  35. # set seed for random number generators in pytorch, numpy and python.random
  36. if cfg.get("seed"):
  37. L.seed_everything(cfg.seed, workers=False)
  38. if cfg.get("deterministic"):
  39. torch.use_deterministic_algorithms(True)
  40. log.info(f"Instantiating datamodule <{cfg.data._target_}>")
  41. datamodule: LightningDataModule = hydra.utils.instantiate(cfg.data)
  42. log.info(f"Instantiating model <{cfg.model._target_}>")
  43. model: LightningModule = hydra.utils.instantiate(cfg.model)
  44. log.info("Instantiating callbacks...")
  45. callbacks: list[Callback] = utils.instantiate_callbacks(cfg.get("callbacks"))
  46. log.info("Instantiating loggers...")
  47. logger: list[Logger] = utils.instantiate_loggers(cfg.get("logger"))
  48. log.info(f"Instantiating trainer <{cfg.trainer._target_}>")
  49. trainer: Trainer = hydra.utils.instantiate(
  50. cfg.trainer,
  51. callbacks=callbacks,
  52. logger=logger,
  53. )
  54. object_dict = {
  55. "cfg": cfg,
  56. "datamodule": datamodule,
  57. "model": model,
  58. "callbacks": callbacks,
  59. "logger": logger,
  60. "trainer": trainer,
  61. }
  62. if logger:
  63. log.info("Logging hyperparameters!")
  64. utils.log_hyperparameters(object_dict)
  65. if cfg.get("train"):
  66. log.info("Starting training!")
  67. ckpt_path = cfg.get("ckpt_path")
  68. auto_resume = False
  69. resume_ckpt_path = utils.get_latest_checkpoint(cfg.paths.ckpt_dir)
  70. if resume_ckpt_path is not None:
  71. ckpt_path = resume_ckpt_path
  72. auto_resume = True
  73. if ckpt_path is not None:
  74. log.info(f"Resuming from checkpoint: {ckpt_path}")
  75. # resume weights only is disabled for auto-resume
  76. if cfg.get("resume_weights_only") and auto_resume is False:
  77. log.info("Resuming weights only!")
  78. ckpt = torch.load(ckpt_path, map_location=model.device)
  79. if "state_dict" in ckpt:
  80. ckpt = ckpt["state_dict"]
  81. err = model.load_state_dict(ckpt, strict=False)
  82. log.info(f"Error loading state dict: {err}")
  83. ckpt_path = None
  84. trainer.fit(model=model, datamodule=datamodule, ckpt_path=ckpt_path)
  85. train_metrics = trainer.callback_metrics
  86. if cfg.get("test"):
  87. log.info("Starting testing!")
  88. ckpt_path = trainer.checkpoint_callback.best_model_path
  89. if ckpt_path == "":
  90. log.warning("Best ckpt not found! Using current weights for testing...")
  91. ckpt_path = cfg.get("ckpt_path")
  92. trainer.test(model=model, datamodule=datamodule, ckpt_path=ckpt_path)
  93. log.info(f"Best ckpt path: {ckpt_path}")
  94. test_metrics = trainer.callback_metrics
  95. # merge train and test metrics
  96. metric_dict = {**train_metrics, **test_metrics}
  97. return metric_dict, object_dict
  98. @hydra.main(
  99. version_base="1.3", config_path="./configs", config_name="llama_pretrain.yaml"
  100. )
  101. def main(cfg: DictConfig) -> Optional[float]:
  102. # train the model
  103. train(cfg)
  104. if __name__ == "__main__":
  105. main()