scverse · ori-kron-wis · Jan 12, 2025 · Jan 14, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,8 @@ to [Semantic Versioning]. Full commit history is available in the
 
 - Add {class}`scvi.external.Decipher` for dimensionality reduction and interpretable
     representation learning in single-cell RNA sequencing data {pr}`3015`, {pr}`3091`.
+- Add an exception callback to {class}`scvi.train.callbacks` in order to save optimal model during
+    training, instead of failing because of Nan's in gradients or loss. {pr}`3159`.
 - Add {class}`scvi.external.RESOLVI` for bias correction in single-cell resolved spatial
     transcriptomics {pr}`3144`.
 

diff --git a/src/scvi/train/_callbacks.py b/src/scvi/train/_callbacks.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 import os
 import warnings
 from collections.abc import Callable
@@ -15,6 +16,7 @@
 from lightning.pytorch.callbacks import Callback, ModelCheckpoint
 from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 from lightning.pytorch.utilities import rank_zero_info
+from lightning.pytorch.utilities.rank_zero import rank_prefixed_message
 
 from scvi import settings
 from scvi.model.base import BaseModelClass
@@ -27,6 +29,8 @@
 
 MetricCallable = Callable[[BaseModelClass], float]
 
+log = logging.getLogger(__name__)
+
 
 class SaveCheckpoint(ModelCheckpoint):
     """``BETA`` Saves model checkpoints based on a monitored metric.
@@ -91,6 +95,7 @@ def __init__(
             )
             kwargs.pop("save_last")
         self.load_best_on_end = load_best_on_end
+        self.loss_is_nan = False
 
         super().__init__(
             dirpath=dirpath,
@@ -161,6 +166,44 @@ def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> No
             # For scArches shapes are changed and we don't want to overwrite these changed shapes.
             pyro.get_param_store().set_state(pyro_param_store)
 
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> None:
+        # Check for NaN in the loss
+        loss = outputs.get("loss") if isinstance(outputs, dict) else outputs
+        if torch.isnan(loss).any():
+            self.loss_is_nan = True
+
+    def on_exception(self, trainer, pl_module, exception) -> None:
+        """Save the model in case of unexpected exceptions, like Nan in loss or gradients"""
+        if not isinstance(exception, KeyboardInterrupt):
+            if self.loss_is_nan:
+                self.reason = (
+                    "\033[31m[Warning] NaN detected in the loss. Stopping training. "
+                    "Saving model....Please load it back and continue training\033[0m"
+                )
+            else:
+                self.reason = (
+                    "\033[31m[Warning] Exception occurred during training (Nan or Inf gradients). "
+                    "Saving model....Please load it back and continue training\033[0m"
+                )
+            trainer.should_stop = True
+            _, _, best_state_dict, _ = _load_saved_files(
+                self.best_model_path,
+                load_adata=False,
+                map_location=pl_module.module.device,
+            )
+            pl_module.module.load_state_dict(best_state_dict)
+            self.save_path = self.on_save_checkpoint(trainer)
+            print(self.reason)
+            print(f"Model saved to {self.save_path}")
+            self._log_info(trainer, self.reason, False)
+
+    @staticmethod
+    def _log_info(trainer: pl.Trainer, message: str, log_rank_zero_only: bool) -> None:
+        rank = trainer.global_rank if trainer.world_size > 1 else None
+        message = rank_prefixed_message(message, rank)
+        if rank is None or not log_rank_zero_only or rank == 0:
+            log.info(message)
+
 
 class SubSampleLabels(Callback):
     """Subsample labels."""

diff --git a/src/scvi/train/_trainer.py b/src/scvi/train/_trainer.py
@@ -72,6 +72,8 @@ class Trainer(pl.Trainer):
         and in 'max' mode it will stop when the quantity monitored has stopped increasing.
     enable_progress_bar
         Whether to enable or disable the progress bar.
+    gradient_clip_val
+        Value for gradient clipping. Gradient clipping can be enabled to avoid eploding gradients.
     progress_bar_refresh_rate
         How often to refresh progress bar (in steps). Value 0 disables progress bar.
     simple_progress_bar
@@ -107,6 +109,7 @@ def __init__(
         early_stopping_patience: int = 45,
         early_stopping_mode: Literal["min", "max"] = "min",
         enable_progress_bar: bool = True,
+        gradient_clip_val: int | float = 0,
         progress_bar_refresh_rate: int = 1,
         simple_progress_bar: bool = True,
         logger: Logger | None | bool = None,
@@ -156,6 +159,7 @@ def __init__(
             benchmark=benchmark,
             check_val_every_n_epoch=check_val_every_n_epoch,
             max_epochs=max_epochs,
+            gradient_clip_val=gradient_clip_val,
             default_root_dir=default_root_dir,
             enable_checkpointing=enable_checkpointing,
             num_sanity_val_steps=num_sanity_val_steps,