Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: nan failure during training #3159

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ to [Semantic Versioning]. Full commit history is available in the

- Add {class}`scvi.external.Decipher` for dimensionality reduction and interpretable
representation learning in single-cell RNA sequencing data {pr}`3015`, {pr}`3091`.
- Add an exception callback to {class}`scvi.train.callbacks` in order to save optimal model during
training, instead of failing because of Nan's in gradients or loss. {pr}`3159`.
- Add {class}`scvi.external.RESOLVI` for bias correction in single-cell resolved spatial
transcriptomics {pr}`3144`.

Expand Down
43 changes: 43 additions & 0 deletions src/scvi/train/_callbacks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import logging
import os
import warnings
from collections.abc import Callable
Expand All @@ -15,6 +16,7 @@
from lightning.pytorch.callbacks import Callback, ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.utilities import rank_zero_info
from lightning.pytorch.utilities.rank_zero import rank_prefixed_message

from scvi import settings
from scvi.model.base import BaseModelClass
Expand All @@ -27,6 +29,8 @@

MetricCallable = Callable[[BaseModelClass], float]

log = logging.getLogger(__name__)


class SaveCheckpoint(ModelCheckpoint):
"""``BETA`` Saves model checkpoints based on a monitored metric.
Expand Down Expand Up @@ -91,6 +95,7 @@ def __init__(
)
kwargs.pop("save_last")
self.load_best_on_end = load_best_on_end
self.loss_is_nan = False

super().__init__(
dirpath=dirpath,
Expand Down Expand Up @@ -161,6 +166,44 @@ def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> No
# For scArches shapes are changed and we don't want to overwrite these changed shapes.
pyro.get_param_store().set_state(pyro_param_store)

def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> None:
# Check for NaN in the loss
loss = outputs.get("loss") if isinstance(outputs, dict) else outputs
if torch.isnan(loss).any():
self.loss_is_nan = True

def on_exception(self, trainer, pl_module, exception) -> None:
"""Save the model in case of unexpected exceptions, like Nan in loss or gradients"""
if not isinstance(exception, KeyboardInterrupt):
if self.loss_is_nan:
self.reason = (
"\033[31m[Warning] NaN detected in the loss. Stopping training. "
"Saving model....Please load it back and continue training\033[0m"
)
else:
self.reason = (
"\033[31m[Warning] Exception occurred during training (Nan or Inf gradients). "
"Saving model....Please load it back and continue training\033[0m"
)
trainer.should_stop = True
_, _, best_state_dict, _ = _load_saved_files(
self.best_model_path,
load_adata=False,
map_location=pl_module.module.device,
)
pl_module.module.load_state_dict(best_state_dict)
self.save_path = self.on_save_checkpoint(trainer)
print(self.reason)
print(f"Model saved to {self.save_path}")
self._log_info(trainer, self.reason, False)

@staticmethod
def _log_info(trainer: pl.Trainer, message: str, log_rank_zero_only: bool) -> None:
rank = trainer.global_rank if trainer.world_size > 1 else None
message = rank_prefixed_message(message, rank)
if rank is None or not log_rank_zero_only or rank == 0:
log.info(message)


class SubSampleLabels(Callback):
"""Subsample labels."""
Expand Down
4 changes: 4 additions & 0 deletions src/scvi/train/_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ class Trainer(pl.Trainer):
and in 'max' mode it will stop when the quantity monitored has stopped increasing.
enable_progress_bar
Whether to enable or disable the progress bar.
gradient_clip_val
Value for gradient clipping. Gradient clipping can be enabled to avoid eploding gradients.
progress_bar_refresh_rate
How often to refresh progress bar (in steps). Value 0 disables progress bar.
simple_progress_bar
Expand Down Expand Up @@ -107,6 +109,7 @@ def __init__(
early_stopping_patience: int = 45,
early_stopping_mode: Literal["min", "max"] = "min",
enable_progress_bar: bool = True,
gradient_clip_val: int | float = 0,
progress_bar_refresh_rate: int = 1,
simple_progress_bar: bool = True,
logger: Logger | None | bool = None,
Expand Down Expand Up @@ -156,6 +159,7 @@ def __init__(
benchmark=benchmark,
check_val_every_n_epoch=check_val_every_n_epoch,
max_epochs=max_epochs,
gradient_clip_val=gradient_clip_val,
default_root_dir=default_root_dir,
enable_checkpointing=enable_checkpointing,
num_sanity_val_steps=num_sanity_val_steps,
Expand Down