Fix restarting attribute for lr finder (#15620)

justusschock · Borda · commit f142ce031abb · 2022-12-08T15:45:35.000+01:00
(cherry picked from commit 15184c6)
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -6,9 +6,15 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ## [1.8.4] - 2022-12-08
 
+### Changed
+
 - Direct support for compiled models ([#15922](https://github.com/Lightning-AI/lightning/pull/15922))
+
+### Fixed
+
 - Fixed issue with unsupported torch.inference_mode() on hpu backends ([#15918](https://github.com/Lightning-AI/lightning/pull/15918))
-- Fix LRScheduler import for PyTorch 2.0 ([#15940](https://github.com/Lightning-AI/lightning/pull/15940))
+- Fixed LRScheduler import for PyTorch 2.0 ([#15940](https://github.com/Lightning-AI/lightning/pull/15940))
+- Fixed `fit_loop.restarting` to be `False` for lr finder ([#15620](https://github.com/Lightning-AI/lightning/pull/15620))
 
 
 ## [1.8.3] - 2022-11-22
diff --git a/src/pytorch_lightning/callbacks/lr_finder.py b/src/pytorch_lightning/callbacks/lr_finder.py
@@ -85,7 +85,7 @@ def __init__(
         max_lr: float = 1,
         num_training_steps: int = 100,
         mode: str = "exponential",
-        early_stop_threshold: float = 4.0,
+        early_stop_threshold: Optional[float] = 4.0,
         update_attr: bool = False,
     ) -> None:
         mode = mode.lower()
diff --git a/src/pytorch_lightning/tuner/lr_finder.py b/src/pytorch_lightning/tuner/lr_finder.py
@@ -203,7 +203,7 @@ def lr_find(
     max_lr: float = 1,
     num_training: int = 100,
     mode: str = "exponential",
-    early_stop_threshold: float = 4.0,
+    early_stop_threshold: Optional[float] = 4.0,
     update_attr: bool = False,
 ) -> Optional[_LRFinder]:
     """See :meth:`~pytorch_lightning.tuner.tuning.Tuner.lr_find`"""
@@ -219,6 +219,8 @@ def lr_find(
     ckpt_path = os.path.join(trainer.default_root_dir, f".lr_find_{uuid.uuid4()}.ckpt")
     trainer.save_checkpoint(ckpt_path)
 
+    start_steps = trainer.global_step
+
     # Arguments we adjust during the lr finder, save for restoring
     params = __lr_finder_dump_params(trainer)
 
@@ -239,7 +241,7 @@ def lr_find(
     _try_loop_run(trainer, params)
 
     # Prompt if we stopped early
-    if trainer.global_step != num_training:
+    if trainer.global_step != num_training + start_steps:
         log.info(f"LR finder stopped early after {trainer.global_step} steps due to diverging loss.")
 
     # Transfer results from callback to lr finder object
@@ -263,6 +265,7 @@ def lr_find(
     # Restore initial state of model
     trainer._checkpoint_connector.restore(ckpt_path)
     trainer.strategy.remove_checkpoint(ckpt_path)
+    trainer.fit_loop.restarting = False  # reset restarting flag as checkpoint restoring sets it to True
 
     return lr_finder
 
@@ -282,7 +285,7 @@ def __lr_finder_dump_params(trainer: "pl.Trainer") -> Dict[str, Any]:
     }
 
 
-def __lr_finder_reset_params(trainer: "pl.Trainer", num_training: int, early_stop_threshold: float) -> None:
+def __lr_finder_reset_params(trainer: "pl.Trainer", num_training: int, early_stop_threshold: Optional[float]) -> None:
     from pytorch_lightning.loggers.logger import DummyLogger
 
     trainer.strategy.lr_scheduler_configs = []
@@ -293,8 +296,8 @@ def __lr_finder_reset_params(trainer: "pl.Trainer", num_training: int, early_sto
     trainer.callbacks = [_LRCallback(num_training, early_stop_threshold, progress_bar_refresh_rate=1)]
     # No logging
     trainer.logger = DummyLogger() if trainer.logger is not None else None
-    # Max step set to number of iterations
-    trainer.fit_loop.max_steps = num_training
+    # Max step set to number of iterations starting at current number of iterations
+    trainer.fit_loop.max_steps = num_training + trainer.global_step
     trainer.limit_val_batches = num_training
 
 
@@ -332,7 +335,7 @@ class _LRCallback(Callback):
     def __init__(
         self,
         num_training: int,
-        early_stop_threshold: float = 4.0,
+        early_stop_threshold: Optional[float] = 4.0,
         progress_bar_refresh_rate: int = 0,
         beta: float = 0.98,
     ):
diff --git a/tests/tests_pytorch/tuner/test_lr_finder.py b/tests/tests_pytorch/tuner/test_lr_finder.py
@@ -438,3 +438,50 @@ def test_if_lr_finder_callback_already_configured():
 
     with pytest.raises(MisconfigurationException, match="Trainer is already configured with a .* callback"):
         trainer.tune(model)
+
+
+def test_lr_finder_callback_restarting(tmpdir):
+    """Test that `LearningRateFinder` does not set restarting=True when loading checkpoint."""
+
+    num_lr_steps = 100
+
+    class MyBoringModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.learning_rate = 0.123
+
+        def on_train_batch_start(self, batch, batch_idx):
+            if getattr(self, "_expected_max_steps", None) is not None:
+                assert self.trainer.fit_loop.max_steps == self._expected_max_steps
+
+        def configure_optimizers(self):
+            return torch.optim.SGD(self.parameters(), lr=self.learning_rate)
+
+    class CustomLearningRateFinder(LearningRateFinder):
+        milestones = (1,)
+
+        def lr_find(self, trainer, pl_module) -> None:
+            pl_module._expected_max_steps = trainer.global_step + self._num_training_steps
+            super().lr_find(trainer, pl_module)
+            pl_module._expected_max_steps = None
+            assert not trainer.fit_loop.restarting
+
+        def on_train_epoch_start(self, trainer, pl_module):
+            if trainer.current_epoch in self.milestones or trainer.current_epoch == 0:
+                self.lr_find(trainer, pl_module)
+
+    model = MyBoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=3,
+        callbacks=[
+            CustomLearningRateFinder(early_stop_threshold=None, update_attr=True, num_training_steps=num_lr_steps)
+        ],
+        limit_train_batches=10,
+        limit_val_batches=0,
+        limit_test_batches=0,
+        num_sanity_val_steps=0,
+        enable_model_summary=False,
+    )
+
+    trainer.fit(model)