From 67d5fa1715e76950d012e6983c906197b40357f9 Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Mon, 24 Jun 2024 14:38:30 -0700 Subject: [PATCH 1/7] Update _atomic_save to use transaction when saving a checkpoint. This should make the save more robust to interrupts. --- src/lightning/fabric/utilities/cloud_io.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lightning/fabric/utilities/cloud_io.py b/src/lightning/fabric/utilities/cloud_io.py index af795a4801ef0..c42e3366a91a2 100644 --- a/src/lightning/fabric/utilities/cloud_io.py +++ b/src/lightning/fabric/utilities/cloud_io.py @@ -76,7 +76,10 @@ def _atomic_save(checkpoint: Dict[str, Any], filepath: Union[str, Path]) -> None bytesbuffer = io.BytesIO() log.debug(f"Saving checkpoint: {filepath}") torch.save(checkpoint, bytesbuffer) - with fsspec.open(filepath, "wb") as f: + + # We use a transaction here so that if the save gets interrupted, existing checkpoints are not trashed. + fs = fsspec.filesystem(filepath) + with fs.transaction, fs.open(filepath, "wb") as f: f.write(bytesbuffer.getvalue()) From 2aeb759e09aafad6aac49a09783860227b5a560f Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Mon, 24 Jun 2024 16:48:45 -0700 Subject: [PATCH 2/7] Force stringification of filepath argument. --- src/lightning/fabric/utilities/cloud_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/utilities/cloud_io.py b/src/lightning/fabric/utilities/cloud_io.py index c42e3366a91a2..01cae094d1660 100644 --- a/src/lightning/fabric/utilities/cloud_io.py +++ b/src/lightning/fabric/utilities/cloud_io.py @@ -78,7 +78,7 @@ def _atomic_save(checkpoint: Dict[str, Any], filepath: Union[str, Path]) -> None torch.save(checkpoint, bytesbuffer) # We use a transaction here so that if the save gets interrupted, existing checkpoints are not trashed. - fs = fsspec.filesystem(filepath) + fs, _ = fsspec.core.url_to_fs(str(filepath)) with fs.transaction, fs.open(filepath, "wb") as f: f.write(bytesbuffer.getvalue()) From f4386eb8156d3067446aff0b52ccc25199537571 Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Mon, 24 Jun 2024 17:21:25 -0700 Subject: [PATCH 3/7] Use extracted urlpath to be more idiomatic --- src/lightning/fabric/utilities/cloud_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/utilities/cloud_io.py b/src/lightning/fabric/utilities/cloud_io.py index 01cae094d1660..f79fe4fbb53a3 100644 --- a/src/lightning/fabric/utilities/cloud_io.py +++ b/src/lightning/fabric/utilities/cloud_io.py @@ -78,8 +78,8 @@ def _atomic_save(checkpoint: Dict[str, Any], filepath: Union[str, Path]) -> None torch.save(checkpoint, bytesbuffer) # We use a transaction here so that if the save gets interrupted, existing checkpoints are not trashed. - fs, _ = fsspec.core.url_to_fs(str(filepath)) - with fs.transaction, fs.open(filepath, "wb") as f: + fs, urlpath = fsspec.core.url_to_fs(str(filepath)) + with fs.transaction, fs.open(urlpath, "wb") as f: f.write(bytesbuffer.getvalue()) From f3c4846d2e4cf34d64343eee330ae9413666eeef Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Mon, 24 Jun 2024 17:26:21 -0700 Subject: [PATCH 4/7] Update src/lightning/fabric/utilities/cloud_io.py Co-authored-by: awaelchli --- src/lightning/fabric/utilities/cloud_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/utilities/cloud_io.py b/src/lightning/fabric/utilities/cloud_io.py index f79fe4fbb53a3..49b02cd0952d1 100644 --- a/src/lightning/fabric/utilities/cloud_io.py +++ b/src/lightning/fabric/utilities/cloud_io.py @@ -77,7 +77,7 @@ def _atomic_save(checkpoint: Dict[str, Any], filepath: Union[str, Path]) -> None log.debug(f"Saving checkpoint: {filepath}") torch.save(checkpoint, bytesbuffer) - # We use a transaction here so that if the save gets interrupted, existing checkpoints are not trashed. + # We use a transaction here to avoid file corruption if the save gets interrupted fs, urlpath = fsspec.core.url_to_fs(str(filepath)) with fs.transaction, fs.open(urlpath, "wb") as f: f.write(bytesbuffer.getvalue()) From c433a4dbc2925027758a39c6bdf55bab93d75b3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 25 Jun 2024 10:47:33 +0200 Subject: [PATCH 5/7] add chlog --- src/lightning/fabric/CHANGELOG.md | 2 +- src/lightning/pytorch/CHANGELOG.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 37322981c503e..6155644ed6709 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -9,7 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- +- Made saving non-distributed checkpoints fully atomic ([#20011](https://github.com/Lightning-AI/pytorch-lightning/pull/20011)) - diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 08562a9eb8dca..39dd56e8e73aa 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -9,7 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- +- Made saving non-distributed checkpoints fully atomic ([#20011](https://github.com/Lightning-AI/pytorch-lightning/pull/20011)) - From faf9d1fcd714c32bc1ca8ad8e320b4608b4d26a4 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 25 Jun 2024 20:55:53 +0200 Subject: [PATCH 6/7] set seed to avoid test interactions on global random state --- tests/tests_pytorch/core/test_lightning_optimizer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/tests_pytorch/core/test_lightning_optimizer.py b/tests/tests_pytorch/core/test_lightning_optimizer.py index 89e7c5ee5c5c7..dfb1c26aa6ff6 100644 --- a/tests/tests_pytorch/core/test_lightning_optimizer.py +++ b/tests/tests_pytorch/core/test_lightning_optimizer.py @@ -16,7 +16,8 @@ import pytest import torch -from lightning.pytorch import Trainer + +from lightning.pytorch import Trainer, seed_everything from lightning.pytorch.core.optimizer import LightningOptimizer from lightning.pytorch.demos.boring_classes import BoringModel from lightning.pytorch.loops.optimization.automatic import Closure @@ -239,6 +240,8 @@ def test_lightning_optimizer_automatic_optimization_lbfgs_zero_grad(tmp_path): """Test zero_grad is called the same number of times as LBFGS requires for reevaluation of the loss in automatic_optimization.""" + seed_everything(0) + class TestModel(BoringModel): def configure_optimizers(self): return torch.optim.LBFGS(self.parameters()) From 852b8c4092da9cd6da36d67654f886e10d52d65b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 25 Jun 2024 18:56:30 +0000 Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_pytorch/core/test_lightning_optimizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/tests_pytorch/core/test_lightning_optimizer.py b/tests/tests_pytorch/core/test_lightning_optimizer.py index dfb1c26aa6ff6..4cc079c4cef3a 100644 --- a/tests/tests_pytorch/core/test_lightning_optimizer.py +++ b/tests/tests_pytorch/core/test_lightning_optimizer.py @@ -16,7 +16,6 @@ import pytest import torch - from lightning.pytorch import Trainer, seed_everything from lightning.pytorch.core.optimizer import LightningOptimizer from lightning.pytorch.demos.boring_classes import BoringModel