Skip to content

Commit 7767fd3

Browse files
authored
Fix result transfer in multiprocessing launcher on multi-node (#15567)
* Fix result transfer in multiprocessing launcher on multi-node * add simple test * add comment * update test * changelog * use tempfile * fix * assert None * unused import * add comment
1 parent 0886e63 commit 7767fd3

File tree

4 files changed

+57
-9
lines changed

4 files changed

+57
-9
lines changed

src/pytorch_lightning/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
6161

6262
- Fixed an issue with `WandbLogger(log_model=True|'all)` raising an error and not being able to serialize tensors in the metadata ([#15544](https://github.com/Lightning-AI/lightning/pull/15544))
6363

64+
- Fixed model state transfer in multiprocessing launcher when running multi-node ([#15567](https://github.com/Lightning-AI/lightning/pull/15567))
65+
6466

6567
## [1.8.0] - 2022-11-01
6668

src/pytorch_lightning/strategies/launchers/multiprocessing.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
import os
15+
import tempfile
1516
from collections import UserList
1617
from dataclasses import dataclass
1718
from multiprocessing.queues import SimpleQueue
@@ -172,13 +173,14 @@ def _collect_rank_zero_results(self, trainer: "pl.Trainer", results: Any) -> Opt
172173
# requires to compute the state_dict on all processes in case Metrics are present
173174
state_dict = trainer.lightning_module.state_dict()
174175

175-
if self._strategy.global_rank != 0:
176+
if self._strategy.local_rank != 0:
176177
return None
177178

178179
# save the last weights
179180
weights_path = None
180181
if trainer.state.fn == TrainerFn.FITTING:
181-
weights_path = os.path.join(trainer.default_root_dir, ".temp.ckpt")
182+
# use tempdir here to avoid race conditions because the filesystem may be shared between nodes
183+
weights_path = os.path.join(tempfile.mkdtemp(), ".temp.ckpt")
182184
self._strategy.checkpoint_io.save_checkpoint(state_dict, weights_path)
183185

184186
# adds the `callback_metrics` to the queue

tests/tests_pytorch/strategies/launchers/test_multiprocessing.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,19 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
import os
1415
from unittest import mock
1516
from unittest.mock import ANY, Mock
1617

1718
import pytest
1819
import torch
1920

21+
from lightning_lite.plugins import ClusterEnvironment
22+
from pytorch_lightning import Trainer
23+
from pytorch_lightning.demos.boring_classes import BoringModel
24+
from pytorch_lightning.strategies import DDPSpawnStrategy
2025
from pytorch_lightning.strategies.launchers.multiprocessing import _GlobalStateSnapshot, _MultiProcessingLauncher
26+
from pytorch_lightning.trainer.states import TrainerFn
2127
from tests_pytorch.helpers.runif import RunIf
2228

2329

@@ -76,3 +82,45 @@ def test_global_state_snapshot():
7682
assert torch.are_deterministic_algorithms_enabled()
7783
assert not torch.backends.cudnn.benchmark
7884
assert torch.initial_seed() == 123
85+
86+
87+
@pytest.mark.parametrize("trainer_fn", [TrainerFn.FITTING, "other"])
88+
@pytest.mark.parametrize("fake_node_rank", [0, 1])
89+
@pytest.mark.parametrize("fake_local_rank", [0, 1])
90+
def test_collect_rank_zero_results(trainer_fn, fake_node_rank, fake_local_rank, tmpdir):
91+
"""Tests that the spawn strategy transfers the new weights to the main process and deletes the temporary
92+
file."""
93+
model = Mock(wraps=BoringModel(), spec=BoringModel)
94+
fake_global_rank = 2 * fake_node_rank + fake_local_rank
95+
96+
cluster_environment = Mock(spec=ClusterEnvironment)
97+
cluster_environment.world_size.return_value = 4
98+
cluster_environment.node_rank.return_value = fake_node_rank
99+
cluster_environment.local_rank.return_value = fake_local_rank
100+
cluster_environment.global_rank.return_value = fake_global_rank
101+
102+
strategy = DDPSpawnStrategy(cluster_environment=cluster_environment)
103+
strategy._local_rank = fake_local_rank
104+
105+
launcher = _MultiProcessingLauncher(strategy=strategy)
106+
trainer = Trainer(default_root_dir=tmpdir, strategy=strategy)
107+
108+
assert strategy.node_rank == fake_node_rank
109+
assert strategy.local_rank == fake_local_rank
110+
assert strategy.global_rank == fake_global_rank
111+
112+
trainer.strategy.connect(model)
113+
trainer.state.fn = trainer_fn # pretend we are in a particular trainer state
114+
115+
spawn_output = launcher._collect_rank_zero_results(trainer, {})
116+
117+
model.state_dict.assert_called_once()
118+
is_fitting = trainer_fn == TrainerFn.FITTING
119+
if strategy.local_rank == 0:
120+
# on local rank 0 (each node), we expect a temp checkpoint (when fitting)
121+
assert not is_fitting or spawn_output.weights_path.endswith(".temp.ckpt")
122+
assert not is_fitting or os.path.isfile(spawn_output.weights_path)
123+
assert is_fitting or spawn_output.weights_path is None
124+
else:
125+
# all other ranks don't have outputs (rank 0 needs to handle the output)
126+
assert spawn_output is None

tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
import os
1415
from datetime import timedelta
15-
from pathlib import Path
1616
from unittest import mock
1717
from unittest.mock import Mock
1818

@@ -135,23 +135,19 @@ def test_ddp_spawn_transfer_weights(tmpdir, trainer_fn):
135135
trainer = Trainer(default_root_dir=tmpdir, strategy=strategy)
136136
trainer.strategy.connect(model)
137137
trainer.state.fn = trainer_fn # pretend we are in a particular trainer state
138-
temp_file = Path(tmpdir, ".temp.ckpt")
139138

140-
assert not temp_file.exists()
141139
spawn_output = strategy._launcher._collect_rank_zero_results(trainer, {})
142140

143141
model.state_dict.assert_called_once()
144142
if trainer_fn == TrainerFn.FITTING:
145-
assert spawn_output.weights_path == str(temp_file)
146-
assert temp_file.exists()
143+
assert spawn_output.weights_path.endswith(".temp.ckpt")
144+
assert os.path.isfile(spawn_output.weights_path)
147145
else:
148146
assert spawn_output.weights_path is None
149-
assert not temp_file.exists()
150147

151148
# <-- here would normally be the multiprocessing boundary
152149
strategy._launcher._recover_results_in_main_process(spawn_output, trainer)
153150
assert model.load_state_dict.call_count == int(spawn_output.weights_path is not None)
154-
assert not temp_file.exists()
155151

156152

157153
@mock.patch("torch.distributed.init_process_group")

0 commit comments

Comments
 (0)