Skip to content

Commit cdb7006

Browse files
Fix ddp_spawn -> ddp fallback logic when on LSF cluster (#15657)
Co-authored-by: awaelchli <[email protected]>
1 parent 61ee3fa commit cdb7006

File tree

6 files changed

+81
-35
lines changed

6 files changed

+81
-35
lines changed

src/lightning_lite/CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
4040

4141
### Fixed
4242

43-
-
43+
44+
- Fixed the automatic fallback from `LightningLite(strategy="ddp_spawn", ...)` to `LightningLite(strategy="ddp", ...)` when on an LSF cluster ([#15103](https://github.com/PyTorchLightning/pytorch-lightning/issues/15103))
45+
4446

4547

4648
## [1.8.1] - 2022-11-10

src/lightning_lite/connector.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,10 @@ def _check_strategy_and_fallback(self) -> None:
403403
strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag
404404

405405
if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
406-
TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect()
406+
TorchElasticEnvironment.detect()
407+
or KubeflowEnvironment.detect()
408+
or SLURMEnvironment.detect()
409+
or LSFEnvironment.detect()
407410
):
408411
strategy_flag = "ddp"
409412
if strategy_flag == "dp" and self._accelerator_flag == "cpu":

src/pytorch_lightning/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
7777
- Fixed manual optimization raising `AttributeError` with Bagua Strategy ([#12534](https://github.com/PyTorchLightning/pytorch-lightning/issues/12534))
7878
- Fixed the import of `pytorch_lightning` causing a warning 'Redirects are currently not supported in Windows or MacOs' ([#15610](https://github.com/PyTorchLightning/pytorch-lightning/issues/15610))
7979

80+
- Fixed the automatic fallback from `Trainer(strategy="ddp_spawn", ...)` to `Trainer(strategy="ddp", ...)` when on an LSF cluster ([#15103](https://github.com/PyTorchLightning/pytorch-lightning/issues/15103))
81+
8082

8183
## [1.8.0] - 2022-11-01
8284

src/pytorch_lightning/trainer/connectors/accelerator_connector.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -614,7 +614,10 @@ def _check_strategy_and_fallback(self) -> None:
614614
strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag
615615

616616
if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
617-
TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect()
617+
TorchElasticEnvironment.detect()
618+
or KubeflowEnvironment.detect()
619+
or SLURMEnvironment.detect()
620+
or LSFEnvironment.detect()
618621
):
619622
strategy_flag = "ddp"
620623
if strategy_flag == "dp" and self._accelerator_flag == "cpu":

tests/tests_lite/test_connector.py

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from lightning_lite.plugins.environments import (
3434
KubeflowEnvironment,
3535
LightningEnvironment,
36+
LSFEnvironment,
3637
SLURMEnvironment,
3738
TorchElasticEnvironment,
3839
)
@@ -201,24 +202,41 @@ class Strat(DDPStrategy):
201202
assert connector.strategy is strategy
202203

203204

204-
@mock.patch.dict(
205-
os.environ,
206-
{
207-
"SLURM_NTASKS": "2",
208-
"SLURM_NTASKS_PER_NODE": "1",
209-
"SLURM_JOB_NAME": "SOME_NAME",
210-
"SLURM_NODEID": "0",
211-
"LOCAL_RANK": "0",
212-
"SLURM_PROCID": "0",
213-
"SLURM_LOCALID": "0",
214-
},
205+
@pytest.mark.parametrize(
206+
"env_vars,expected_environment",
207+
[
208+
(
209+
{
210+
"SLURM_NTASKS": "2",
211+
"SLURM_NTASKS_PER_NODE": "1",
212+
"SLURM_JOB_NAME": "SOME_NAME",
213+
"SLURM_NODEID": "0",
214+
"LOCAL_RANK": "0",
215+
"SLURM_PROCID": "0",
216+
"SLURM_LOCALID": "0",
217+
},
218+
SLURMEnvironment,
219+
),
220+
(
221+
{
222+
"LSB_JOBID": "1",
223+
"LSB_DJOB_RANKFILE": "SOME_RANK_FILE",
224+
"JSM_NAMESPACE_LOCAL_RANK": "1",
225+
"JSM_NAMESPACE_SIZE": "20",
226+
"JSM_NAMESPACE_RANK": "1",
227+
},
228+
LSFEnvironment,
229+
),
230+
],
215231
)
216-
@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=0)
217-
def test_dist_backend_accelerator_mapping(*_):
218-
connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2)
219-
assert isinstance(connector.accelerator, CPUAccelerator)
220-
assert isinstance(connector.strategy, DDPStrategy)
221-
assert connector.strategy.local_rank == 0
232+
@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._read_hosts", return_value=["node0", "node1"])
233+
@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._get_node_rank", return_value=0)
234+
def test_fallback_from_ddp_spawn_to_ddp_on_cluster(_, __, env_vars, expected_environment):
235+
with mock.patch.dict(os.environ, env_vars, clear=True):
236+
trainer = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2)
237+
assert isinstance(trainer.accelerator, CPUAccelerator)
238+
assert isinstance(trainer.strategy, DDPStrategy)
239+
assert isinstance(trainer.strategy.cluster_environment, expected_environment)
222240

223241

224242
@RunIf(mps=False)

tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from lightning_lite.plugins.environments import (
2626
KubeflowEnvironment,
2727
LightningEnvironment,
28+
LSFEnvironment,
2829
SLURMEnvironment,
2930
TorchElasticEnvironment,
3031
)
@@ -193,24 +194,41 @@ class Strat(DDPStrategy):
193194
assert trainer._accelerator_connector.strategy is strategy
194195

195196

196-
@mock.patch.dict(
197-
os.environ,
198-
{
199-
"SLURM_NTASKS": "2",
200-
"SLURM_NTASKS_PER_NODE": "1",
201-
"SLURM_JOB_NAME": "SOME_NAME",
202-
"SLURM_NODEID": "0",
203-
"LOCAL_RANK": "0",
204-
"SLURM_PROCID": "0",
205-
"SLURM_LOCALID": "0",
206-
},
197+
@pytest.mark.parametrize(
198+
"env_vars,expected_environment",
199+
[
200+
(
201+
{
202+
"SLURM_NTASKS": "2",
203+
"SLURM_NTASKS_PER_NODE": "1",
204+
"SLURM_JOB_NAME": "SOME_NAME",
205+
"SLURM_NODEID": "0",
206+
"LOCAL_RANK": "0",
207+
"SLURM_PROCID": "0",
208+
"SLURM_LOCALID": "0",
209+
},
210+
SLURMEnvironment,
211+
),
212+
(
213+
{
214+
"LSB_JOBID": "1",
215+
"LSB_DJOB_RANKFILE": "SOME_RANK_FILE",
216+
"JSM_NAMESPACE_LOCAL_RANK": "1",
217+
"JSM_NAMESPACE_SIZE": "20",
218+
"JSM_NAMESPACE_RANK": "1",
219+
},
220+
LSFEnvironment,
221+
),
222+
],
207223
)
208-
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
209-
def test_dist_backend_accelerator_mapping(cuda_count_0):
210-
trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2)
224+
@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._read_hosts", return_value=["node0", "node1"])
225+
@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._get_node_rank", return_value=0)
226+
def test_fallback_from_ddp_spawn_to_ddp_on_cluster(_, __, env_vars, expected_environment):
227+
with mock.patch.dict(os.environ, env_vars, clear=True):
228+
trainer = Trainer(strategy="ddp_spawn", accelerator="cpu", devices=2)
211229
assert isinstance(trainer.accelerator, CPUAccelerator)
212230
assert isinstance(trainer.strategy, DDPStrategy)
213-
assert trainer.strategy.local_rank == 0
231+
assert isinstance(trainer.strategy.cluster_environment, expected_environment)
214232

215233

216234
def test_interactive_incompatible_backend_error(mps_count_2, cuda_count_2, monkeypatch):

0 commit comments

Comments
 (0)