Skip to content

Commit 5bef756

Browse files
awaelchliBorda
andauthored
Remove deprecated torch_distributed_backend logic (#14693)
* Remove deprecated torch_distributed_backend logic * changelog * mention deprecated * imports Co-authored-by: Jirka Borovec <[email protected]> Co-authored-by: Jirka <[email protected]>
1 parent ced9487 commit 5bef756

File tree

10 files changed

+43
-161
lines changed

10 files changed

+43
-161
lines changed

src/lightning_lite/strategies/deepspeed.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
from lightning_lite.plugins.precision.utils import _fp_to_half
3434
from lightning_lite.strategies.ddp import DDPStrategy
3535
from lightning_lite.utilities.apply_func import apply_to_collection
36-
from lightning_lite.utilities.distributed import get_default_process_group_backend_for_device, log
36+
from lightning_lite.utilities.distributed import log
3737
from lightning_lite.utilities.enums import AMPType, PrecisionType
3838
from lightning_lite.utilities.rank_zero import rank_zero_info
3939
from lightning_lite.utilities.seed import reset_seed
@@ -450,9 +450,6 @@ def _init_deepspeed_distributed(self) -> None:
450450
self._process_group_backend = self._get_process_group_backend()
451451
deepspeed.init_distributed(self._process_group_backend, distributed_port=self.cluster_environment.main_port)
452452

453-
def _get_process_group_backend(self) -> str:
454-
return self._process_group_backend or get_default_process_group_backend_for_device(self.root_device)
455-
456453
def _set_node_environment_variables(self) -> None:
457454
assert self.cluster_environment is not None
458455
os.environ["MASTER_ADDR"] = self.cluster_environment.main_address

src/lightning_lite/utilities/distributed.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from typing import Any, List, Optional, Tuple, Union
44

55
import torch
6-
from lightning_utilities.core.rank_zero import rank_zero_deprecation
76
from torch import Tensor
87
from torch.nn import functional as F
98

@@ -251,14 +250,3 @@ def tpu_distributed() -> bool:
251250

252251
def get_default_process_group_backend_for_device(device: torch.device) -> str:
253252
return "nccl" if device.type == "cuda" else "gloo"
254-
255-
256-
def _get_process_group_backend_from_env() -> Optional[str]:
257-
torch_backend = os.getenv("PL_TORCH_DISTRIBUTED_BACKEND")
258-
if torch_backend is not None:
259-
rank_zero_deprecation(
260-
"Environment variable `PL_TORCH_DISTRIBUTED_BACKEND`"
261-
" was deprecated in v1.6 and will be removed in v1.8."
262-
" Specify `process_group_backend` directly on the strategy constructor."
263-
)
264-
return torch_backend

src/pytorch_lightning/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
171171
- Removed the deprecated `BaseProfiler` and `AbstractProfiler` classes ([#14404](https://github.com/Lightning-AI/lightning/pull/14404))
172172

173173

174+
- Removed the deprecated way to set the distributed backend via the environment variable `PL_TORCH_DISTRIBUTED_BACKEND`, in favor of setting the `process_group_backend` in the strategy constructor ([#14693](https://github.com/Lightning-AI/lightning/pull/14693))
175+
176+
177+
174178
### Fixed
175179

176180
- Break HPU Graphs into two parts (forward + backward as one and optimizer as another) for better performance ([#14656](https://github.com/Lightning-AI/lightning/pull/14656))

src/pytorch_lightning/strategies/ddp.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,7 @@
3131
import pytorch_lightning as pl
3232
from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment
3333
from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO
34-
from lightning_lite.utilities.distributed import (
35-
_get_process_group_backend_from_env,
36-
distributed_available,
37-
get_default_process_group_backend_for_device,
38-
)
34+
from lightning_lite.utilities.distributed import distributed_available, get_default_process_group_backend_for_device
3935
from lightning_lite.utilities.distributed import group as _group
4036
from lightning_lite.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available
4137
from lightning_lite.utilities.optimizer import optimizers_to_device
@@ -213,11 +209,7 @@ def setup_distributed(self) -> None:
213209
init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout)
214210

215211
def _get_process_group_backend(self) -> str:
216-
return (
217-
self._process_group_backend
218-
or _get_process_group_backend_from_env()
219-
or get_default_process_group_backend_for_device(self.root_device)
220-
)
212+
return self._process_group_backend or get_default_process_group_backend_for_device(self.root_device)
221213

222214
def set_world_ranks(self) -> None:
223215
if self.cluster_environment is None:

src/pytorch_lightning/strategies/ddp_spawn.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,7 @@
2626
import pytorch_lightning as pl
2727
from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment
2828
from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO
29-
from lightning_lite.utilities.distributed import (
30-
_get_process_group_backend_from_env,
31-
distributed_available,
32-
get_default_process_group_backend_for_device,
33-
)
29+
from lightning_lite.utilities.distributed import distributed_available, get_default_process_group_backend_for_device
3430
from lightning_lite.utilities.distributed import group as _group
3531
from lightning_lite.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available
3632
from lightning_lite.utilities.optimizer import optimizers_to_device
@@ -187,11 +183,7 @@ def _worker_setup(self, process_idx: int) -> None:
187183
)
188184

189185
def _get_process_group_backend(self) -> str:
190-
return (
191-
self._process_group_backend
192-
or _get_process_group_backend_from_env()
193-
or get_default_process_group_backend_for_device(self.root_device)
194-
)
186+
return self._process_group_backend or get_default_process_group_backend_for_device(self.root_device)
195187

196188
def pre_configure_ddp(self) -> None:
197189
# if unset, default `find_unused_parameters` `True`

src/pytorch_lightning/strategies/deepspeed.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,6 @@
3232
import pytorch_lightning as pl
3333
from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment
3434
from lightning_lite.plugins.precision.utils import _fp_to_half
35-
from lightning_lite.utilities.distributed import (
36-
_get_process_group_backend_from_env,
37-
get_default_process_group_backend_for_device,
38-
log,
39-
)
4035
from lightning_lite.utilities.enums import AMPType, PrecisionType
4136
from lightning_lite.utilities.optimizer import optimizers_to_device
4237
from lightning_lite.utilities.seed import reset_seed
@@ -53,6 +48,7 @@
5348
from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn
5449
from pytorch_lightning.utilities.types import LRSchedulerConfig, STEP_OUTPUT
5550

51+
log = logging.getLogger(__name__)
5652
warning_cache = WarningCache()
5753

5854
_DEEPSPEED_AVAILABLE = RequirementCache("deepspeed")
@@ -395,13 +391,6 @@ def _init_deepspeed_distributed(self) -> None:
395391
self._process_group_backend = self._get_process_group_backend()
396392
deepspeed.init_distributed(self._process_group_backend, distributed_port=self.cluster_environment.main_port)
397393

398-
def _get_process_group_backend(self) -> str:
399-
return (
400-
self._process_group_backend
401-
or _get_process_group_backend_from_env()
402-
or get_default_process_group_backend_for_device(self.root_device)
403-
)
404-
405394
def _set_node_environment_variables(self) -> None:
406395
assert self.cluster_environment is not None
407396
os.environ["MASTER_ADDR"] = self.cluster_environment.main_address

src/pytorch_lightning/strategies/fully_sharded_native.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,7 @@
2121
import pytorch_lightning as pl
2222
from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment
2323
from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO
24-
from lightning_lite.utilities.distributed import (
25-
_get_process_group_backend_from_env,
26-
get_default_process_group_backend_for_device,
27-
)
24+
from lightning_lite.utilities.distributed import get_default_process_group_backend_for_device
2825
from lightning_lite.utilities.distributed import group as _group
2926
from lightning_lite.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available
3027
from lightning_lite.utilities.optimizer import optimizers_to_device
@@ -188,11 +185,7 @@ def setup_environment(self) -> None:
188185
super().setup_environment()
189186

190187
def _get_process_group_backend(self) -> str:
191-
return (
192-
self._process_group_backend
193-
or _get_process_group_backend_from_env()
194-
or get_default_process_group_backend_for_device(self.root_device)
195-
)
188+
return self._process_group_backend or get_default_process_group_backend_for_device(self.root_device)
196189

197190
def set_world_ranks(self) -> None:
198191
if self.cluster_environment is None:

src/pytorch_lightning/strategies/parallel.py

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,10 @@
2121
import pytorch_lightning as pl
2222
from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment
2323
from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO
24-
from lightning_lite.utilities.distributed import (
25-
_get_process_group_backend_from_env,
26-
all_gather_ddp_if_available,
27-
get_default_process_group_backend_for_device,
28-
ReduceOp,
29-
)
24+
from lightning_lite.utilities.distributed import all_gather_ddp_if_available, ReduceOp
3025
from pytorch_lightning.plugins import LayerSync
3126
from pytorch_lightning.plugins.precision import PrecisionPlugin
3227
from pytorch_lightning.strategies.strategy import Strategy
33-
from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
3428

3529

3630
class ParallelStrategy(Strategy, ABC):
@@ -89,17 +83,6 @@ def distributed_sampler_kwargs(self) -> Dict[str, Any]:
8983
)
9084
return distributed_sampler_kwargs
9185

92-
@property
93-
def torch_distributed_backend(self) -> str:
94-
"""Deprecated property."""
95-
rank_zero_deprecation(
96-
"ParallelStrategy.torch_distributed_backend was deprecated in v1.6 and will be removed in v1.8."
97-
)
98-
pg_backend = _get_process_group_backend_from_env()
99-
if pg_backend:
100-
return pg_backend
101-
return get_default_process_group_backend_for_device(self.root_device)
102-
10386
def reconciliate_processes(self, trace: str) -> None:
10487
"""Function to re-conciliate processes on failure."""
10588

tests/tests_pytorch/deprecated_api/test_remove_1-8.py

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,12 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
"""Test deprecated functionality which will be removed in v1.8.0."""
15-
import os
1615
import time
1716
from unittest import mock
1817
from unittest.mock import Mock
1918

2019
import numpy as np
2120
import pytest
22-
import torch
2321

2422
import pytorch_lightning
2523
from lightning_lite.utilities import device_parser
@@ -29,7 +27,6 @@
2927
from pytorch_lightning.loggers import CSVLogger, Logger
3028
from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
3129
from pytorch_lightning.profilers import AdvancedProfiler, SimpleProfiler
32-
from pytorch_lightning.strategies import ParallelStrategy
3330
from pytorch_lightning.strategies.ipu import LightningIPUModule
3431
from pytorch_lightning.trainer.configuration_validator import _check_datamodule_checkpoint_hooks
3532
from pytorch_lightning.trainer.states import RunningStage
@@ -509,49 +506,6 @@ def test_v1_8_0_lightning_module_use_amp():
509506
model.use_amp = False
510507

511508

512-
@mock.patch.dict(os.environ, {"PL_TORCH_DISTRIBUTED_BACKEND": "foo"})
513-
def test_v1_8_0_torch_distributed_backend_env():
514-
from lightning_lite.utilities.distributed import _get_process_group_backend_from_env
515-
516-
with pytest.deprecated_call(
517-
match="Environment variable `PL_TORCH_DISTRIBUTED_BACKEND`"
518-
" was deprecated in v1.6 and will be removed in v1.8."
519-
):
520-
_get_process_group_backend_from_env()
521-
522-
523-
def test_parallel_strategy_torch_distributed_backend():
524-
class CustomParallel(ParallelStrategy):
525-
@property
526-
def root_device(self) -> torch.device:
527-
return torch.device("cpu")
528-
529-
def model_to_device(self):
530-
pass
531-
532-
@property
533-
def is_global_zero(self):
534-
return True
535-
536-
def broadcast(self, obj):
537-
return obj
538-
539-
def reduce(self, tensor):
540-
return tensor
541-
542-
def barrier(self):
543-
return
544-
545-
def all_gather(self, tensor):
546-
return tensor
547-
548-
strategy = CustomParallel()
549-
with pytest.deprecated_call(
550-
match="ParallelStrategy.torch_distributed_backend was deprecated" " in v1.6 and will be removed in v1.8."
551-
):
552-
strategy.torch_distributed_backend
553-
554-
555509
def test_trainer_config_device_ids():
556510
trainer = Trainer(devices=2)
557511
with pytest.deprecated_call(

tests/tests_pytorch/strategies/test_ddp.py

Lines changed: 30 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
# limitations under the License.
1414
import os
1515
from unittest import mock
16-
from unittest.mock import patch
1716

1817
import pytest
1918
import torch
@@ -59,34 +58,29 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir):
5958

6059

6160
@RunIf(skip_windows=True)
62-
@pytest.mark.skipif(torch.cuda.is_available(), reason="test doesn't requires GPU machine")
61+
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2"}, clear=True)
6362
@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True)
64-
def test_torch_distributed_backend_env_variables(tmpdir):
63+
@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2)
64+
def test_torch_distributed_backend_invalid(_, __, tmpdir):
6565
"""This test set `undefined` as torch backend and should raise an `Backend.UNDEFINED` ValueError."""
66-
_environ = {"PL_TORCH_DISTRIBUTED_BACKEND": "undefined", "CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2"}
67-
with patch.dict(os.environ, _environ), patch(
68-
"lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2
69-
):
70-
with pytest.deprecated_call(match="Environment variable `PL_TORCH_DISTRIBUTED_BACKEND` was deprecated in v1.6"):
71-
with pytest.raises(ValueError, match="Invalid backend: 'undefined'"):
72-
model = BoringModel()
73-
trainer = Trainer(
74-
default_root_dir=tmpdir,
75-
fast_dev_run=True,
76-
strategy="ddp",
77-
accelerator="gpu",
78-
devices=2,
79-
logger=False,
80-
)
81-
trainer.fit(model)
66+
model = BoringModel()
67+
trainer = Trainer(
68+
default_root_dir=tmpdir,
69+
fast_dev_run=True,
70+
strategy=DDPStrategy(process_group_backend="undefined"),
71+
accelerator="cuda",
72+
devices=2,
73+
logger=False,
74+
)
75+
with pytest.raises(ValueError, match="Invalid backend: 'undefined'"):
76+
trainer.fit(model)
8277

8378

8479
@RunIf(skip_windows=True)
8580
@mock.patch("torch.cuda.set_device")
8681
@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True)
8782
@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1)
8883
@mock.patch("pytorch_lightning.accelerators.gpu.CUDAAccelerator.is_available", return_value=True)
89-
@mock.patch.dict(os.environ, {"PL_TORCH_DISTRIBUTED_BACKEND": "gloo"}, clear=True)
9084
def test_ddp_torch_dist_is_available_in_setup(
9185
mock_gpu_is_available, mock_device_count, mock_cuda_available, mock_set_device, tmpdir
9286
):
@@ -98,10 +92,15 @@ def setup(self, stage: str) -> None:
9892
raise SystemExit()
9993

10094
model = TestModel()
101-
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=1)
102-
with pytest.deprecated_call(match="Environment variable `PL_TORCH_DISTRIBUTED_BACKEND` was deprecated in v1.6"):
103-
with pytest.raises(SystemExit):
104-
trainer.fit(model)
95+
trainer = Trainer(
96+
default_root_dir=tmpdir,
97+
fast_dev_run=True,
98+
strategy=DDPStrategy(process_group_backend="gloo"),
99+
accelerator="gpu",
100+
devices=1,
101+
)
102+
with pytest.raises(SystemExit):
103+
trainer.fit(model)
105104

106105

107106
@RunIf(min_cuda_gpus=2, min_torch="1.8.1", standalone=True)
@@ -143,17 +142,15 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule")
143142

144143

145144
@pytest.mark.parametrize(
146-
["process_group_backend", "env_var", "device_str", "expected_process_group_backend"],
145+
["process_group_backend", "device_str", "expected_process_group_backend"],
147146
[
148-
pytest.param("foo", None, "cpu", "foo"),
149-
pytest.param("foo", "BAR", "cpu", "foo"),
150-
pytest.param("foo", "BAR", "cuda:0", "foo"),
151-
pytest.param(None, "BAR", "cuda:0", "BAR"),
152-
pytest.param(None, None, "cuda:0", "nccl"),
153-
pytest.param(None, None, "cpu", "gloo"),
147+
pytest.param("foo", "cpu", "foo"),
148+
pytest.param("foo", "cuda:0", "foo"),
149+
pytest.param(None, "cuda:0", "nccl"),
150+
pytest.param(None, "cpu", "gloo"),
154151
],
155152
)
156-
def test_ddp_process_group_backend(process_group_backend, env_var, device_str, expected_process_group_backend):
153+
def test_ddp_process_group_backend(process_group_backend, device_str, expected_process_group_backend):
157154
"""Test settings for process group backend."""
158155

159156
class MockDDPStrategy(DDPStrategy):
@@ -166,14 +163,7 @@ def root_device(self):
166163
return self._root_device
167164

168165
strategy = MockDDPStrategy(process_group_backend=process_group_backend, root_device=torch.device(device_str))
169-
if not process_group_backend and env_var:
170-
with mock.patch.dict(os.environ, {"PL_TORCH_DISTRIBUTED_BACKEND": env_var}):
171-
with pytest.deprecated_call(
172-
match="Environment variable `PL_TORCH_DISTRIBUTED_BACKEND` was deprecated in v1.6"
173-
):
174-
assert strategy._get_process_group_backend() == expected_process_group_backend
175-
else:
176-
assert strategy._get_process_group_backend() == expected_process_group_backend
166+
assert strategy._get_process_group_backend() == expected_process_group_backend
177167

178168

179169
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)