Skip to content

Commit 5cdfcd7

Browse files
authored
Merge branch 'master' into carmocca/mypy-1.0
2 parents 4e1bd5b + da6263a commit 5cdfcd7

File tree

8 files changed

+92
-187
lines changed

8 files changed

+92
-187
lines changed

src/lightning/fabric/utilities/imports.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,7 @@
2525
# 2. The inspection mode via `python -i`: https://stackoverflow.com/a/6879085/1162383
2626
_IS_INTERACTIVE = hasattr(sys, "ps1") or bool(sys.flags.interactive)
2727

28-
# We use "base_version" for non-nightly builds as well, because some environments like NVIDIA's PyTorch dockers
29-
# install PyTorch from source at a commit that doesn't align with the released version tag.
30-
# See: https://github.com/Lightning-AI/lightning/issues/16644
31-
_TORCH_GREATER_EQUAL_1_12 = compare_version("torch", operator.ge, "1.12.0", use_base_version=True)
32-
_TORCH_GREATER_EQUAL_1_13 = compare_version("torch", operator.ge, "1.13.0", use_base_version=True)
28+
_TORCH_GREATER_EQUAL_1_12 = compare_version("torch", operator.ge, "1.12.0")
29+
_TORCH_GREATER_EQUAL_1_13 = compare_version("torch", operator.ge, "1.13.0")
3330
_TORCH_GREATER_EQUAL_2_0 = compare_version("torch", operator.ge, "2.0.0", use_base_version=True)
3431
_TORCH_GREATER_EQUAL_2_1 = compare_version("torch", operator.ge, "2.1.0", use_base_version=True)

src/lightning/pytorch/CHANGELOG.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
163163
- Changed minimum supported version of `rich` from `10.14.0` to `12.13.0` ([#16798](https://github.com/Lightning-AI/lightning/pull/16798))
164164

165165

166+
- Removed the `lightning.pytorch.overrides.torch_distributed.broadcast_object_list` function ([#17011](https://github.com/Lightning-AI/lightning/pull/17011))
167+
168+
166169
- The `ServableModule` is now an abstract interface ([#17000](https://github.com/Lightning-AI/lightning/pull/17000))
167170

168171

@@ -425,9 +428,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
425428
- Fixed an issue where `DistributedSampler.set_epoch` wasn't getting called during `trainer.predict` ([#16785](https://github.com/Lightning-AI/lightning/pull/16785), [#16826](https://github.com/Lightning-AI/lightning/pull/16826))
426429

427430

428-
- Fixed an issue with comparing torch versions when using a version of torch built from source ([#16657](https://github.com/Lightning-AI/lightning/pull/16657))
429-
430-
431431
## [1.9.4] - 2023-03-01
432432

433433
### Added

src/lightning/pytorch/core/module.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,12 @@
5151
from lightning.fabric.utilities.cloud_io import get_filesystem
5252
from lightning.fabric.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin
5353
from lightning.fabric.utilities.distributed import _distributed_available
54-
from lightning.fabric.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_2_0, _TORCH_GREATER_EQUAL_2_1
54+
from lightning.fabric.utilities.imports import (
55+
_IS_WINDOWS,
56+
_TORCH_GREATER_EQUAL_1_13,
57+
_TORCH_GREATER_EQUAL_2_0,
58+
_TORCH_GREATER_EQUAL_2_1,
59+
)
5560
from lightning.fabric.utilities.types import _MAP_LOCATION_TYPE, _PATH
5661
from lightning.fabric.wrappers import _FabricOptimizer
5762
from lightning.pytorch.callbacks.callback import Callback
@@ -64,7 +69,7 @@
6469
from lightning.pytorch.trainer.connectors.logger_connector.fx_validator import _FxValidator
6570
from lightning.pytorch.utilities import GradClipAlgorithmType
6671
from lightning.pytorch.utilities.exceptions import MisconfigurationException
67-
from lightning.pytorch.utilities.imports import _TORCH_GREATER_EQUAL_1_13, _TORCHMETRICS_GREATER_EQUAL_0_9_1
72+
from lightning.pytorch.utilities.imports import _TORCHMETRICS_GREATER_EQUAL_0_9_1
6873
from lightning.pytorch.utilities.rank_zero import rank_zero_debug, rank_zero_warn, WarningCache
6974
from lightning.pytorch.utilities.signature_utils import is_param_in_hook_signature
7075
from lightning.pytorch.utilities.types import _METRIC, LRSchedulerPLType, LRSchedulerTypeUnion, STEP_OUTPUT

src/lightning/pytorch/overrides/torch_distributed.py

Lines changed: 0 additions & 170 deletions
This file was deleted.

src/lightning/pytorch/strategies/fsdp.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
_sync_ddp_if_available,
3333
)
3434
from lightning.fabric.utilities.distributed import group as _group
35-
from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_12
35+
from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_12, _TORCH_GREATER_EQUAL_1_13
3636
from lightning.fabric.utilities.optimizer import _optimizers_to_device
3737
from lightning.fabric.utilities.seed import reset_seed
3838
from lightning.fabric.utilities.types import ProcessGroup, ReduceOp
@@ -44,7 +44,6 @@
4444
from lightning.pytorch.strategies.strategy import TBroadcast
4545
from lightning.pytorch.trainer.states import TrainerFn
4646
from lightning.pytorch.utilities.exceptions import MisconfigurationException
47-
from lightning.pytorch.utilities.imports import _TORCH_GREATER_EQUAL_1_13
4847
from lightning.pytorch.utilities.model_helpers import is_overridden
4948
from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_only
5049
from lightning.pytorch.utilities.types import STEP_OUTPUT

src/lightning/pytorch/strategies/hpu_parallel.py

Lines changed: 78 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
from lightning.fabric.plugins import CheckpointIO, ClusterEnvironment
2424
from lightning.fabric.utilities.distributed import group as _group
2525
from lightning.pytorch.accelerators.hpu import _HPU_AVAILABLE
26-
from lightning.pytorch.overrides.torch_distributed import broadcast_object_list
2726
from lightning.pytorch.plugins.io.hpu_plugin import HPUCheckpointIO
2827
from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
2928
from lightning.pytorch.plugins.precision import PrecisionPlugin
@@ -106,7 +105,7 @@ def broadcast(self, obj: object, src: int = 0) -> object: # type: ignore
106105
if self.global_rank != src:
107106
obj = [None]
108107

109-
broadcast_object_list(obj, src, group=_group.WORLD)
108+
_hpu_broadcast_object_list(obj, src, group=_group.WORLD)
110109
return obj[0]
111110

112111
def on_after_backward(self) -> None:
@@ -138,3 +137,80 @@ def teardown(self) -> None:
138137
# Was set to local rank
139138
os.environ.pop("ID", None)
140139
os.environ.pop("HCCL_DISTRIBUTED_BACKEND", None)
140+
141+
142+
# The code underneath is taken from PyTorch `torch/distributed/distributed_c10d.py`
143+
# the distributed backend and tensor type updates for habana backend is done here before broadcast
144+
def _hpu_broadcast_object_list(object_list, src=0, group=None, device=None): # type: ignore
145+
from torch.distributed import _rank_not_in_group, Backend, broadcast, get_backend, get_rank
146+
from torch.distributed.distributed_c10d import _object_to_tensor, _tensor_to_object
147+
148+
if _rank_not_in_group(group):
149+
return
150+
151+
my_rank = get_rank()
152+
# Serialize object_list elements to tensors on src rank.
153+
if my_rank == src:
154+
tensor_list, size_list = zip(*[_object_to_tensor(obj, device) for obj in object_list])
155+
object_sizes_tensor = torch.cat(size_list)
156+
else:
157+
object_sizes_tensor = torch.empty(len(object_list), dtype=torch.long)
158+
159+
# Current device selection.
160+
# To preserve backwards compatibility, ``device`` is default to ``None``
161+
# in which case we run current logic of device selection, i.e.
162+
# ``current_device`` is CUDA if backend is NCCL otherwise CPU device. In the
163+
# case it is not ``None`` we move the size and object tensors to be
164+
# broadcasted to this device.
165+
group_backend = get_backend(group)
166+
is_nccl_backend = group_backend == Backend.NCCL
167+
is_hpu_backend = os.environ.get("HCCL_DISTRIBUTED_BACKEND") == "1"
168+
if device is not None:
169+
if is_nccl_backend and device.type != "cuda":
170+
raise ValueError("device type must be cuda for nccl backend")
171+
current_device = device
172+
else:
173+
current_device = torch.device("cpu")
174+
if is_nccl_backend:
175+
# See note about using torch.cuda.current_device() here in
176+
# docstring. We cannot simply use my_rank since rank == device is
177+
# not necessarily true.
178+
current_device = torch.device("cuda", torch.cuda.current_device())
179+
if is_nccl_backend:
180+
object_sizes_tensor = object_sizes_tensor.to(current_device)
181+
182+
elif is_hpu_backend:
183+
current_device = torch.device("hpu")
184+
# Workaround: HPU doesn't not support long tensors for collectives
185+
if (object_sizes_tensor.type() == "torch.LongTensor") or (object_sizes_tensor.type() == "torch.hpu.LongTensor"):
186+
object_sizes_tensor = object_sizes_tensor.int()
187+
else:
188+
print("unhandled hpu object_sizes_tensor type :: ", object_sizes_tensor.type())
189+
object_sizes_tensor = object_sizes_tensor.to(current_device)
190+
191+
# Broadcast object sizes
192+
broadcast(object_sizes_tensor, src=src, group=group)
193+
194+
# Concatenate and broadcast serialized object tensors
195+
if my_rank == src:
196+
object_tensor = torch.cat(tensor_list)
197+
else:
198+
object_tensor = torch.empty(
199+
torch.sum(object_sizes_tensor).int().item(),
200+
dtype=torch.uint8,
201+
)
202+
203+
if is_nccl_backend or is_hpu_backend:
204+
object_tensor = object_tensor.to(current_device)
205+
206+
broadcast(object_tensor, src=src, group=group)
207+
# Deserialize objects using their stored sizes.
208+
offset = 0
209+
if my_rank != src:
210+
for i, obj_size in enumerate(object_sizes_tensor):
211+
obj_view = object_tensor[offset : offset + obj_size]
212+
obj_view = obj_view.type(torch.uint8)
213+
if obj_view.device != torch.device("cpu"):
214+
obj_view = obj_view.cpu()
215+
offset += obj_size
216+
object_list[i] = _tensor_to_object(obj_view, obj_size)

src/lightning/pytorch/utilities/imports.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
_PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8)
2222
_PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10)
2323
_PYTHON_GREATER_EQUAL_3_11_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 11)
24-
_TORCH_GREATER_EQUAL_1_13 = compare_version("torch", operator.ge, "1.13.0", use_base_version=True)
2524
_TORCHMETRICS_GREATER_EQUAL_0_9_1 = RequirementCache("torchmetrics>=0.9.1")
2625
_TORCHMETRICS_GREATER_EQUAL_0_11 = RequirementCache("torchmetrics>=0.11.0") # using new API with task
2726

tests/tests_pytorch/callbacks/test_finetuning_callback.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,10 @@
1919
from torch.optim import Optimizer, SGD
2020
from torch.utils.data import DataLoader
2121

22-
from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_12
22+
from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_12, _TORCH_GREATER_EQUAL_1_13
2323
from lightning.pytorch import LightningModule, seed_everything, Trainer
2424
from lightning.pytorch.callbacks import BackboneFinetuning, BaseFinetuning, ModelCheckpoint
2525
from lightning.pytorch.demos.boring_classes import BoringModel, RandomDataset
26-
from lightning.pytorch.utilities.imports import _TORCH_GREATER_EQUAL_1_13
2726

2827

2928
class TestBackboneFinetuningCallback(BackboneFinetuning):

0 commit comments

Comments
 (0)