Skip to content

Commit 6500156

Browse files
committed
cuda gpu devices clarification
From the context, it should hopefully be clear that we are talking about nvidia cuda gpus update error message update
1 parent 38e8cb8 commit 6500156

File tree

2 files changed

+12
-13
lines changed

2 files changed

+12
-13
lines changed

src/lightning_lite/accelerators/cuda.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -79,17 +79,17 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None:
7979

8080

8181
def find_usable_cuda_devices(num_devices: int = -1) -> List[int]:
82-
"""Returns a list of all available and usable CUDA GPUs.
82+
"""Returns a list of all available and usable CUDA GPU devices.
8383
8484
A GPU is considered usable if we can successfully move a tensor to the device, and this is what this function
85-
tests for each GPU on the system until the target number of usable GPUs is found.
85+
tests for each GPU on the system until the target number of usable devices is found.
8686
8787
A subset of GPUs on the system might be used by other processes, and if the GPU is configured to operate in
8888
'exclusive' mode (configurable by the admin), then only one process is allowed to occupy it.
8989
9090
Args:
91-
num_devices: The number of GPUs you want to request. By default, this function will return as many as there are
92-
usable GPUs available.
91+
num_devices: The number of devices you want to request. By default, this function will return as many as there
92+
are usable CUDA GPU devices available.
9393
9494
Warning:
9595
If multiple processes call this function at the same time, there can be race conditions in the case where
@@ -98,11 +98,11 @@ def find_usable_cuda_devices(num_devices: int = -1) -> List[int]:
9898
visible_devices = _get_all_visible_cuda_devices()
9999
if not visible_devices:
100100
raise ValueError(
101-
f"You requested to find {num_devices} GPUs but there are no visible CUDA devices on this machine."
101+
f"You requested to find {num_devices} devices but there are no visible CUDA devices on this machine."
102102
)
103103
if num_devices > len(visible_devices):
104104
raise ValueError(
105-
f"You requested to find {num_devices} GPUs but this machine only has {len(visible_devices)} GPUs."
105+
f"You requested to find {num_devices} devices but this machine only has {len(visible_devices)} GPUs."
106106
)
107107

108108
available_devices = []
@@ -122,17 +122,16 @@ def find_usable_cuda_devices(num_devices: int = -1) -> List[int]:
122122

123123
if len(available_devices) != num_devices:
124124
raise RuntimeError(
125-
f"You requested to find {num_devices} GPUs but only {len(available_devices)} are currently available."
126-
f" GPUs {unavailable_devices} are occupied by other processes and can't be"
127-
" used at the moment."
125+
f"You requested to find {num_devices} devices but only {len(available_devices)} are currently available."
126+
f" The devices {unavailable_devices} are occupied by other processes and can't be used at the moment."
128127
)
129128
return available_devices
130129

131130

132131
def _get_all_visible_cuda_devices() -> List[int]:
133-
"""Returns a list of all visible CUDA GPUs.
132+
"""Returns a list of all visible CUDA GPU devices.
134133
135-
GPUs masked by the environment variabale ``CUDA_VISIBLE_DEVICES`` won't be returned here. For example, assume you
134+
Devices masked by the environment variabale ``CUDA_VISIBLE_DEVICES`` won't be returned here. For example, assume you
136135
have 8 physical GPUs. If ``CUDA_VISIBLE_DEVICES="1,3,6"``, then this function will return the list ``[0, 1, 2]``
137136
because these are the three visible GPUs after applying the mask ``CUDA_VISIBLE_DEVICES``.
138137
"""

tests/tests_lite/accelerators/test_cuda.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def test_find_usable_cuda_devices_error_handling():
123123

124124
# Asking for GPUs if no GPUs visible
125125
with mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=0), pytest.raises(
126-
ValueError, match="You requested to find 2 GPUs but there are no visible CUDA"
126+
ValueError, match="You requested to find 2 devices but there are no visible CUDA"
127127
):
128128
find_usable_cuda_devices(2)
129129

@@ -137,5 +137,5 @@ def test_find_usable_cuda_devices_error_handling():
137137
tensor_mock = Mock(side_effect=RuntimeError) # simulate device placement fails
138138
with mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=2), mock.patch(
139139
"lightning_lite.accelerators.cuda.torch.tensor", tensor_mock
140-
), pytest.raises(RuntimeError, match=escape("GPUs [0, 1] are occupied by other processes")):
140+
), pytest.raises(RuntimeError, match=escape("The devices [0, 1] are occupied by other processes")):
141141
find_usable_cuda_devices(2)

0 commit comments

Comments
 (0)