@@ -79,17 +79,17 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None:
79
79
80
80
81
81
def find_usable_cuda_devices (num_devices : int = - 1 ) -> List [int ]:
82
- """Returns a list of all available and usable CUDA GPUs .
82
+ """Returns a list of all available and usable CUDA GPU devices .
83
83
84
84
A GPU is considered usable if we can successfully move a tensor to the device, and this is what this function
85
- tests for each GPU on the system until the target number of usable GPUs is found.
85
+ tests for each GPU on the system until the target number of usable devices is found.
86
86
87
87
A subset of GPUs on the system might be used by other processes, and if the GPU is configured to operate in
88
88
'exclusive' mode (configurable by the admin), then only one process is allowed to occupy it.
89
89
90
90
Args:
91
- num_devices: The number of GPUs you want to request. By default, this function will return as many as there are
92
- usable GPUs available.
91
+ num_devices: The number of devices you want to request. By default, this function will return as many as there
92
+ are usable CUDA GPU devices available.
93
93
94
94
Warning:
95
95
If multiple processes call this function at the same time, there can be race conditions in the case where
@@ -98,11 +98,11 @@ def find_usable_cuda_devices(num_devices: int = -1) -> List[int]:
98
98
visible_devices = _get_all_visible_cuda_devices ()
99
99
if not visible_devices :
100
100
raise ValueError (
101
- f"You requested to find { num_devices } GPUs but there are no visible CUDA devices on this machine."
101
+ f"You requested to find { num_devices } devices but there are no visible CUDA devices on this machine."
102
102
)
103
103
if num_devices > len (visible_devices ):
104
104
raise ValueError (
105
- f"You requested to find { num_devices } GPUs but this machine only has { len (visible_devices )} GPUs."
105
+ f"You requested to find { num_devices } devices but this machine only has { len (visible_devices )} GPUs."
106
106
)
107
107
108
108
available_devices = []
@@ -122,17 +122,16 @@ def find_usable_cuda_devices(num_devices: int = -1) -> List[int]:
122
122
123
123
if len (available_devices ) != num_devices :
124
124
raise RuntimeError (
125
- f"You requested to find { num_devices } GPUs but only { len (available_devices )} are currently available."
126
- f" GPUs { unavailable_devices } are occupied by other processes and can't be"
127
- " used at the moment."
125
+ f"You requested to find { num_devices } devices but only { len (available_devices )} are currently available."
126
+ f" The devices { unavailable_devices } are occupied by other processes and can't be used at the moment."
128
127
)
129
128
return available_devices
130
129
131
130
132
131
def _get_all_visible_cuda_devices () -> List [int ]:
133
- """Returns a list of all visible CUDA GPUs .
132
+ """Returns a list of all visible CUDA GPU devices .
134
133
135
- GPUs masked by the environment variabale ``CUDA_VISIBLE_DEVICES`` won't be returned here. For example, assume you
134
+ Devices masked by the environment variabale ``CUDA_VISIBLE_DEVICES`` won't be returned here. For example, assume you
136
135
have 8 physical GPUs. If ``CUDA_VISIBLE_DEVICES="1,3,6"``, then this function will return the list ``[0, 1, 2]``
137
136
because these are the three visible GPUs after applying the mask ``CUDA_VISIBLE_DEVICES``.
138
137
"""
0 commit comments