From 031584f4e9cf4b62c0428808acd3426bc505aa16 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 20 Dec 2022 20:56:21 +0100 Subject: [PATCH 01/23] edge cases --- src/lightning_lite/accelerators/__init__.py | 1 + src/lightning_lite/accelerators/cuda.py | 59 ++++++++++++++++++- src/lightning_lite/utilities/device_parser.py | 2 +- 3 files changed, 58 insertions(+), 4 deletions(-) diff --git a/src/lightning_lite/accelerators/__init__.py b/src/lightning_lite/accelerators/__init__.py index f1abf60f142d9..c38fa5ccb9eb8 100644 --- a/src/lightning_lite/accelerators/__init__.py +++ b/src/lightning_lite/accelerators/__init__.py @@ -13,6 +13,7 @@ from lightning_lite.accelerators.accelerator import Accelerator # noqa: F401 from lightning_lite.accelerators.cpu import CPUAccelerator # noqa: F401 from lightning_lite.accelerators.cuda import CUDAAccelerator # noqa: F401 +from lightning_lite.accelerators.cuda import find_usable_cuda_gpus # noqa: F401 from lightning_lite.accelerators.mps import MPSAccelerator # noqa: F401 from lightning_lite.accelerators.registry import _AcceleratorRegistry, call_register_accelerators from lightning_lite.accelerators.tpu import TPUAccelerator # noqa: F401 diff --git a/src/lightning_lite/accelerators/cuda.py b/src/lightning_lite/accelerators/cuda.py index eb4954f388fb6..a2adc8102a38b 100644 --- a/src/lightning_lite/accelerators/cuda.py +++ b/src/lightning_lite/accelerators/cuda.py @@ -78,10 +78,63 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: ) -def _get_all_available_cuda_gpus() -> List[int]: +def find_usable_cuda_gpus(num_gpus: int = -1) -> List[int]: + """Returns a list of all available and usable CUDA GPUs. + + A GPU is considered usable if we can successfully move a tensor to the device, and this is what this function + tests for each GPU on the system until the target number of usable GPUs is found. + + A subset of GPUs on the system might be used by other processes, and if the GPU is configured to operate in + 'exclusive' mode (configurable by the admin), then only one process is allowed to occupy it. + + Args: + num_gpus: The number of GPUs you want to request. By default, this function will return as many as there are + usable GPUs available. + + Warning: + If multiple processes call this function at the same time, there can be race conditions in the case where + both processes determine that the device is unoccupied, leading into one of them crashing later on. """ - Returns: - A list of all available CUDA GPUs + visible_gpus = _get_all_visible_cuda_gpus() + if not visible_gpus: + raise ValueError( + f"You requested to find {num_gpus} GPUs but there are no visible CUDA devices on this machine." + ) + if num_gpus < len(visible_gpus): + raise ValueError( + f"You requested to find {num_gpus} GPUs but this machine only has {len(visible_gpus)} GPUs." + ) + + available_gpus = [] + unavailable_gpus = [] + + for gpu_idx in visible_gpus: + try: + torch.tensor(0, device=torch.device("cuda", gpu_idx)) + except RuntimeError: + unavailable_gpus.append(gpu_idx) + continue + + available_gpus.append(gpu_idx) + if len(available_gpus) == num_gpus: + # exit early if we found the right number of GPUs + break + + if len(available_gpus) != num_gpus: + raise RuntimeError( + f"You requested to find {num_gpus} GPUs but only {len(available_gpus)} are currently available." + f" GPUs {', '.join(unavailable_gpus)} are occupied by other processes and can't be used at the moment." + ) + return available_gpus + + +def _get_all_visible_cuda_gpus() -> List[int]: + """Returns a list of all visible CUDA GPUs. + + GPUs masked by the environment variabale ``CUDA_VISIBLE_DEVICES`` won't be returned here. + For example, assume you have 8 physical GPUs. If ``CUDA_VISIBLE_DEVICES="1,3,6"``, then this function + will return the list ``[0, 1, 2]`` because these are the three visible GPUs after applying the mask + ``CUDA_VISIBLE_DEVICES``. """ return list(range(num_cuda_devices())) diff --git a/src/lightning_lite/utilities/device_parser.py b/src/lightning_lite/utilities/device_parser.py index 10ca4ce1d6fb6..17b3942822ec3 100644 --- a/src/lightning_lite/utilities/device_parser.py +++ b/src/lightning_lite/utilities/device_parser.py @@ -160,7 +160,7 @@ def _get_all_available_gpus(include_cuda: bool = False, include_mps: bool = Fals Returns: A list of all available GPUs """ - cuda_gpus = accelerators.cuda._get_all_available_cuda_gpus() if include_cuda else [] + cuda_gpus = accelerators.cuda._get_all_visible_cuda_gpus() if include_cuda else [] mps_gpus = accelerators.mps._get_all_available_mps_gpus() if include_mps else [] return cuda_gpus + mps_gpus From 5e0c93e49780a9a339980b0bbc9e1004defd3140 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 20 Dec 2022 23:11:30 +0100 Subject: [PATCH 02/23] docs --- .../source-pytorch/accelerators/gpu_basic.rst | 27 ++++++++++++++----- .../accelerators/__init__.py | 1 + 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/docs/source-pytorch/accelerators/gpu_basic.rst b/docs/source-pytorch/accelerators/gpu_basic.rst index 43be718180aa9..900059697d0f1 100644 --- a/docs/source-pytorch/accelerators/gpu_basic.rst +++ b/docs/source-pytorch/accelerators/gpu_basic.rst @@ -88,10 +88,25 @@ The table below lists examples of possible input formats and how they are interp | "-1" | str | [0, 1, 2, ...] | all available GPUs | +------------------+-----------+---------------------+---------------------------------+ -.. note:: - When specifying number of ``devices`` as an integer ``devices=k``, setting the trainer flag - ``auto_select_gpus=True`` will automatically help you find ``k`` GPUs that are not - occupied by other processes. This is especially useful when GPUs are configured - to be in "exclusive mode", such that only one process at a time can access them. - For more details see the :doc:`trainer guide <../common/trainer>`. +Find usable CUDA devices +^^^^^^^^^^^^^^^^^^^^^^^^ + +If you want to run several experiments at the same time on your machine, for example for a hyperparameter sweep, then you can +use the following utility function to pick GPU indices that are "accessible", without having to change your code every time. + +.. code-block:: python + + from lightning.pytorch.accelerators import find_usable_cuda_gpus + + # Find two GPUs on the system that are not already occupied + trainer = Trainer(accelerator="cuda", devices=find_usable_cuda_gpus(2)) + + from lightning.lite.accelerators import find_usable_cuda_gpus + + # Works with LightningLite too + lite = LightningLite(accelerator="cuda", devices=find_usable_cuda_gpus(2)) + + +This is especially useful when GPUs are configured to be in "exclusive compute mode", such that only one process at a time is allowed access the device. +This special mode is often enabled on server GPUs or systems shared among multiple users. diff --git a/src/pytorch_lightning/accelerators/__init__.py b/src/pytorch_lightning/accelerators/__init__.py index fb5fa4ede2a0d..2f54a683ba2a0 100644 --- a/src/pytorch_lightning/accelerators/__init__.py +++ b/src/pytorch_lightning/accelerators/__init__.py @@ -11,6 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from lightning_lite.accelerators.registry import _AcceleratorRegistry, call_register_accelerators +from lightning_lite.accelerators.cuda import find_usable_cuda_gpus # noqa: F401 from pytorch_lightning.accelerators.accelerator import Accelerator # noqa: F401 from pytorch_lightning.accelerators.cpu import CPUAccelerator # noqa: F401 from pytorch_lightning.accelerators.cuda import CUDAAccelerator # noqa: F401 From db87f29edd356ffa82532c360ddce3720d9dd4c1 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 20 Dec 2022 23:33:42 +0100 Subject: [PATCH 03/23] update --- docs/source-pytorch/common/trainer.rst | 4 +++ .../connectors/accelerator_connector.py | 15 +++++++-- src/pytorch_lightning/trainer/trainer.py | 6 +++- .../tuner/auto_gpu_select.py | 31 ++++++++++++++++--- 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst index 1eb3f270fa1a2..d226303c2afcc 100644 --- a/docs/source-pytorch/common/trainer.rst +++ b/docs/source-pytorch/common/trainer.rst @@ -338,6 +338,10 @@ before any training. auto_select_gpus ^^^^^^^^^^^^^^^^ +.. warning:: ``auto_select_gpus`` has been deprecated in v1.9.0 and will be removed in v1.10.0. + Please use the function ``pytorch_lightning.accelerators.find_usable_cuda_gpus`` instead. + + .. raw:: html