Fix test suite when running on MPS-enabled hardware (#14708)

awaelchli · web-flow · commit 35c65b028714 · 2022-09-16T19:21:36.000Z
diff --git a/_notebooks b/_notebooks
@@ -1 +1 @@
-Subproject commit 8a36a41548f34c44ac455d515a72994487e85813
+Subproject commit 6d5634b7942180e6ba4a30bfbd74926d1c22f1eb
diff --git a/tests/tests_lite/accelerators/test_mps.py b/tests/tests_lite/accelerators/test_mps.py
@@ -16,6 +16,7 @@
 from tests_lite.helpers.runif import RunIf
 
 from lightning_lite.accelerators.mps import MPSAccelerator
+from lightning_lite.utilities.exceptions import MisconfigurationException
 
 _MAYBE_MPS = "mps" if MPSAccelerator.is_available() else "cpu"  # torch.device(mps) only works on torch>=1.12
 
@@ -39,11 +40,17 @@ def test_init_device_with_wrong_device_type():
     "devices,expected",
     [
         (1, [torch.device(_MAYBE_MPS, 0)]),
-        (2, [torch.device(_MAYBE_MPS, 0), torch.device(_MAYBE_MPS, 1)]),
         ([0], [torch.device(_MAYBE_MPS, 0)]),
-        # TODO(lite): This case passes with the implementation from PL, but looks like a bug
-        ([0, 2], [torch.device(_MAYBE_MPS, 0), torch.device(_MAYBE_MPS, 1)]),
+        ("1", [torch.device(_MAYBE_MPS, 0)]),
+        ("0,", [torch.device(_MAYBE_MPS, 0)]),
     ],
 )
 def test_get_parallel_devices(devices, expected):
     assert MPSAccelerator.get_parallel_devices(devices) == expected
+
+
+@RunIf(mps=True)
+@pytest.mark.parametrize("devices", [2, [0, 2], "2", "0,2"])
+def test_get_parallel_devices_invalid_request(devices):
+    with pytest.raises(MisconfigurationException, match="But your machine only has"):
+        MPSAccelerator.get_parallel_devices(devices)
diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py
@@ -689,6 +689,7 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
 def test_trainer_gpus(monkeypatch, trainer_kwargs):
     monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True)
     monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 4)
+    monkeypatch.setattr(device_parser, "_get_all_available_mps_gpus", lambda: list(range(4)))
     trainer = Trainer(**trainer_kwargs)
     with pytest.deprecated_call(
         match=(
diff --git a/tests/tests_pytorch/lite/test_wrappers.py b/tests/tests_pytorch/lite/test_wrappers.py
@@ -204,7 +204,7 @@ def test_lite_dataloader_iterator():
         ("cpu", "cpu"),
         pytest.param("cpu", "cuda:0", marks=RunIf(min_cuda_gpus=1)),
         pytest.param("cuda:0", "cpu", marks=RunIf(min_cuda_gpus=1)),
-        pytest.param("cpu", "mps", marks=RunIf(mps=True)),
+        # pytest.param("cpu", "mps", marks=RunIf(mps=True)),  # TODO: Add once torch.equal is supported
         pytest.param("mps", "cpu", marks=RunIf(mps=True)),
     ],
 )
@@ -222,12 +222,12 @@ def test_lite_dataloader_device_placement(src_device_str, dest_device_str):
     iterator = iter(lite_dataloader)
 
     batch0 = next(iterator)
-    # TODO: This should be torch.equal, but not supported on MPS at this time (torch 1.12)
-    assert torch.allclose(batch0, torch.tensor([0, 1], device=dest_device))
+    # TODO: torch.equal is not supported on MPS at this time (torch 1.12)
+    assert torch.equal(batch0, torch.tensor([0, 1], device=dest_device))
 
     batch1 = next(iterator)
-    # TODO: This should be torch.equal, but not supported on MPS at this time (torch 1.12)
-    assert torch.allclose(batch1["data"], torch.tensor([2, 3], device=dest_device))
+    # TODO: torch.equal is not supported on MPS at this time (torch 1.12)
+    assert torch.equal(batch1["data"], torch.tensor([2, 3], device=dest_device))
 
 
 def test_lite_optimizer_wraps():
diff --git a/tests/tests_pytorch/models/test_gpu.py b/tests/tests_pytorch/models/test_gpu.py
@@ -91,7 +91,6 @@ def device_count():
     monkeypatch.setattr(device_parser, "num_cuda_devices", device_count)
 
 
-# Asking for a gpu when non are available will result in a MisconfigurationException
 @pytest.mark.parametrize(
     ["devices", "expected_root_gpu", "strategy"],
     [
@@ -104,8 +103,11 @@ def device_count():
         ("-1", None, "ddp"),
     ],
 )
-def test_root_gpu_property_0_raising(mocked_device_count_0, devices, expected_root_gpu, strategy):
-    with pytest.raises(MisconfigurationException):
+@mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=False)
+@mock.patch("lightning_lite.accelerators.cuda.CUDAAccelerator.is_available", return_value=False)
+def test_root_gpu_property_0_raising(_, __, devices, expected_root_gpu, strategy):
+    """Test that asking for a GPU when none are available will result in a MisconfigurationException."""
+    with pytest.raises(MisconfigurationException, match="No supported gpu backend found!"):
         Trainer(accelerator="gpu", devices=devices, strategy=strategy)
 
 
diff --git a/tests/tests_pytorch/plugins/test_amp_plugins.py b/tests/tests_pytorch/plugins/test_amp_plugins.py
@@ -39,6 +39,7 @@ class MyApexPlugin(ApexMixedPrecisionPlugin):
     pass
 
 
+@RunIf(mps=False)
 @mock.patch.dict(
     os.environ,
     {
diff --git a/tests/tests_pytorch/plugins/test_cluster_integration.py b/tests/tests_pytorch/plugins/test_cluster_integration.py
@@ -87,7 +87,8 @@ def test_ranks_available_manual_strategy_selection(mock_gpu_acc_available, strat
 )
 @mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True)
 @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=4)
-def test_ranks_available_automatic_strategy_selection(mock0, mock1, trainer_kwargs):
+@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=list(range(4)))
+def test_ranks_available_automatic_strategy_selection(_, __, ___, trainer_kwargs):
     """Test that the rank information is readily available after Trainer initialization."""
     num_nodes = 2
     trainer_kwargs.update(num_nodes=num_nodes)
diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 from unittest import mock
 
 import pytest
@@ -58,9 +57,8 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir):
 
 
 @RunIf(skip_windows=True)
-@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2"}, clear=True)
-@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True)
 @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2)
+@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=list(range(2)))
 def test_torch_distributed_backend_invalid(_, __, tmpdir):
     """This test set `undefined` as torch backend and should raise an `Backend.UNDEFINED` ValueError."""
     model = BoringModel()
diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
@@ -214,7 +214,8 @@ def test_dist_backend_accelerator_mapping(*_):
 
 
 @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2)
-def test_ipython_incompatible_backend_error(_, monkeypatch):
+@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=[0, 1])
+def test_ipython_incompatible_backend_error(_, __, monkeypatch):
     monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True)
     with pytest.raises(MisconfigurationException, match=r"strategy='ddp'\)`.*is not compatible"):
         Trainer(strategy="ddp", accelerator="gpu", devices=2)
@@ -252,6 +253,7 @@ def test_ipython_compatible_strategy_ddp_fork(monkeypatch):
     assert trainer.strategy.launcher.is_interactive_compatible
 
 
+@RunIf(mps=False)
 @pytest.mark.parametrize(
     ["strategy", "strategy_class"],
     [
@@ -462,7 +464,7 @@ def test_strategy_choice_ddp_fork_cpu():
 @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2)
 @mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True)
 def test_strategy_choice_ddp(*_):
-    trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=1)
+    trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="cuda", devices=1)
     assert isinstance(trainer.accelerator, CUDAAccelerator)
     assert isinstance(trainer.strategy, DDPStrategy)
     assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment)
@@ -471,8 +473,8 @@ def test_strategy_choice_ddp(*_):
 @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
 @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2)
 @mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True)
-def test_strategy_choice_ddp_spawn(cuda_available_mock, device_count_mock):
-    trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="gpu", devices=1)
+def test_strategy_choice_ddp_spawn(*_):
+    trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cuda", devices=1)
     assert isinstance(trainer.accelerator, CUDAAccelerator)
     assert isinstance(trainer.strategy, DDPSpawnStrategy)
     assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment)
@@ -515,13 +517,10 @@ def test_strategy_choice_ddp_slurm(_, __, strategy, job_name, expected_env):
         "TORCHELASTIC_RUN_ID": "1",
     },
 )
-@mock.patch("torch.cuda.set_device")
 @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2)
 @mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True)
-@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
-@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True)
 def test_strategy_choice_ddp_te(*_):
-    trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=2)
+    trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="cuda", devices=2)
     assert isinstance(trainer.accelerator, CUDAAccelerator)
     assert isinstance(trainer.strategy, DDPStrategy)
     assert isinstance(trainer.strategy.cluster_environment, TorchElasticEnvironment)
@@ -562,12 +561,10 @@ def test_strategy_choice_ddp_cpu_te(*_):
         "RANK": "1",
     },
 )
-@mock.patch("torch.cuda.set_device")
 @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1)
 @mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True)
-@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
 def test_strategy_choice_ddp_kubeflow(*_):
-    trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=1)
+    trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="cuda", devices=1)
     assert isinstance(trainer.accelerator, CUDAAccelerator)
     assert isinstance(trainer.strategy, DDPStrategy)
     assert isinstance(trainer.strategy.cluster_environment, KubeflowEnvironment)
@@ -780,10 +777,10 @@ def test_gpu_accelerator_backend_choice(expected_accelerator_flag, expected_acce
     assert isinstance(trainer.accelerator, expected_accelerator_class)
 
 
+@RunIf(mps=False)
 @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1)
 def test_gpu_accelerator_backend_choice_cuda(_):
     trainer = Trainer(accelerator="gpu")
-
     assert trainer._accelerator_connector._accelerator_flag == "cuda"
     assert isinstance(trainer.accelerator, CUDAAccelerator)
 
diff --git a/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py b/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py
@@ -48,6 +48,7 @@ def test_pick_multiple_gpus_more_than_available(*_):
         pick_multiple_gpus(3)
 
 
+@RunIf(mps=False)
 @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2)
 @mock.patch("pytorch_lightning.trainer.connectors.accelerator_connector.pick_multiple_gpus", return_value=[1])
 def test_auto_select_gpus(*_):
diff --git a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py
@@ -113,6 +113,7 @@ def test_num_stepping_batches_accumulate_gradients(accumulate_grad_batches, expe
     assert trainer.estimated_stepping_batches == expected_steps
 
 
+@RunIf(mps=False)
 @pytest.mark.parametrize(
     ["trainer_kwargs", "estimated_steps"],
     [
diff --git a/tests/tests_pytorch/trainer/test_supporters.py b/tests/tests_pytorch/trainer/test_supporters.py
@@ -316,11 +316,10 @@ def test_nested_calc_num_data(input_data, compute_func, expected_length):
 @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
 @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2)
 @mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True)
+@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=[0, 1])
 @pytest.mark.parametrize("use_fault_tolerant", [False, True])
 @pytest.mark.parametrize("replace_sampler_ddp", [False, True])
-def test_combined_data_loader_validation_test(
-    cuda_available_mock, device_count_mock, use_fault_tolerant, replace_sampler_ddp, tmpdir
-):
+def test_combined_data_loader_validation_test(_, __, ___, use_fault_tolerant, replace_sampler_ddp, tmpdir):
     """This test makes sure distributed sampler has been properly injected in dataloaders when using
     CombinedLoader."""
 
diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py

Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,7 @@ class MyApexPlugin(ApexMixedPrecisionPlugin):`
`39`	`39`	`pass`
`40`	`40`
`41`	`41`
	`42`	`+@RunIf(mps=False)`
`42`	`43`	`@mock.patch.dict(`
`43`	`44`	`os.environ,`
`44`	`45`	`{`