Lightning-AI · Borda · Dec 8, 2022 · Nov 10, 2022 · Nov 21, 2022 · Nov 21, 2022
@@ -30,6 +30,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed debugging with VSCode IDE ([#15747](https://github.com/Lightning-AI/lightning/pull/15747))
 
+- Fixed MPS error for multinode component (defaults to cpu on mps devices now as distributed operations are not supported by pytorch on mps) ([#15748](https://github.com/Ligtning-AI/lightning/pull/15748))
+
+
 
 ## [1.8.2] - 2022-11-17
 
@@ -52,12 +55,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed bi-directional queues sending delta with Drive Component name changes ([#15642](https://github.com/Lightning-AI/lightning/pull/15642))
 - Fixed CloudRuntime works collection with structures and accelerated multi node startup time ([#15650](https://github.com/Lightning-AI/lightning/pull/15650))
 - Fixed catimage import ([#15712](https://github.com/Lightning-AI/lightning/pull/15712))
-- Parse all lines in app file looking for shebangs to run commands ([#15714](https://github.com/Lightning-AI/lightning/pull/15714))
-
+- Parse all lines in app file looking for shebangs to run commands ([#15714](https://github.com/Lightning-AI/lightning/pull/15714)
 - Fixed setting property to the LightningFlow ([#15750](https://github.com/Lightning-AI/lightning/pull/15750))
 
 
 
+
 ## [1.8.1] - 2022-11-10
 
 

@@ -1,4 +1,5 @@
 import os
+import warnings
 from dataclasses import dataclass
 from typing import Any, Callable, Type
 
@@ -31,6 +32,7 @@ def run(
         nprocs: int,
     ):
         from lightning.lite import LightningLite
+        from lightning.lite.accelerators import MPSAccelerator
         from lightning.lite.strategies import DDPSpawnShardedStrategy, DDPSpawnStrategy
 
         # Used to configure PyTorch progress group
@@ -52,7 +54,18 @@ def run(
         def pre_fn(lite, *args, **kwargs):
             kwargs["devices"] = nprocs
             kwargs["num_nodes"] = num_nodes
-            kwargs["accelerator"] = "auto"
+
+            if MPSAccelerator.is_available():
+                old_acc_value = kwargs.get("accelerator", "auto")
+                kwargs["accelerator"] = "cpu"
+
+                if old_acc_value != kwargs["accelerator"]:
+                    warnings.warn(
+                        "Forcing accelerator=cpu as other accelerators (specifically MPS) are not supported "
+                        "by PyTorch for distributed training on mps capable devices"
+                    )
+            else:
+                kwargs["accelerator"] = "auto"
             strategy = kwargs.get("strategy", None)
             if strategy:
                 if isinstance(strategy, str):
@@ -61,14 +74,18 @@ def pre_fn(lite, *args, **kwargs):
                     elif strategy == "ddp_sharded_spawn":
                         strategy = "ddp_sharded"
                 elif isinstance(strategy, (DDPSpawnStrategy, DDPSpawnShardedStrategy)):
-                    raise Exception("DDP Spawned strategies aren't supported yet.")
+                    raise ValueError("DDP Spawned strategies aren't supported yet.")
+
+            kwargs["strategy"] = strategy
+
             return {}, args, kwargs
 
         tracer = Tracer()
         tracer.add_traced(LightningLite, "__init__", pre_fn=pre_fn)
         tracer._instrument()
-        work_run()
+        ret_val = work_run()
         tracer._restore()
+        return ret_val
 
 
 class LiteMultiNode(MultiNode):

@@ -88,7 +88,7 @@ def run(
         elif world_size > 1:
             raise Exception("Torch distributed should be available.")
 
-        work_run(world_size, node_rank, global_rank, local_rank)
+        return work_run(world_size, node_rank, global_rank, local_rank)
 
 
 class PyTorchSpawnMultiNode(MultiNode):

@@ -1,4 +1,5 @@
 import os
+import warnings
 from dataclasses import dataclass
 from typing import Any, Callable, Type
 
@@ -30,8 +31,9 @@ def run(
         node_rank: int,
         nprocs: int,
     ):
-        from lightning.lite.strategies import DDPSpawnShardedStrategy, DDPSpawnStrategy
         from lightning.pytorch import Trainer as LTrainer
+        from lightning.pytorch.accelerators import MPSAccelerator
+        from lightning.pytorch.strategies import DDPSpawnShardedStrategy, DDPSpawnStrategy
         from pytorch_lightning import Trainer as PLTrainer
 
         # Used to configure PyTorch progress group
@@ -50,7 +52,15 @@ def run(
         def pre_fn(trainer, *args, **kwargs):
             kwargs["devices"] = nprocs
             kwargs["num_nodes"] = num_nodes
-            kwargs["accelerator"] = "auto"
+            if MPSAccelerator.is_available():
+                old_acc_value = kwargs.get("accelerator", "auto")
+                kwargs["accelerator"] = "cpu"
+
+                if old_acc_value != kwargs["accelerator"]:
+                    warnings.warn(
+                        "Forcing accelerator=cpu as other accelerators (specifically MPS) are not supported "
+                        "by PyTorch for distributed training on mps capable devices"
+                    )
             strategy = kwargs.get("strategy", None)
             if strategy:
                 if isinstance(strategy, str):
@@ -59,15 +69,17 @@ def pre_fn(trainer, *args, **kwargs):
                     elif strategy == "ddp_sharded_spawn":
                         strategy = "ddp_sharded"
                 elif isinstance(strategy, (DDPSpawnStrategy, DDPSpawnShardedStrategy)):
-                    raise Exception("DDP Spawned strategies aren't supported yet.")
+                    raise ValueError("DDP Spawned strategies aren't supported yet.")
+                kwargs["strategy"] = strategy
             return {}, args, kwargs
 
         tracer = Tracer()
         tracer.add_traced(PLTrainer, "__init__", pre_fn=pre_fn)
         tracer.add_traced(LTrainer, "__init__", pre_fn=pre_fn)
         tracer._instrument()
-        work_run()
+        ret_val = work_run()
         tracer._restore()
+        return ret_val
 
 
 class LightningTrainerMultiNode(MultiNode):

@@ -0,0 +1,98 @@
+import os
+from copy import deepcopy
+from functools import partial
+from unittest import mock
+
+import pytest
+from lightning_utilities.core.imports import module_available
+from tests_app.helpers.utils import no_warning_call
+
+import lightning as L
+from lightning_app.components.multi_node.lite import _LiteRunExecutor
+
+
+def dummy_callable(**kwargs):
+    ll = L.lite.LightningLite(**kwargs)
+    return ll._all_passed_kwargs
+
+
+def dummy_init(self, **kwargs):
+    self._all_passed_kwargs = kwargs
+
+
+def _get_args_after_tracer_injection(**kwargs):
+    with mock.patch.object(L.lite.LightningLite, "__init__", dummy_init):
+        ret_val = _LiteRunExecutor.run(
+            local_rank=0,
+            work_run=partial(dummy_callable, **kwargs),
+            main_address="1.2.3.4",
+            main_port=5,
+            node_rank=6,
+            num_nodes=7,
+            nprocs=8,
+        )
+        env_vars = deepcopy(os.environ)
+    return ret_val, env_vars
+
+
+@pytest.mark.skipif(not module_available("lightning.lite"), reason="Lightning.lite not available")
+@pytest.mark.skipif(not L.lite.accelerators.MPSAccelerator.is_available(), reason="mps not available")
+@pytest.mark.parametrize("accelerator_given,accelerator_expected", [("cpu", "cpu"), ("auto", "cpu"), ("gpu", "cpu")])
+def test_lite_run_executor_mps_forced_cpu(accelerator_given, accelerator_expected):
+    warning_str = (
+        r"Forcing accelerator=cpu as other accelerators \(specifically MPS\) are not supported "
+        + "by PyTorch for distributed training on mps capable devices"
+    )
+    if accelerator_expected != accelerator_given:
+        warning_context = pytest.warns(UserWarning, match=warning_str)
+    else:
+        warning_context = no_warning_call(match=warning_str + "*")
+
+    with warning_context:
+        ret_val, env_vars = _get_args_after_tracer_injection(accelerator=accelerator_given)
+    assert ret_val["accelerator"] == accelerator_expected
+
+
+@pytest.mark.parametrize(
+    "args_given,args_expected",
+    [
+        (
+            {
+                "devices": 1,
+                "num_nodes": 1,
+                "accelerator": "gpu",
+            },
+            {"devices": 8, "num_nodes": 7, "accelerator": "auto"},
+        ),
+        ({"strategy": "ddp_spawn"}, {"strategy": "ddp"}),
+        ({"strategy": "ddp_sharded_spawn"}, {"strategy": "ddp_sharded"}),
+    ],
+)
+def test_trainer_run_executor_arguments_choices(args_given: dict, args_expected: dict):
+
+    # ddp with mps devices not available (tested separately, just patching here for cross-os testing of other args)
+    if L.lite.accelerators.MPSAccelerator.is_available():
+        args_expected["accelerator"] = "cpu"
+
+    ret_val, env_vars = _get_args_after_tracer_injection(**args_given)
+
+    for k, v in args_expected.items():
+        assert ret_val[k] == v
+
+    assert env_vars["MASTER_ADDR"] == "1.2.3.4"
+    assert env_vars["MASTER_PORT"] == "5"
+    assert env_vars["GROUP_RANK"] == "6"
+    assert env_vars["RANK"] == str(0 + 6 * 8)
+    assert env_vars["LOCAL_RANK"] == "0"
+    assert env_vars["WORLD_SIZE"] == str(7 * 8)
+    assert env_vars["LOCAL_WORLD_SIZE"] == "8"
+    assert env_vars["TORCHELASTIC_RUN_ID"] == "1"
+    assert env_vars["LT_CLI_USED"] == "1"
+
+
+def test_lite_run_executor_invalid_strategy_instances():
+    with pytest.raises(ValueError, match="DDP Spawned strategies aren't supported yet."):
+        _, _ = _get_args_after_tracer_injection(strategy=L.lite.strategies.DDPSpawnStrategy())
+
+    with pytest.raises(ValueError, match="DDP Spawned strategies aren't supported yet."):
+        _, _ = _get_args_after_tracer_injection(strategy=L.lite.strategies.DDPSpawnShardedStrategy())
@@ -0,0 +1,99 @@
+import os
+from copy import deepcopy
+from functools import partial
+from unittest import mock
+
+import pytest
+from lightning_utilities.core.imports import module_available
+from tests_app.helpers.utils import no_warning_call
+
+import lightning as L
+from lightning_app.components.multi_node.trainer import _LightningTrainerRunExecutor
+
+
+def dummy_callable(**kwargs):
+    t = L.pytorch.Trainer(**kwargs)
+    return t._all_passed_kwargs
+
+
+def dummy_init(self, **kwargs):
+    self._all_passed_kwargs = kwargs
+
+
+def _get_args_after_tracer_injection(**kwargs):
+    with mock.patch.object(L.pytorch.Trainer, "__init__", dummy_init):
+        ret_val = _LightningTrainerRunExecutor.run(
+            local_rank=0,
+            work_run=partial(dummy_callable, **kwargs),
+            main_address="1.2.3.4",
+            main_port=5,
+            node_rank=6,
+            num_nodes=7,
+            nprocs=8,
+        )
+        env_vars = deepcopy(os.environ)
+    return ret_val, env_vars
+
+
+@pytest.mark.skipif(not module_available("lightning.pytorch"), reason="lightning.pytorch not available")
+@pytest.mark.skipif(
+    not L.pytorch.accelerators.MPSAccelerator.is_available(), reason="MPS not available but required for this test"
+)
+@pytest.mark.parametrize("accelerator_given,accelerator_expected", [("cpu", "cpu"), ("auto", "cpu"), ("gpu", "cpu")])
+def test_trainer_run_executor_mps_forced_cpu(accelerator_given, accelerator_expected):
+    warning_str = (
+        r"Forcing accelerator=cpu as other accelerators \(specifically MPS\) are not supported "
+        + "by PyTorch for distributed training on mps capable devices"
+    )
+    if accelerator_expected != accelerator_given:
+        warning_context = pytest.warns(UserWarning, match=warning_str)
+    else:
+        warning_context = no_warning_call(match=warning_str + "*")
+
+    with warning_context:
+        ret_val, env_vars = _get_args_after_tracer_injection(accelerator=accelerator_given)
+    assert ret_val["accelerator"] == accelerator_expected
+
+
+@pytest.mark.parametrize(
+    "args_given,args_expected",
+    [
+        (
+            {
+                "devices": 1,
+                "num_nodes": 1,
+                "accelerator": "gpu",
+            },
+            {"devices": 8, "num_nodes": 7, "accelerator": "auto"},
+        ),
+        ({"strategy": "ddp_spawn"}, {"strategy": "ddp"}),
+        ({"strategy": "ddp_sharded_spawn"}, {"strategy": "ddp_sharded"}),
+    ],
+)
+def test_trainer_run_executor_arguments_choices(args_given: dict, args_expected: dict):
+
+    # ddp with mps devices not available (tested separately, just patching here for cross-os testing of other args)
+    if L.pytorch.accelerators.MPSAccelerator.is_available():
+        args_expected["accelerator"] = "cpu"
+
+    ret_val, env_vars = _get_args_after_tracer_injection(**args_given)
+
+    for k, v in args_expected.items():
+        assert ret_val[k] == v
+
+    assert env_vars["MASTER_ADDR"] == "1.2.3.4"
+    assert env_vars["MASTER_PORT"] == "5"
+    assert env_vars["GROUP_RANK"] == "6"
+    assert env_vars["RANK"] == str(0 + 6 * 8)
+    assert env_vars["LOCAL_RANK"] == "0"
+    assert env_vars["WORLD_SIZE"] == str(7 * 8)
+    assert env_vars["LOCAL_WORLD_SIZE"] == "8"
+    assert env_vars["TORCHELASTIC_RUN_ID"] == "1"
+
+
+def test_trainer_run_executor_invalid_strategy_instances():
+    with pytest.raises(ValueError, match="DDP Spawned strategies aren't supported yet."):
+        _, _ = _get_args_after_tracer_injection(strategy=L.pytorch.strategies.DDPSpawnStrategy())
+
+    with pytest.raises(ValueError, match="DDP Spawned strategies aren't supported yet."):
+        _, _ = _get_args_after_tracer_injection(strategy=L.pytorch.strategies.DDPSpawnShardedStrategy())
diff --git a/tests/tests_app/helpers/__init__.py b/tests/tests_app/helpers/__init__.py
diff --git a/tests/tests_app/helpers/utils.py b/tests/tests_app/helpers/utils.py
@@ -0,0 +1,27 @@
+import re
+from contextlib import contextmanager
+from typing import Optional, Type
+
+import pytest
+
+
+@contextmanager
+def no_warning_call(expected_warning: Type[Warning] = UserWarning, match: Optional[str] = None):
+    with pytest.warns(None) as record:
+        yield
+
+    if match is None:
+        try:
+            w = record.pop(expected_warning)
+        except AssertionError:
+            # no warning raised
+            return
+    else:
+        for w in record.list:
+            if w.category is expected_warning and re.compile(match).search(w.message.args[0]):
+                break
+        else:
+            return
+
+    msg = "A warning" if expected_warning is None else f"`{expected_warning.__name__}`"
+    raise AssertionError(f"{msg} was raised: {w}")