Lightning-AI · tchaton · Nov 8, 2022 · Nov 6, 2022 · Nov 6, 2022 · Nov 6, 2022
diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml
@@ -100,6 +100,8 @@ jobs:
         if: ${{ matrix.pkg-name != 'lightning' }}
         run: |
           python .actions/assistant.py copy_replace_imports --source_dir="./examples" --source_import="lightning.app,lightning" --target_import="lightning_app,lightning_app"
+          python .actions/assistant.py copy_replace_imports --source_dir="./examples" --source_import="lightning_app.lite" --target_import="lightning_lite"
+          python .actions/assistant.py copy_replace_imports --source_dir="./examples" --source_import="lightning_app.pytorch" --target_import="pytorch_lightning"
 
       - name: Switch coverage scope
         run: python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'lightning_app'))" >> $GITHUB_ENV

@@ -6,32 +6,42 @@ Lightning supports makes multi-node training simple by providing a simple interf
 
 You can run the multi-node raw PyTorch by running the following commands.
 
+Here is an example where you setup spawn your processes yourself.
+
 ```bash
 lightning run app app_torch_work.py
 ```
 
+or you can use the built-in component for it.
+
+```bash
+lightning run app app_component_torch.py
+```
+
 ## Multi Node with raw PyTorch + Lite
 
 You can run the multi-node raw PyTorch and Lite by running the following commands.
 
+This removes all the boilerplate around distributed strategy by you remain in control of your loops.
+
 ```bash
-lightning run app app_lite_work.py
+lightning run app app_component_lite.py
 ```
 
 ## Multi Node with PyTorch Lightning
 
 Lightning supports running PyTorch Lightning from a script or within a Lightning Work.
 
-### Multi Node PyTorch Lightning Script
+You can either run a script directly
 
 ```bash
 lightning run app app_pl_script.py
 ```
 
-### Multi Node PyTorch Lightning Work
+or run your code within as a work.
 
 ```bash
-lightning run app app_pl_work.py
+lightning run app app_component_pl.py
 ```
 
 ## Multi Node with any frameworks

diff --git a/examples/app_multi_node/app_component_lite.py b/examples/app_multi_node/app_component_lite.py
@@ -0,0 +1,37 @@
+import torch
+
+import lightning as L
+from lightning.app.components import LiteMultiNode
+from lightning.lite import LightningLite
+
+
+class LitePyTorchDistributed(L.LightningWork):
+    @staticmethod
+    def run():
+        # 1. Create LightningLite.
+        lite = LightningLite(strategy="ddp", precision="bf16")
+
+        # 2. Prepare distributed model and optimizer.
+        model = torch.nn.Linear(32, 2)
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+        model, optimizer = lite.setup(model, optimizer)
+        criterion = torch.nn.MSELoss()
+
+        # 3. Train the model for 50 steps.
+        for step in range(50):
+            model.zero_grad()
+            x = torch.randn(64, 32).to(lite.device)
+            output = model(x)
+            loss = criterion(output, torch.ones_like(output))
+            print(f"global_rank: {lite.global_rank} step: {step} loss: {loss}")
+            lite.backward(loss)
+            optimizer.step()
+
+
+app = L.LightningApp(
+    LiteMultiNode(
+        LitePyTorchDistributed,
+        cloud_compute=L.CloudCompute("gpu-fast-multi"),  # 4 x V100,
+        num_nodes=2,
+    )
+)
diff --git a/examples/app_multi_node/app_component_pl.py b/examples/app_multi_node/app_component_pl.py
@@ -0,0 +1,24 @@
+import lightning as L
+from lightning.app.components import PyTorchLightningMultiNode
+from lightning.pytorch.demos.boring_classes import BoringModel
+
+
+class PyTorchLightningDistributed(L.LightningWork):
+    @staticmethod
+    def run():
+        model = BoringModel()
+        trainer = L.Trainer(
+            max_epochs=10,
+            strategy="ddp",
+        )
+        trainer.fit(model)
+
+
+compute = L.CloudCompute("gpu-fast-multi")  # 4 x V100
+app = L.LightningApp(
+    PyTorchLightningMultiNode(
+        PyTorchLightningDistributed,
+        num_nodes=2,
+        cloud_compute=compute,
+    )
+)
diff --git a/examples/app_multi_node/app_component_torch.py b/examples/app_multi_node/app_component_torch.py
@@ -0,0 +1,46 @@
+import torch
+from torch.nn.parallel.distributed import DistributedDataParallel
+
+import lightning as L
+from lightning.app.components import PyTorchSpawnMultiNode
+
+
+class PyTorchDistributed(L.LightningWork):
+
+    # Note: Only staticmethod are support for now with `PyTorchSpawnMultiNode`
+    @staticmethod
+    def run(
+        world_size: int,
+        node_rank: int,
+        global_rank: str,
+        local_rank: int,
+    ):
+        # 1. Prepare distributed model
+        model = torch.nn.Linear(32, 2)
+        device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu")
+        device_ids = device if torch.cuda.is_available() else None
+        model = DistributedDataParallel(model, device_ids=device_ids).to(device)
+
+        # 2. Prepare loss and optimizer
+        criterion = torch.nn.MSELoss()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+
+        # 3. Train the model for 50 steps.
+        for step in range(50):
+            model.zero_grad()
+            x = torch.randn(64, 32).to(device)
+            output = model(x)
+            loss = criterion(output, torch.ones_like(output))
+            print(f"global_rank: {global_rank} step: {step} loss: {loss}")
+            loss.backward()
+            optimizer.step()
+
+
+compute = L.CloudCompute("gpu-fast-multi")  # 4 x V100
+app = L.LightningApp(
+    PyTorchSpawnMultiNode(
+        PyTorchDistributed,
+        num_nodes=2,
+        cloud_compute=compute,
+    )
+)
@@ -60,7 +60,7 @@ def run(
         )
 
 
-compute = L.CloudCompute("gpu-fast-multi")  # 4xV100
+compute = L.CloudCompute("gpu-fast-multi")  # 4 x V100
 app = L.LightningApp(
     MultiNode(
         PyTorchDistributed,

@@ -59,7 +59,10 @@ warn_no_return = "False"
 # the list can be generated with:
 # mypy --no-error-summary 2>&1 | tr ':' ' ' | awk '{print $1}' | sort | uniq | sed 's/\.py//g; s|src/||g;  s|\/|\.|g' | xargs -I {} echo '"{}",'
 module = [
-    "lightning_app.components.multi_node",
+    "lightning_app.components.multi_node.lite",
+    "lightning_app.components.multi_node.base",
+    "lightning_app.components.multi_node.pytorch_spawn",
+    "lightning_app.components.multi_node.pl",
     "lightning_app.api.http_methods",
     "lightning_app.api.request_types",
     "lightning_app.cli.commands.app_commands",

diff --git a/requirements/app/examples.txt b/requirements/app/examples.txt
@@ -1 +1,2 @@
 pytorch-lightning>=1.8.0
+lightning_lite
@@ -14,7 +14,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added a `MultiNode` Component to run with distributed computation with any frameworks ([#15524](https://github.com/Lightning-AI/lightning/pull/15524))
 
--
+- Expose `RunWorkExecutor` to the work and provides default ones for the `MultiNode` Component ([#15561](https://github.com/Lightning-AI/lightning/pull/15561))
 
 
 ### Changed

@@ -1,6 +1,11 @@
 from lightning_app.components.database.client import DatabaseClient
 from lightning_app.components.database.server import Database
-from lightning_app.components.multi_node import MultiNode
+from lightning_app.components.multi_node import (
+    LiteMultiNode,
+    MultiNode,
+    PyTorchLightningMultiNode,
+    PyTorchSpawnMultiNode,
+)
 from lightning_app.components.python.popen import PopenPythonScript
 from lightning_app.components.python.tracer import Code, TracerPythonScript
 from lightning_app.components.serve.gradio import ServeGradio
@@ -18,6 +23,9 @@
     "ServeStreamlit",
     "ModelInferenceAPI",
     "MultiNode",
+    "LiteMultiNode",
     "LightningTrainingComponent",
     "PyTorchLightningScriptRunner",
+    "PyTorchSpawnMultiNode",
+    "PyTorchLightningMultiNode",
 ]
@@ -0,0 +1,6 @@
+from lightning_app.components.multi_node.base import MultiNode
+from lightning_app.components.multi_node.lite import LiteMultiNode
+from lightning_app.components.multi_node.pl import PyTorchLightningMultiNode
+from lightning_app.components.multi_node.pytorch_spawn import PyTorchSpawnMultiNode
+
+__all__ = ["LiteMultiNode", "MultiNode", "PyTorchSpawnMultiNode", "PyTorchLightningMultiNode"]
@@ -1,10 +1,11 @@
-from typing import Any, Type
+from typing import Any, Callable, Optional, Type, Union
 
 from lightning_app import structures
 from lightning_app.core.flow import LightningFlow
 from lightning_app.core.work import LightningWork
 from lightning_app.utilities.enum import WorkStageStatus
 from lightning_app.utilities.packaging.cloud_compute import CloudCompute
+from lightning_app.utilities.proxies import WorkRunExecutor
 
 
 class MultiNode(LightningFlow):
@@ -13,6 +14,7 @@ def __init__(
         work_cls: Type["LightningWork"],
         num_nodes: int,
         cloud_compute: "CloudCompute",
+        executor_cls: Optional[Union[Type[WorkRunExecutor], Callable]] = None,
         *work_args: Any,
         **work_kwargs: Any,
     ) -> None:
@@ -48,6 +50,7 @@ def run(
             work_cls: The work to be executed
             num_nodes: Number of nodes.
             cloud_compute: The cloud compute object used in the cloud.
+            executor_cls: Customize the work run method execution.
             work_args: Arguments to be provided to the work on instantiation.
             work_kwargs: Keywords arguments to be provided to the work on instantiation.
         """
@@ -58,6 +61,10 @@ def run(
         self._cloud_compute = cloud_compute
         self._work_args = work_args
         self._work_kwargs = work_kwargs
+
+        if executor_cls:
+            self._work_kwargs["run_executor_cls"] = executor_cls
+
         self.has_started = False
 
     def run(self) -> None:
@@ -74,6 +81,7 @@ def run(self) -> None:
                             parallel=True,
                         )
                     )
+
                     # Starting node `node_rank`` ...
                     self.ws[-1].start()
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		pytorch-lightning>=1.8.0
		lightning_lite
tchaton marked this conversation as resolved. Outdated Show resolved Hide resolved
-Original file line number
+Diff line change
@@ Expand Up @@
     - Added a `MultiNode` Component to run with distributed computation with any frameworks ([#15524](https://github.com/Lightning-AI/lightning/pull/15524))
-    -
+    - Expose `RunWorkExecutor` to the work and provides default ones for the `MultiNode` Component ([#15561](https://github.com/Lightning-AI/lightning/pull/15561))
     ### Changed
@@ Expand Down @@