Lightning-AI
diff --git a/‎examples/app_multi_node/README.md‎
Lines changed: 17 additions & 7 deletions b/‎examples/app_multi_node/README.md‎
Lines changed: 17 additions & 7 deletions
diff --git a/‎examples/app_multi_node/app_lite_work.py‎
Lines changed: 0 additions & 59 deletions b/‎examples/app_multi_node/app_lite_work.py‎
Lines changed: 0 additions & 59 deletions
diff --git a/‎examples/app_multi_node/app_pl_work.py‎
Lines changed: 0 additions & 38 deletions b/‎examples/app_multi_node/app_pl_work.py‎
Lines changed: 0 additions & 38 deletions
diff --git a/‎examples/app_multi_node/train.py‎ renamed to ‎examples/app_multi_node/pl_boring_script.py‎ b/‎examples/app_multi_node/train.py‎ renamed to ‎examples/app_multi_node/pl_boring_script.py‎
diff --git a/‎examples/app_multi_node/app_generic_work.py‎ renamed to ‎examples/app_multi_node/train_any.py‎
Lines changed: 1 addition & 2 deletions b/‎examples/app_multi_node/app_generic_work.py‎ renamed to ‎examples/app_multi_node/train_any.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/app_multi_node/train_lite.py‎
Lines changed: 38 additions & 0 deletions b/‎examples/app_multi_node/train_lite.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎examples/app_multi_node/train_pl.py‎
Lines changed: 24 additions & 0 deletions b/‎examples/app_multi_node/train_pl.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎examples/app_multi_node/app_pl_script.py‎ renamed to ‎examples/app_multi_node/train_pl_script.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/app_multi_node/app_pl_script.py‎ renamed to ‎examples/app_multi_node/train_pl_script.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/app_multi_node/app_torch_work.py‎ renamed to ‎examples/app_multi_node/train_pytorch.py‎
Lines changed: 2 additions & 9 deletions b/‎examples/app_multi_node/app_torch_work.py‎ renamed to ‎examples/app_multi_node/train_pytorch.py‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎examples/app_multi_node/train_pytorch_spawn.py‎
Lines changed: 46 additions & 0 deletions b/‎examples/app_multi_node/train_pytorch_spawn.py‎
Lines changed: 46 additions & 0 deletions
@@ -6,36 +6,46 @@ Lightning supports makes multi-node training simple by providing a simple interf
 
 You can run the multi-node raw PyTorch by running the following commands.
 
+Here is an example where you spawn your processes yourself.
+
+```bash
+lightning run app train_pytorch.py
+```
+
+or you can use the built-in component for it.
+
 ```bash
-lightning run app app_torch_work.py
+lightning run app train_pytorch_spawn.py
 ```
 
 ## Multi Node with raw PyTorch + Lite
 
 You can run the multi-node raw PyTorch and Lite by running the following commands.
 
 ```bash
-lightning run app app_lite_work.py
+lightning run app train_lite.py
 ```
 
+Using Lite, you retain control over your loops while accessing in a minimal way all Lightning distributed strategies.
+
 ## Multi Node with PyTorch Lightning
 
 Lightning supports running PyTorch Lightning from a script or within a Lightning Work.
 
-### Multi Node PyTorch Lightning Script
+You can either run a script directly
 
 ```bash
-lightning run app app_pl_script.py
+lightning run app train_pl_script.py
 ```
 
-### Multi Node PyTorch Lightning Work
+or run your code within as a work.
 
 ```bash
-lightning run app app_pl_work.py
+lightning run app train_pl.py
 ```
 
 ## Multi Node with any frameworks
 
 ```bash
-lightning run app app_generic_work.py
+lightning run app train_any.py
 ```
@@ -13,11 +13,10 @@ def run(
         print(f"ADD YOUR DISTRIBUTED CODE: {main_address} {main_port} {num_nodes} {node_rank}.")
 
 
-compute = L.CloudCompute("gpu")
 app = L.LightningApp(
     MultiNode(
         AnyDistributedComponent,
         num_nodes=2,
-        cloud_compute=compute,
+        cloud_compute=L.CloudCompute("gpu"),
     )
 )
@@ -0,0 +1,38 @@
+import torch
+
+import lightning as L
+from lightning.app.components import LiteMultiNode
+from lightning.lite import LightningLite
+
+
+class LitePyTorchDistributed(L.LightningWork):
+    @staticmethod
+    def run():
+        # 1. Create LightningLite.
+        lite = LightningLite(strategy="ddp", precision="bf16")
+
+        # 2. Prepare distributed model and optimizer.
+        model = torch.nn.Linear(32, 2)
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+        model, optimizer = lite.setup(model, optimizer)
+        criterion = torch.nn.MSELoss()
+
+        # 3. Train the model for 50 steps.
+        for step in range(50):
+            model.zero_grad()
+            x = torch.randn(64, 32).to(lite.device)
+            output = model(x)
+            loss = criterion(output, torch.ones_like(output))
+            print(f"global_rank: {lite.global_rank} step: {step} loss: {loss}")
+            lite.backward(loss)
+            optimizer.step()
+
+
+# Run over 2 nodes of 4 x V100
+app = L.LightningApp(
+    LiteMultiNode(
+        LitePyTorchDistributed,
+        cloud_compute=L.CloudCompute("gpu-fast-multi"),  # 4 x V100
+        num_nodes=2,
+    )
+)
@@ -0,0 +1,24 @@
+import lightning as L
+from lightning.app.components import PyTorchLightningMultiNode
+from lightning.pytorch.demos.boring_classes import BoringModel
+
+
+class PyTorchLightningDistributed(L.LightningWork):
+    @staticmethod
+    def run():
+        model = BoringModel()
+        trainer = L.Trainer(
+            max_epochs=10,
+            strategy="ddp",
+        )
+        trainer.fit(model)
+
+
+# Run over 2 nodes of 4 x V100
+app = L.LightningApp(
+    PyTorchLightningMultiNode(
+        PyTorchLightningDistributed,
+        num_nodes=2,
+        cloud_compute=L.CloudCompute("gpu-fast-multi"),  # 4 x V100
+    )
+)
@@ -2,9 +2,10 @@
 from lightning.app.components import LightningTrainingComponent
 from lightning.app.utilities.packaging.cloud_compute import CloudCompute
 
+# Run over 2 nodes of 4 x V100
 app = L.LightningApp(
     LightningTrainingComponent(
-        "train.py",
+        "pl_boring_script.py",
         num_nodes=2,
         cloud_compute=CloudCompute("gpu-fast-multi"),
     ),
 
@@ -38,13 +38,6 @@ def distributed_train(local_rank: int, main_address: str, main_port: int, num_no
         loss.backward()
         optimizer.step()
 
-    # 5. Verify all processes have the same weights at the end of training.
-    weight = model.module.weight.clone()
-    torch.distributed.all_reduce(weight)
-    assert torch.equal(model.module.weight, weight / world_size)
-
-    print("Multi Node Distributed Training Done!")
-
 
 class PyTorchDistributed(L.LightningWork):
     def run(
@@ -60,11 +53,11 @@ def run(
         )
 
 
-compute = L.CloudCompute("gpu-fast-multi")  # 4xV100
+# Run over 2 nodes of 4 x V100
 app = L.LightningApp(
     MultiNode(
         PyTorchDistributed,
         num_nodes=2,
-        cloud_compute=compute,
+        cloud_compute=L.CloudCompute("gpu-fast-multi"),  # 4 x V100
     )
 )
@@ -0,0 +1,46 @@
+import torch
+from torch.nn.parallel.distributed import DistributedDataParallel
+
+import lightning as L
+from lightning.app.components import PyTorchSpawnMultiNode
+
+
+class PyTorchDistributed(L.LightningWork):
+
+    # Note: Only staticmethod are support for now with `PyTorchSpawnMultiNode`
+    @staticmethod
+    def run(
+        world_size: int,
+        node_rank: int,
+        global_rank: str,
+        local_rank: int,
+    ):
+        # 1. Prepare distributed model
+        model = torch.nn.Linear(32, 2)
+        device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu")
+        device_ids = device if torch.cuda.is_available() else None
+        model = DistributedDataParallel(model, device_ids=device_ids).to(device)
+
+        # 2. Prepare loss and optimizer
+        criterion = torch.nn.MSELoss()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+
+        # 3. Train the model for 50 steps.
+        for step in range(50):
+            model.zero_grad()
+            x = torch.randn(64, 32).to(device)
+            output = model(x)
+            loss = criterion(output, torch.ones_like(output))
+            print(f"global_rank: {global_rank} step: {step} loss: {loss}")
+            loss.backward()
+            optimizer.step()
+
+
+# Run over 2 nodes of 4 x V100
+app = L.LightningApp(
+    PyTorchSpawnMultiNode(
+        PyTorchDistributed,
+        num_nodes=2,
+        cloud_compute=L.CloudCompute("gpu-fast-multi"),  # 4 x V100
+    )
+)
Original file line number	Diff line number	Diff line change
`@@ -13,11 +13,10 @@ def run(`
`13`	`13`	`print(f"ADD YOUR DISTRIBUTED CODE: {main_address} {main_port} {num_nodes} {node_rank}.")`
`14`	`14`
`15`	`15`
`16`		`-compute = L.CloudCompute("gpu")`
`17`	`16`	`app = L.LightningApp(`
`18`	`17`	`MultiNode(`
`19`	`18`	`AnyDistributedComponent,`
`20`	`19`	`num_nodes=2,`
`21`		`- cloud_compute=compute,`
	`20`	`+ cloud_compute=L.CloudCompute("gpu"),`
`22`	`21`	`)`
`23`	`22`	`)`