Merge branch 'master' into carmocca/delay-convert-module

carmocca · web-flow · commit 24a1b52a9971 · 2024-02-05T19:23:41.000+01:00
diff --git a/.azure/gpu-benchmarks.yml b/.azure/gpu-benchmarks.yml
@@ -46,7 +46,8 @@ jobs:
     variables:
       DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
     container:
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.0-cuda11.8.0"
+      # TODO: Upgrade to Python 3.11
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
       options: "--gpus=all --shm-size=32g"
     strategy:
       matrix:
diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml
@@ -56,14 +56,12 @@ jobs:
       options: "--gpus=all --shm-size=2gb  -v /var/tmp:/var/tmp"
     strategy:
       matrix:
+        # TODO: Upgrade to Python 3.11
         "Fabric | latest":
-          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.0"
-          PACKAGE_NAME: "fabric"
-        "Fabric | future":
-          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.2-cuda12.1.0"
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
           PACKAGE_NAME: "fabric"
         "Lightning | latest":
-          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.0"
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
           PACKAGE_NAME: "lightning"
     workspace:
       clean: all
diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
@@ -48,15 +48,12 @@ jobs:
     cancelTimeoutInMinutes: "2"
     strategy:
       matrix:
+        # TODO: Upgrade to Python 3.11
         "PyTorch | latest":
-          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.0"
-          PACKAGE_NAME: "pytorch"
-        "PyTorch | future":
-          # todo: failed to install `pygame` with py3.11
           image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
           PACKAGE_NAME: "pytorch"
         "Lightning | latest":
-          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.0"
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
           PACKAGE_NAME: "lightning"
     pool: lit-rtx-3090
     variables:
diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
@@ -21,15 +21,12 @@ subprojects:
     checks:
       - "pl-cpu (macOS-11, lightning, 3.8, 1.13, oldest)"
       - "pl-cpu (macOS-11, lightning, 3.10, 1.13)"
-      - "pl-cpu (macOS-11, lightning, 3.10, 2.0)"
       - "pl-cpu (macOS-11, lightning, 3.10, 2.1)"
       - "pl-cpu (ubuntu-20.04, lightning, 3.8, 1.13, oldest)"
       - "pl-cpu (ubuntu-20.04, lightning, 3.10, 1.13)"
-      - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.0)"
       - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.1)"
       - "pl-cpu (windows-2022, lightning, 3.8, 1.13, oldest)"
       - "pl-cpu (windows-2022, lightning, 3.10, 1.13)"
-      - "pl-cpu (windows-2022, lightning, 3.10, 2.0)"
       - "pl-cpu (windows-2022, lightning, 3.10, 2.1)"
       - "pl-cpu (macOS-11, pytorch, 3.8, 1.13)"
       - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 1.13)"
@@ -190,15 +187,12 @@ subprojects:
     checks:
       - "fabric-cpu (macOS-11, lightning, 3.8, 1.13, oldest)"
       - "fabric-cpu (macOS-11, lightning, 3.10, 1.13)"
-      - "fabric-cpu (macOS-11, lightning, 3.10, 2.0)"
       - "fabric-cpu (macOS-11, lightning, 3.11, 2.1)"
       - "fabric-cpu (ubuntu-20.04, lightning, 3.8, 1.13, oldest)"
       - "fabric-cpu (ubuntu-20.04, lightning, 3.10, 1.13)"
-      - "fabric-cpu (ubuntu-20.04, lightning, 3.10, 2.0)"
       - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.1)"
       - "fabric-cpu (windows-2022, lightning, 3.8, 1.13, oldest)"
       - "fabric-cpu (windows-2022, lightning, 3.10, 1.13)"
-      - "fabric-cpu (windows-2022, lightning, 3.10, 2.0)"
       - "fabric-cpu (windows-2022, lightning, 3.11, 2.1)"
       - "fabric-cpu (macOS-11, fabric, 3.8, 1.13)"
       - "fabric-cpu (ubuntu-20.04, fabric, 3.8, 1.13)"
diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml
@@ -43,13 +43,9 @@ jobs:
           - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" }
           # only run PyTorch latest
-          - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
-          - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
-          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
           - { os: "macOS-11", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" }
           - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" }
-          # only run PyTorch future
           - { os: "macOS-12", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" }
           - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" }
diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml
@@ -47,13 +47,9 @@ jobs:
           - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" }
           # only run PyTorch latest
-          - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
-          - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
-          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
           - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" }
           - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" }
-          # only run PyTorch future
           - { os: "macOS-12", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" }
           - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" }
diff --git a/docs/source-fabric/advanced/compile.rst b/docs/source-fabric/advanced/compile.rst
@@ -220,6 +220,7 @@ On PyTorch 2.2 and later, ``torch.compile`` will detect dynamism automatically a
 
     Numbers produced with NVIDIA A100 SXM4 40GB, PyTorch 2.2.0, CUDA 12.1.
 
+
 ----
 
 
@@ -255,17 +256,33 @@ Naturally, the tradoff here is that it will consume a bit more memory.
 
 You can find a full list of compile options in the `PyTorch documentation <https://pytorch.org/docs/stable/generated/torch.compile.html>`_.
 
+
+----
+
+
+**************************************
+A note about torch.compile in practice
+**************************************
+
+In practice, you will find that ``torch.compile`` often doesn't work well and can even be counter-productive.
+Compilation may fail with cryptic error messages that are impossible to debug without help from the PyTorch team.
+It is also not uncommon that ``torch.compile`` will produce a significantly *slower* model or one with much higher memory usage.
+On top of that, the compilation phase itself can be incredibly slow, taking several minutes to finish.
+For these reasons, we recommend that you don't waste too much time trying to apply ``torch.compile`` during development, and rather evaluate its effectiveness toward the end when you are about to launch long-running, expensive experiments.
+Always compare the speed and memory usage of the compiled model against the original model!
+
+
 ----
 
 
-*******************************************************
-(Experimental) Apply torch.compile over FSDP, DDP, etc.
-*******************************************************
+*************************************
+Using torch.compile with FSDP and DDP
+*************************************
 
 As stated earlier, we recommend that you compile the model before calling ``fabric.setup()``.
-However, if you are using DDP or FSDP with Fabric, the compilation won't incorporate the distributed calls inside these wrappers by default.
-In an experimental feature, you can let ``fabric.setup()`` reapply the ``torch.compile`` call after the model gets wrapped in DDP/FSDP internally.
-In the future, this option will become the default.
+In the case of DDP and FSDP, ``fabric.setup()`` will automatically reapply the ``torch.compile`` call after the model gets wrapped in DDP/FSDP internally.
+This will ensure that the compilation can incorporate the distributed calls and optimize them.
+However, should you have issues compiling DDP and FSDP models, you can opt out of this feature:
 
 .. code-block:: python
 
@@ -275,25 +292,11 @@ In the future, this option will become the default.
     # Compile the model
     model = torch.compile(model)
 
-    # Default: `fabric.setup()` will not reapply the compilation over DDP/FSDP
-    model = fabric.setup(model, _reapply_compile=False)
-
-    # Recompile the model over DDP/FSDP (experimental)
+    # Default: `fabric.setup()` will configure compilation over DDP/FSDP for you
     model = fabric.setup(model, _reapply_compile=True)
 
+    # Turn it off if you see issues with DDP/FSDP
+    model = fabric.setup(model, _reapply_compile=False)
 
-----
-
-
-**************************************
-A note about torch.compile in practice
-**************************************
-
-In practice, you will find that ``torch.compile`` often doesn't work well and can even be counter-productive.
-Compilation may fail with cryptic error messages that are impossible to debug without help from the PyTorch team.
-It is also not uncommon that ``torch.compile`` will produce a significantly *slower* model or one with much higher memory usage.
-On top of that, the compilation phase itself can be incredibly slow, taking several minutes to finish.
-For these reasons, we recommend that you don't waste too much time trying to apply ``torch.compile`` during development, and rather evaluate its effectiveness toward the end when you are about to launch long-running, expensive experiments.
-Always compare the speed and memory usage of the compiled model against the original model!
 
 |
diff --git a/src/lightning/data/streaming/data_processor.py b/src/lightning/data/streaming/data_processor.py
@@ -16,6 +16,7 @@
 from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
 from urllib import parse
 
+import numpy as np
 from tqdm.auto import tqdm as _tqdm
 
 from lightning import seed_everything
@@ -290,7 +291,7 @@ def _map_items_to_workers_weighted(
         else:
             print(f"Worker {worker_id} gets ({len(worker_items[worker_id])}) items for a total weight of {size}.")
 
-    return [worker_items[worker_id] for worker_id in worker_ids_this_node]
+    return [np.random.permutation(worker_items[worker_id]).tolist() for worker_id in worker_ids_this_node]
 
 
 def _get_num_bytes(item: Any, base_path: str) -> int:
diff --git a/src/lightning/data/streaming/resolver.py b/src/lightning/data/streaming/resolver.py
@@ -303,13 +303,17 @@ def _execute(
     if not _LIGHTNING_SDK_AVAILABLE:
         raise ModuleNotFoundError("The `lightning_sdk` is required.")
 
+    lightning_skip_install = os.getenv("LIGHTNING_SKIP_INSTALL", "")
+    if lightning_skip_install:
+        lightning_skip_install = f" LIGHTNING_SKIP_INSTALL={lightning_skip_install} "
+
     lightning_branch = os.getenv("LIGHTNING_BRANCH", "")
     if lightning_branch:
-        lightning_branch = f" LIGHTNING_BRANCH={lightning_branch}"
+        lightning_branch = f" LIGHTNING_BRANCH={lightning_branch} "
 
     studio = Studio()
     job = studio._studio_api.create_data_prep_machine_job(
-        command or f"cd {os.getcwd()} &&{lightning_branch} python {' '.join(sys.argv)}",
+        command or f"cd {os.getcwd()} &&{lightning_skip_install}{lightning_branch} python {' '.join(sys.argv)}",
         name=name,
         num_instances=num_nodes,
         studio_id=studio._studio.id,
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -31,6 +31,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed parsing of v100s GPUs in `get_available_flops` ([#18952](https://github.com/Lightning-AI/lightning/pull/18952))
 - Fixed issue where the `precision="transformer-engine"` argument would not replace layers by default ([#19082](https://github.com/Lightning-AI/lightning/pull/19082))
+- Fixed the input validation logic in `FSDPStrategy` to accept a `device_mesh` ([#19392](https://github.com/Lightning-AI/lightning/pull/19392))
 
 
 ## [2.1.4] - 2024-01-31
diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py
@@ -214,7 +214,7 @@ def setup(
         module: nn.Module,
         *optimizers: Optimizer,
         move_to_device: bool = True,
-        _reapply_compile: Optional[bool] = None,
+        _reapply_compile: bool = True,
     ) -> Any:  # no specific return because the way we want our API to look does not play well with mypy
         r"""Set up a model and its optimizers for accelerated training.
 
@@ -223,10 +223,11 @@ def setup(
             *optimizers: The optimizer(s) to set up (no optimizers is also possible)
             move_to_device: If set ``True`` (default), moves the model to the correct device. Set this to ``False``
                 and alternatively use :meth:`to_device` manually.
-            _reapply_compile: (Experimental) If set to ``True``, and the model was ``torch.compile``d before, the
+            _reapply_compile: If ``True`` (default), and the model was ``torch.compile``d before, the
                 corresponding :class:`~torch._dynamo.OptimizedModule` wrapper will be removed and reapplied with the
                 same settings after the model was set up by the strategy (e.g., after the model was wrapped by DDP,
-                FSDP etc.). Only supported on PyTorch >= 2.1. Defaults to ``False``, but it may change in the future.
+                FSDP etc.). Only applies on PyTorch >= 2.1. Set it to ``False`` if compiling DDP/FSDP is causing
+                issues.
 
         Returns:
             The tuple containing wrapped module and the optimizers, in the same order they were passed in.
@@ -280,7 +281,7 @@ def setup(
         return module
 
     def setup_module(
-        self, module: nn.Module, move_to_device: bool = True, _reapply_compile: Optional[bool] = None
+        self, module: nn.Module, move_to_device: bool = True, _reapply_compile: bool = True
     ) -> _FabricModule:
         r"""Set up a model for accelerated training or inference.
 
@@ -292,11 +293,11 @@ def setup_module(
             module: A :class:`torch.nn.Module` to set up
             move_to_device: If set ``True`` (default), moves the model to the correct device. Set this to ``False``
                 and alternatively use :meth:`to_device` manually.
-            _reapply_compile: (Experimental) If set to ``True``, and the model was ``torch.compile``d before, the
+            _reapply_compile: If ``True`` (default), and the model was ``torch.compile``d before, the
                 corresponding :class:`~torch._dynamo.OptimizedModule` wrapper will be removed and reapplied with the
                 same settings after the model was set up by the strategy (e.g., after the model was wrapped by DDP,
-                FSDP etc.). Only supported on PyTorch >= 2.1. Defaults to ``False``, but it may change in the future.
-
+                FSDP etc.). Only applies on PyTorch >= 2.1. Set it to ``False`` if compiling DDP/FSDP is causing
+                issues.
         Returns:
             The wrapped model.
 
diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py
@@ -795,11 +795,20 @@ def _init_cpu_offload(cpu_offload: Optional[Union[bool, "CPUOffload"]]) -> "CPUO
 def _init_sharding_strategy(sharding_strategy: "_SHARDING_STRATEGY", kwargs: Dict) -> "ShardingStrategy":
     from torch.distributed.fsdp import ShardingStrategy
 
+    if kwargs.get("process_group") is not None and kwargs.get("device_mesh") is not None:
+        raise ValueError(
+            "The arguments `FSDPStrategy(process_group=..., device_mesh=...)` are mutually exclusive."
+            "Pass only one of them."
+        )
+
     strategy = ShardingStrategy[sharding_strategy.upper()] if isinstance(sharding_strategy, str) else sharding_strategy
-    if "HYBRID" in strategy.name and kwargs.get("auto_wrap_policy") is None and kwargs.get("process_group") is None:
+    if (
+        "HYBRID" in strategy.name and kwargs.get("auto_wrap_policy") is None
+        and kwargs.get("process_group") is None and kwargs.get("device_mesh") is None
+    ):
         raise RuntimeError(
-            "The hybrid sharding strategy requires you to either set the `auto_wrap_policy` or pass a process"
-            " group tuple to the `process_group` parameter."
+            "The hybrid sharding strategy requires you to pass at least one of the parameters: `auto_wrap_policy`,"
+            " `process_group` tuple, or `device_mesh`."
         )
     return strategy
 
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -41,6 +41,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed issue where the `precision="transformer-engine"` argument would not replace layers by default ([#19082](https://github.com/Lightning-AI/lightning/pull/19082))
 - Fixed issue where layers created in `LightningModule.setup` or `LightningModule.configure_model` wouldn't get converted when using the Bitsandbytes or TransformerEngine plugins ([#19061](https://github.com/Lightning-AI/lightning/pull/19061))
+- Fixed the input validation logic in `FSDPStrategy` to accept a `device_mesh` ([#19392](https://github.com/Lightning-AI/lightning/pull/19392))
 
 
 ## [2.1.4] - 2024-01-31
diff --git a/tests/tests_data/streaming/test_data_processor.py b/tests/tests_data/streaming/test_data_processor.py
diff --git a/tests/tests_fabric/strategies/test_fsdp.py b/tests/tests_fabric/strategies/test_fsdp.py
diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py