squashs all commits

awaelchli · awaelchli · commit a3c97c911398 · 2023-01-20T13:23:30.000+01:00
diff --git a/src/lightning_fabric/fabric.py b/src/lightning_fabric/fabric.py
@@ -519,6 +519,7 @@ def save(self, path: Union[str, Path], state: Dict[str, Union[nn.Module, Optimiz
             state: A dictionary with contents to be saved. If the dict contains modules or optimizers, their
                 state-dict will be retrieved and converted automatically.
         """
+        # TODO: validate deepspeed model with self._models_setup > 1
         return self._strategy.save_checkpoint(path=path, state=_unwrap_objects(state))
 
     def load(
@@ -537,6 +538,9 @@ def load(
             The remaining items that were not restored into the given state dictionary. If no state dictionary is
             given, the full checkpoint will be returned.
         """
+        # TODO: validate deepspeed model with self._models_setup > 1
+        # if isinstance(self._strategy, DeepSpeedStrategy):
+
         return self._strategy.load_checkpoint(path=path, state=state)
 
     def launch(self, function: Optional[Callable[["Fabric"], Any]] = None, *args: Any, **kwargs: Any) -> Any:
diff --git a/src/lightning_fabric/strategies/deepspeed.py b/src/lightning_fabric/strategies/deepspeed.py
@@ -17,8 +17,9 @@
 import os
 import platform
 from contextlib import contextmanager
+from itertools import chain
 from pathlib import Path
-from typing import Any, Dict, Generator, Iterable, List, Mapping, Optional, Tuple, TYPE_CHECKING, Union
+from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, TYPE_CHECKING, Union
 
 import torch
 from lightning_utilities.core.imports import RequirementCache
@@ -376,22 +377,37 @@ def save_checkpoint(
         Raises:
             TypeError if the unused ``storage_options`` gets passed.
         """
-        # broadcast the path from rank 0 to ensure all the states are saved in a common path
-        path = self.broadcast(path)
-
         if storage_options is not None:
             raise TypeError(
                 f"`{self.__class__.__name__}.save_checkpoint(..., storage_options=...)` is not supported because"
                 f" {self.__class__.__name__} does not use the `CheckpointIO`."
             )
+        # validate that the deepspeed engine recorded in this strategy corresponds with the model the user
+        # is handling
+        # TODO: we support multiple models with deepspeed, redo this error
+        if self._deepspeed_engine not in state.values():
+            raise ValueError(
+                "Could not find a deepspeed model in the provided checkpoint state. Please provide the model as"
+                " part of the state like so: `save_checkpoint(..., state={'model': model, ...})`. Make sure"
+                " you set up the model (and optimizers if any) through the strategy before saving the checkpoint."
+            )
 
+        # broadcast the path from rank 0 to ensure all the states are saved in a common path
+        path = self.broadcast(path)
+
+        # split the checkpoint into two parts:
+        # 1) the deepspeed engine encapsulating both the model and optionally the optimizer(s)
+        # 2) the rest of the user's state, which in deepspeed is called `client state`
         excluded_objects = (self._deepspeed_engine, self._deepspeed_engine.optimizer)
         state = {k: v for k, v in state.items() if v not in excluded_objects}
+        # there might be other stateful objects unrelatd to the deepspeed engine - convert them to a state_dict
         state = self._convert_stateful_objects_in_state(state)
-        # Use deepspeed's internal checkpointing function to handle partitioned weights across processes
+        # use deepspeed's internal checkpointing function to handle partitioned weights across processes
         self._deepspeed_engine.save_checkpoint(path, client_state=state, tag="checkpoint")
 
-    def load_checkpoint(self, path: _PATH, state: Optional[Dict[str, Union[Module, Optimizer, Any]]] = None) -> Dict[str, Any]:
+    def load_checkpoint(
+        self, path: _PATH, state: Optional[Dict[str, Union[Module, Optimizer, Any]]] = None
+    ) -> Dict[str, Any]:
         """Load the contents from a checkpoint and restore the state of the given objects.
 
         Args:
@@ -404,25 +420,43 @@ def load_checkpoint(self, path: _PATH, state: Optional[Dict[str, Union[Module, O
             given, the full checkpoint will be returned.
         """
         if self.load_full_weights and self.zero_stage_3:
-            # Broadcast to ensure we load from the rank 0 checkpoint
-            # This doesn't have to be the case when using deepspeed sharded checkpointing
+            # This code path to enables loading a checkpoint from a non-deepspeed checkpoint or from
+            # a consolidated checkpoint
             path = self.broadcast(path)
             return super().load_checkpoint(path=path, state=state)
 
-        if self._deepspeed_engine not in state.values():
-            # TODO
-            raise ValueError()
-        optimzer_state_requested = bool(len([item for item in state.values() if isinstance(item, Optimizer)]))
-
         torch.cuda.empty_cache()
-        _, client_state = self._deepspeed_engine.load_checkpoint(
-            path, load_optimizer_states=optimzer_state_requested, load_lr_scheduler_states=False
+
+        from deepspeed import DeepSpeedEngine
+
+        modules = chain(*(module.modules() for module in state.values() if isinstance(module, Module)))
+        engines = [engine for engine in modules if isinstance(engine, DeepSpeedEngine)]
+        if len(engines) == 0:
+            raise ValueError(
+                "Could not find a deepspeed model in the provided checkpoint state. Please provide the model as"
+                " part of the state like so: `load_checkpoint(..., state={'model': model, ...})`. Make sure"
+                " you set up the model (and optimizers if any) through the strategy before loading the checkpoint."
+            )
+        elif len(engines) > 1:
+            raise ValueError(
+                "Found multiple DeepSpeed engine modules in the given state. Saving checkpoints with DeepSpeed is"
+                " currently limited to a single model per checkpoint. To save multiple model checkpoints, call the"
+                " save method for each model separately with a different path."
+            )
+        engine = engines[0]
+
+        optimzer_state_requested = bool(len([item for item in state.values() if isinstance(item, Optimizer)]))
+        _, client_state = engine.load_checkpoint(
+            path,
+            tag="checkpoint",
+            load_optimizer_states=optimzer_state_requested,
+            load_lr_scheduler_states=False,
+            load_module_strict=True,  # TODO: make strict loading configurable
         )
         if client_state is None:
-            # TODO: fix message
             raise ValueError(
-                "DeepSpeed was unable to load the checkpoint. Ensure you passed in a DeepSpeed compatible checkpoint "
-                "or a single checkpoint file with `Trainer(strategy=DeepSpeedStrategy(load_full_weights=True))`."
+                "DeepSpeed was unable to load the checkpoint. Ensure you passed in a DeepSpeed compatible checkpoint"
+                " or a single checkpoint file by setting `DeepSpeedStrategy(..., load_full_weights=True)`."
             )
         for k, v in client_state.copy().items():
             if k not in state:
diff --git a/tests/tests_fabric/strategies/test_deepspeed_integration.py b/tests/tests_fabric/strategies/test_deepspeed_integration.py
@@ -241,7 +241,7 @@ def _make_block(self):
 
 
 @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
-def test_deepspeed_multigpu_stage_3(tmpdir):
+def test_deepspeed_multigpu_stage_3():
     """Test to ensure ZeRO Stage 3 works with a parallel model."""
     fabric = ModelParallelClassification(
         strategy=DeepSpeedStrategy(stage=3),
@@ -255,7 +255,7 @@ def test_deepspeed_multigpu_stage_3(tmpdir):
 @RunIf(deepspeed=True)
 @mock.patch("deepspeed.init_distributed", autospec=True)
 @pytest.mark.parametrize("platform", ["Linux", "Windows"])
-def test_deepspeed_env_variables_on_platforms(deepspeed_dist_mock, tmpdir, platform):
+def test_deepspeed_env_variables_on_platforms(deepspeed_dist_mock, platform):
     """Test to ensure that we set up distributed communication correctly.
 
     When using Windows, ranks environment variables should not be set, and DeepSpeed should handle this.
@@ -279,7 +279,7 @@ def test_deepspeed_env_variables_on_platforms(deepspeed_dist_mock, tmpdir, platf
 
 
 @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
-def test_deepspeed_specific_gpu_device_index(tmpdir):
+def test_deepspeed_specific_gpu_device_index():
     """Test that the DeepSpeed strategy can run on specific device indices."""
 
     class RunFabric(BoringFabric):
@@ -295,7 +295,7 @@ def step(self, model, batch):
 
 
 @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True, bf16_cuda=True)
-def test_deepspeed_with_bfloat16_precision(tmpdir):
+def test_deepspeed_with_bfloat16_precision():
     """Test that the DeepSpeed strategy works with bfloat16 precision."""
 
     class Model(nn.Module):
@@ -322,3 +322,88 @@ def step(self, model, batch):
     assert fabric._strategy.precision.precision == "bf16"
     assert fabric._strategy.config["zero_optimization"]["stage"] == 3
     fabric.run()
+
+
+def _assert_saved_model_is_equal(fabric, model, checkpoint_path):
+    """Convert the saved checkpoint to a single file with the model weights consolidated to easily verify the full
+    weights in float32 precision."""
+    from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict
+
+    assert isinstance(fabric.strategy, DeepSpeedStrategy)
+
+    # carry out the check only on rank 0
+    if fabric.is_global_zero:
+        if fabric.strategy.config["zero_optimization"]["stage"] in (2, 3):
+            single_ckpt_path = checkpoint_path / "single_model.pt"
+            # the tag is hardcoded in DeepSpeedStrategy
+            convert_zero_checkpoint_to_fp32_state_dict(checkpoint_path, single_ckpt_path, tag="checkpoint")
+            state_dict = torch.load(single_ckpt_path)
+        else:
+            # 'checkpoint' is the tag, hardcoded in DeepSpeedStrategy
+            single_ckpt_path = checkpoint_path / "checkpoint" / "mp_rank_00_model_states.pt"
+            state_dict = torch.load(single_ckpt_path)["module"]
+
+        model = model.cpu()
+
+        # assert model parameters are identical after loading
+        for orig_param, saved_model_param in zip(model.parameters(), state_dict.values()):
+            # perform the equality check in the same precision
+            saved_model_param = saved_model_param.cpu().to(orig_param.dtype)
+            assert torch.equal(orig_param, saved_model_param)
+
+    fabric.barrier()
+
+
+@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True, bf16_cuda=True)
+@pytest.mark.parametrize("stage", [1, 2, 3])
+def test_deepspeed_save_load_checkpoint_zero_3(stage, tmp_path):
+    """Test that DeepSpeed stage 1, 2, and 3 model checkpoints can be saved and loaded successfully."""
+    from deepspeed import DeepSpeedEngine
+
+    fabric = Fabric(accelerator="cuda", devices=2, strategy=DeepSpeedStrategy(stage=stage), precision="bf16")
+    fabric.launch()
+
+    checkpoint_path = fabric.broadcast(tmp_path / "deepspeed-checkpoint")
+
+    with fabric.sharded_model():
+        model = BoringModel()
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
+    model, optimizer = fabric.setup(model, optimizer)
+    assert isinstance(model._forward_module, DeepSpeedEngine)
+
+    # TODO(fabric): The dtype on the model is not correct, should be torch.bfloat16
+    assert model.dtype == torch.float32
+    assert next(model.parameters()).dtype == torch.bfloat16
+
+    # dummy training step
+    output = model(torch.randn(1, 32).to(fabric.device))
+    loss = output.sum()
+    fabric.backward(loss)
+    optimizer.step()
+    optimizer.zero_grad()
+
+    state = {"model": model, "optimizer": optimizer, "steps": 1}
+    fabric.save(checkpoint_path, state)
+
+    fabric.barrier()
+
+    # re-init all objects and resume
+    fabric = Fabric(accelerator="cuda", devices=2, strategy=DeepSpeedStrategy(stage=stage), precision="bf16")
+    fabric.launch()
+    with fabric.sharded_model():
+        model = BoringModel()
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
+    model, optimizer = fabric.setup(model, optimizer)
+    state = {"model": model, "optimizer": optimizer, "steps": 0}
+
+    metadata = fabric.load(checkpoint_path, state)
+    fabric.barrier()
+
+    # check user data in state reloaded
+    assert state["steps"] == 1
+    # the remainder of the deepspeed checkpoint contains metadata
+    assert "ds_version" in metadata
+
+    _assert_saved_model_is_equal(fabric, model, checkpoint_path)