@@ -173,9 +173,13 @@ def _assert_layer_fsdp_instance(self) -> None:
173
173
174
174
def _run_multiple_stages (trainer , model , model_path : Optional [str ] = None ):
175
175
trainer .fit (model )
176
+ trainer .test (model )
177
+
176
178
model_path = trainer .strategy .broadcast (model_path )
177
- model_path = model_path if model_path else trainer .checkpoint_callback .last_model_path
179
+ model_path = Path ( model_path if model_path else trainer .checkpoint_callback .last_model_path )
178
180
181
+ # Save another checkpoint after testing, without optimizer states
182
+ trainer .save_checkpoint (model_path .with_name ("after-test" ))
179
183
trainer .save_checkpoint (model_path , weights_only = True )
180
184
181
185
_assert_save_equality (trainer , model_path , cls = model .__class__ )
@@ -270,13 +274,13 @@ def training_step(self, batch, batch_idx):
270
274
trainer .fit (model )
271
275
272
276
273
- @RunIf (min_cuda_gpus = 1 , skip_windows = True , standalone = True )
277
+ @RunIf (min_cuda_gpus = 2 , skip_windows = True , standalone = True )
274
278
@pytest .mark .parametrize ("precision" , ["16-mixed" , pytest .param ("bf16-mixed" , marks = RunIf (bf16_cuda = True ))])
275
279
def test_fsdp_strategy_checkpoint (tmpdir , precision ):
276
280
"""Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""
277
281
model = TestFSDPModel ()
278
282
trainer = Trainer (
279
- default_root_dir = tmpdir , accelerator = "gpu" , devices = 1 , strategy = "fsdp" , precision = precision , max_epochs = 1
283
+ default_root_dir = tmpdir , accelerator = "gpu" , devices = 2 , strategy = "fsdp" , precision = precision , max_epochs = 1
280
284
)
281
285
_run_multiple_stages (trainer , model , os .path .join (tmpdir , "last.ckpt" ))
282
286
0 commit comments