Skip to content

Commit 24a1b52

Browse files
authored
Merge branch 'master' into carmocca/delay-convert-module
2 parents cc2687b + c7c42dc commit 24a1b52

File tree

16 files changed

+104
-80
lines changed

16 files changed

+104
-80
lines changed

.azure/gpu-benchmarks.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ jobs:
4646
variables:
4747
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
4848
container:
49-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.0-cuda11.8.0"
49+
# TODO: Upgrade to Python 3.11
50+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
5051
options: "--gpus=all --shm-size=32g"
5152
strategy:
5253
matrix:

.azure/gpu-tests-fabric.yml

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,12 @@ jobs:
5656
options: "--gpus=all --shm-size=2gb -v /var/tmp:/var/tmp"
5757
strategy:
5858
matrix:
59+
# TODO: Upgrade to Python 3.11
5960
"Fabric | latest":
60-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.0"
61-
PACKAGE_NAME: "fabric"
62-
"Fabric | future":
63-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.2-cuda12.1.0"
61+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
6462
PACKAGE_NAME: "fabric"
6563
"Lightning | latest":
66-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.0"
64+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
6765
PACKAGE_NAME: "lightning"
6866
workspace:
6967
clean: all

.azure/gpu-tests-pytorch.yml

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,15 +48,12 @@ jobs:
4848
cancelTimeoutInMinutes: "2"
4949
strategy:
5050
matrix:
51+
# TODO: Upgrade to Python 3.11
5152
"PyTorch | latest":
52-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.0"
53-
PACKAGE_NAME: "pytorch"
54-
"PyTorch | future":
55-
# todo: failed to install `pygame` with py3.11
5653
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
5754
PACKAGE_NAME: "pytorch"
5855
"Lightning | latest":
59-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.0"
56+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
6057
PACKAGE_NAME: "lightning"
6158
pool: lit-rtx-3090
6259
variables:

.github/checkgroup.yml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,12 @@ subprojects:
2121
checks:
2222
- "pl-cpu (macOS-11, lightning, 3.8, 1.13, oldest)"
2323
- "pl-cpu (macOS-11, lightning, 3.10, 1.13)"
24-
- "pl-cpu (macOS-11, lightning, 3.10, 2.0)"
2524
- "pl-cpu (macOS-11, lightning, 3.10, 2.1)"
2625
- "pl-cpu (ubuntu-20.04, lightning, 3.8, 1.13, oldest)"
2726
- "pl-cpu (ubuntu-20.04, lightning, 3.10, 1.13)"
28-
- "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.0)"
2927
- "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.1)"
3028
- "pl-cpu (windows-2022, lightning, 3.8, 1.13, oldest)"
3129
- "pl-cpu (windows-2022, lightning, 3.10, 1.13)"
32-
- "pl-cpu (windows-2022, lightning, 3.10, 2.0)"
3330
- "pl-cpu (windows-2022, lightning, 3.10, 2.1)"
3431
- "pl-cpu (macOS-11, pytorch, 3.8, 1.13)"
3532
- "pl-cpu (ubuntu-20.04, pytorch, 3.8, 1.13)"
@@ -190,15 +187,12 @@ subprojects:
190187
checks:
191188
- "fabric-cpu (macOS-11, lightning, 3.8, 1.13, oldest)"
192189
- "fabric-cpu (macOS-11, lightning, 3.10, 1.13)"
193-
- "fabric-cpu (macOS-11, lightning, 3.10, 2.0)"
194190
- "fabric-cpu (macOS-11, lightning, 3.11, 2.1)"
195191
- "fabric-cpu (ubuntu-20.04, lightning, 3.8, 1.13, oldest)"
196192
- "fabric-cpu (ubuntu-20.04, lightning, 3.10, 1.13)"
197-
- "fabric-cpu (ubuntu-20.04, lightning, 3.10, 2.0)"
198193
- "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.1)"
199194
- "fabric-cpu (windows-2022, lightning, 3.8, 1.13, oldest)"
200195
- "fabric-cpu (windows-2022, lightning, 3.10, 1.13)"
201-
- "fabric-cpu (windows-2022, lightning, 3.10, 2.0)"
202196
- "fabric-cpu (windows-2022, lightning, 3.11, 2.1)"
203197
- "fabric-cpu (macOS-11, fabric, 3.8, 1.13)"
204198
- "fabric-cpu (ubuntu-20.04, fabric, 3.8, 1.13)"

.github/workflows/ci-tests-fabric.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,9 @@ jobs:
4343
- { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" }
4444
- { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" }
4545
# only run PyTorch latest
46-
- { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
47-
- { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
48-
- { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
4946
- { os: "macOS-11", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" }
5047
- { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" }
5148
- { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" }
52-
# only run PyTorch future
5349
- { os: "macOS-12", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" }
5450
- { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" }
5551
- { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" }

.github/workflows/ci-tests-pytorch.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,9 @@ jobs:
4747
- { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" }
4848
- { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" }
4949
# only run PyTorch latest
50-
- { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
51-
- { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
52-
- { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
5350
- { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" }
5451
- { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" }
5552
- { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" }
56-
# only run PyTorch future
5753
- { os: "macOS-12", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" }
5854
- { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" }
5955
- { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" }

docs/source-fabric/advanced/compile.rst

Lines changed: 26 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ On PyTorch 2.2 and later, ``torch.compile`` will detect dynamism automatically a
220220
221221
Numbers produced with NVIDIA A100 SXM4 40GB, PyTorch 2.2.0, CUDA 12.1.
222222

223+
223224
----
224225

225226

@@ -255,17 +256,33 @@ Naturally, the tradoff here is that it will consume a bit more memory.
255256
256257
You can find a full list of compile options in the `PyTorch documentation <https://pytorch.org/docs/stable/generated/torch.compile.html>`_.
257258

259+
260+
----
261+
262+
263+
**************************************
264+
A note about torch.compile in practice
265+
**************************************
266+
267+
In practice, you will find that ``torch.compile`` often doesn't work well and can even be counter-productive.
268+
Compilation may fail with cryptic error messages that are impossible to debug without help from the PyTorch team.
269+
It is also not uncommon that ``torch.compile`` will produce a significantly *slower* model or one with much higher memory usage.
270+
On top of that, the compilation phase itself can be incredibly slow, taking several minutes to finish.
271+
For these reasons, we recommend that you don't waste too much time trying to apply ``torch.compile`` during development, and rather evaluate its effectiveness toward the end when you are about to launch long-running, expensive experiments.
272+
Always compare the speed and memory usage of the compiled model against the original model!
273+
274+
258275
----
259276

260277

261-
*******************************************************
262-
(Experimental) Apply torch.compile over FSDP, DDP, etc.
263-
*******************************************************
278+
*************************************
279+
Using torch.compile with FSDP and DDP
280+
*************************************
264281

265282
As stated earlier, we recommend that you compile the model before calling ``fabric.setup()``.
266-
However, if you are using DDP or FSDP with Fabric, the compilation won't incorporate the distributed calls inside these wrappers by default.
267-
In an experimental feature, you can let ``fabric.setup()`` reapply the ``torch.compile`` call after the model gets wrapped in DDP/FSDP internally.
268-
In the future, this option will become the default.
283+
In the case of DDP and FSDP, ``fabric.setup()`` will automatically reapply the ``torch.compile`` call after the model gets wrapped in DDP/FSDP internally.
284+
This will ensure that the compilation can incorporate the distributed calls and optimize them.
285+
However, should you have issues compiling DDP and FSDP models, you can opt out of this feature:
269286

270287
.. code-block:: python
271288
@@ -275,25 +292,11 @@ In the future, this option will become the default.
275292
# Compile the model
276293
model = torch.compile(model)
277294
278-
# Default: `fabric.setup()` will not reapply the compilation over DDP/FSDP
279-
model = fabric.setup(model, _reapply_compile=False)
280-
281-
# Recompile the model over DDP/FSDP (experimental)
295+
# Default: `fabric.setup()` will configure compilation over DDP/FSDP for you
282296
model = fabric.setup(model, _reapply_compile=True)
283297
298+
# Turn it off if you see issues with DDP/FSDP
299+
model = fabric.setup(model, _reapply_compile=False)
284300
285-
----
286-
287-
288-
**************************************
289-
A note about torch.compile in practice
290-
**************************************
291-
292-
In practice, you will find that ``torch.compile`` often doesn't work well and can even be counter-productive.
293-
Compilation may fail with cryptic error messages that are impossible to debug without help from the PyTorch team.
294-
It is also not uncommon that ``torch.compile`` will produce a significantly *slower* model or one with much higher memory usage.
295-
On top of that, the compilation phase itself can be incredibly slow, taking several minutes to finish.
296-
For these reasons, we recommend that you don't waste too much time trying to apply ``torch.compile`` during development, and rather evaluate its effectiveness toward the end when you are about to launch long-running, expensive experiments.
297-
Always compare the speed and memory usage of the compiled model against the original model!
298301
299302
|

src/lightning/data/streaming/data_processor.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
1717
from urllib import parse
1818

19+
import numpy as np
1920
from tqdm.auto import tqdm as _tqdm
2021

2122
from lightning import seed_everything
@@ -290,7 +291,7 @@ def _map_items_to_workers_weighted(
290291
else:
291292
print(f"Worker {worker_id} gets ({len(worker_items[worker_id])}) items for a total weight of {size}.")
292293

293-
return [worker_items[worker_id] for worker_id in worker_ids_this_node]
294+
return [np.random.permutation(worker_items[worker_id]).tolist() for worker_id in worker_ids_this_node]
294295

295296

296297
def _get_num_bytes(item: Any, base_path: str) -> int:

src/lightning/data/streaming/resolver.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -303,13 +303,17 @@ def _execute(
303303
if not _LIGHTNING_SDK_AVAILABLE:
304304
raise ModuleNotFoundError("The `lightning_sdk` is required.")
305305

306+
lightning_skip_install = os.getenv("LIGHTNING_SKIP_INSTALL", "")
307+
if lightning_skip_install:
308+
lightning_skip_install = f" LIGHTNING_SKIP_INSTALL={lightning_skip_install} "
309+
306310
lightning_branch = os.getenv("LIGHTNING_BRANCH", "")
307311
if lightning_branch:
308-
lightning_branch = f" LIGHTNING_BRANCH={lightning_branch}"
312+
lightning_branch = f" LIGHTNING_BRANCH={lightning_branch} "
309313

310314
studio = Studio()
311315
job = studio._studio_api.create_data_prep_machine_job(
312-
command or f"cd {os.getcwd()} &&{lightning_branch} python {' '.join(sys.argv)}",
316+
command or f"cd {os.getcwd()} &&{lightning_skip_install}{lightning_branch} python {' '.join(sys.argv)}",
313317
name=name,
314318
num_instances=num_nodes,
315319
studio_id=studio._studio.id,

src/lightning/fabric/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
3131

3232
- Fixed parsing of v100s GPUs in `get_available_flops` ([#18952](https://github.com/Lightning-AI/lightning/pull/18952))
3333
- Fixed issue where the `precision="transformer-engine"` argument would not replace layers by default ([#19082](https://github.com/Lightning-AI/lightning/pull/19082))
34+
- Fixed the input validation logic in `FSDPStrategy` to accept a `device_mesh` ([#19392](https://github.com/Lightning-AI/lightning/pull/19392))
3435

3536

3637
## [2.1.4] - 2024-01-31

0 commit comments

Comments
 (0)