From 2bdc588f76de6aa5f3615155181db4af9124e735 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Fri, 20 Dec 2024 02:16:48 +0900 Subject: [PATCH 01/46] bump: testing with future torch 2.6 --- .github/workflows/ci-tests-fabric.yml | 6 +++++- .github/workflows/ci-tests-pytorch.yml | 6 +++++- .github/workflows/docker-build.yml | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index c2fda73dcf1f4..87b3656a592c3 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -79,6 +79,10 @@ jobs: - { os: "macOS-14", pkg-name: "fabric", python-version: "3.9", pytorch-version: "2.1" } - { os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.9", pytorch-version: "2.1" } - { os: "windows-2022", pkg-name: "fabric", python-version: "3.9", pytorch-version: "2.1" } + # adding recently cut Torch 2.6 + - { os: "macOS-14", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.6" } + - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.6" } + - { os: "windows-2022", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.6" } timeout-minutes: 25 # because of building grpcio on Mac env: PACKAGE_NAME: ${{ matrix.pkg-name }} @@ -127,7 +131,7 @@ jobs: - name: Env. variables run: | # Switch PyTorch URL - python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.5' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV + python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.6' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV # Switch coverage scope python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'lightning_fabric'))" >> $GITHUB_ENV # if you install mono-package set dependency only for this subpackage diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index ba8519ea8ed8a..8ba3fa58a7bb0 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -83,6 +83,10 @@ jobs: - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "2.1" } - { os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "2.1" } - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "2.1" } + # adding recently cut Torch 2.6 + - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.6" } + - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.6" } + - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.6" } timeout-minutes: 50 env: PACKAGE_NAME: ${{ matrix.pkg-name }} @@ -133,7 +137,7 @@ jobs: - name: Env. variables run: | # Switch PyTorch URL - python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.5' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV + python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.6' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV # Switch coverage scope python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'pytorch_lightning'))" >> $GITHUB_ENV # if you install mono-package set dependency only for this subpackage diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 09ae3adc45ac6..859bef808c762 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -109,6 +109,7 @@ jobs: - { python_version: "3.11", pytorch_version: "2.3.1", cuda_version: "12.1.0" } - { python_version: "3.11", pytorch_version: "2.4.1", cuda_version: "12.1.0" } - { python_version: "3.12", pytorch_version: "2.5.1", cuda_version: "12.1.0" } + - { python_version: "3.12", pytorch_version: "2.6.0", cuda_version: "12.4.1" } steps: - uses: actions/checkout@v4 - uses: docker/setup-buildx-action@v3 From 89153f0a14992ac5b08e8513311a87bc224f10d5 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Fri, 20 Dec 2024 02:31:50 +0900 Subject: [PATCH 02/46] bump `typing-extensions` --- requirements/fabric/base.txt | 2 +- requirements/pytorch/base.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 42c055e85ca7d..70cd75c1c0d37 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -4,5 +4,5 @@ torch >=2.1.0, <2.6.0 fsspec[http] >=2022.5.0, <2024.4.0 packaging >=20.0, <=23.1 -typing-extensions >=4.4.0, <4.10.0 +typing-extensions >=4.4.0, <4.11.0 lightning-utilities >=0.10.0, <0.12.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 94aca759c37e2..cdf3cc03e2985 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -7,5 +7,5 @@ PyYAML >=5.4, <6.1.0 fsspec[http] >=2022.5.0, <2024.4.0 torchmetrics >=0.7.0, <1.5.0 # needed for using fixed compare_version packaging >=20.0, <=23.1 -typing-extensions >=4.4.0, <4.10.0 +typing-extensions >=4.4.0, <4.11.0 lightning-utilities >=0.10.0, <0.12.0 From bc3cbd97ce6cecc8ef2eeb0035ecdbc1348b2866 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 7 Jan 2025 20:38:52 +0900 Subject: [PATCH 03/46] TORCHINDUCTOR_CACHE_DIR --- tests/tests_pytorch/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index ea5207516cad1..5200ac2e408e9 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -93,6 +93,7 @@ def restore_env_variables(): "TF_CPP_MIN_LOG_LEVEL", "TF_GRPC_DEFAULT_OPTIONS", "XLA_FLAGS", + "TORCHINDUCTOR_CACHE_DIR", # leaked by torch.compile } leaked_vars.difference_update(allowlist) assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}" From 6cfa48c19d4764f11f7d57d85b83d201bebb0cbd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Jan 2025 11:39:16 +0000 Subject: [PATCH 04/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_pytorch/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index 5200ac2e408e9..b7cac2c215057 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -93,7 +93,7 @@ def restore_env_variables(): "TF_CPP_MIN_LOG_LEVEL", "TF_GRPC_DEFAULT_OPTIONS", "XLA_FLAGS", - "TORCHINDUCTOR_CACHE_DIR", # leaked by torch.compile + "TORCHINDUCTOR_CACHE_DIR", # leaked by torch.compile } leaked_vars.difference_update(allowlist) assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}" From 0e1d047c9721914711f8ed0bf31f433e977eb959 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 7 Jan 2025 23:16:09 +0900 Subject: [PATCH 05/46] --extra-index-url=https://download.pytorch.org/whl/cpu/ --- .github/actions/pip-wheels/action.yml | 4 ++-- .github/workflows/_legacy-checkpoints.yml | 6 +++--- .github/workflows/ci-tests-fabric.yml | 6 +++--- .github/workflows/ci-tests-pytorch.yml | 8 ++++---- .github/workflows/docs-build.yml | 4 ++-- .github/workflows/release-pkg.yml | 4 ++-- tests/tests_pytorch/models/test_onnx.py | 5 ++--- 7 files changed, 18 insertions(+), 19 deletions(-) diff --git a/.github/actions/pip-wheels/action.yml b/.github/actions/pip-wheels/action.yml index 28d6e346b7aa2..19f2e7bf5e182 100644 --- a/.github/actions/pip-wheels/action.yml +++ b/.github/actions/pip-wheels/action.yml @@ -46,8 +46,8 @@ runs: run: | # cat requirements.dump pip wheel -r requirements.dump --prefer-binary \ - --wheel-dir=.wheels \ - -f ${{ inputs.torch-url }} -f ${{ inputs.wheel-dir }} + --wheel-dir=".wheels" \ + --extra-index-url=${{ inputs.torch-url }} -f ${{ inputs.wheel-dir }} ls -lh .wheels/ shell: bash diff --git a/.github/workflows/_legacy-checkpoints.yml b/.github/workflows/_legacy-checkpoints.yml index 0161ab57bca52..4107633424388 100644 --- a/.github/workflows/_legacy-checkpoints.yml +++ b/.github/workflows/_legacy-checkpoints.yml @@ -43,7 +43,7 @@ on: env: LEGACY_FOLDER: "tests/legacy" - TORCH_URL: "https://download.pytorch.org/whl/cpu/torch_stable.html" + TORCH_URL: "https://download.pytorch.org/whl/cpu/" defaults: run: @@ -67,12 +67,12 @@ jobs: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 timeout-minutes: 20 - run: pip install . -f ${TORCH_URL} + run: pip install . --extra-index-url="${TORCH_URL}" if: inputs.pl_version == '' - name: Install PL version timeout-minutes: 20 - run: pip install "pytorch-lightning==${{ inputs.pl_version }}" -f ${TORCH_URL} + run: pip install "pytorch-lightning==${{ inputs.pl_version }}" --extra-index-url="${TORCH_URL}" if: inputs.pl_version != '' - name: Adjust tests -> PL diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index aac28f1f593e9..7d4a60532c1aa 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -88,8 +88,8 @@ jobs: PACKAGE_NAME: ${{ matrix.pkg-name }} FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} PYPI_CACHE_DIR: "_pip-wheels" - TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/torch_stable.html" - TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/torch" + TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/" + TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/" # TODO: Remove this - Enable running MPS tests on this platform DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }} steps: @@ -143,7 +143,7 @@ jobs: timeout-minutes: 20 run: | pip install -e ".[${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" -U --prefer-binary \ - --find-links="${TORCH_URL}" --find-links="${PYPI_CACHE_DIR}" + --extra-index-url="${TORCH_URL}" --find-links="${PYPI_CACHE_DIR}" pip list - name: Dump handy wheels if: github.event_name == 'push' && github.ref == 'refs/heads/master' diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index 31cc44cd365e6..ae7fc6f59e77d 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -90,9 +90,9 @@ jobs: timeout-minutes: 50 env: PACKAGE_NAME: ${{ matrix.pkg-name }} - TORCH_URL: "https://download.pytorch.org/whl/cpu/torch_stable.html" - TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/torch_stable.html" - TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/torch" + TORCH_URL: "https://download.pytorch.org/whl/cpu/" + TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/" + TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/" FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} PYPI_CACHE_DIR: "_pip-wheels" # TODO: Remove this - Enable running MPS tests on this platform @@ -150,7 +150,7 @@ jobs: run: | pip install ".[${EXTRA_PREFIX}extra,${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" -U --prefer-binary \ -r requirements/_integrations/accelerators.txt \ - --find-links="${TORCH_URL}" --find-links="${PYPI_CACHE_DIR}" + --extra-index-url="${TORCH_URL}" --find-links="${PYPI_CACHE_DIR}" pip list - name: Drop LAI from extensions if: ${{ matrix.pkg-name != 'lightning' }} diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml index d84b7bed7a34a..12d03f591e774 100644 --- a/.github/workflows/docs-build.yml +++ b/.github/workflows/docs-build.yml @@ -46,7 +46,7 @@ defaults: env: FREEZE_REQUIREMENTS: "1" - TORCH_URL: "https://download.pytorch.org/whl/cpu/torch_stable.html" + TORCH_URL: "https://download.pytorch.org/whl/cpu/" PYPI_CACHE_DIR: "_pip-wheels" PYPI_LOCAL_DIR: "pypi_pkgs/" @@ -106,7 +106,7 @@ jobs: mkdir -p ${PYPI_CACHE_DIR} # in case cache was not hit ls -lh ${PYPI_CACHE_DIR} pip install .[all] -U -r requirements/${{ matrix.pkg-name }}/docs.txt \ - -f ${PYPI_LOCAL_DIR} -f ${PYPI_CACHE_DIR} -f ${TORCH_URL} + -f ${PYPI_LOCAL_DIR} -f ${PYPI_CACHE_DIR} --extra-index-url="${TORCH_URL}" pip list - name: Install req. for Notebooks/tutorials if: matrix.pkg-name == 'pytorch' diff --git a/.github/workflows/release-pkg.yml b/.github/workflows/release-pkg.yml index c7828d70f7103..9786c2f57b3c7 100644 --- a/.github/workflows/release-pkg.yml +++ b/.github/workflows/release-pkg.yml @@ -23,7 +23,7 @@ defaults: env: FREEZE_REQUIREMENTS: 1 - TORCH_URL: "https://download.pytorch.org/whl/cpu/torch_stable.html" + TORCH_URL: "https://download.pytorch.org/whl/cpu/" PYTHON_VER: "3.9" jobs: @@ -60,7 +60,7 @@ jobs: python-version: ${{ env.PYTHON_VER }} - name: install Package run: | - pip install . -f ${TORCH_URL} + pip install . --extra-index-url="${TORCH_URL}" pip list - name: package Version id: lai-package diff --git a/tests/tests_pytorch/models/test_onnx.py b/tests/tests_pytorch/models/test_onnx.py index ee670cd66e871..9f99dfa7277ef 100644 --- a/tests/tests_pytorch/models/test_onnx.py +++ b/tests/tests_pytorch/models/test_onnx.py @@ -118,10 +118,9 @@ def test_verbose_param(tmp_path, capsys): model.example_input_array = torch.randn(5, 32) file_path = os.path.join(tmp_path, "model.onnx") - with patch("torch.onnx.log", autospec=True) as test: + with patch("torch.onnx.log", autospec=True) as mocked: model.to_onnx(file_path, verbose=True) - args, _ = test.call_args - prefix, _ = args + (prefix, _), _ = mocked.call_args assert prefix == "Exported graph: " From 6946f4752310357a8092825e722ef01c4b7c0502 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 8 Jan 2025 00:56:30 +0900 Subject: [PATCH 06/46] onnx.log max_torch="2.6.0" --- tests/tests_pytorch/models/test_onnx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_pytorch/models/test_onnx.py b/tests/tests_pytorch/models/test_onnx.py index 9f99dfa7277ef..7436d1e9a56a2 100644 --- a/tests/tests_pytorch/models/test_onnx.py +++ b/tests/tests_pytorch/models/test_onnx.py @@ -111,7 +111,7 @@ def test_model_saves_on_multi_gpu(tmp_path): assert os.path.exists(file_path) is True -@RunIf(onnx=True) +@RunIf(onnx=True, max_torch="2.6.0") def test_verbose_param(tmp_path, capsys): """Test that output is present when verbose parameter is set.""" model = BoringModel() From 7530f5777ce152587a8f74a1f51c6dbf83e4ceea Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 8 Jan 2025 01:06:17 +0900 Subject: [PATCH 07/46] todo --- tests/tests_pytorch/models/test_onnx.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tests_pytorch/models/test_onnx.py b/tests/tests_pytorch/models/test_onnx.py index 7436d1e9a56a2..46787d987f9b8 100644 --- a/tests/tests_pytorch/models/test_onnx.py +++ b/tests/tests_pytorch/models/test_onnx.py @@ -111,6 +111,7 @@ def test_model_saves_on_multi_gpu(tmp_path): assert os.path.exists(file_path) is True +# todo: investigate where the logging happening in torch.onnx for PT 2.6+ @RunIf(onnx=True, max_torch="2.6.0") def test_verbose_param(tmp_path, capsys): """Test that output is present when verbose parameter is set.""" From 26a8864b21258d494f46c0899dba052a095e33ca Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 8 Jan 2025 17:58:05 +0900 Subject: [PATCH 08/46] docker --- dockers/base-cuda/Dockerfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 0e56f2fa93bd9..0da0cf9b2de9f 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -92,9 +92,8 @@ RUN \ -r requirements/pytorch/extra.txt \ -r requirements/pytorch/test.txt \ -r requirements/pytorch/strategies.txt \ - --find-links="https://download.pytorch.org/whl/cu${CUDA_VERSION_MM//'.'/''}/torch_stable.html" \ - --find-links="https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM//'.'/''}/torch" \ - --find-links="https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM//'.'/''}/pytorch-triton" + --extra-index-url="https://download.pytorch.org/whl/cu${CUDA_VERSION_MM//'.'/''}/" \ + --extra-index-url="https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM//'.'/''}/" RUN \ # Show what we have From e52f392841c9d855ffe02cbad5b4218012699eb8 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 8 Jan 2025 18:15:00 +0900 Subject: [PATCH 09/46] gpu --- .azure/gpu-tests-fabric.yml | 7 +++++-- .azure/gpu-tests-pytorch.yml | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 0d970a552cecc..fb6c5d755b0dc 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -57,11 +57,14 @@ jobs: strategy: matrix: "Fabric | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.1" PACKAGE_NAME: "fabric" "Lightning | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.1" PACKAGE_NAME: "lightning" + "Fabric | future": + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1" + PACKAGE_NAME: "fabric" workspace: clean: all steps: diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index e09ad011908cb..b4ac03241efe6 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -50,11 +50,14 @@ jobs: strategy: matrix: "PyTorch | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.1" PACKAGE_NAME: "pytorch" "Lightning | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.1" PACKAGE_NAME: "lightning" + "PyTorch | future": + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1" + PACKAGE_NAME: "pytorch" pool: lit-rtx-3090 variables: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) From e7d800f91e20b335bf368a3fd3b4aa207bd3471e Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 8 Jan 2025 18:20:40 +0900 Subject: [PATCH 10/46] gpu --- .azure/gpu-tests-fabric.yml | 5 ++++- .azure/gpu-tests-pytorch.yml | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index fb6c5d755b0dc..ce7f4d99fc6e5 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -56,8 +56,11 @@ jobs: options: "--gpus=all --shm-size=2gb -v /var/tmp:/var/tmp" strategy: matrix: + "Fabric | oldest": + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1" + PACKAGE_NAME: "fabric" "Fabric | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.5-cuda12.1.1" PACKAGE_NAME: "fabric" "Lightning | latest": image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.1" diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index b4ac03241efe6..56ebc8a54b2a6 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -49,8 +49,11 @@ jobs: cancelTimeoutInMinutes: "2" strategy: matrix: + "PyTorch | oldest": + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1" + PACKAGE_NAME: "pytorch" "PyTorch | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.5-cuda12.1.1" PACKAGE_NAME: "pytorch" "Lightning | latest": image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.1" From 9fb71dce7c0744f38ea212d522585c7bacfb71c3 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 8 Jan 2025 18:32:20 +0900 Subject: [PATCH 11/46] check --- .azure/gpu-tests-fabric.yml | 2 +- .azure/gpu-tests-pytorch.yml | 1 - requirements/pytorch/check-avail-extras.py | 4 ++++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index ce7f4d99fc6e5..058ff8af3beeb 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -120,7 +120,7 @@ jobs: set -e python requirements/collect_env_details.py python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" - python -c "import bitsandbytes" + python requirements/pytorch/check-avail-extras.py displayName: "Env details" - bash: python -m pytest lightning_fabric diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 56ebc8a54b2a6..4f9bd1ddd687f 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -135,7 +135,6 @@ jobs: python requirements/collect_env_details.py python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" python requirements/pytorch/check-avail-extras.py - python -c "import bitsandbytes" displayName: "Env details" - bash: python -m pytest pytorch_lightning diff --git a/requirements/pytorch/check-avail-extras.py b/requirements/pytorch/check-avail-extras.py index 3ab8d2848c3f0..cebafa587a359 100644 --- a/requirements/pytorch/check-avail-extras.py +++ b/requirements/pytorch/check-avail-extras.py @@ -4,3 +4,7 @@ import matplotlib # noqa: F401 import omegaconf # noqa: F401 import rich # noqa: F401 + + import torch # noqa: F401 + if torch.cuda.is_available(): + import bitsandbytes # noqa: F401 From daa09206f7df81617e13e15247be3c7ef2668890 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Jan 2025 09:33:19 +0000 Subject: [PATCH 12/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- requirements/pytorch/check-avail-extras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/check-avail-extras.py b/requirements/pytorch/check-avail-extras.py index cebafa587a359..a9f86282bd782 100644 --- a/requirements/pytorch/check-avail-extras.py +++ b/requirements/pytorch/check-avail-extras.py @@ -4,7 +4,7 @@ import matplotlib # noqa: F401 import omegaconf # noqa: F401 import rich # noqa: F401 + import torch - import torch # noqa: F401 if torch.cuda.is_available(): import bitsandbytes # noqa: F401 From 6c26f4851b3c4e77270ade994020e8ae81b354a3 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 8 Jan 2025 18:37:26 +0900 Subject: [PATCH 13/46] gpu --- .azure/gpu-tests-fabric.yml | 2 +- .azure/gpu-tests-pytorch.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 058ff8af3beeb..6446df50dd2af 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -60,7 +60,7 @@ jobs: image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1" PACKAGE_NAME: "fabric" "Fabric | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.5-cuda12.1.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.1" PACKAGE_NAME: "fabric" "Lightning | latest": image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.1" diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 4f9bd1ddd687f..6cdf71aed9470 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -53,7 +53,7 @@ jobs: image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1" PACKAGE_NAME: "pytorch" "PyTorch | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.5-cuda12.1.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.1" PACKAGE_NAME: "pytorch" "Lightning | latest": image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.1" From 3a066811f5e45dcd0ea4971ec40186864ec5eaf4 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 8 Jan 2025 20:01:10 +0900 Subject: [PATCH 14/46] bitsandbytes --- .azure/gpu-tests-fabric.yml | 11 +++++++++++ .azure/gpu-tests-pytorch.yml | 11 +++++++++++ requirements/pytorch/check-avail-extras.py | 4 ---- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 6446df50dd2af..a2737c30c0e97 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -123,6 +123,17 @@ jobs: python requirements/pytorch/check-avail-extras.py displayName: "Env details" + - bash: | + # get pytorch version + PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") + # FixMe: uninstall bitsandbytes for pytorch 2.6 as it is not compatible with `triton.ops` + if [[ "${PYTORCH_VERSION}" == "2.6.0" ]]; then + pip uninstall -y bitsandbytes + else + python -c "import bitsandbytes" + fi + displayName: "Handle bitsandbytes" + - bash: python -m pytest lightning_fabric workingDirectory: src # without succeeded this could run even if the job has already failed diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 6cdf71aed9470..9eda6aa59e89d 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -137,6 +137,17 @@ jobs: python requirements/pytorch/check-avail-extras.py displayName: "Env details" + - bash: | + # get pytorch version + PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") + # FixMe: uninstall bitsandbytes for pytorch 2.6 as it is not compatible with `triton.ops` + if [[ "${PYTORCH_VERSION}" == "2.6.0" ]]; then + pip uninstall -y bitsandbytes + else + python -c "import bitsandbytes" + fi + displayName: "Handle bitsandbytes" + - bash: python -m pytest pytorch_lightning workingDirectory: src # without succeeded this could run even if the job has already failed diff --git a/requirements/pytorch/check-avail-extras.py b/requirements/pytorch/check-avail-extras.py index a9f86282bd782..3ab8d2848c3f0 100644 --- a/requirements/pytorch/check-avail-extras.py +++ b/requirements/pytorch/check-avail-extras.py @@ -4,7 +4,3 @@ import matplotlib # noqa: F401 import omegaconf # noqa: F401 import rich # noqa: F401 - import torch - - if torch.cuda.is_available(): - import bitsandbytes # noqa: F401 From d31f23cc94810f7e2620348a220e74326c881e84 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 8 Jan 2025 20:05:43 +0900 Subject: [PATCH 15/46] bitsandbytes --- .azure/gpu-tests-fabric.yml | 14 +++++++------- .azure/gpu-tests-pytorch.yml | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index a2737c30c0e97..a42d9364c4c22 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -116,13 +116,6 @@ jobs: pip install setuptools==75.6.0 jsonargparse==4.35.0 displayName: "Install package & dependencies" - - bash: | - set -e - python requirements/collect_env_details.py - python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" - python requirements/pytorch/check-avail-extras.py - displayName: "Env details" - - bash: | # get pytorch version PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") @@ -134,6 +127,13 @@ jobs: fi displayName: "Handle bitsandbytes" + - bash: | + set -e + python requirements/collect_env_details.py + python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" + python requirements/pytorch/check-avail-extras.py + displayName: "Env details" + - bash: python -m pytest lightning_fabric workingDirectory: src # without succeeded this could run even if the job has already failed diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 9eda6aa59e89d..1ef00a6dd9d25 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -130,13 +130,6 @@ jobs: condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'lightning')) displayName: "Drop PL for LAI" - - bash: | - set -e - python requirements/collect_env_details.py - python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" - python requirements/pytorch/check-avail-extras.py - displayName: "Env details" - - bash: | # get pytorch version PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") @@ -148,6 +141,13 @@ jobs: fi displayName: "Handle bitsandbytes" + - bash: | + set -e + python requirements/collect_env_details.py + python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" + python requirements/pytorch/check-avail-extras.py + displayName: "Env details" + - bash: python -m pytest pytorch_lightning workingDirectory: src # without succeeded this could run even if the job has already failed From 97cf6ed632ccc35c736fa4563875d3863ab3a6f3 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 8 Jan 2025 20:16:53 +0900 Subject: [PATCH 16/46] set -e --- .azure/gpu-tests-fabric.yml | 1 + .azure/gpu-tests-pytorch.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index a42d9364c4c22..d5980be0c48d4 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -111,6 +111,7 @@ jobs: displayName: "Adjust dependencies" - bash: | + set -e extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))") pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" --find-links="${TORCHVISION_URL}" pip install setuptools==75.6.0 jsonargparse==4.35.0 diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 1ef00a6dd9d25..b763636ca5740 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -115,6 +115,7 @@ jobs: displayName: "Adjust dependencies" - bash: | + set -e extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))") pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" --find-links="${TORCHVISION_URL}" pip install setuptools==75.6.0 jsonargparse==4.35.0 From 5e9009b12245c404ddd3c8c46288be933cd2f570 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 8 Jan 2025 21:31:05 +0900 Subject: [PATCH 17/46] fix --- .azure/gpu-tests-fabric.yml | 5 ++--- .azure/gpu-tests-pytorch.yml | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index d5980be0c48d4..da0483dbdae53 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -83,9 +83,8 @@ jobs: displayName: "set env. vars" - bash: | echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}" - echo "##vso[task.setvariable variable=TORCHVISION_URL]https://download.pytorch.org/whl/test/cu124/torchvision-0.19.0%2Bcu124-cp${PYTHON_VERSION_MM}-cp${PYTHON_VERSION_MM}-linux_x86_64.whl" condition: endsWith(variables['Agent.JobName'], 'future') - displayName: "set env. vars 4 future" + displayName: "extend env. vars 4 future" - bash: | echo $(DEVICES) @@ -113,7 +112,7 @@ jobs: - bash: | set -e extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))") - pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" --find-links="${TORCHVISION_URL}" + pip install -e ".[${extra}dev]" pytest-timeout -U --extra-index-url="${TORCH_URL}" pip install setuptools==75.6.0 jsonargparse==4.35.0 displayName: "Install package & dependencies" diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index b763636ca5740..81ed24bbe3806 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -87,9 +87,8 @@ jobs: displayName: "set env. vars" - bash: | echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}" - echo "##vso[task.setvariable variable=TORCHVISION_URL]https://download.pytorch.org/whl/test/cu124/torchvision-0.19.0%2Bcu124-cp${PYTHON_VERSION_MM}-cp${PYTHON_VERSION_MM}-linux_x86_64.whl" condition: endsWith(variables['Agent.JobName'], 'future') - displayName: "set env. vars 4 future" + displayName: "extend env. vars 4 future" - bash: | echo $(DEVICES) @@ -117,7 +116,7 @@ jobs: - bash: | set -e extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))") - pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" --find-links="${TORCHVISION_URL}" + pip install -e ".[${extra}dev]" pytest-timeout -U --extra-index-url="${TORCH_URL}" pip install setuptools==75.6.0 jsonargparse==4.35.0 displayName: "Install package & dependencies" From 165d38a59cf66a1d854e5c8ca729ff789c2ccdc2 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 8 Jan 2025 21:42:43 +0900 Subject: [PATCH 18/46] future --- .azure/gpu-tests-fabric.yml | 5 ++--- .azure/gpu-tests-pytorch.yml | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index da0483dbdae53..43880fc5ba4ff 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -117,10 +117,9 @@ jobs: displayName: "Install package & dependencies" - bash: | - # get pytorch version - PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") # FixMe: uninstall bitsandbytes for pytorch 2.6 as it is not compatible with `triton.ops` - if [[ "${PYTORCH_VERSION}" == "2.6.0" ]]; then + # if the job name include key word future, then uninstall bitsandbytes + if [[ "$(Agent.JobName)" == *"future"* ]]; then pip uninstall -y bitsandbytes else python -c "import bitsandbytes" diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 81ed24bbe3806..317913bc6f1b1 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -131,10 +131,9 @@ jobs: displayName: "Drop PL for LAI" - bash: | - # get pytorch version - PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") # FixMe: uninstall bitsandbytes for pytorch 2.6 as it is not compatible with `triton.ops` - if [[ "${PYTORCH_VERSION}" == "2.6.0" ]]; then + # if the job name include key word future, then uninstall bitsandbytes + if [[ "$(Agent.JobName)" == *"future"* ]]; then pip uninstall -y bitsandbytes else python -c "import bitsandbytes" From 7024590cf9b61a13680947dd8c55d983ae7bdf32 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Thu, 9 Jan 2025 01:16:40 +0900 Subject: [PATCH 19/46] batch --- .azure/gpu-tests-fabric.yml | 1 + .azure/gpu-tests-pytorch.yml | 1 + tests/run_standalone_tests.sh | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 43880fc5ba4ff..0dede598b58b3 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -49,6 +49,7 @@ jobs: FREEZE_REQUIREMENTS: "1" PIP_CACHE_DIR: "/var/tmp/pip" PL_RUN_CUDA_TESTS: "1" + PL_STANDALONE_TESTS_BATCH_SIZE: "2" container: image: $(image) # default shm size is 64m. Increase it to avoid: diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 317913bc6f1b1..7e52df9216027 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -67,6 +67,7 @@ jobs: FREEZE_REQUIREMENTS: "1" PIP_CACHE_DIR: "/var/tmp/pip" PL_RUN_CUDA_TESTS: "1" + PL_STANDALONE_TESTS_BATCH_SIZE: "3" container: image: $(image) # default shm size is 64m. Increase it to avoid: diff --git a/tests/run_standalone_tests.sh b/tests/run_standalone_tests.sh index 75a52e16c57dc..136198ad4cf31 100755 --- a/tests/run_standalone_tests.sh +++ b/tests/run_standalone_tests.sh @@ -17,7 +17,7 @@ set -e # Batch size for testing: Determines how many standalone test invocations run in parallel # It can be set through the env variable PL_STANDALONE_TESTS_BATCH_SIZE and defaults to 6 if not set -test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-3}" +test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-5}" source="${PL_STANDALONE_TESTS_SOURCE:-"lightning"}" # this is the directory where the tests are located test_dir=$1 # parse the first argument From 83a01f1507a36ceaf89566e7740097c5f7f10563 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 22 Jan 2025 10:37:09 +0100 Subject: [PATCH 20/46] Empty-Commit From 9e2280643bda18447c44d6f6716334e4f8a8b582 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Fri, 14 Feb 2025 17:15:42 +0100 Subject: [PATCH 21/46] update --- .azure/gpu-tests-fabric.yml | 7 ++----- .azure/gpu-tests-pytorch.yml | 7 ++----- .github/workflows/ci-tests-fabric.yml | 28 ++++++++++++-------------- .github/workflows/ci-tests-pytorch.yml | 28 ++++++++++++-------------- 4 files changed, 30 insertions(+), 40 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 0dede598b58b3..7ebcee9f7e29e 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -61,14 +61,11 @@ jobs: image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1" PACKAGE_NAME: "fabric" "Fabric | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1" PACKAGE_NAME: "fabric" "Lightning | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.1" - PACKAGE_NAME: "lightning" - "Fabric | future": image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1" - PACKAGE_NAME: "fabric" + PACKAGE_NAME: "lightning" workspace: clean: all steps: diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 7e52df9216027..eddb13d6f99ef 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -53,14 +53,11 @@ jobs: image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1" PACKAGE_NAME: "pytorch" "PyTorch | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1" PACKAGE_NAME: "pytorch" "Lightning | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.1" - PACKAGE_NAME: "lightning" - "PyTorch | future": image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1" - PACKAGE_NAME: "pytorch" + PACKAGE_NAME: "lightning" pool: lit-rtx-3090 variables: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index 7d4a60532c1aa..17228fcc729c8 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -56,33 +56,31 @@ jobs: - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } # only run PyTorch latest with Python latest, use Fabric scope to limit dependency issues - - { os: "macOS-14", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.5.1" } - - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.5.1" } - - { os: "windows-2022", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.5.1" } + - { os: "macOS-14", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.6" } + - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.6" } + - { os: "windows-2022", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.6" } # "oldest" versions tests, only on minimum Python - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } + - { os: "macOS-14", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } - { os: "ubuntu-20.04", - pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest", } - { os: "windows-2022", - pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest", } # "fabric" installs the standalone package - - { os: "macOS-14", pkg-name: "fabric", python-version: "3.9", pytorch-version: "2.1" } - - { os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.9", pytorch-version: "2.1" } - - { os: "windows-2022", pkg-name: "fabric", python-version: "3.9", pytorch-version: "2.1" } - # adding recently cut Torch 2.6 - - { os: "macOS-14", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.6" } - - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.6" } - - { os: "windows-2022", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.6" } + - { os: "macOS-14", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.5" } + - { os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.5" } + - { os: "windows-2022", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.5" } + # adding recently cut Torch 2.7 - FUTURE + # - { os: "macOS-14", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.7" } + # - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.7" } + # - { os: "windows-2022", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.7" } timeout-minutes: 25 # because of building grpcio on Mac env: PACKAGE_NAME: ${{ matrix.pkg-name }} @@ -130,8 +128,8 @@ jobs: - name: Env. variables run: | - # Switch PyTorch URL - python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.6' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV + # Switch PyTorch URL between stable and test/future + python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.7' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV # Switch coverage scope python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'lightning_fabric'))" >> $GITHUB_ENV # if you install mono-package set dependency only for this subpackage diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index ae7fc6f59e77d..3967c94466e61 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -60,33 +60,31 @@ jobs: - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } # only run PyTorch latest with Python latest, use PyTorch scope to limit dependency issues - - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.5.1" } - - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.5.1" } - - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.5.1" } + - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.6" } + - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.6" } + - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.6" } # "oldest" versions tests, only on minimum Python - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } + - { os: "macOS-14", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } - { os: "ubuntu-20.04", - pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest", } - { os: "windows-2022", - pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest", } # "pytorch" installs the standalone package - - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "2.1" } - - { os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "2.1" } - - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "2.1" } - # adding recently cut Torch 2.6 - - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.6" } - - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.6" } - - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.6" } + - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.5" } + - { os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.5" } + - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.5" } + # adding recently cut Torch 2.7 - FUTURE + - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.7" } + - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.7" } + - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.7" } timeout-minutes: 50 env: PACKAGE_NAME: ${{ matrix.pkg-name }} @@ -136,8 +134,8 @@ jobs: - name: Env. variables run: | - # Switch PyTorch URL - python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.6' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV + # Switch PyTorch URL between stable and test/future + python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.7' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV # Switch coverage scope python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'pytorch_lightning'))" >> $GITHUB_ENV # if you install mono-package set dependency only for this subpackage From bee93b631a82bb5dbe33996397af447974b76c50 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Feb 2025 16:17:00 +0000 Subject: [PATCH 22/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .github/workflows/ci-tests-fabric.yml | 14 ++------------ .github/workflows/ci-tests-pytorch.yml | 14 ++------------ 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index 17228fcc729c8..5496144371e9e 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -61,18 +61,8 @@ jobs: - { os: "windows-2022", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.6" } # "oldest" versions tests, only on minimum Python - { os: "macOS-14", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } - - { - os: "ubuntu-20.04", - python-version: "3.9", - pytorch-version: "2.1", - requires: "oldest", - } - - { - os: "windows-2022", - python-version: "3.9", - pytorch-version: "2.1", - requires: "oldest", - } + - { os: "ubuntu-20.04", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } + - { os: "windows-2022", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } # "fabric" installs the standalone package - { os: "macOS-14", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.5" } - { os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.5" } diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index 3967c94466e61..70da745fd26c4 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -65,18 +65,8 @@ jobs: - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.6" } # "oldest" versions tests, only on minimum Python - { os: "macOS-14", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } - - { - os: "ubuntu-20.04", - python-version: "3.9", - pytorch-version: "2.1", - requires: "oldest", - } - - { - os: "windows-2022", - python-version: "3.9", - pytorch-version: "2.1", - requires: "oldest", - } + - { os: "ubuntu-20.04", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } + - { os: "windows-2022", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } # "pytorch" installs the standalone package - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.5" } - { os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.5" } From 4607a55638b0d92f03272f62a733d2a49bc5568f Mon Sep 17 00:00:00 2001 From: Jirka B Date: Fri, 14 Feb 2025 17:20:18 +0100 Subject: [PATCH 23/46] if [[ "$(image)" == *"torch2.6"* ]]; then --- .azure/gpu-tests-fabric.yml | 2 +- .azure/gpu-tests-pytorch.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 7ebcee9f7e29e..451f56629b088 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -117,7 +117,7 @@ jobs: - bash: | # FixMe: uninstall bitsandbytes for pytorch 2.6 as it is not compatible with `triton.ops` # if the job name include key word future, then uninstall bitsandbytes - if [[ "$(Agent.JobName)" == *"future"* ]]; then + if [[ "$(image)" == *"torch2.6"* ]]; then pip uninstall -y bitsandbytes else python -c "import bitsandbytes" diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index eddb13d6f99ef..b94f828d1be10 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -131,7 +131,7 @@ jobs: - bash: | # FixMe: uninstall bitsandbytes for pytorch 2.6 as it is not compatible with `triton.ops` # if the job name include key word future, then uninstall bitsandbytes - if [[ "$(Agent.JobName)" == *"future"* ]]; then + if [[ "$(image)" == *"torch2.6"* ]]; then pip uninstall -y bitsandbytes else python -c "import bitsandbytes" From fc4811c59a461c1c78b1e89d478920d2cedc0344 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Fri, 14 Feb 2025 18:06:46 +0100 Subject: [PATCH 24/46] update --- .github/workflows/ci-tests-fabric.yml | 8 ++++---- .github/workflows/ci-tests-pytorch.yml | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index 5496144371e9e..3ab7345472926 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -60,9 +60,9 @@ jobs: - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.6" } - { os: "windows-2022", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.6" } # "oldest" versions tests, only on minimum Python - - { os: "macOS-14", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } - - { os: "ubuntu-20.04", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } - - { os: "windows-2022", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } + - { os: "macOS-14", pkg-name: "fabric", pytorch-version: "2.1", requires: "oldest" } + - { os: "ubuntu-20.04", pkg-name: "fabric", pytorch-version: "2.1", requires: "oldest" } + - { os: "windows-2022", pkg-name: "fabric", pytorch-version: "2.1", requires: "oldest" } # "fabric" installs the standalone package - { os: "macOS-14", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.5" } - { os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.5" } @@ -86,7 +86,7 @@ jobs: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: ${{ matrix.python-version || '3.x' }} - name: basic setup run: pip install -q -r .actions/requirements.txt diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index 70da745fd26c4..f856cc7541327 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -64,17 +64,17 @@ jobs: - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.6" } - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.6" } # "oldest" versions tests, only on minimum Python - - { os: "macOS-14", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } - - { os: "ubuntu-20.04", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } - - { os: "windows-2022", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } + - { os: "macOS-14", pkg-name: "pytorch", pytorch-version: "2.1", requires: "oldest" } + - { os: "ubuntu-20.04", pkg-name: "pytorch", pytorch-version: "2.1", requires: "oldest" } + - { os: "windows-2022", pkg-name: "pytorch", pytorch-version: "2.1", requires: "oldest" } # "pytorch" installs the standalone package - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.5" } - { os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.5" } - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.5" } # adding recently cut Torch 2.7 - FUTURE - - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.7" } - - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.7" } - - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.7" } + # - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.7" } + # - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.7" } + # - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.7" } timeout-minutes: 50 env: PACKAGE_NAME: ${{ matrix.pkg-name }} @@ -91,7 +91,7 @@ jobs: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: ${{ matrix.python-version || '3.x' }} - name: basic setup run: pip install -q -r .actions/requirements.txt From cfa27a765c89cdd5b425fc73ced83accaa351f52 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Fri, 14 Feb 2025 18:10:34 +0100 Subject: [PATCH 25/46] 3.9 --- .github/workflows/ci-tests-fabric.yml | 2 +- .github/workflows/ci-tests-pytorch.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index 3ab7345472926..f451ce0fd8b17 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -86,7 +86,7 @@ jobs: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version || '3.x' }} + python-version: ${{ matrix.python-version || '3.9' }} - name: basic setup run: pip install -q -r .actions/requirements.txt diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index f856cc7541327..9c981e3a87536 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -91,7 +91,7 @@ jobs: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version || '3.x' }} + python-version: ${{ matrix.python-version || '3.9' }} - name: basic setup run: pip install -q -r .actions/requirements.txt From cdbd1a64457dbe15a98d1f29a8a8fae985f1d3b9 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Fri, 14 Feb 2025 20:01:29 +0100 Subject: [PATCH 26/46] PL_STANDALONE_TESTS_BATCH_SIZE --- .azure/gpu-tests-fabric.yml | 2 +- .azure/gpu-tests-pytorch.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 451f56629b088..01e654bc83af5 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -49,7 +49,7 @@ jobs: FREEZE_REQUIREMENTS: "1" PIP_CACHE_DIR: "/var/tmp/pip" PL_RUN_CUDA_TESTS: "1" - PL_STANDALONE_TESTS_BATCH_SIZE: "2" + PL_STANDALONE_TESTS_BATCH_SIZE: "1" # todo container: image: $(image) # default shm size is 64m. Increase it to avoid: diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index b94f828d1be10..d7bfd8318b9fb 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -64,7 +64,7 @@ jobs: FREEZE_REQUIREMENTS: "1" PIP_CACHE_DIR: "/var/tmp/pip" PL_RUN_CUDA_TESTS: "1" - PL_STANDALONE_TESTS_BATCH_SIZE: "3" + PL_STANDALONE_TESTS_BATCH_SIZE: "2" # todo container: image: $(image) # default shm size is 64m. Increase it to avoid: From 3d8f484a6fb2b43e0fff5a7eb33a8ad89f74dbfb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Feb 2025 19:02:46 +0000 Subject: [PATCH 27/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .azure/gpu-tests-fabric.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 01e654bc83af5..a8121502760e3 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -49,7 +49,7 @@ jobs: FREEZE_REQUIREMENTS: "1" PIP_CACHE_DIR: "/var/tmp/pip" PL_RUN_CUDA_TESTS: "1" - PL_STANDALONE_TESTS_BATCH_SIZE: "1" # todo + PL_STANDALONE_TESTS_BATCH_SIZE: "1" # todo container: image: $(image) # default shm size is 64m. Increase it to avoid: From 6559d4f3001274d424e67da4fa3f32c68cc82fce Mon Sep 17 00:00:00 2001 From: Jirka B Date: Sat, 15 Feb 2025 00:04:24 +0100 Subject: [PATCH 28/46] python -c "import bitsandbytes" --- .azure/gpu-tests-fabric.yml | 8 +------- .azure/gpu-tests-pytorch.yml | 8 +------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index a8121502760e3..032511bca9702 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -115,13 +115,7 @@ jobs: displayName: "Install package & dependencies" - bash: | - # FixMe: uninstall bitsandbytes for pytorch 2.6 as it is not compatible with `triton.ops` - # if the job name include key word future, then uninstall bitsandbytes - if [[ "$(image)" == *"torch2.6"* ]]; then - pip uninstall -y bitsandbytes - else - python -c "import bitsandbytes" - fi + python -c "import bitsandbytes" displayName: "Handle bitsandbytes" - bash: | diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index d7bfd8318b9fb..2bf1aba2f63e3 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -129,13 +129,7 @@ jobs: displayName: "Drop PL for LAI" - bash: | - # FixMe: uninstall bitsandbytes for pytorch 2.6 as it is not compatible with `triton.ops` - # if the job name include key word future, then uninstall bitsandbytes - if [[ "$(image)" == *"torch2.6"* ]]; then - pip uninstall -y bitsandbytes - else - python -c "import bitsandbytes" - fi + python -c "import bitsandbytes" displayName: "Handle bitsandbytes" - bash: | From 2355ae29ba8749e3d7405104f97342ba8a558e81 Mon Sep 17 00:00:00 2001 From: Luca Antiga Date: Tue, 18 Feb 2025 10:31:29 +0100 Subject: [PATCH 29/46] Bump bitsandbytes upper bound for CI --- requirements/fabric/strategies.txt | 4 ++-- requirements/pytorch/extra.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt index 394aceb39cd6b..a0dfe64538cbd 100644 --- a/requirements/fabric/strategies.txt +++ b/requirements/fabric/strategies.txt @@ -6,5 +6,5 @@ # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods` # shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372 deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin" # strict -bitsandbytes >=0.44.0,<0.44.2; sys_platform == 'linux' or sys_platform == 'win32' -bitsandbytes >=0.42.0,<0.43.0 ; sys_platform == 'darwin' +bitsandbytes >=0.44.0,<0.45.3; sys_platform == 'linux' or sys_platform == 'win32' +bitsandbytes >=0.42.0,<0.45.3 ; sys_platform == 'darwin' diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index 70c6548817b4a..2cd4e44a2eaf1 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -8,5 +8,5 @@ hydra-core >=1.2.0, <1.4.0 jsonargparse[signatures] >=4.27.7, <=4.35.0 rich >=12.3.0, <13.6.0 tensorboardX >=2.2, <2.7.0 # min version is set by torch.onnx missing attribute -bitsandbytes >=0.44.0,<0.44.2; sys_platform == 'linux' or sys_platform == 'win32' -bitsandbytes >=0.42.0,<0.43.0 ; sys_platform == 'darwin' +bitsandbytes >=0.44.0,<0.45.3; sys_platform == 'linux' or sys_platform == 'win32' +bitsandbytes >=0.42.0,<0.45.3 ; sys_platform == 'darwin' From d2f4b0a9a8f750aea112c038c838932140dfff64 Mon Sep 17 00:00:00 2001 From: Luca Antiga Date: Tue, 18 Feb 2025 11:05:09 +0100 Subject: [PATCH 30/46] Bump bitsandbytes lower bound for CI --- requirements/fabric/strategies.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt index a0dfe64538cbd..9d3d6d599bdb3 100644 --- a/requirements/fabric/strategies.txt +++ b/requirements/fabric/strategies.txt @@ -6,5 +6,5 @@ # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods` # shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372 deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin" # strict -bitsandbytes >=0.44.0,<0.45.3; sys_platform == 'linux' or sys_platform == 'win32' -bitsandbytes >=0.42.0,<0.45.3 ; sys_platform == 'darwin' +bitsandbytes >=0.45.2,<0.45.3; sys_platform == 'linux' or sys_platform == 'win32' +bitsandbytes >=0.45.2,<0.45.3 ; sys_platform == 'darwin' From f5bd47a7740380bd5d67ad7183fbc81929c2a75a Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 11 Mar 2025 08:48:52 +0100 Subject: [PATCH 31/46] COVERAGE_SOURCE --- .azure/gpu-tests-fabric.yml | 1 - .azure/gpu-tests-pytorch.yml | 1 - tests/run_standalone_tests.sh | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index f555a25ea3e78..a807175005a4d 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -151,7 +151,6 @@ jobs: - bash: bash ./run_standalone_tests.sh "tests_fabric" workingDirectory: tests/ env: - PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE) PL_RUN_STANDALONE_TESTS: "1" displayName: "Testing: fabric standalone" timeoutInMinutes: "10" diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 74ba666062906..24d29255f1b82 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -172,7 +172,6 @@ jobs: workingDirectory: tests/ env: PL_USE_MOCKED_MNIST: "1" - PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE) PL_RUN_STANDALONE_TESTS: "1" displayName: "Testing: PyTorch standalone tests" timeoutInMinutes: "35" diff --git a/tests/run_standalone_tests.sh b/tests/run_standalone_tests.sh index fb4dbe11a3618..1e373525f0c60 100755 --- a/tests/run_standalone_tests.sh +++ b/tests/run_standalone_tests.sh @@ -20,7 +20,7 @@ test_batch_size="${NUM_PARALLEL_TESTS:-5}" # Source directory for coverage runs can be set with CODECOV_SOURCE and defaults to lightning. -codecov_source="${CODECOV_SOURCE:-"lightning"}" +codecov_source="${COVERAGE_SOURCE:-"lightning"}" # The test directory is passed as the first argument to the script test_dir=$1 # parse the first argument From 837ef680fe4c4b0f12bc07981aaaa6bf164e3ef5 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 11 Mar 2025 08:59:08 +0100 Subject: [PATCH 32/46] bitsandbytes --- requirements/fabric/strategies.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt index 9d3d6d599bdb3..7e2a9dfeb7763 100644 --- a/requirements/fabric/strategies.txt +++ b/requirements/fabric/strategies.txt @@ -6,5 +6,5 @@ # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods` # shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372 deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin" # strict -bitsandbytes >=0.45.2,<0.45.3; sys_platform == 'linux' or sys_platform == 'win32' -bitsandbytes >=0.45.2,<0.45.3 ; sys_platform == 'darwin' +bitsandbytes >=0.45.2,<0.45.3; platform_system != "Darwin" + From ce5a21974f4e7a8c3841f5b9760bf1cee027c0f3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 11 Mar 2025 08:03:43 +0000 Subject: [PATCH 33/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- requirements/fabric/strategies.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt index 7e2a9dfeb7763..5b7f170cbd866 100644 --- a/requirements/fabric/strategies.txt +++ b/requirements/fabric/strategies.txt @@ -7,4 +7,3 @@ # shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372 deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin" # strict bitsandbytes >=0.45.2,<0.45.3; platform_system != "Darwin" - From d6cc6adcac1a23dd4126c736b3220590e5a7d623 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 11 Mar 2025 09:28:55 +0100 Subject: [PATCH 34/46] external --- .azure/gpu-tests-fabric.yml | 4 +- .azure/gpu-tests-pytorch.yml | 4 +- tests/README.md | 6 +- tests/run_standalone_tests.sh | 152 ---------------------------------- 4 files changed, 9 insertions(+), 157 deletions(-) delete mode 100755 tests/run_standalone_tests.sh diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index a807175005a4d..6f074949f70bf 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -148,7 +148,9 @@ jobs: displayName: "Testing: fabric standard" timeoutInMinutes: "10" - - bash: bash ./run_standalone_tests.sh "tests_fabric" + - bash: | + wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh + bash ./run_standalone_tests.sh "tests_fabric" workingDirectory: tests/ env: PL_RUN_STANDALONE_TESTS: "1" diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 24d29255f1b82..365482babd112 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -168,7 +168,9 @@ jobs: displayName: "Testing: PyTorch standard" timeoutInMinutes: "35" - - bash: bash ./run_standalone_tests.sh "tests_pytorch" + - bash: | + wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh + bash ./run_standalone_tests.sh "tests_pytorch" workingDirectory: tests/ env: PL_USE_MOCKED_MNIST: "1" diff --git a/tests/README.md b/tests/README.md index 8f015d3386fc3..fdb3a9acab7f1 100644 --- a/tests/README.md +++ b/tests/README.md @@ -64,9 +64,9 @@ You can rely on our CI to make sure all these tests pass. There are certain standalone tests, which you can run using: ```bash -./tests/run_standalone_tests.sh tests/tests_pytorch/trainer/ -# or run a specific test -./tests/run_standalone_tests.sh -k test_multi_gpu_model_ddp +cd tests/ +wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh +./tests/run_standalone_tests.sh tests_pytorch/ ``` ## Running Coverage diff --git a/tests/run_standalone_tests.sh b/tests/run_standalone_tests.sh deleted file mode 100755 index 1e373525f0c60..0000000000000 --- a/tests/run_standalone_tests.sh +++ /dev/null @@ -1,152 +0,0 @@ -#!/bin/bash -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# THIS FILE ASSUMES IT IS RUN INSIDE THE tests DIRECTORY. - -# Batch size for testing: Determines how many standalone test invocations run in parallel -# It can be set through the env variable NUM_PARALLEL_TESTS and defaults to 5 if not set -test_batch_size="${NUM_PARALLEL_TESTS:-5}" - -# Source directory for coverage runs can be set with CODECOV_SOURCE and defaults to lightning. -codecov_source="${COVERAGE_SOURCE:-"lightning"}" - -# The test directory is passed as the first argument to the script -test_dir=$1 # parse the first argument - -# There is also timeout for the tests. -# It can be set through the env variable TEST_TIMEOUT and defaults to 1200 seconds if not set 1200 seconds -test_timeout="${TEST_TIMEOUT:-1200}" - -# Temporary file to store the collected tests -COLLECTED_TESTS_FILE="collected_tests.txt" - -ls -lh . # show the contents of the directory - -# Python arguments for running the tests and coverage -defaults=" -m coverage run --source ${codecov_source} --append -m pytest --no-header -v -s --color=yes --timeout=${test_timeout} --durations=0 " -echo "Using defaults: ${defaults}" - -# Get the list of parametrizations. we need to call them separately. the last two lines are removed. -# note: if there's a syntax error, this will fail with some garbled output -python -um pytest ${test_dir} -q --collect-only --pythonwarnings ignore 2>&1 > $COLLECTED_TESTS_FILE -# Early terminate if collection failed (e.g. syntax error) -if [[ $? != 0 ]]; then - cat $COLLECTED_TESTS_FILE - printf "ERROR: test collection failed!\n" - exit 1 -fi - -# Initialize empty array -tests=() - -# Read from file line by line -while IFS= read -r line; do - # Only keep lines containing "test_" - if [[ $line == *"test_"* ]]; then - # Extract part after test_dir/ - pruned_line="${line#*${test_dir}/}" - tests+=("${test_dir}/$pruned_line") - fi -done < $COLLECTED_TESTS_FILE - -# Count tests -test_count=${#tests[@]} - -# Display results -printf "collected $test_count tests:\n-------------------\n" -printf "%s\n" "${tests[@]}" -printf "\n===================\n" - -# if test count is one print warning -if [[ $test_count -eq 1 ]]; then - printf "WARNING: only one test found!\n" -elif [ $test_count -eq 0 ]; then - printf "ERROR: no tests found!\n" - exit 1 -fi - -# clear all the collected reports -rm -f parallel_test_output-*.txt # in case it exists, remove it - -status=0 # aggregated script status -report="" # final report -pids=() # array of PID for running tests -test_ids=() # array of indexes of running tests -failed_tests=() # array of failed tests -printf "Running $test_count tests in batches of $test_batch_size:\n" -for i in "${!tests[@]}"; do - test=${tests[$i]} - printf "* Running test $((i+1))/$test_count: $test\n" - - # execute the test in the background - # redirect to a log file that buffers test output. since the tests will run in the background, - # we cannot let them output to std{out,err} because the outputs would be garbled together - python ${defaults} "$test" &> "parallel_test_output-$i.txt" & - test_ids+=($i) # save the test's id in an array with running tests - pids+=($!) # save the PID in an array with running tests - - # if we reached the batch size, wait for all tests to finish - if (( (($i + 1) % $test_batch_size == 0) || $i == $test_count-1 )); then - printf "-> Waiting for batch to finish: $(IFS=' '; echo "${pids[@]}")\n" - # wait for running tests - for j in "${!test_ids[@]}"; do - i=${test_ids[$j]} # restore the global test's id - pid=${pids[$j]} # restore the particular PID - test=${tests[$i]} # restore the test name - printf "? Waiting for $tests >> parallel_test_output-$i.txt (PID: $pid)\n" - wait -n $pid - # get the exit status of the test - test_status=$? - # add row to the final report - report+="Ran\t$test\t>> exit:$test_status\n" - if [[ $test_status != 0 ]]; then - # add the test to the failed tests array - failed_tests+=($i) - # Process exited with a non-zero exit status - status=$test_status - fi - done - printf "Starting over with a new batch...\n" - test_ids=() # reset the test's id array - pids=() # reset the PID array - fi -done - -# print test report with exit code for each test -printf '=%.s' {1..80} -printf "\n$report" -printf '=%.s' {1..80} -printf '\n' - -# print failed tests from duped logs -if [[ ${#failed_tests[@]} -gt 0 ]]; then - printf "Failed tests:\n" - for i in "${failed_tests[@]}"; do - printf '\n%.s' {1..5} - printf '=%.s' {1..80} - printf "\n${tests[$i]}\n" - printf '-%.s' {1..80} - printf "\n" - # show the output of the failed test - cat "parallel_test_output-$i.txt" - printf "\n" - printf '=%.s' {1..80} - done -else - printf "All tests passed!\n" -fi - -# exit with the worse test result -exit $status From 1895897db31b12ab2234919211bc0f5226e6a73c Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 11 Mar 2025 09:49:22 +0100 Subject: [PATCH 35/46] _BITSANDBYTES_AVAILABLE --- src/lightning/fabric/plugins/precision/bitsandbytes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/plugins/precision/bitsandbytes.py b/src/lightning/fabric/plugins/precision/bitsandbytes.py index ecb1d8a442655..b78157d1c4074 100644 --- a/src/lightning/fabric/plugins/precision/bitsandbytes.py +++ b/src/lightning/fabric/plugins/precision/bitsandbytes.py @@ -40,7 +40,7 @@ log = logging.getLogger(__name__) -_BITSANDBYTES_AVAILABLE = RequirementCache("bitsandbytes>=0.42.0") +_BITSANDBYTES_AVAILABLE = RequirementCache("bitsandbytes") class BitsandbytesPrecision(Precision): From e343e6bc420306dd2e2dc24088175c3406e6b2a5 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 11 Mar 2025 09:52:46 +0100 Subject: [PATCH 36/46] link --- tests/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/README.md b/tests/README.md index fdb3a9acab7f1..9265caf4b412e 100644 --- a/tests/README.md +++ b/tests/README.md @@ -39,7 +39,7 @@ Note: if your computer does not have multi-GPU or TPU these tests are skipped. **GitHub Actions:** For convenience, you can also use your own GHActions building which will be triggered with each commit. This is useful if you do not test against all required dependency versions. -**Docker:** Another option is to utilize the [pytorch lightning cuda base docker image](https://hub.docker.com/repository/docker/pytorchlightning/pytorch_lightning/tags?page=1&name=cuda). You can then run: +**Docker:** Another option is to utilize the [pytorch lightning cuda base docker image](https://hub.docker.com/r/pytorchlightning/pytorch_lightning/tags?name=cuda). You can then run: ```bash python -m pytest src/lightning/pytorch tests/tests_pytorch -v From 3d56092f7e232f850a356797969f9f5b1695b139 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 11 Mar 2025 10:00:18 +0100 Subject: [PATCH 37/46] bb --- .azure/gpu-tests-fabric.yml | 8 ++++---- .azure/gpu-tests-pytorch.yml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 6f074949f70bf..9d9973d8fece0 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -114,10 +114,6 @@ jobs: pip install setuptools==75.6.0 jsonargparse==4.35.0 displayName: "Install package & dependencies" - - bash: | - python -c "import bitsandbytes" - displayName: "Handle bitsandbytes" - - bash: | set -e python requirements/collect_env_details.py @@ -125,6 +121,10 @@ jobs: python requirements/pytorch/check-avail-extras.py displayName: "Env details" + - bash: | + python -c "import bitsandbytes" + displayName: "Handle bitsandbytes" + - bash: python -m pytest lightning_fabric workingDirectory: src # without succeeded this could run even if the job has already failed diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 365482babd112..8ac9727d314cc 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -128,10 +128,6 @@ jobs: condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'lightning')) displayName: "Drop PL for LAI" - - bash: | - python -c "import bitsandbytes" - displayName: "Handle bitsandbytes" - - bash: | set -e python requirements/collect_env_details.py @@ -139,6 +135,10 @@ jobs: python requirements/pytorch/check-avail-extras.py displayName: "Env details" + - bash: | + python -c "import bitsandbytes" + displayName: "Handle bitsandbytes" + - bash: python -m pytest pytorch_lightning workingDirectory: src # without succeeded this could run even if the job has already failed From be212ca33995bc53f050aaf2873fce0c098f1cac Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 11 Mar 2025 10:11:37 +0100 Subject: [PATCH 38/46] bitsandbytes --- .azure/gpu-tests-fabric.yml | 5 +---- .azure/gpu-tests-pytorch.yml | 5 +---- requirements/pytorch/extra.txt | 3 +-- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 9d9973d8fece0..3442aa942cea0 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -119,11 +119,8 @@ jobs: python requirements/collect_env_details.py python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" python requirements/pytorch/check-avail-extras.py - displayName: "Env details" - - - bash: | python -c "import bitsandbytes" - displayName: "Handle bitsandbytes" + displayName: "Env details" - bash: python -m pytest lightning_fabric workingDirectory: src diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 8ac9727d314cc..41cbf200e75a9 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -133,11 +133,8 @@ jobs: python requirements/collect_env_details.py python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" python requirements/pytorch/check-avail-extras.py - displayName: "Env details" - - - bash: | python -c "import bitsandbytes" - displayName: "Handle bitsandbytes" + displayName: "Env details" - bash: python -m pytest pytorch_lightning workingDirectory: src diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index 2cd4e44a2eaf1..e14cb38297caa 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -8,5 +8,4 @@ hydra-core >=1.2.0, <1.4.0 jsonargparse[signatures] >=4.27.7, <=4.35.0 rich >=12.3.0, <13.6.0 tensorboardX >=2.2, <2.7.0 # min version is set by torch.onnx missing attribute -bitsandbytes >=0.44.0,<0.45.3; sys_platform == 'linux' or sys_platform == 'win32' -bitsandbytes >=0.42.0,<0.45.3 ; sys_platform == 'darwin' +bitsandbytes >=0.45.2,<0.45.3; platform_system != "Darwin" From b98ed1081e25c8351af44a2c30c8f6920f6945e4 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 11 Mar 2025 10:18:06 +0100 Subject: [PATCH 39/46] reduce-overhead --- tests/tests_fabric/strategies/test_ddp_integration.py | 6 +++--- tests/tests_fabric/strategies/test_fsdp_integration.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/tests_fabric/strategies/test_ddp_integration.py b/tests/tests_fabric/strategies/test_ddp_integration.py index 70dd25aa99603..f0f70035db9e6 100644 --- a/tests/tests_fabric/strategies/test_ddp_integration.py +++ b/tests/tests_fabric/strategies/test_ddp_integration.py @@ -84,8 +84,8 @@ def test_reapply_compile(): fabric.launch() model = BoringModel() - compile_kwargs = {"mode": "reduce-overhead"} - compiled_model = torch.compile(model, **compile_kwargs) + # compile_kwargs = {"mode": "reduce-overhead"} + compiled_model = torch.compile(model) # , **compile_kwargs torch.compile.reset_mock() fabric_model = fabric.setup(compiled_model, _reapply_compile=True) @@ -93,7 +93,7 @@ def test_reapply_compile(): assert isinstance(fabric_model._forward_module, OptimizedModule) assert isinstance(fabric_model._forward_module._orig_mod, DistributedDataParallel) # Assert we called compile again with the same arguments, but on the DDP-wrapped module - torch.compile.assert_called_with(fabric_model._forward_module._orig_mod, **compile_kwargs) + torch.compile.assert_called_with(fabric_model._forward_module._orig_mod) # , **compile_kwargs assert fabric_model._original_module == model assert fabric_model._forward_module._orig_mod.module == model diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py index 11a7a1a6f8f7f..552e8477a6bc4 100644 --- a/tests/tests_fabric/strategies/test_fsdp_integration.py +++ b/tests/tests_fabric/strategies/test_fsdp_integration.py @@ -411,8 +411,8 @@ def test_reapply_compile(): fabric.launch() model = BoringModel() - compile_kwargs = {"mode": "reduce-overhead"} - compiled_model = torch.compile(model, **compile_kwargs) + # compile_kwargs = {"mode": "reduce-overhead"} + compiled_model = torch.compile(model) # , **compile_kwargs torch.compile.reset_mock() fabric_model = fabric.setup(compiled_model, _reapply_compile=True) @@ -421,7 +421,7 @@ def test_reapply_compile(): assert isinstance(fabric_model._forward_module._orig_mod, FullyShardedDataParallel) # Assert we called compile again with the same arguments, but on the FSDP-wrapped module - torch.compile.assert_called_with(fabric_model._forward_module._orig_mod, **compile_kwargs) + torch.compile.assert_called_with(fabric_model._forward_module._orig_mod) # , **compile_kwargs assert fabric_model._original_module == model assert fabric_model._forward_module._orig_mod.module == model From 5c021d5ceae16409663dbf56444c6d031edf566d Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 11 Mar 2025 10:30:06 +0100 Subject: [PATCH 40/46] @pytest.mark.filterwarnings("ignore::FutureWarning") --- tests/tests_fabric/plugins/precision/test_bitsandbytes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py index 152f9a1c01fe9..36ef596ad8d85 100644 --- a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py @@ -95,6 +95,7 @@ def __init__(self): @RunIf(min_cuda_gpus=1, max_torch="2.4") +@pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.skipif(not _BITSANDBYTES_AVAILABLE, reason="bitsandbytes unavailable") @pytest.mark.parametrize( ("args", "expected"), From a12a1ec30ff3c1cee575e62f805c1ac00bbe7d9a Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 11 Mar 2025 10:32:37 +0100 Subject: [PATCH 41/46] not _BITSANDBYTES_AVAILABLE --- tests/tests_fabric/plugins/precision/test_bitsandbytes.py | 2 +- tests/tests_pytorch/plugins/precision/test_bitsandbytes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py index 36ef596ad8d85..19a3aa4adc9f2 100644 --- a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py @@ -27,7 +27,7 @@ from tests_fabric.helpers.runif import RunIf -@pytest.mark.skipif(_BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") +@pytest.mark.skipif(not _BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") def test_bitsandbytes_plugin(monkeypatch): module = lightning.fabric.plugins.precision.bitsandbytes monkeypatch.setattr(module, "_BITSANDBYTES_AVAILABLE", lambda: True) diff --git a/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py b/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py index 8f331e26f979d..d9b698dee510a 100644 --- a/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py @@ -24,7 +24,7 @@ from lightning.pytorch.plugins.precision.bitsandbytes import BitsandbytesPrecision -@pytest.mark.skipif(_BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") +@pytest.mark.skipif(not _BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") def test_bitsandbytes_plugin(monkeypatch): module = lightning.fabric.plugins.precision.bitsandbytes monkeypatch.setattr(module, "_BITSANDBYTES_AVAILABLE", lambda: True) From 7b19eb7b424be4984a512d37a77fe3bccf16078d Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Tue, 11 Mar 2025 10:35:33 +0100 Subject: [PATCH 42/46] Apply suggestions from code review --- .azure/gpu-tests-fabric.yml | 1 - .azure/gpu-tests-pytorch.yml | 1 - tests/tests_fabric/plugins/precision/test_bitsandbytes.py | 2 +- tests/tests_pytorch/plugins/precision/test_bitsandbytes.py | 2 +- 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 3442aa942cea0..4d738d9110599 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -49,7 +49,6 @@ jobs: FREEZE_REQUIREMENTS: "1" PIP_CACHE_DIR: "/var/tmp/pip" PL_RUN_CUDA_TESTS: "1" - PL_STANDALONE_TESTS_BATCH_SIZE: "1" # todo container: image: $(image) # default shm size is 64m. Increase it to avoid: diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 41cbf200e75a9..414f98dab3f66 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -64,7 +64,6 @@ jobs: FREEZE_REQUIREMENTS: "1" PIP_CACHE_DIR: "/var/tmp/pip" PL_RUN_CUDA_TESTS: "1" - PL_STANDALONE_TESTS_BATCH_SIZE: "2" # todo container: image: $(image) # default shm size is 64m. Increase it to avoid: diff --git a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py index 19a3aa4adc9f2..36ef596ad8d85 100644 --- a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py @@ -27,7 +27,7 @@ from tests_fabric.helpers.runif import RunIf -@pytest.mark.skipif(not _BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") +@pytest.mark.skipif(_BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") def test_bitsandbytes_plugin(monkeypatch): module = lightning.fabric.plugins.precision.bitsandbytes monkeypatch.setattr(module, "_BITSANDBYTES_AVAILABLE", lambda: True) diff --git a/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py b/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py index d9b698dee510a..8f331e26f979d 100644 --- a/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py @@ -24,7 +24,7 @@ from lightning.pytorch.plugins.precision.bitsandbytes import BitsandbytesPrecision -@pytest.mark.skipif(not _BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") +@pytest.mark.skipif(_BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") def test_bitsandbytes_plugin(monkeypatch): module = lightning.fabric.plugins.precision.bitsandbytes monkeypatch.setattr(module, "_BITSANDBYTES_AVAILABLE", lambda: True) From 89c13f37d525f9c84cda59cb513836f68ff6937b Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 11 Mar 2025 10:40:52 +0100 Subject: [PATCH 43/46] RunIf(mps=False) --- tests/tests_fabric/plugins/precision/test_bitsandbytes.py | 1 + tests/tests_pytorch/plugins/precision/test_bitsandbytes.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py index 36ef596ad8d85..430f36b308941 100644 --- a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py @@ -28,6 +28,7 @@ @pytest.mark.skipif(_BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") +@RunIf(mps=False) # skip on MPS as Bitsandbytes is only supported on CUDA GPUs def test_bitsandbytes_plugin(monkeypatch): module = lightning.fabric.plugins.precision.bitsandbytes monkeypatch.setattr(module, "_BITSANDBYTES_AVAILABLE", lambda: True) diff --git a/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py b/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py index 8f331e26f979d..5cb513f4d2de0 100644 --- a/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py @@ -22,9 +22,11 @@ from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE from lightning.pytorch import LightningModule, Trainer from lightning.pytorch.plugins.precision.bitsandbytes import BitsandbytesPrecision +from tests_pytorch.helpers.runif import RunIf @pytest.mark.skipif(_BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") +@RunIf(mps=False) # skip on MPS as Bitsandbytes is only supported on CUDA GPUs def test_bitsandbytes_plugin(monkeypatch): module = lightning.fabric.plugins.precision.bitsandbytes monkeypatch.setattr(module, "_BITSANDBYTES_AVAILABLE", lambda: True) From 056ed0dfaf188b6b5fd76bef9b52d6edcaa7fbf7 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 11 Mar 2025 11:00:56 +0100 Subject: [PATCH 44/46] test_bitsandbytes_plugin --- tests/tests_fabric/plugins/precision/test_bitsandbytes.py | 3 ++- tests/tests_pytorch/plugins/precision/test_bitsandbytes.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py index 430f36b308941..9c0a61dac876c 100644 --- a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License +import platform import sys from unittest.mock import Mock @@ -28,7 +29,7 @@ @pytest.mark.skipif(_BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") -@RunIf(mps=False) # skip on MPS as Bitsandbytes is only supported on CUDA GPUs +@pytest.mark.skipif(platform.system() == "Darwin") # skip on Mac as Bitsandbytes is only supported on CUDA GPUs def test_bitsandbytes_plugin(monkeypatch): module = lightning.fabric.plugins.precision.bitsandbytes monkeypatch.setattr(module, "_BITSANDBYTES_AVAILABLE", lambda: True) diff --git a/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py b/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py index 5cb513f4d2de0..b560091aea22b 100644 --- a/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License +import platform import sys from unittest.mock import Mock @@ -22,11 +23,10 @@ from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE from lightning.pytorch import LightningModule, Trainer from lightning.pytorch.plugins.precision.bitsandbytes import BitsandbytesPrecision -from tests_pytorch.helpers.runif import RunIf @pytest.mark.skipif(_BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") -@RunIf(mps=False) # skip on MPS as Bitsandbytes is only supported on CUDA GPUs +@pytest.mark.skipif(platform.system() == "Darwin") # skip on Mac as Bitsandbytes is only supported on CUDA GPUs def test_bitsandbytes_plugin(monkeypatch): module = lightning.fabric.plugins.precision.bitsandbytes monkeypatch.setattr(module, "_BITSANDBYTES_AVAILABLE", lambda: True) From 3a2499e98fab185d45ba179eade2c5c1f6770746 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 11 Mar 2025 11:10:26 +0100 Subject: [PATCH 45/46] reason --- tests/tests_fabric/plugins/precision/test_bitsandbytes.py | 2 +- tests/tests_pytorch/plugins/precision/test_bitsandbytes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py index 9c0a61dac876c..f529b631d2374 100644 --- a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py @@ -29,7 +29,7 @@ @pytest.mark.skipif(_BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") -@pytest.mark.skipif(platform.system() == "Darwin") # skip on Mac as Bitsandbytes is only supported on CUDA GPUs +@pytest.mark.skipif(platform.system() == "Darwin", reason="Bitsandbytes is only supported on CUDA GPUs") # skip on Mac def test_bitsandbytes_plugin(monkeypatch): module = lightning.fabric.plugins.precision.bitsandbytes monkeypatch.setattr(module, "_BITSANDBYTES_AVAILABLE", lambda: True) diff --git a/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py b/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py index b560091aea22b..a478a2b9831a1 100644 --- a/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py @@ -26,7 +26,7 @@ @pytest.mark.skipif(_BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") -@pytest.mark.skipif(platform.system() == "Darwin") # skip on Mac as Bitsandbytes is only supported on CUDA GPUs +@pytest.mark.skipif(platform.system() == "Darwin", reason="Bitsandbytes is only supported on CUDA GPUs") # skip on Mac def test_bitsandbytes_plugin(monkeypatch): module = lightning.fabric.plugins.precision.bitsandbytes monkeypatch.setattr(module, "_BITSANDBYTES_AVAILABLE", lambda: True) From 84cae4785128d4b53eaaa17296f50ca3a2f4fc8b Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 11 Mar 2025 12:48:54 +0100 Subject: [PATCH 46/46] _TORCH_LESS_EQUAL_2_6 --- src/lightning/fabric/utilities/imports.py | 1 + tests/tests_fabric/strategies/test_ddp_integration.py | 9 ++++++--- tests/tests_fabric/strategies/test_fsdp_integration.py | 9 ++++++--- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/lightning/fabric/utilities/imports.py b/src/lightning/fabric/utilities/imports.py index a1c5a6f6dcd1b..5a9ec1edc1ca8 100644 --- a/src/lightning/fabric/utilities/imports.py +++ b/src/lightning/fabric/utilities/imports.py @@ -34,6 +34,7 @@ _TORCH_EQUAL_2_4_0 = compare_version("torch", operator.eq, "2.4.0") _TORCH_GREATER_EQUAL_2_4 = compare_version("torch", operator.ge, "2.4.0") _TORCH_GREATER_EQUAL_2_4_1 = compare_version("torch", operator.ge, "2.4.1") +_TORCH_LESS_EQUAL_2_6 = compare_version("torch", operator.le, "2.6.0") _PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10) diff --git a/tests/tests_fabric/strategies/test_ddp_integration.py b/tests/tests_fabric/strategies/test_ddp_integration.py index f0f70035db9e6..3ed76211e5d6d 100644 --- a/tests/tests_fabric/strategies/test_ddp_integration.py +++ b/tests/tests_fabric/strategies/test_ddp_integration.py @@ -23,6 +23,7 @@ from torch.nn.parallel.distributed import DistributedDataParallel from lightning.fabric import Fabric +from lightning.fabric.utilities.imports import _TORCH_LESS_EQUAL_2_6 from tests_fabric.helpers.runif import RunIf from tests_fabric.strategies.test_single_device import _run_test_clip_gradients from tests_fabric.test_fabric import BoringModel @@ -84,8 +85,10 @@ def test_reapply_compile(): fabric.launch() model = BoringModel() - # compile_kwargs = {"mode": "reduce-overhead"} - compiled_model = torch.compile(model) # , **compile_kwargs + # currently (PyTorch 2.6) using ruduce-overhead here casues a RuntimeError: + # Error: accessing tensor output of CUDAGraphs that has been overwritten by a subsequent run. + compile_kwargs = {"mode": "reduce-overhead"} if _TORCH_LESS_EQUAL_2_6 else {} + compiled_model = torch.compile(model, **compile_kwargs) torch.compile.reset_mock() fabric_model = fabric.setup(compiled_model, _reapply_compile=True) @@ -93,7 +96,7 @@ def test_reapply_compile(): assert isinstance(fabric_model._forward_module, OptimizedModule) assert isinstance(fabric_model._forward_module._orig_mod, DistributedDataParallel) # Assert we called compile again with the same arguments, but on the DDP-wrapped module - torch.compile.assert_called_with(fabric_model._forward_module._orig_mod) # , **compile_kwargs + torch.compile.assert_called_with(fabric_model._forward_module._orig_mod, **compile_kwargs) assert fabric_model._original_module == model assert fabric_model._forward_module._orig_mod.module == model diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py index 552e8477a6bc4..576a0df38b966 100644 --- a/tests/tests_fabric/strategies/test_fsdp_integration.py +++ b/tests/tests_fabric/strategies/test_fsdp_integration.py @@ -29,6 +29,7 @@ from lightning.fabric import Fabric from lightning.fabric.plugins import FSDPPrecision from lightning.fabric.strategies import FSDPStrategy +from lightning.fabric.utilities.imports import _TORCH_LESS_EQUAL_2_6 from lightning.fabric.utilities.load import _load_distributed_checkpoint from lightning.fabric.wrappers import _FabricOptimizer from tests_fabric.helpers.datasets import RandomDataset @@ -411,8 +412,10 @@ def test_reapply_compile(): fabric.launch() model = BoringModel() - # compile_kwargs = {"mode": "reduce-overhead"} - compiled_model = torch.compile(model) # , **compile_kwargs + # currently (PyTorch 2.6) using ruduce-overhead here casues a RuntimeError: + # Error: accessing tensor output of CUDAGraphs that has been overwritten by a subsequent run. + compile_kwargs = {"mode": "reduce-overhead"} if _TORCH_LESS_EQUAL_2_6 else {} + compiled_model = torch.compile(model, **compile_kwargs) torch.compile.reset_mock() fabric_model = fabric.setup(compiled_model, _reapply_compile=True) @@ -421,7 +424,7 @@ def test_reapply_compile(): assert isinstance(fabric_model._forward_module._orig_mod, FullyShardedDataParallel) # Assert we called compile again with the same arguments, but on the FSDP-wrapped module - torch.compile.assert_called_with(fabric_model._forward_module._orig_mod) # , **compile_kwargs + torch.compile.assert_called_with(fabric_model._forward_module._orig_mod, **compile_kwargs) assert fabric_model._original_module == model assert fabric_model._forward_module._orig_mod.module == model