Skip to content

Commit 1dcc5eb

Browse files
authored
Merge branch 'master' into bugfix/14209_num_stepping_batches
2 parents 1dcb94c + 1a3fe39 commit 1dcc5eb

File tree

95 files changed

+1040
-1866
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

95 files changed

+1040
-1866
lines changed

.azure/app-cloud-e2e.yml

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@ variables:
2424

2525
jobs:
2626
- job: App_cloud_e2e_testing
27-
pool:
28-
vmImage: 'ubuntu-latest'
27+
pool: azure-cpus
28+
container:
29+
image: mcr.microsoft.com/playwright/python:v1.25.2-focal
30+
options: "--shm-size=2g"
2931
timeoutInMinutes: "30"
3032
cancelTimeoutInMinutes: "2"
3133
strategy:
@@ -56,6 +58,7 @@ jobs:
5658
clean: all
5759
steps:
5860
- bash: |
61+
whoami
5962
python --version
6063
pip --version
6164
displayName: 'Info'
@@ -80,10 +83,10 @@ jobs:
8083

8184
- bash: |
8285
python -m pip install playwright
83-
python -m playwright install --with-deps
86+
python -m playwright install # --with-deps
8487
displayName: 'Install Playwright system dependencies'
8588
86-
- bash: pip install -e .
89+
- bash: pip install -e . --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
8790
displayName: 'Install lightning'
8891

8992
- bash: |
@@ -110,12 +113,12 @@ jobs:
110113
TEST_APP_NAME: $(name)
111114
HAR_LOCATION: './artifacts/hars'
112115
SLOW_MO: '50'
113-
LAI_USER: $(LAI_USER)
114-
LAI_PASS: $(LAI_PASS)
115-
LIGHTNING_USER_ID: $(LIGHTNING_USER_ID)
116-
LIGHTNING_API_KEY: $(LIGHTNING_API_KEY)
116+
# LAI_USER: $(LAI_USER)
117+
# LAI_PASS: $(LAI_PASS)
118+
LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD)
119+
LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD)
117120
LIGHTNING_USERNAME: $(LIGHTNING_USERNAME)
118-
LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL)
121+
LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD)
119122
displayName: 'Run the tests'
120123
121124
- publish: '$(Build.ArtifactStagingDirectory)/videos'
@@ -125,16 +128,16 @@ jobs:
125128
- bash: |
126129
time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()"
127130
env:
128-
LAI_USER: $(LAI_USER)
129-
LAI_PASS: $(LAI_PASS)
130-
LIGHTNING_USER_ID: $(LIGHTNING_USER_ID)
131-
LIGHTNING_API_KEY: $(LIGHTNING_API_KEY)
131+
# LAI_USER: $(LAI_USER)
132+
# LAI_PASS: $(LAI_PASS)
133+
LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD)
134+
LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD)
132135
LIGHTNING_USERNAME: $(LIGHTNING_USERNAME)
133-
LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL)
136+
LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD)
134137
PR_NUMBER: $(local_id)
135138
TEST_APP_NAME: $(name)
136-
GRID_USER_ID: $(LIGHTNING_USER_ID) # TODO: clarify the meaning
137-
GRID_USER_KEY: $(LIGHTNING_API_KEY) # TODO: clarify the meaning
138-
GRID_URL: $(LIGHTNING_CLOUD_URL)
139-
_GRID_USERNAME: $(LIGHTNING_USERNAME)
139+
# GRID_USER_ID: $(LIGHTNING_USER_ID) # TODO: clarify the meaning
140+
# GRID_USER_KEY: $(LIGHTNING_API_KEY) # TODO: clarify the meaning
141+
# GRID_URL: $(LIGHTNING_CLOUD_URL)
142+
# _GRID_USERNAME: $(LIGHTNING_USERNAME)
140143
displayName: 'Clean Previous Apps'

.azure/gpu-tests.yml

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ jobs:
4444

4545
- bash: |
4646
CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}')
47-
FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
47+
FILTER='.azure/gpu_*|src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
4848
echo $CHANGED_FILES > changed_files.txt
4949
MATCHES=$(cat changed_files.txt | grep -E $FILTER)
5050
echo $MATCHES
@@ -72,12 +72,15 @@ jobs:
7272
set -e
7373
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
7474
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)"
75+
TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
7576
CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
7677
CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])")
78+
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION}
79+
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt ${PYTORCH_VERSION}
80+
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION}
7781
pip install "bagua-cuda$CUDA_VERSION_BAGUA>=0.9.0"
78-
pip install -e .[strategies]
79-
pip install -U deepspeed # TODO: remove when docker images are upgraded
80-
pip install --requirement requirements/pytorch/devel.txt
82+
pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
83+
pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
8184
pip list
8285
env:
8386
PACKAGE_NAME: pytorch
@@ -120,6 +123,15 @@ jobs:
120123
timeoutInMinutes: "35"
121124
condition: eq(variables['continue'], '1')
122125

126+
- bash: bash run_standalone_tasks.sh
127+
workingDirectory: tests/tests_pytorch
128+
env:
129+
PL_USE_MOCKED_MNIST: "1"
130+
PL_RUN_CUDA_TESTS: "1"
131+
displayName: 'Testing: PyTorch standalone tasks'
132+
timeoutInMinutes: "10"
133+
condition: eq(variables['continue'], '1')
134+
123135
- bash: |
124136
python -m coverage report
125137
python -m coverage xml

.azure/hpu-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ jobs:
4545
pip --version
4646
sudo pip uninstall -y lightning pytorch-lightning
4747
pip install fire
48-
python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext
48+
python .actions/assistant.py requirements-prune-pkgs torch,torchvision
4949
pip install ".[extra,test]"
5050
pip list
5151
env:

.github/workflows/ci-pytorch-dockers.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ jobs:
7575
push: ${{ env.PUSH_TO_HUB }}
7676
tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }}
7777
timeout-minutes: 60
78-
- uses: ravsamhq/notify-slack-action@v1
78+
- uses: ravsamhq/notify-slack-action@v2
7979
if: failure() && env.PUSH_TO_HUB == 'true'
8080
with:
8181
status: ${{ job.status }}
@@ -117,7 +117,7 @@ jobs:
117117
push: ${{ env.PUSH_TO_HUB }}
118118
tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
119119
timeout-minutes: 95
120-
- uses: ravsamhq/notify-slack-action@v1
120+
- uses: ravsamhq/notify-slack-action@v2
121121
if: failure() && env.PUSH_TO_HUB == 'true'
122122
with:
123123
status: ${{ job.status }}
@@ -155,7 +155,7 @@ jobs:
155155
push: ${{ env.PUSH_TO_HUB }}
156156
tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
157157
timeout-minutes: 95
158-
- uses: ravsamhq/notify-slack-action@v1
158+
- uses: ravsamhq/notify-slack-action@v2
159159
if: failure() && env.PUSH_TO_HUB == 'true'
160160
with:
161161
status: ${{ job.status }}
@@ -199,7 +199,7 @@ jobs:
199199
push: ${{ env.PUSH_TO_HUB }}
200200
tags: pytorchlightning/pytorch_lightning:ipu-ci-runner-py${{ matrix.python_version }}
201201
timeout-minutes: 10
202-
- uses: ravsamhq/notify-slack-action@v1
202+
- uses: ravsamhq/notify-slack-action@v2
203203
if: failure() && env.PUSH_TO_HUB == 'true'
204204
with:
205205
status: ${{ job.status }}
@@ -235,7 +235,7 @@ jobs:
235235
push: ${{ env.PUSH_TO_HUB }}
236236
tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }}
237237
timeout-minutes: 10
238-
- uses: ravsamhq/notify-slack-action@v1
238+
- uses: ravsamhq/notify-slack-action@v2
239239
if: failure() && env.PUSH_TO_HUB == 'true'
240240
with:
241241
status: ${{ job.status }}

.github/workflows/events-nightly.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ jobs:
4848
# report failure to Slack
4949
- name: Slack notification
5050
if: failure() && github.event_name == 'schedule'
51-
uses: ravsamhq/notify-slack-action@v1
51+
uses: ravsamhq/notify-slack-action@v2
5252
with:
5353
status: ${{ job.status }}
5454
token: ${{ secrets.GITHUB_TOKEN }}

dockers/base-conda/Dockerfile

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ RUN \
3434
# https://github.com/NVIDIA/nvidia-docker/issues/1631
3535
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
3636
apt-get update -qq --fix-missing && \
37+
NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
38+
CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
39+
MAX_ALLOWED_NCCL=2.11.4 && \
40+
TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \
3741
apt-get install -y --no-install-recommends \
3842
build-essential \
3943
cmake \
@@ -42,17 +46,15 @@ RUN \
4246
curl \
4347
unzip \
4448
ca-certificates \
45-
libopenmpi-dev
46-
47-
RUN \
49+
libopenmpi-dev \
50+
libnccl2=$TO_INSTALL_NCCL \
51+
libnccl-dev=$TO_INSTALL_NCCL && \
4852
# Install conda and python.
4953
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
5054
curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \
5155
chmod +x ~/miniconda.sh && \
5256
~/miniconda.sh -b && \
53-
rm ~/miniconda.sh
54-
55-
RUN \
57+
rm ~/miniconda.sh && \
5658
# Cleaning
5759
apt-get autoremove -y && \
5860
apt-get clean && \
@@ -76,11 +78,11 @@ RUN \
7678
conda update -n base -c defaults conda && \
7779
CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \
7880
conda create -y --name $CONDA_ENV \
79-
python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION_MM} \
81+
python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision cudatoolkit=${CUDA_VERSION_MM} \
8082
-c nvidia -c pytorch -c pytorch-test && \
8183
conda init bash && \
8284
# NOTE: this requires that the channel is presented in the yaml before packages \
83-
printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \
85+
printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \
8486
python prune.py && \
8587
rm prune.py && \
8688
cat environment.yml && \
@@ -100,7 +102,7 @@ RUN \
100102
pip list | grep torch && \
101103
python -c "import torch; print(torch.__version__)" && \
102104
pip install -q fire && \
103-
python assistant.py requirements_prune_pkgs torch,torchvision,torchtext && \
105+
python assistant.py requirements_prune_pkgs torch,torchvision && \
104106
# Install remaining requirements
105107
pip install --no-cache-dir -r requirements/pytorch/base.txt \
106108
-r requirements/pytorch/extra.txt \

dockers/base-cuda/Dockerfile

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,11 @@ RUN \
3737
# https://github.com/NVIDIA/nvidia-docker/issues/1631
3838
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
3939
apt-get update -qq --fix-missing && \
40-
apt-get install -y --no-install-recommends \
40+
NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
41+
CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
42+
MAX_ALLOWED_NCCL=2.11.4 && \
43+
TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \
44+
apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
4145
build-essential \
4246
pkg-config \
4347
cmake \
@@ -50,19 +54,17 @@ RUN \
5054
libopenmpi-dev \
5155
openmpi-bin \
5256
ssh \
53-
&& \
54-
57+
libnccl2=$TO_INSTALL_NCCL \
58+
libnccl-dev=$TO_INSTALL_NCCL && \
5559
# Install python
5660
add-apt-repository ppa:deadsnakes/ppa && \
5761
apt-get install -y \
5862
python${PYTHON_VERSION} \
5963
python${PYTHON_VERSION}-distutils \
6064
python${PYTHON_VERSION}-dev \
6165
&& \
62-
6366
update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
6467
update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \
65-
6668
# Cleaning
6769
apt-get autoremove -y && \
6870
apt-get clean && \
@@ -78,7 +80,6 @@ RUN \
7880
wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \
7981
python${PYTHON_VERSION} get-pip.py && \
8082
rm get-pip.py && \
81-
8283
pip install -q fire && \
8384
# Disable cache \
8485
CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
@@ -91,16 +92,6 @@ RUN \
9192
pip install -r requirements/pytorch/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \
9293
rm assistant.py
9394

94-
RUN \
95-
apt-get purge -y cmake && \
96-
wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \
97-
tar -zxvf cmake-3.20.2.tar.gz && \
98-
cd cmake-3.20.2 && \
99-
./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \
100-
make && \
101-
make install && \
102-
cmake --version
103-
10495
ENV \
10596
HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \
10697
HOROVOD_GPU_OPERATIONS=NCCL \

dockers/tpu-tests/tpu_test_cases.jsonnet

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ local tputests = base.BaseTest {
3737
export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
3838
export PL_RUN_TPU_TESTS=1
3939
cd tests/tests_pytorch
40+
set -e
4041
coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./
4142
echo "\n||| Running standalone tests |||\n"
4243
bash run_standalone_tests.sh

docs/source-pytorch/accelerators/hpu_basic.rst

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,40 @@ It uses :class:`~pytorch_lightning.strategies.hpu_parallel.HPUParallelStrategy`
4747
4848
----
4949

50+
Scale-out on Gaudis
51+
-------------------
52+
53+
To train a Lightning model using multiple HPU nodes, set the ``num_nodes`` parameter with the available nodes in the ``Trainer`` class.
54+
55+
.. code-block:: python
56+
57+
trainer = Trainer(accelerator="hpu", devices=8, strategy="hpu_parallel", num_nodes=2)
58+
59+
In addition to this, the following environment variables need to be set to establish communication across nodes. Check out the documentation on :doc:`Cluster Environment <../clouds/cluster>` for more details.
60+
61+
- *MASTER_PORT* - required; has to be a free port on machine with NODE_RANK 0
62+
- *MASTER_ADDR* - required (except for NODE_RANK 0); address of NODE_RANK 0 node
63+
- *WORLD_SIZE* - required; how many workers are in the cluster
64+
- *NODE_RANK* - required; id of the node in the cluster
65+
66+
The trainer needs to be instantiated on every node participating in the training.
67+
68+
On Node 1:
69+
70+
.. code-block:: bash
71+
72+
MASTER_ADDR=<MASTER_ADDR> MASTER_PORT=<MASTER_PORT> NODE_RANK=0 WORLD_SIZE=16
73+
python -m some_model_trainer.py (--arg1 ... train script args...)
74+
75+
On Node 2:
76+
77+
.. code-block:: bash
78+
79+
MASTER_ADDR=<MASTER_ADDR> MASTER_PORT=<MASTER_PORT> NODE_RANK=1 WORLD_SIZE=16
80+
python -m some_model_trainer.py (--arg1 ... train script args...)
81+
82+
----
83+
5084
Select Gaudis automatically
5185
---------------------------
5286

docs/source-pytorch/advanced/model_parallel.rst

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -212,14 +212,31 @@ PyTorch Fully Sharded Training
212212
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
213213

214214
PyTorch has it's own version of `FSDP <https://pytorch.org/docs/stable/fsdp.html>`_ which is upstreamed from their `fairscale <https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html>`__ project.
215-
It was introduced in their `v1.11.0 release <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`_. The API is pretty similar to that of FairScale.
215+
It was introduced in their `v1.11.0 release <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`_ but it is recommended to use it with PyTorch v1.12 or more and that's what
216+
Lightning supports. The API is pretty similar to that of FairScale.
216217

217-
.. note::
218-
Currently Fully Sharded Training relies on the user to wrap the model with Fully Sharded within the ``LightningModule``.
219-
This means you must create a single model that is treated as a ``torch.nn.Module`` within the ``LightningModule``.
220-
This is a limitation of Fully Sharded Training that will be resolved in the future.
221218

222-
To activate parameter sharding, you must wrap your model using the``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly.
219+
Auto Wrapping
220+
"""""""""""""
221+
Model layers should be wrapped in FSDP in a nested way to save peak memory and enable communication and computation overlapping. The
222+
simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code. You don't
223+
have to ``wrap`` layers manually as in the case of manual wrapping.
224+
225+
.. code-block:: python
226+
227+
model = BoringModel()
228+
trainer = Trainer(accelerator="gpu", devices=4, strategy="fsdp_native", precision=16)
229+
trainer.fit(model)
230+
231+
232+
Read more `here <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/#auto-wrapping>`__.
233+
234+
235+
Manual Wrapping
236+
"""""""""""""""
237+
238+
Manual wrapping can be useful to explore complex sharding strategies by applying ``wrap`` selectively to some parts of the model. To activate
239+
parameter sharding with manual wrapping, you can wrap your model using the ``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly.
223240

224241
When not using Fully Sharded these wrap functions are a no-op. This means once the changes have been made, there is no need to remove the changes for other strategies.
225242

0 commit comments

Comments
 (0)