Skip to content

Commit 42628f9

Browse files
authored
Merge branch 'master' into xccl_enable
2 parents ef2c671 + 41fcead commit 42628f9

File tree

138 files changed

+8619
-881
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

138 files changed

+8619
-881
lines changed

.github/workflows/cpu-torch-latest.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ jobs:
4242
git clone https://github.com/huggingface/transformers
4343
cd transformers
4444
# if needed switch to the last known good SHA until transformers@master is fixed
45-
git checkout 981c276
45+
# git checkout 981c276
4646
git rev-parse --short HEAD
4747
pip install .
4848
@@ -59,5 +59,5 @@ jobs:
5959
run: |
6060
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
6161
cd tests
62-
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.6"
63-
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.6"
62+
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.7"
63+
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.7"

.github/workflows/hpu-gaudi2-nightly.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
# The type of runner that the job will run on
2222
runs-on: [self-hosted, intel, gaudi2]
2323
container:
24-
image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
24+
image: vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
2525
ports:
2626
- 80
2727
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
@@ -45,6 +45,8 @@ jobs:
4545
test_zero_leaf_module.py
4646
test_zero_offloadpp.py
4747
test_zero_tiled.py
48+
test_autotp_training.py
49+
test_ulysses.py
4850
4951
# Steps represent a sequence of tasks that will be executed as part of the job
5052
steps:

.github/workflows/hpu-gaudi2.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ jobs:
3939
# The type of runner that the job will run on
4040
runs-on: [self-hosted, intel, gaudi2]
4141
container:
42-
image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
42+
image: vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
4343
ports:
4444
- 80
4545
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
@@ -94,6 +94,8 @@ jobs:
9494
test_zero_nesting_init.py
9595
test_zeropp.py
9696
(test_zero.py and (TestZero3ParamPartitioningLargeParam or TestZero3ParamPartitioningLargeParam))
97+
(test_linear.py and (TestLoRALinear or TestBasicLinear))
98+
(test_ctx.py and TestEngine)
9799
98100
# Steps represent a sequence of tasks that will be executed as part of the job
99101
steps:
@@ -112,7 +114,7 @@ jobs:
112114
git clone https://github.com/huggingface/transformers
113115
cd transformers
114116
# if needed switch to the last known good SHA until transformers@master is fixed
115-
git checkout 981c276
117+
# git checkout 981c276
116118
git rev-parse --short HEAD
117119
pip install .
118120

.github/workflows/nv-a6000.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
unit-tests:
2424
runs-on: [self-hosted, nvidia, a6000]
2525
container:
26-
image: nvcr.io/nvidia/pytorch:24.09-py3
26+
image: nvcr.io/nvidia/pytorch:24.12-py3
2727
ports:
2828
- 80
2929
options: --gpus all --shm-size "8G"
@@ -43,7 +43,7 @@ jobs:
4343
git clone https://github.com/huggingface/transformers
4444
cd transformers
4545
# if you need to use an older transformers version temporarily in case of breakage
46-
git checkout 981c276
46+
# git checkout 981c276
4747
git rev-parse --short HEAD
4848
python -m pip install .
4949
- name: Install deepspeed
@@ -58,8 +58,8 @@ jobs:
5858
run: |
5959
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
6060
cd tests
61-
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.5" --cuda_ver="12"
62-
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.5" --cuda_ver="12"
61+
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.6" --cuda_ver="12"
62+
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.6" --cuda_ver="12"
6363
- name: MII unit tests
6464
run: |
6565
BRANCH="main"

.github/workflows/nv-flash-attn.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
unit-tests:
1919
runs-on: [self-hosted, nvidia, a6000]
2020
container:
21-
image: nvcr.io/nvidia/pytorch:24.09-py3
21+
image: nvcr.io/nvidia/pytorch:24.12-py3
2222
ports:
2323
- 80
2424
options: --gpus all --shm-size "8G"
@@ -53,7 +53,7 @@ jobs:
5353
run: |
5454
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
5555
cd tests
56-
python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.5" --cuda_ver="12"
56+
python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.6" --cuda_ver="12"
5757
- name: Open GitHub issue if nightly CI fails
5858
if: ${{ failure() && (github.event_name == 'schedule') }}
5959
uses: JasonEtco/create-an-issue@v2

.github/workflows/nv-human-eval.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
unit-tests:
1212
runs-on: [self-hosted, nvidia, a6000]
1313
container:
14-
image: nvcr.io/nvidia/pytorch:24.09-py3
14+
image: nvcr.io/nvidia/pytorch:24.12-py3
1515
ports:
1616
- 80
1717
options: --gpus all --shm-size "8G"
@@ -50,4 +50,4 @@ jobs:
5050
run: |
5151
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
5252
cd tests
53-
python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.5" --cuda_ver="12"
53+
python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.6" --cuda_ver="12"

.github/workflows/nv-pre-compile-ops.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ jobs:
3636
#python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
3737
- name: Compile DeepSpeed Ops
3838
run: |
39-
DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
39+
DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_DEEP_COMPILE=0 pip3 install .
4040
- name: DS Report
4141
run: |
4242
ds_report

.github/workflows/nv-torch-latest-v100.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ jobs:
4444
4545
- name: Install deepspeed
4646
run: |
47-
pip install .[dev,1bit,autotuning]
47+
pip install .[dev,1bit,autotuning,deepcompile]
4848
ds_report
4949
5050
- name: Python environment
@@ -55,5 +55,5 @@ jobs:
5555
run: |
5656
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
5757
cd tests
58-
pytest $PYTEST_OPTS --forked -n 8 unit/ --torch_ver="2.6" --cuda_ver="12.4"
58+
pytest -x $PYTEST_OPTS --forked -n 8 unit/ --torch_ver="2.6" --cuda_ver="12.4"
5959
pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.6" --cuda_ver="12.4"

.github/workflows/nv-torch-nightly-v100.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ jobs:
3737
git clone https://github.com/huggingface/transformers
3838
cd transformers
3939
# if needed switch to the last known good SHA until transformers@master is fixed
40-
git checkout 981c276
40+
# git checkout 981c276
4141
git rev-parse --short HEAD
4242
pip install .
4343

.github/workflows/setup-venv/action.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ runs:
66
- id: update-env
77
run: |
88
sudo apt-get update
9-
sudo apt-get install -y libaio-dev
9+
# Temporary disable nvme UTs
10+
# sudo apt-get install -y libaio-dev
11+
sudo apt remove -y libaio-dev
1012
python -m pip install --user --upgrade pip
1113
python -m pip install --user --upgrade virtualenv
1214
shell: bash

0 commit comments

Comments
 (0)