deepspeedai
diff --git a/‎.github/workflows/cpu-torch-latest.yml
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/cpu-torch-latest.yml
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/hpu-gaudi2-nightly.yml
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/hpu-gaudi2-nightly.yml
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/workflows/hpu-gaudi2.yml
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/hpu-gaudi2.yml
Lines changed: 4 additions & 2 deletions
diff --git a/‎.github/workflows/nv-a6000.yml
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/nv-a6000.yml
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/nv-flash-attn.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/nv-flash-attn.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/nv-human-eval.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/nv-human-eval.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/nv-pre-compile-ops.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/nv-pre-compile-ops.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/nv-torch-latest-v100.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/nv-torch-latest-v100.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/nv-torch-nightly-v100.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/nv-torch-nightly-v100.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/setup-venv/action.yml
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/setup-venv/action.yml
Lines changed: 3 additions & 1 deletion
@@ -42,7 +42,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout 981c276
+          # git checkout 981c276
           git rev-parse --short HEAD
           pip install .
 
@@ -59,5 +59,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.6"
-          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.6"
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.7"
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.7"
@@ -21,7 +21,7 @@ jobs:
     # The type of runner that the job will run on
     runs-on: [self-hosted, intel, gaudi2]
     container:
-      image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
+      image: vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
       ports:
         - 80
       options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
@@ -45,6 +45,8 @@ jobs:
         test_zero_leaf_module.py
         test_zero_offloadpp.py
         test_zero_tiled.py
+        test_autotp_training.py
+        test_ulysses.py
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
 
@@ -39,7 +39,7 @@ jobs:
     # The type of runner that the job will run on
     runs-on: [self-hosted, intel, gaudi2]
     container:
-      image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
+      image: vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
       ports:
         - 80
       options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
@@ -94,6 +94,8 @@ jobs:
         test_zero_nesting_init.py
         test_zeropp.py
         (test_zero.py and (TestZero3ParamPartitioningLargeParam or TestZero3ParamPartitioningLargeParam))
+        (test_linear.py and (TestLoRALinear or TestBasicLinear))
+        (test_ctx.py and TestEngine)
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
@@ -112,7 +114,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout 981c276
+          # git checkout 981c276
           git rev-parse --short HEAD
           pip install .
 
 
@@ -23,7 +23,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:24.09-py3
+      image: nvcr.io/nvidia/pytorch:24.12-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -43,7 +43,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if you need to use an older transformers version temporarily in case of breakage
-          git checkout 981c276
+          # git checkout 981c276
           git rev-parse --short HEAD
           python -m pip install .
       - name: Install deepspeed
@@ -58,8 +58,8 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.5" --cuda_ver="12"
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.5" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.6" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.6" --cuda_ver="12"
       - name: MII unit tests
         run: |
           BRANCH="main"
 
@@ -18,7 +18,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:24.09-py3
+      image: nvcr.io/nvidia/pytorch:24.12-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -53,7 +53,7 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.5" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.6" --cuda_ver="12"
       - name: Open GitHub issue if nightly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}
         uses: JasonEtco/create-an-issue@v2
 
@@ -11,7 +11,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:24.09-py3
+      image: nvcr.io/nvidia/pytorch:24.12-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -50,4 +50,4 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.5" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.6" --cuda_ver="12"
@@ -36,7 +36,7 @@ jobs:
             #python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
         - name: Compile DeepSpeed Ops
           run: |
-            DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
+            DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_DEEP_COMPILE=0 pip3 install .
         - name: DS Report
           run: |
              ds_report
@@ -44,7 +44,7 @@ jobs:
 
       - name: Install deepspeed
         run: |
-          pip install .[dev,1bit,autotuning]
+          pip install .[dev,1bit,autotuning,deepcompile]
           ds_report
 
       - name: Python environment
@@ -55,5 +55,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -n 8 unit/ --torch_ver="2.6" --cuda_ver="12.4"
+          pytest -x $PYTEST_OPTS --forked -n 8 unit/ --torch_ver="2.6" --cuda_ver="12.4"
           pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.6" --cuda_ver="12.4"
@@ -37,7 +37,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout 981c276
+          # git checkout 981c276
           git rev-parse --short HEAD
           pip install .
 
 
@@ -6,7 +6,9 @@ runs:
     - id: update-env
       run: |
         sudo apt-get update
-        sudo apt-get install -y libaio-dev
+        # Temporary disable nvme UTs
+        # sudo apt-get install -y libaio-dev
+        sudo apt remove -y libaio-dev
         python -m pip install --user --upgrade pip
         python -m pip install --user --upgrade virtualenv
       shell: bash