Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
77c7744
modify accelerate tests
mengfei25 Sep 1, 2025
4b6013a
modify transformers tests
mengfei25 Sep 1, 2025
c8cc64c
Merge branch 'main' into mengfeil/modify-extra-tests
mengfei25 Sep 1, 2025
00aa549
update
mengfei25 Sep 1, 2025
42e6541
Merge branch 'mengfeil/modify-extra-tests' of https://github.com/inte…
mengfei25 Sep 1, 2025
2de397e
split transformers test jobs
mengfei25 Sep 1, 2025
782a62c
update
mengfei25 Sep 1, 2025
4df28d5
update
mengfei25 Sep 1, 2025
cbc00e3
update
mengfei25 Sep 2, 2025
bec7bb1
Merge branch 'main' into mengfeil/modify-extra-tests
mengfei25 Sep 2, 2025
29705c1
update
mengfei25 Sep 2, 2025
50c015e
Merge branch 'mengfeil/modify-extra-tests' of https://github.com/inte…
mengfei25 Sep 2, 2025
6727196
update
mengfei25 Sep 2, 2025
10c6152
update
mengfei25 Sep 2, 2025
5663c85
update
mengfei25 Sep 2, 2025
081fad6
update
mengfei25 Sep 2, 2025
608c5c6
update
mengfei25 Sep 2, 2025
5e9a5be
Merge branch 'main' into mengfeil/modify-extra-tests
mengfei25 Sep 2, 2025
12d8646
update
mengfei25 Sep 2, 2025
638762d
update
mengfei25 Sep 3, 2025
c3275a3
accelerate only need 1 card
mengfei25 Sep 3, 2025
0303d81
update
mengfei25 Sep 3, 2025
824cfef
update
mengfei25 Sep 3, 2025
0a159d9
Merge branch 'main' into mengfeil/modify-extra-tests
mengfei25 Sep 3, 2025
234985e
Merge branch 'main' into mengfeil/modify-extra-tests
mengfei25 Sep 6, 2025
379cf35
Merge branch 'main' into mengfeil/modify-extra-tests
mengfei25 Sep 6, 2025
b456364
modify container args
mengfei25 Sep 9, 2025
856bd09
remove workspace cleanup before checkout
mengfei25 Sep 9, 2025
462b387
modify ZE_AFFINITY_MASK in container
mengfei25 Sep 10, 2025
9b49c07
transformers mutli shards
mengfei25 Sep 10, 2025
fc3ac8e
update
mengfei25 Sep 10, 2025
dda9f7d
Merge branch 'main' into mengfeil/modify-extra-tests
mengfei25 Sep 10, 2025
b47b279
set numactl to distribute CPUs
mengfei25 Sep 10, 2025
344d370
fix lint
mengfei25 Sep 10, 2025
86bfbce
rollback to 856bd09
mengfei25 Sep 11, 2025
01af1f6
accelerate tests parallel with ZE_AFFINITY_MASK=n
mengfei25 Sep 11, 2025
acbdde6
split transformers test jobs
mengfei25 Sep 1, 2025
8ad779c
cleanup
mengfei25 Sep 11, 2025
70e031d
cleanup
mengfei25 Sep 11, 2025
da60b8e
cleanup
mengfei25 Sep 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 58 additions & 42 deletions .github/workflows/_linux_accelerate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ on:
runner:
required: true
type: string
default: 'linux.idc.xpu'
default: 'pvc_rolling'
description: Runner label
accelerate:
required: false
Expand All @@ -45,11 +45,15 @@ concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

defaults:
run:
shell: bash {0}

jobs:
conditions-filter:
name: conditions-filter
if: ${{ github.event.pull_request.draft == false }}
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
timeout-minutes: 10
env:
GH_TOKEN: ${{ github.token }}
Expand All @@ -66,22 +70,46 @@ jobs:
disabled_tests="$(awk '/disable_/{printf("%s ", $0)}' pr-info.txt)"
echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}"

Torch-XPU-Accelerate-Tests:
runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }}
prepare:
runs-on: ${{ inputs.runner != '' && inputs.runner || 'pvc_rolling' }}
needs: conditions-filter
if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_accelerate')) }}
outputs:
runner_id: ${{ steps.runner-info.outputs.runner_id }}
user_id: ${{ steps.runner-info.outputs.user_id }}
render_id: ${{ steps.runner-info.outputs.render_id }}
hostname: ${{ steps.runner-info.outputs.hostname }}
pytest_extra_args: ${{ steps.runner-info.outputs.pytest_extra_args }}
steps:
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
- name: Get runner
id: runner-info
uses: ./.github/actions/get-runner

tests:
runs-on: ${{ needs.prepare.outputs.runner_id }}
needs: prepare
container:
image: mengfeili/intel-pvc-driver:1146-1136
volumes:
- ${{ github.workspace }}:${{ github.workspace }}
options: --device=/dev/mem --device=/dev/dri --group-add video --group-add ${{ needs.prepare.outputs.render_id }}
--security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
-u ${{ needs.prepare.outputs.user_id }}
-e ZE_AFFINITY_MASK
env:
WORK_DIR: 'accelerate'
PYTORCH_DEBUG_XPU_FALLBACK: 1
HF_HUB_ETAG_TIMEOUT: 120
HF_HUB_DOWNLOAD_TIMEOUT: 120
PARSE_JUNIT: ${{ github.workspace }}/torch-xpu-ops/.github/scripts/parse-junitxml.py
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.prepare.outputs.pytest_extra_args }}
env:
WORK_DIR: 'accelerate'
NEOReadDebugKeys: 0
DisableScratchPages: 0
accelerate: ${{ inputs.accelerate != '' && inputs.accelerate || 'v1.6.0' }}
transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.51.3' }}
python: ${{ inputs.python != '' && inputs.python || '3.10' }}
PYTORCH_DEBUG_XPU_FALLBACK: 1
ZE_AFFINITY_MASK: 0
PARSE_JUNIT: ${{ github.workspace }}/torch-xpu-ops/.github/scripts/parse-junitxml.py
HF_HUB_ETAG_TIMEOUT: 120
HF_HUB_DOWNLOAD_TIMEOUT: 120
steps:
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
Expand All @@ -93,25 +121,22 @@ jobs:
repository: huggingface/accelerate
ref: ${{ env.accelerate }}
path: accelerate
- name: Create unique Conda ENV name
- name: Setup python-${{ env.python }}
uses: actions/setup-python@v5
with:
python-version: ${{ env.python }}
- name: Check python
run: |
random=$(head /dev/urandom | tr -dc A-Za-z0-9_ | head -c ${1:-5} | xargs)
echo "CONDA_ENV_NAME=hf_accelerate_test_${ZE_AFFINITY_MASK}_${random}" >> $GITHUB_ENV
- name: Prepare Conda ENV
which python && python -V
which pip && pip list
pip install -U pip wheel setuptools
- name: Install pytorch and deps
run: |
echo "Using Conda ENV name: $CONDA_ENV_NAME"
conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME
conda create -y -n $CONDA_ENV_NAME python=${{ env.python }}
source activate $CONDA_ENV_NAME
pip install junitparser pytest-timeout
pip install junitparser
pip install transformers==${{ env.transformers }}
- name: Prepare Stock XPU Pytorch
run: |
source activate $CONDA_ENV_NAME
pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
- name: Prepare Accelerate
run: |
source activate $CONDA_ENV_NAME
cd $WORK_DIR
pip install -e .
pip install -e ".[testing]"
Expand All @@ -120,7 +145,6 @@ jobs:
cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./
- name: Report installed versions
run: |
source activate $CONDA_ENV_NAME
echo "pip installed packages:"
pip list | tee ${{ github.workspace }}/$WORK_DIR/tests_log/pip_list.txt
echo "lspci gpu devices:"
Expand All @@ -131,16 +155,17 @@ jobs:
xpu-smi discovery -y --json --dump -1
- name: Sanity check installed packages
run: |
source activate $CONDA_ENV_NAME
# Use latest pytest
pip install -U pytest pytest-timeout pytest-xdist
# These checks are to exit earlier if for any reason torch
# packages were reinstalled back to CUDA versions (not expected).
pip show torch | grep Version | grep xpu
pip show torchaudio | grep Version | grep xpu
pip show torchvision | grep Version | grep xpu
python -c 'import torch; exit(not torch.xpu.is_available())'
- name: Run tests
printenv
- name: Run tests on ${{ needs.prepare.outputs.hostname }}
run: |
source activate $CONDA_ENV_NAME
cd $WORK_DIR && rm -rf reports && mkdir -p reports
# Excluding tests due to:
# * tests/test_examples.py::FeatureExamplesTests::test_profiler fails on
Expand All @@ -150,8 +175,7 @@ jobs:
# * tests/test_big_modeling.py::test_dispatch_model_tied_weights_memory_with_nested_offload_cpu fails
# with OOM. That's a new test added by https://github.com/huggingface/accelerate/pull/3445
pattern="not test_profiler and not test_gated and not test_dispatch_model_tied_weights_memory_with_nested_offload_cpu"
cmd=(python3 -m pytest --timeout 600 -rsf --junitxml=reports/accelerate.xml -k "$pattern" \
tests/)
cmd=(python -m pytest --junitxml=reports/accelerate.xml -k "$pattern" tests/)
{
echo "### Running"
echo "\`\`\`"
Expand All @@ -162,28 +186,20 @@ jobs:
- name: Print result tables
if: ${{ ! cancelled() }}
run: |
source activate $CONDA_ENV_NAME
cd $WORK_DIR
{
echo "### Results"
python3 $PARSE_JUNIT reports/accelerate.xml --stats
python $PARSE_JUNIT reports/accelerate.xml --stats
echo "### Failed"
python3 $PARSE_JUNIT reports/accelerate.xml --errors --failed
python $PARSE_JUNIT reports/accelerate.xml --errors --failed
echo "### Skipped"
python3 $PARSE_JUNIT reports/accelerate.xml --skipped
python $PARSE_JUNIT reports/accelerate.xml --skipped
} >> $GITHUB_STEP_SUMMARY
- name: Print environment
if: ${{ ! cancelled() }}
uses: ./torch-xpu-ops/.github/actions/print-environment
with:
conda: $CONDA_ENV_NAME
pip_packages: 'accelerate transformers'
- name: Clean up
if: ${{ always() }}
run: |
if [ -n "$CONDA_ENV_NAME" ]; then
conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME
fi
- name: Upload Test log
if: ${{ ! cancelled() }}
uses: actions/upload-artifact@v4
Expand Down
Loading
Loading