diff --git a/.github/workflows/_linux_accelerate.yml b/.github/workflows/_linux_accelerate.yml index a40601d62..decf4e612 100644 --- a/.github/workflows/_linux_accelerate.yml +++ b/.github/workflows/_linux_accelerate.yml @@ -26,7 +26,7 @@ on: runner: required: true type: string - default: 'linux.idc.xpu' + default: 'pvc_rolling' description: Runner label accelerate: required: false @@ -45,11 +45,15 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +defaults: + run: + shell: bash {0} + jobs: conditions-filter: name: conditions-filter if: ${{ github.event.pull_request.draft == false }} - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 timeout-minutes: 10 env: GH_TOKEN: ${{ github.token }} @@ -66,22 +70,46 @@ jobs: disabled_tests="$(awk '/disable_/{printf("%s ", $0)}' pr-info.txt)" echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}" - Torch-XPU-Accelerate-Tests: - runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }} + prepare: + runs-on: ${{ inputs.runner != '' && inputs.runner || 'pvc_rolling' }} needs: conditions-filter if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_accelerate')) }} + outputs: + runner_id: ${{ steps.runner-info.outputs.runner_id }} + user_id: ${{ steps.runner-info.outputs.user_id }} + render_id: ${{ steps.runner-info.outputs.render_id }} + hostname: ${{ steps.runner-info.outputs.hostname }} + pytest_extra_args: ${{ steps.runner-info.outputs.pytest_extra_args }} + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Get runner + id: runner-info + uses: ./.github/actions/get-runner + + tests: + runs-on: ${{ needs.prepare.outputs.runner_id }} + needs: prepare + container: + image: mengfeili/intel-pvc-driver:1146-1136 + volumes: + - ${{ github.workspace }}:${{ github.workspace }} + options: --device=/dev/mem --device=/dev/dri --group-add video --group-add ${{ needs.prepare.outputs.render_id }} + --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g + -u ${{ needs.prepare.outputs.user_id }} + -e ZE_AFFINITY_MASK + env: + WORK_DIR: 'accelerate' + PYTORCH_DEBUG_XPU_FALLBACK: 1 + HF_HUB_ETAG_TIMEOUT: 120 + HF_HUB_DOWNLOAD_TIMEOUT: 120 + PARSE_JUNIT: ${{ github.workspace }}/torch-xpu-ops/.github/scripts/parse-junitxml.py + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool + PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.prepare.outputs.pytest_extra_args }} env: - WORK_DIR: 'accelerate' - NEOReadDebugKeys: 0 - DisableScratchPages: 0 accelerate: ${{ inputs.accelerate != '' && inputs.accelerate || 'v1.6.0' }} transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.51.3' }} python: ${{ inputs.python != '' && inputs.python || '3.10' }} - PYTORCH_DEBUG_XPU_FALLBACK: 1 - ZE_AFFINITY_MASK: 0 - PARSE_JUNIT: ${{ github.workspace }}/torch-xpu-ops/.github/scripts/parse-junitxml.py - HF_HUB_ETAG_TIMEOUT: 120 - HF_HUB_DOWNLOAD_TIMEOUT: 120 steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 @@ -93,25 +121,22 @@ jobs: repository: huggingface/accelerate ref: ${{ env.accelerate }} path: accelerate - - name: Create unique Conda ENV name + - name: Setup python-${{ env.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.python }} + - name: Check python run: | - random=$(head /dev/urandom | tr -dc A-Za-z0-9_ | head -c ${1:-5} | xargs) - echo "CONDA_ENV_NAME=hf_accelerate_test_${ZE_AFFINITY_MASK}_${random}" >> $GITHUB_ENV - - name: Prepare Conda ENV + which python && python -V + which pip && pip list + pip install -U pip wheel setuptools + - name: Install pytorch and deps run: | - echo "Using Conda ENV name: $CONDA_ENV_NAME" - conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME - conda create -y -n $CONDA_ENV_NAME python=${{ env.python }} - source activate $CONDA_ENV_NAME - pip install junitparser pytest-timeout + pip install junitparser pip install transformers==${{ env.transformers }} - - name: Prepare Stock XPU Pytorch - run: | - source activate $CONDA_ENV_NAME pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - name: Prepare Accelerate run: | - source activate $CONDA_ENV_NAME cd $WORK_DIR pip install -e . pip install -e ".[testing]" @@ -120,7 +145,6 @@ jobs: cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./ - name: Report installed versions run: | - source activate $CONDA_ENV_NAME echo "pip installed packages:" pip list | tee ${{ github.workspace }}/$WORK_DIR/tests_log/pip_list.txt echo "lspci gpu devices:" @@ -131,16 +155,17 @@ jobs: xpu-smi discovery -y --json --dump -1 - name: Sanity check installed packages run: | - source activate $CONDA_ENV_NAME + # Use latest pytest + pip install -U pytest pytest-timeout pytest-xdist # These checks are to exit earlier if for any reason torch # packages were reinstalled back to CUDA versions (not expected). pip show torch | grep Version | grep xpu pip show torchaudio | grep Version | grep xpu pip show torchvision | grep Version | grep xpu python -c 'import torch; exit(not torch.xpu.is_available())' - - name: Run tests + printenv + - name: Run tests on ${{ needs.prepare.outputs.hostname }} run: | - source activate $CONDA_ENV_NAME cd $WORK_DIR && rm -rf reports && mkdir -p reports # Excluding tests due to: # * tests/test_examples.py::FeatureExamplesTests::test_profiler fails on @@ -150,8 +175,7 @@ jobs: # * tests/test_big_modeling.py::test_dispatch_model_tied_weights_memory_with_nested_offload_cpu fails # with OOM. That's a new test added by https://github.com/huggingface/accelerate/pull/3445 pattern="not test_profiler and not test_gated and not test_dispatch_model_tied_weights_memory_with_nested_offload_cpu" - cmd=(python3 -m pytest --timeout 600 -rsf --junitxml=reports/accelerate.xml -k "$pattern" \ - tests/) + cmd=(python -m pytest --junitxml=reports/accelerate.xml -k "$pattern" tests/) { echo "### Running" echo "\`\`\`" @@ -162,28 +186,20 @@ jobs: - name: Print result tables if: ${{ ! cancelled() }} run: | - source activate $CONDA_ENV_NAME cd $WORK_DIR { echo "### Results" - python3 $PARSE_JUNIT reports/accelerate.xml --stats + python $PARSE_JUNIT reports/accelerate.xml --stats echo "### Failed" - python3 $PARSE_JUNIT reports/accelerate.xml --errors --failed + python $PARSE_JUNIT reports/accelerate.xml --errors --failed echo "### Skipped" - python3 $PARSE_JUNIT reports/accelerate.xml --skipped + python $PARSE_JUNIT reports/accelerate.xml --skipped } >> $GITHUB_STEP_SUMMARY - name: Print environment if: ${{ ! cancelled() }} uses: ./torch-xpu-ops/.github/actions/print-environment with: - conda: $CONDA_ENV_NAME pip_packages: 'accelerate transformers' - - name: Clean up - if: ${{ always() }} - run: | - if [ -n "$CONDA_ENV_NAME" ]; then - conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME - fi - name: Upload Test log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml index 2d8bfd4af..fc3deaf7a 100644 --- a/.github/workflows/_linux_transformers.yml +++ b/.github/workflows/_linux_transformers.yml @@ -21,7 +21,7 @@ on: runner: required: true type: string - default: 'linux.idc.xpu' + default: 'pvc_rolling' description: Runner label driver: required: false @@ -58,8 +58,6 @@ env: HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} HF_HUB_ETAG_TIMEOUT: 120 HF_HUB_DOWNLOAD_TIMEOUT: 120 - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} python: ${{ inputs.python != '' && inputs.python || '3.10' }} accelerate: ${{ inputs.accelerate != '' && inputs.accelerate || 'v1.7.0'}} datasets: ${{ inputs.datasets != '' && inputs.datasets || 'v3.6.0'}} @@ -76,8 +74,12 @@ env: libswresample-dev libswscale-dev pciutils - PYTEST_TIMEOUT: 600 TORCH_INDEX: '--pre --index-url https://download.pytorch.org/whl/nightly/xpu' + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool + +defaults: + run: + shell: bash {0} jobs: conditions-filter: @@ -101,7 +103,7 @@ jobs: echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}" prepare: - runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }} + runs-on: ${{ inputs.runner != '' && inputs.runner || 'pvc_rolling' }} needs: conditions-filter if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_transformers')) }} outputs: @@ -109,6 +111,11 @@ jobs: torchvision: ${{ steps.getver.outputs.torchvision }} torchaudio: ${{ steps.getver.outputs.torchaudio }} triton: ${{ steps.getver.outputs.triton }} + runner_id: ${{ steps.runner-info.outputs.runner_id }} + user_id: ${{ steps.runner-info.outputs.user_id }} + render_id: ${{ steps.runner-info.outputs.render_id }} + hostname: ${{ steps.runner-info.outputs.hostname }} + pytest_extra_args: ${{ steps.runner-info.outputs.pytest_extra_args }} steps: - id: getver run: | @@ -127,10 +134,28 @@ jobs: echo "torchvision=$torchvision" | tee -a "$GITHUB_OUTPUT" echo "torchaudio=$torchaudio" | tee -a "$GITHUB_OUTPUT" echo "triton=$triton" | tee -a "$GITHUB_OUTPUT" + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Get runner + id: runner-info + uses: ./.github/actions/get-runner tests: needs: prepare - runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }} + runs-on: ${{ needs.prepare.outputs.runner_id }} + container: + image: mengfeili/intel-pvc-driver:1146-1136 + volumes: + - ${{ github.workspace }}:${{ github.workspace }} + options: --device=/dev/mem --device=/dev/dri --group-add video --group-add ${{ needs.prepare.outputs.render_id }} + --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g + -u ${{ needs.prepare.outputs.user_id }} + -e ZE_AFFINITY_MASK + env: + PYTORCH_DEBUG_XPU_FALLBACK: '1' + TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py' + # enable pytest parallel run, and continue others if meets crash case such as segmentation fault + PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.prepare.outputs.pytest_extra_args }} strategy: fail-fast: false max-parallel: 1 @@ -152,52 +177,16 @@ jobs: # * https://github.com/pytorch/pytorch/issues/140965 (aten::_linalg_eigvals) # * https://github.com/huggingface/transformers/issues/36267 (marian tests) - test_case: 'tests_models_0' - cmd: 'tests/models --num-shards 16 --shard-id 0 --ignore=tests/models/marian/test_modeling_marian.py' + cmd: 'tests/models --num-shards 4 --shard-id 0 --ignore=tests/models/marian/test_modeling_marian.py' filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - test_case: 'tests_models_1' - cmd: 'tests/models --num-shards 16 --shard-id 1 --ignore=tests/models/marian/test_modeling_marian.py' + cmd: 'tests/models --num-shards 4 --shard-id 1 --ignore=tests/models/marian/test_modeling_marian.py' filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - test_case: 'tests_models_2' - cmd: 'tests/models --num-shards 16 --shard-id 2 --ignore=tests/models/marian/test_modeling_marian.py' + cmd: 'tests/models --num-shards 4 --shard-id 2 --ignore=tests/models/marian/test_modeling_marian.py' filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - test_case: 'tests_models_3' - cmd: 'tests/models --num-shards 16 --shard-id 3 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_4' - cmd: 'tests/models --num-shards 16 --shard-id 4 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_5' - cmd: 'tests/models --num-shards 16 --shard-id 5 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_6' - cmd: 'tests/models --num-shards 16 --shard-id 6 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_7' - cmd: 'tests/models --num-shards 16 --shard-id 7 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_8' - cmd: 'tests/models --num-shards 16 --shard-id 8 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_9' - cmd: 'tests/models --num-shards 16 --shard-id 9 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_10' - cmd: 'tests/models --num-shards 16 --shard-id 10 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_11' - cmd: 'tests/models --num-shards 16 --shard-id 11 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_12' - cmd: 'tests/models --num-shards 16 --shard-id 12 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_13' - cmd: 'tests/models --num-shards 16 --shard-id 13 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_14' - cmd: 'tests/models --num-shards 16 --shard-id 14 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_15' - cmd: 'tests/models --num-shards 16 --shard-id 15 --ignore=tests/models/marian/test_modeling_marian.py' + cmd: 'tests/models --num-shards 4 --shard-id 3 --ignore=tests/models/marian/test_modeling_marian.py' filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' # Excluding tests due to: # * Some ray tests hang, reason unknown @@ -212,9 +201,6 @@ jobs: - test_case: 'tests_utils' cmd: '--ignore=tests/utils/test_import_utils.py tests/utils' filter: 'not test_load_img_url_timeout' - env: - PYTORCH_DEBUG_XPU_FALLBACK: '1' - TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py' steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 @@ -253,20 +239,18 @@ jobs: sleep 1; if (( $SECONDS - start_time > 60 )); then false; fi done - - name: Create unique Conda ENV name - run: | - random=$(head /dev/urandom | tr -dc A-Za-z0-9_ | head -c ${1:-5} | xargs) - echo "CONDA_ENV_NAME=hf_transformers_test_${ZE_AFFINITY_MASK}_${random}" >> $GITHUB_ENV - - name: Prepare Conda ENV + - name: Setup python-${{ env.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.python }} + - name: Check python run: | - echo "Using Conda ENV name: $CONDA_ENV_NAME" - conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME - conda create -y -n $CONDA_ENV_NAME python=${{ env.python }} - source activate $CONDA_ENV_NAME - pip install junitparser pytest-shard pytest-timeout - - name: Prepare Stock XPU Pytorch + which python && python -V + which pip && pip list + pip install -U pip wheel setuptools + - name: Prepare pytorch and deps run: | - source activate $CONDA_ENV_NAME + pip install junitparser pip install $TORCH_INDEX \ torch==${{ needs.prepare.outputs.torch }} \ torchvision==${{ needs.prepare.outputs.torchvision }} \ @@ -275,7 +259,6 @@ jobs: - name: Prepare Transformers run: | pwd - source activate $CONDA_ENV_NAME cd transformers pip install \ accelerate==${{ env.accelerate }} \ @@ -287,7 +270,6 @@ jobs: cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./ - name: Report installed versions run: | - source activate $CONDA_ENV_NAME LOGS_DIR="${{ github.workspace }}/transformers/logs" echo "pip installed packages:" pip list | tee "$LOGS_DIR/pip_list-$TEST_CASE.txt" @@ -299,36 +281,35 @@ jobs: xpu-smi discovery -y --json --dump -1 - name: Sanity check installed packages run: | - source activate $CONDA_ENV_NAME + # Use latest pytest + pip install -U pytest pytest-timeout pytest-xdist pytest-shard # These checks are to exit earlier if for any reason Transformers # reinstalled torch packages back to CUDA versions (not expected). pip show torch | grep Version | grep xpu pip show torchaudio | grep Version | grep xpu pip show torchvision | grep Version | grep xpu python -c 'import torch; exit(not torch.xpu.is_available())' - - name: Run tests + - name: Run tests on ${{ needs.prepare.outputs.hostname }} run: | - source activate $CONDA_ENV_NAME cd transformers - python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml \ - -k "${{ matrix.test.filter}}" ${{ matrix.test.cmd }} || true + python -m pytest --make-reports=${TEST_CASE} --junit-xml=reports/${TEST_CASE}.xml \ + -k "${{ matrix.test.filter}}" ${{ matrix.test.cmd }} || true - name: Check for errors in tests run: | - source activate $CONDA_ENV_NAME - python3 torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml + python torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml - name: Print environment if: ${{ ! cancelled() }} uses: ./torch-xpu-ops/.github/actions/print-environment with: - conda: $CONDA_ENV_NAME pip_packages: 'accelerate datasets transformers' to: 'transformers/logs/environment-$TEST_CASE.md' - name: Clean up if: ${{ always() }} run: | - du -sh ${{ env.HF_HOME }} || true - if [ -n "$CONDA_ENV_NAME" ]; then - conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME + if [ -d "$HF_HOME" ]; then + ls -al ${{ env.HF_HOME }} + du -sh ${{ env.HF_HOME }} + rm -rf ${{ env.HF_HOME }} fi - name: Upload reports if: ${{ ! cancelled() }} @@ -346,7 +327,7 @@ jobs: report: needs: tests if: ${{ success() || failure() }} - runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }} + runs-on: ubuntu-24.04 steps: - name: Download reports uses: actions/download-artifact@v4 @@ -366,16 +347,12 @@ jobs: uses: actions/checkout@v4 with: path: torch-xpu-ops - - name: Create unique Conda ENV name - run: | - random=$(head /dev/urandom | tr -dc A-Za-z0-9_ | head -c ${1:-5} | xargs) - echo "CONDA_ENV_NAME=hf_transformers_test_${ZE_AFFINITY_MASK}_${random}" >> $GITHUB_ENV - - name: Prepare Conda ENV + - name: Setup python-${{ env.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.python }} + - name: Install pip deps run: | - echo "Using Conda ENV name: $CONDA_ENV_NAME" - conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME - conda create -y -n $CONDA_ENV_NAME python=${{ env.python }} - source activate $CONDA_ENV_NAME pip install junitparser - name: Print results table if: ${{ ! cancelled() }} @@ -409,8 +386,7 @@ jobs: - name: Print baseline difference if: ${{ ! cancelled() }} run: | - source activate $CONDA_ENV_NAME - python3 torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml >> $GITHUB_STEP_SUMMARY || true + python torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml >> $GITHUB_STEP_SUMMARY || true - name: Print failure lines if: ${{ ! cancelled() }} run: | @@ -477,9 +453,3 @@ jobs: for f in $(find transformers/logs -name "environment-*.md"); do diff $f $first_md done - - name: Clean up - if: ${{ always() }} - run: | - if [ -n "$CONDA_ENV_NAME" ]; then - conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME - fi