intel · chuanqi129 · Sep 11, 2025 · Sep 1, 2025 · Sep 1, 2025 · Sep 1, 2025
diff --git a/.github/actions/get-runner/action.yml b/.github/actions/get-runner/action.yml
@@ -9,26 +9,28 @@ outputs:
     value: ${{ steps.runner.outputs.render_id }}
   hostname:
     value: ${{ steps.runner.outputs.hostname }}
+  ZE_AFFINITY_MASK:
+    value: ${{ steps.tests.outputs.ZE_AFFINITY_MASK }}
   xpu_num:
-    value: ${{ steps.runner.outputs.xpu_num }}
+    value: ${{ steps.tests.outputs.xpu_num }}
   cpus_per_xpu:
-    value: ${{ steps.runner.outputs.cpus_per_xpu }}
+    value: ${{ steps.tests.outputs.cpus_per_xpu }}
   pytest_extra_args:
-    value: ${{ steps.runner.outputs.pytest_extra_args }}
+    value: ${{ steps.tests.outputs.pytest_extra_args }}
+  numactl_args:
+    value: ${{ steps.tests.outputs.numactl_args }}
 
 runs:
   using: composite
   steps:
-    - name: Get runner
+    - name: Show runner
       shell: bash -xe {0}
       id: runner
       run: |
-        # get test runner
         echo "runner_id=$(echo ${RUNNER_NAME} |sed 's/\-[0-9]$//')" |tee -a ${GITHUB_OUTPUT}
-        echo "user_id=$(id -u)" |tee -a ${GITHUB_OUTPUT}
+        echo "user_id=$(id -u):$(id -g)" |tee -a ${GITHUB_OUTPUT}
         echo "render_id=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT}
         echo "hostname=$(hostname)" |tee -a ${GITHUB_OUTPUT}
-        # show host info
         lscpu
         lshw -C display
         free -h
@@ -37,6 +39,9 @@ runs:
         uname -a
         # clinfo hang and reboot system to recover
         timeout 120 clinfo --list || sudo reboot
+    - name: Check scaling_governor
+      shell: bash -xe {0}
+      run: |
         scaling_governor=$(cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq)
         if [ $(sudo -n true > /dev/null 2>&1 && echo $? || echo $?) -eq 0 ];then
             if [ "${scaling_governor}" != "performance" ];then
@@ -52,31 +57,60 @@ runs:
             echo "[INFO] You do NOT have ROOT permission to set system config."
             echo "       The frequency governor is ${scaling_governor}."
         fi
+    - name: Info for tests
+      shell: bash -xe {0}
+      id: tests
+      run: |
+        # cpu number
         cpu_num="$(lscpu |grep -E 'Core\(s\) per socket:|Socket\(s\):' |awk 'BEGIN{sum=1}{sum*=$NF}END{printf sum}')"
-        xpu_num="$(clinfo --list |awk 'BEGIN{gpu=0;}{
-          if(gpu==1 && $0~/Platform/){gpu=0}; if(gpu==1){print $0}; if($0~/Platform.*Graphics/){gpu=1}
-        }' |wc -l)"
-        cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${xpu_num}" '{printf c/x}')"
+        # available gpu card number
+        xpu_num="$(xpu-smi discovery --dump 1 2>&1 |grep -c "[0-9]")"
+        # total gpu card number
+        pci_base_class_mask=0x00ff0000
+        pci_base_class_display=0x00030000
+        pci_vendor_id_intel=0x8086
+        gpu_card_num=0
+        for var in $(ls /sys/bus/pci/devices)
+        do
+            pci_class="$(cat /sys/bus/pci/devices/${var}/class)"
+            pci_vendor="$(cat /sys/bus/pci/devices/${var}/vendor)"
+            is_xpu="$(python -c "if (${pci_class} & ${pci_base_class_mask}) == ${pci_base_class_display} and ${pci_vendor} == ${pci_vendor_id_intel}: print('yes')")"
+            if [ "${is_xpu}" == "yes" ];then
+                echo "Detected Intel GPU at /sys/bus/pci/devices/${var}"
+                gpu_card_num=$[ $gpu_card_num + 1 ]
+            fi
+        done
+        # get available gpus
+        ZE_AFFINITY_MASK="$(xpu-smi discovery 2>&1 |grep 'DRM Device: /dev/' |sed 's/.*card//;s/[^0-9].*//' |sort -n |uniq |awk '{
+          if (NR == 1) { first = $1; }
+          if (first > 0) { printf("%s,", $1 - 1); } else { printf("%s,", $1); }
+        }' |sed 's/,$//')"
+        cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${gpu_card_num}" '{printf c/x}')"
+        # get pytest args for pytest parallel
+        numactl_args="$(echo |awk -v c="${cpus_per_xpu}" -v ze="${ZE_AFFINITY_MASK}" '{
+          split(ze, x, ",");
+          for (i in x) {
+            printf(" numactl -l -C %d-%d ;", c*x[i], c*x[i]+c-1);
+          }
+        }')"
         pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
           if (x > 0) {
             split(z, xpu_list, ",");
             for (i=0;i<x;i++) {
-              if (z != "") {
-                  ze = xpu_list[i+1];
-              } else {
-                  ze = i;
-              }
+              ze = xpu_list[i+1];
               printf(" --tx popen//env:ZE_AFFINITY_MASK=%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
-                      ze, cx, i*cx, (i+1)*cx-1);
+                      ze, cx, ze*cx, (ze+1)*cx-1);
             }
           }else {
             printf(" -n 1 ");
           }
         }')"
+        echo "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" |tee -a ${GITHUB_OUTPUT}
         echo "xpu_num=${xpu_num}" |tee -a ${GITHUB_OUTPUT}
         echo "cpus_per_xpu=${cpus_per_xpu}" |tee -a ${GITHUB_OUTPUT}
+        echo "numactl_args=${numactl_args}" |tee -a ${GITHUB_OUTPUT}
         echo "pytest_extra_args=${pytest_extra_args}" |tee -a ${GITHUB_OUTPUT}
-    - name: Cleanup host
+    - name: Cleanup runner
       shell: bash -xe {0}
       run: |
         # clean docker cache

diff --git a/.github/actions/linux-e2etest/action.yml b/.github/actions/linux-e2etest/action.yml
@@ -75,12 +75,12 @@ runs:
                     else
                       xpu_id=${var}
                     fi
-                    numactl --localalloc --physcpubind=${cpu_list} bash -x inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu ${xpu_id} static ${xpu_num} ${var} &
+                    bash -x inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu ${xpu_id} static ${xpu_num} ${var} &
                   done
                 else
                   for test_model in $(echo ${MODEL_ONLY_NAME} |sed 's/,/ /g')
                   do
-                    numactl --localalloc bash -x inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 0 static 1 0 ${test_model}
+                    bash -x inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 0 static 1 0 ${test_model}
                   done
                 fi
                 wait

diff --git a/.github/scripts/inductor_xpu_test.sh b/.github/scripts/inductor_xpu_test.sh
@@ -60,6 +60,7 @@ fi
 
 ulimit -n 1048576
 ZE_AFFINITY_MASK=${CARD} \
-    eval python benchmarks/dynamo/"${SUITE}".py --"${SCENARIO}" --"${Real_DT}" -d "${DEVICE}" -n10 "${DT_extra}" "${Mode_extra}" \
-    "${Shape_extra}" "${partition_flags}" "${Model_only_extra}" --backend=inductor --cold-start-latency --timeout=10800 \
-         --output="${LOG_DIR}"/"${LOG_NAME}".csv 2>&1 | tee "${LOG_DIR}"/"${LOG_NAME}"_card"${CARD}".log
+    eval $(echo ${numactl_args}|awk -F ';' -v i=$[${CARD}+1] '{print $i}') \
+        python benchmarks/dynamo/"${SUITE}".py --"${SCENARIO}" --"${Real_DT}" -d "${DEVICE}" -n10 "${DT_extra}" "${Mode_extra}" \
+            "${Shape_extra}" "${partition_flags}" "${Model_only_extra}" --backend=inductor --cold-start-latency --timeout=10800 \
+            --output="${LOG_DIR}"/"${LOG_NAME}".csv 2>&1 | tee "${LOG_DIR}"/"${LOG_NAME}"_card"${CARD}".log
diff --git a/.github/workflows/_linux_accelerate.yml b/.github/workflows/_linux_accelerate.yml
@@ -26,7 +26,7 @@ on:
       runner:
         required: true
         type: string
-        default: 'linux.idc.xpu'
+        default: 'pvc_rolling'
         description: Runner label
       accelerate:
         required: false
@@ -45,11 +45,15 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
+defaults:
+  run:
+    shell: bash {0}
+
 jobs:
   conditions-filter:
     name: conditions-filter
     if: ${{ github.event.pull_request.draft == false }}
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-24.04
     timeout-minutes: 10
     env:
       GH_TOKEN: ${{ github.token }}
@@ -66,22 +70,46 @@ jobs:
           disabled_tests="$(awk '/disable_/{printf("%s ", $0)}' pr-info.txt)"
           echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}"
 
-  Torch-XPU-Accelerate-Tests:
-    runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }}
+  prepare:
+    runs-on: ${{ inputs.runner != '' && inputs.runner || 'pvc_rolling' }}
     needs: conditions-filter
     if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_accelerate')) }}
+    outputs:
+      runner_id: ${{ steps.runner-info.outputs.runner_id }}
+      user_id: ${{ steps.runner-info.outputs.user_id }}
+      render_id: ${{ steps.runner-info.outputs.render_id }}
+      hostname: ${{ steps.runner-info.outputs.hostname }}
+      ZE_AFFINITY_MASK: ${{ steps.runner-info.outputs.ZE_AFFINITY_MASK }}
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Get runner
+        id: runner-info
+        uses: ./.github/actions/get-runner
+
+  tests:
+    runs-on: ${{ needs.prepare.outputs.runner_id }}
+    needs: prepare
+    container:
+      image: mengfeili/intel-pvc-driver:1146-1136
+      volumes:
+        - ${{ github.workspace }}:${{ github.workspace }}
+      options: --device=/dev/mem --device=/dev/dri --group-add video --group-add ${{ needs.prepare.outputs.render_id }}
+              --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
+              -u ${{ needs.prepare.outputs.user_id }}
+      env:
+        ZE_AFFINITY_MASK: ${{ needs.prepare.outputs.ZE_AFFINITY_MASK }}
+        WORK_DIR: 'accelerate'
+        PYTORCH_DEBUG_XPU_FALLBACK: 1
+        HF_HUB_ETAG_TIMEOUT: 120
+        HF_HUB_DOWNLOAD_TIMEOUT: 120
+        PARSE_JUNIT: ${{ github.workspace }}/torch-xpu-ops/.github/scripts/parse-junitxml.py
+        AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
+        PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread -n 1
     env:
-      WORK_DIR: 'accelerate'
-      NEOReadDebugKeys: 0
-      DisableScratchPages: 0
       accelerate: ${{ inputs.accelerate != '' && inputs.accelerate || 'v1.6.0' }}
       transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.51.3' }}
       python: ${{ inputs.python != '' && inputs.python || '3.10' }}
-      PYTORCH_DEBUG_XPU_FALLBACK: 1
-      ZE_AFFINITY_MASK: 0
-      PARSE_JUNIT: ${{ github.workspace }}/torch-xpu-ops/.github/scripts/parse-junitxml.py
-      HF_HUB_ETAG_TIMEOUT: 120
-      HF_HUB_DOWNLOAD_TIMEOUT: 120
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
@@ -93,25 +121,22 @@ jobs:
           repository: huggingface/accelerate
           ref: ${{ env.accelerate }}
           path: accelerate
-      - name: Create unique Conda ENV name
+      - name: Setup python-${{ env.python }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.python }}
+      - name: Check python
         run: |
-          random=$(head /dev/urandom | tr -dc A-Za-z0-9_ | head -c ${1:-5} | xargs)
-          echo "CONDA_ENV_NAME=hf_accelerate_test_${ZE_AFFINITY_MASK}_${random}" >> $GITHUB_ENV
-      - name: Prepare Conda ENV
+          which python && python -V
+          which pip && pip list
+          pip install -U pip wheel setuptools
+      - name: Install pytorch and deps
         run: |
-          echo "Using Conda ENV name: $CONDA_ENV_NAME"
-          conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME
-          conda create -y -n $CONDA_ENV_NAME python=${{ env.python }}
-          source activate $CONDA_ENV_NAME
-          pip install junitparser pytest-timeout
+          pip install junitparser
           pip install transformers==${{ env.transformers }}
-      - name: Prepare Stock XPU Pytorch
-        run: |
-          source activate $CONDA_ENV_NAME
           pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
       - name: Prepare Accelerate
         run: |
-          source activate $CONDA_ENV_NAME
           cd $WORK_DIR
           pip install -e .
           pip install -e ".[testing]"
@@ -120,7 +145,6 @@ jobs:
           cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./
       - name: Report installed versions
         run: |
-          source activate $CONDA_ENV_NAME
           echo "pip installed packages:"
           pip list | tee ${{ github.workspace }}/$WORK_DIR/tests_log/pip_list.txt
           echo "lspci gpu devices:"
@@ -131,16 +155,20 @@ jobs:
           xpu-smi discovery -y --json --dump -1
       - name: Sanity check installed packages
         run: |
-          source activate $CONDA_ENV_NAME
+          # Use latest pytest
+          pip install -U pytest pytest-timeout pytest-xdist
           # These checks are to exit earlier if for any reason torch
           # packages were reinstalled back to CUDA versions (not expected).
           pip show torch | grep Version | grep xpu
           pip show torchaudio | grep Version | grep xpu
           pip show torchvision | grep Version | grep xpu
           python -c 'import torch; exit(not torch.xpu.is_available())'
-      - name: Run tests
+          printenv
+      - name: Run tests on ${{ needs.prepare.outputs.hostname }}
         run: |
-          source activate $CONDA_ENV_NAME
+          # use 1 GPU only for tests
+          # which also can get the 1st GPU from 1 card only runner and some cards lost runner
+          export ZE_AFFINITY_MASK="$(echo ${ZE_AFFINITY_MASK} |sed 's/,.*//')"
           cd $WORK_DIR && rm -rf reports && mkdir -p reports
           # Excluding tests due to:
           # * tests/test_examples.py::FeatureExamplesTests::test_profiler fails on
@@ -150,8 +178,7 @@ jobs:
           # * tests/test_big_modeling.py::test_dispatch_model_tied_weights_memory_with_nested_offload_cpu fails
           #   with OOM. That's a new test added by https://github.com/huggingface/accelerate/pull/3445
           pattern="not test_profiler and not test_gated and not test_dispatch_model_tied_weights_memory_with_nested_offload_cpu"
-          cmd=(python3 -m pytest --timeout 600 -rsf --junitxml=reports/accelerate.xml -k "$pattern" \
-            tests/)
+          cmd=(python -m pytest --junitxml=reports/accelerate.xml -k "$pattern" tests/)
           {
             echo "### Running"
             echo "\`\`\`"
@@ -162,28 +189,20 @@ jobs:
       - name: Print result tables
         if: ${{ ! cancelled() }}
         run: |
-          source activate $CONDA_ENV_NAME
           cd $WORK_DIR
           {
             echo "### Results"
-            python3 $PARSE_JUNIT reports/accelerate.xml --stats
+            python $PARSE_JUNIT reports/accelerate.xml --stats
             echo "### Failed"
-            python3 $PARSE_JUNIT reports/accelerate.xml --errors --failed
+            python $PARSE_JUNIT reports/accelerate.xml --errors --failed
             echo "### Skipped"
-            python3 $PARSE_JUNIT reports/accelerate.xml --skipped
+            python $PARSE_JUNIT reports/accelerate.xml --skipped
           } >> $GITHUB_STEP_SUMMARY
       - name: Print environment
         if: ${{ ! cancelled() }}
         uses: ./torch-xpu-ops/.github/actions/print-environment
         with:
-          conda: $CONDA_ENV_NAME
           pip_packages: 'accelerate transformers'
-      - name: Clean up
-        if: ${{ always() }}
-        run: |
-          if [ -n "$CONDA_ENV_NAME" ]; then
-            conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME
-          fi
       - name: Upload Test log
         if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4

diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml
@@ -62,10 +62,9 @@ jobs:
       hostname: ${{ steps.runner-info.outputs.hostname }}
       xpu_num: ${{ steps.runner-info.outputs.xpu_num }}
       cpus_per_xpu: ${{ steps.runner-info.outputs.cpus_per_xpu }}
+      ZE_AFFINITY_MASK: ${{ steps.runner-info.outputs.ZE_AFFINITY_MASK }}
+      numactl_args: ${{ steps.runner-info.outputs.numactl_args }}
     steps:
-      - name: Cleanup workspace
-        run: |
-          sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
       - name: Get runner
@@ -80,9 +79,12 @@ jobs:
       image: mengfeili/intel-pvc-driver:1146-1136
       volumes:
         - ${{ github.workspace }}:${{ github.workspace }}
-      options: --device=/dev/mem --device=/dev/dri --group-add video --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
-              -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }}
+      options: --device=/dev/mem --device=/dev/dri --group-add video --group-add ${{ needs.runner.outputs.render_id }}
+              --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
+              -u ${{ needs.runner.outputs.user_id }}
       env:
+        ZE_AFFINITY_MASK: ${{ needs.runner.outputs.ZE_AFFINITY_MASK }}
+        numactl_args: ${{ needs.runner.outputs.numactl_args }}
         xpu_num: ${{ needs.runner.outputs.xpu_num }}
         cpus_per_xpu: ${{ needs.runner.outputs.cpus_per_xpu }}
         MODEL_ONLY_NAME: ${{ inputs.model }}

diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml
@@ -37,10 +37,8 @@ jobs:
       user_id: ${{ steps.runner-info.outputs.user_id }}
       render_id: ${{ steps.runner-info.outputs.render_id }}
       hostname: ${{ steps.runner-info.outputs.hostname }}
+      ZE_AFFINITY_MASK: ${{ steps.runner-info.outputs.ZE_AFFINITY_MASK }}
     steps:
-      - name: Cleanup workspace
-        run: |
-          sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
       - name: Get runner
@@ -57,9 +55,11 @@ jobs:
       image: mengfeili/intel-pvc-driver:1146-1136
       volumes:
         - ${{ github.workspace }}:${{ github.workspace }}
-      options: --device=/dev/mem --device=/dev/dri --group-add video --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
-              -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }}
+      options: --device=/dev/mem --device=/dev/dri --group-add video --group-add ${{ needs.runner.outputs.render_id }}
+              --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
+              -u ${{ needs.runner.outputs.user_id }}
       env:
+        ZE_AFFINITY_MASK: ${{ needs.runner.outputs.ZE_AFFINITY_MASK }}
         AGENT_TOOLSDIRECTORY: /opt/xpu-tool
     steps:
       - name: Checkout torch-xpu-ops