Skip to content

[Nightly] Enable bisect search #1849

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions .github/scripts/bisect_search.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/bin/bash
set -xe
export GIT_PAGER=cat

# Init params
WORKSPACE=$(realpath ${WORKSPACE:-"/tmp"})
PYTORCH_VERSION=${PYTORCH_VERSION:-"main"}
TORCH_XPU_OPS_VERSION=${TORCH_XPU_OPS_VERSION:-"main"}
for var; do
eval "export $(echo ${var@Q} |sed "s/^'-*//g;s/=/='/")"
done

if [ "${PYTORCH_VERSION}" == "search" ];then
PYTORCH_VERSION="$(git rev-parse HEAD)"
fi
if [ "${TORCH_XPU_OPS_VERSION}" == "search" ];then
TORCH_XPU_OPS_VERSION="$(git rev-parse HEAD)"
fi

# Clean WORKSPACE
mkdir -p ${WORKSPACE}
rm -rf "${WORKSPACE:?}/"* || sudo rm -rf "${WORKSPACE:?}/"*

# Build pytorch
pip uninstall -y torch
source $(dirname $(realpath $0))/env.sh 2> /dev/null
build_status="$($(dirname $(realpath $0))/build.sh \
--WORKSPACE="${WORKSPACE}" \
--PYTORCH_VERSION="${PYTORCH_VERSION}" \
--TORCH_XPU_OPS_VERSION="${TORCH_XPU_OPS_VERSION}" \
> ${GITHUB_WORKSPACE}/gs-logs/build-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
if [ ${build_status} -ne 0 ];then
tail -n 100 ${GITHUB_WORKSPACE}/gs-logs/build-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log
echo "Build got failed"
exit 1
fi
pip list |grep torch

# Test
test_result=1
if [ "${SEARCH_CHECK}" == "accuracy" ];then
cd ${WORKSPACE}/pytorch
rm -rf torch
test_status="$(eval "${SEARCH_CASE} --output=${WORKSPACE}/tmp.csv" \
> ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
if [ ${test_status} -eq 0 ];then
acc_result=$(tail -n 1 ${WORKSPACE}/tmp.csv |awk -F, '{print $4}')
if [[ "${acc_result}" == "pass"* ]];then
test_result=0
fi
fi
elif [ "${SEARCH_CHECK}" == "performance" ];then
cd ${WORKSPACE}/pytorch
rm -rf torch
test_status="$(eval "${SEARCH_CASE} --output=${WORKSPACE}/tmp.csv" \
> ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
if [ ${test_status} -eq 0 ];then
perf_result=$(tail -n 1 ${WORKSPACE}/tmp.csv |awk -F, '{print $5}')
test_result=$(echo "${perf_result},${SEARCH_GOOD_VALUE:-"0.00001"},${SEARCH_CRITERIA}" |awk -F, '{
if ($1/$2 > (1 - $3)){
print "0";
}else{
print "1";
}
}')
fi
elif [ "${SEARCH_CHECK}" == "ut_regressions" ];then
cd ${WORKSPACE}/pytorch/third_party/torch-xpu-ops/test/regressions
test_status="$(eval "${SEARCH_CASE}" \
> ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
if [ ${test_status} -eq 0 ];then
test_result=0
fi
elif [ "${SEARCH_CHECK}" == "ut_extended" ];then
cd ${WORKSPACE}/pytorch/third_party/torch-xpu-ops/test/xpu/extended
test_status="$(eval "${SEARCH_CASE}" \
> ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
if [ ${test_status} -eq 0 ];then
test_result=0
fi
elif [ "${SEARCH_CHECK}" == "ut_xpu" ];then
cd ${WORKSPACE}/pytorch/third_party/torch-xpu-ops/test/xpu
test_status="$(eval "${SEARCH_CASE}" \
> ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
if [ ${test_status} -eq 0 ];then
test_result=0
fi
else
test_status="$(eval "${SEARCH_CASE}" \
> ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
if [ ${test_status} -eq 0 ];then
test_result=0
fi
fi

# Test result
cat ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log
echo "${test_result},${acc_result},${perf_result},${PYTORCH_VERSION},${TORCH_XPU_OPS_VERSION}" |\
tee -a ${GITHUB_WORKSPACE}/gs-logs/summary.csv |tee -a ${WORKSPACE}/result.csv
exit ${test_result}
1 change: 0 additions & 1 deletion .github/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ git submodule sync && git submodule update --init --recursive
python -m pip install -r requirements.txt
python -m pip install mkl-static mkl-include
export USE_STATIC_MKL=1
export USE_XCCL=1
export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \
intel-cmplr-lib-rt==2025.1.1 | \
intel-cmplr-lib-ur==2025.1.1 | \
Expand Down
12 changes: 7 additions & 5 deletions .github/scripts/env.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#!/bin/bash

source /opt/intel/oneapi/compiler/latest/env/vars.sh
source /opt/intel/oneapi/pti/latest/env/vars.sh
source /opt/intel/oneapi/umf/latest/env/vars.sh
source /opt/intel/oneapi/ccl/latest/env/vars.sh
source /opt/intel/oneapi/mpi/latest/env/vars.sh
XPU_ONEAPI_PATH=${XPU_ONEAPI_PATH:-"/opt/intel/oneapi"}

source ${XPU_ONEAPI_PATH}/compiler/latest/env/vars.sh
source ${XPU_ONEAPI_PATH}/pti/latest/env/vars.sh
source ${XPU_ONEAPI_PATH}/umf/latest/env/vars.sh
source ${XPU_ONEAPI_PATH}/ccl/latest/env/vars.sh
source ${XPU_ONEAPI_PATH}/mpi/latest/env/vars.sh
icpx --version
sycl-ls
247 changes: 247 additions & 0 deletions .github/workflows/_bisect_search.yml
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename to bisect_search.yml directly

Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
name: Bisect Search

on:
workflow_dispatch:
inputs:
runner:
required: true
type: string
default: 'pvc_rolling'
description: Test node
search_commits:
required: true
type: string
default: ''
description: Target commits, such as 'pytorch=old/new,xpu-ops=old/new'
search_check:
type: string
default: ''
description: Test case type, 'performance, accuracy, <ut_regressions/ut_extended/ut_xpu> or others'
search_case:
required: true
type: string
default: ''
description: Test case, such as 'python xxx.py or pytest -k xxx'
search_criteria:
type: string
default: '0.1'
description: Criteria for performance check, default is 10%
oneapi:
type: string
default: '2025.1.3'
description: Installed oneAPI DLE on host by default, fill offline.sh url if needed
python:
type: string
default: '3.10'
description: Python version

permissions: read-all

jobs:
get_runner:
runs-on: ${{ inputs.runner }}
outputs:
test_host: ${{ steps.runner-info.outputs.test_host }}
test_user: ${{ steps.runner-info.outputs.test_user }}
test_group: ${{ steps.runner-info.outputs.test_group }}
steps:
- name: Get runner info
id: runner-info
run: |
# get test runner
echo "test_host=${RUNNER_NAME}" |tee -a ${GITHUB_OUTPUT}
echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT}
echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT}
# show host info
cat /etc/os-release
uname -a
source /opt/intel/oneapi/setvars.sh
sycl-ls
dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev'
- name: Cleanup workspace
if: ${{ always() }}
run: |
# clean docker cache
docker stop $(docker ps -aq) || true
docker system prune -af || true
# clean files
ls -al
sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf

biisect-search:
needs: get_runner
runs-on: ${{ needs.get_runner.outputs.test_host }}
container:
image: mengfeili/intel-pvc-driver:1146-1136
volumes:
- ${{ github.workspace }}:${{ github.workspace }}
options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g
-u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }}
env:
AGENT_TOOLSDIRECTORY: /tmp/_tools
HF_HOME: /tmp/.cache/huggingface
TORCH_HOME: /tmp/.cache/torch
GH_TOKEN: ${{ github.token }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
SEARCH_COMMITS: ${{ inputs.search_commits }}
SEARCH_CHECK: ${{ inputs.search_check }}
SEARCH_CASE: ${{ inputs.search_case }}
SEARCH_CRITERIA: ${{ inputs.search_criteria }}
TORCH_XPU_ARCH_LIST: pvc
USE_XCCL: 0
USE_KINETO: 0
defaults:
run:
shell: bash -xe {0}
steps:
- name: Check runner
run: |
ls -al
sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
sudo rm -rf /tmp/_tools
- name: Setup python-${{ inputs.python }}
uses: actions/setup-python@v5
with:
python-version: ${{ inputs.python }}
- name: Check runner
run: |
hostname && whoami && id
clinfo --list
gcc -v && g++ -v
which python && which pip
python -V
pip install -U pip wheel setuptools
pip list
uname -a
dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev'
pip install cmake ninja pandas psutil scipy requests pybind11
mkdir gs-logs gs-search
echo "Status,Acc,Perf,PyTorch,Torch-xpu-ops" > gs-logs/summary.csv
- name: Install oneAPI DLE
if: ${{ inputs.oneapi != 'installed' }}
run: |
rm -rf ~/intel ~/.intel /tmp/intel
if [ "${{ inputs.oneapi }}" == "2025.1.3" ];then
ONEAPI_URL="https://registrationcenter-download.intel.com/akdlm/IRC_NAS/3435dc45-055e-4f7a-86b1-779931772404/intel-deep-learning-essentials-2025.1.3.7_offline.sh"
elif [ "${{ inputs.oneapi }}" == "2025.2.0" ];then
ONEAPI_URL="https://registrationcenter-download.intel.com/akdlm/IRC_NAS/49d38360-b403-4b06-9104-86fa8d886e6d/intel-deep-learning-essentials-2025.2.0.558_offline.sh"
else
ONEAPI_URL="${{ inputs.oneapi }}"
fi
wget -q -O oneapi.sh "${ONEAPI_URL}"
bash oneapi.sh -a -s --eula accept --action install --install-dir /tmp/intel/oneapi
echo "XPU_ONEAPI_PATH=/tmp/intel/oneapi" >> ${GITHUB_ENV}
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
with:
path: gs-scripts
- name: Prepare source code
run: |
git clone https://github.com/pytorch/pytorch gs-pytorch
cd gs-pytorch
LATEST_PT_COMMIT="$(git rev-parse HEAD)"
cd ..
git clone https://github.com/intel/torch-xpu-ops gs-torch-xpu-ops
cd gs-torch-xpu-ops
LATEST_XPU_COMMIT="$(git rev-parse HEAD)"
cd ..
echo "LATEST_PT_COMMIT=${LATEST_PT_COMMIT}" >> ${GITHUB_ENV}
echo "LATEST_XPU_COMMIT=${LATEST_XPU_COMMIT}" >> ${GITHUB_ENV}
- name: Prepare test env
run: |
pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu
if [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/huggingface.py"* ]];then
pip install transformers==4.44.2
elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/timm_models.py"* ]];then
pip install --no-deps git+https://github.com/huggingface/[email protected]
pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch)
elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/torchbench.py"* ]];then
model_name="$(echo ${{ inputs.search_case }} |sed 's+.*\--only *++;s/ .*//')"
git clone https://github.com/pytorch/benchmark gs-benchmark
cd gs-benchmark
echo "PYTHONPATH=${PWD}:${PYTHONPATH}" >> ${GITHUB_ENV}
python install.py ${model_name}
else
pip install -r gs-pytorch/.ci/docker/requirements-ci.txt
fi
pip uninstall -y torch && pip uninstall -y torch
- name: Bisect search pytorch
if: ${{ contains(inputs.search_commits, 'pytorch') }}
run: |
pytorch_commits="$(echo ${{ inputs.search_commits }} |sed 's+.*pytorch=++;s+,.*++')"
old_commit="$(echo ${pytorch_commits} |awk -F '/' '{print $1}')"
new_commit="$(echo ${pytorch_commits} |awk -F '/' '{print $2}')"
old_status="$(${{ github.workspace }}/gs-scripts/.github/scripts/bisect_search.sh \
--WORKSPACE="${{ github.workspace }}/gs-search" \
--PYTORCH_VERSION="${old_commit}" \
--TORCH_XPU_OPS_VERSION="${LATEST_XPU_COMMIT}" \
> ${{ github.workspace }}/gs-logs/search-${old_commit}-${LATEST_XPU_COMMIT}.log 2>&1 && echo $? || echo $?)"
old_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
export SEARCH_GOOD_VALUE="$(echo ${old_result} |awk -F, '{print $3}')"
new_status="$(${{ github.workspace }}/gs-scripts/.github/scripts/bisect_search.sh \
--WORKSPACE="${{ github.workspace }}/gs-search" \
--PYTORCH_VERSION="${new_commit}" \
--TORCH_XPU_OPS_VERSION="${LATEST_XPU_COMMIT}" \
> ${{ github.workspace }}/gs-logs/search-${new_commit}-${LATEST_XPU_COMMIT}.log 2>&1 && echo $? || echo $?)"
new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
if [ "${old_status}" != "${new_status}" ];then
cd gs-pytorch
git reset --hard
rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/
git bisect start ${new_commit} ${old_commit}
git bisect run ./gs-scripts/.github/scripts/bisect_search.sh \
--WORKSPACE="${{ github.workspace }}/gs-search" \
--PYTORCH_VERSION="search" \
--TORCH_XPU_OPS_VERSION="${LATEST_XPU_COMMIT}" \
2>&1 |tee ${{ github.workspace }}/gs-logs/bisect-pytorch.log
git bisect log |tee ${{ github.workspace }}/gs-logs/result-pytorch.log
else
echo "Checked and no regression !"
fi
- name: Bisect search torch-xpu-ops
if: ${{ contains(inputs.search_commits, 'xpu-ops') }}
run: |
xpu_ops_commits="$(echo ${{ inputs.search_commits }} |sed 's+.*xpu-ops=++;s+,.*++')"
old_commit="$(echo ${xpu_ops_commits} |awk -F '/' '{print $1}')"
new_commit="$(echo ${xpu_ops_commits} |awk -F '/' '{print $2}')"
old_status="$(${{ github.workspace }}/gs-scripts/.github/scripts/bisect_search.sh \
--WORKSPACE="${{ github.workspace }}/gs-search" \
--PYTORCH_VERSION="${LATEST_PT_COMMIT}" \
--TORCH_XPU_OPS_VERSION="${old_commit}" \
> ${{ github.workspace }}/gs-logs/search-${LATEST_PT_COMMIT}-${old_commit}.log && echo $? || echo $?)"
old_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
export SEARCH_GOOD_VALUE="$(echo ${old_result} |awk -F, '{print $3}')"
new_status="$(${{ github.workspace }}/gs-scripts/.github/scripts/bisect_search.sh \
--WORKSPACE="${{ github.workspace }}/gs-search" \
--PYTORCH_VERSION="${LATEST_PT_COMMIT}" \
--TORCH_XPU_OPS_VERSION="${new_commit}" \
> ${{ github.workspace }}/gs-logs/search-${LATEST_PT_COMMIT}-${new_commit}.log && echo $? || echo $?)"
new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
if [ "${old_status}" != "${new_status}" ];then
cd gs-pytorch
git reset --hard
rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/
git bisect start ${new_commit} ${old_commit}
git bisect run ./gs-scripts/.github/scripts/bisect_search.sh \
--WORKSPACE="${{ github.workspace }}/gs-search" \
--PYTORCH_VERSION="${LATEST_PT_COMMIT}" \
--TORCH_XPU_OPS_VERSION="search" \
2>&1 |tee ${{ github.workspace }}/gs-logs/bisect-torch-xpu-ops.log
git bisect log |tee ${{ github.workspace }}/gs-logs/result-torch-xpu-ops.log
else
echo "Checked and no regression !"
fi
- name: Summary
run: |
cat gs-logs/summary.csv |tee -a ${GITHUB_STEP_SUMMARY}
for reulst_log in $(find gs-logs -name "result-*.log")
do
echo -e "\n\n\n${reulst_log}" |tee -a ${GITHUB_STEP_SUMMARY}
cat ${reulst_log} |tee -a ${GITHUB_STEP_SUMMARY}
done
- name: Upload Logs
if: ${{ ! cancelled() }}
uses: actions/upload-artifact@v4
with:
name: bisect-search
path: ${{ github.workspace }}/gs-logs
1 change: 1 addition & 0 deletions .github/workflows/_linux_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ jobs:
source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh
# gcc 11
source /opt/rh/gcc-toolset-11/enable
export USE_XCCL=1
${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \
--WORKSPACE="${{ github.workspace }}" \
--PYTORCH_REPO="${PYTORCH_REPO}" \
Expand Down