disable attentions #86

Workflow file for this run

.github/workflows/windows-cuda.yml at d77e574

	name: Build (Windows, CUDA)

	on: [push, workflow_dispatch]

	jobs:
	build-windows:
	runs-on: windows-2025

	defaults:
	run:
	shell: cmd

	steps:
	- name: Checkout repo
	uses: actions/checkout@v4
	with:
	submodules: recursive
	# master
	ref: 4754a1d64e5920a715b0396906f339e6c15742a0
	fetch-depth: 0

	- name: Fix
	run: (echo #if USE_FPA_INTB_GEMM & type onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc & echo #endif) > temp && move /y temp onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_profiler.cc

	- name: Setup MSVC
	uses: ilammy/msvc-dev-cmd@v1

	- name: Setup Ninja
	run: pip install ninja

	- name: Cache CUDA
	id: cache-cuda
	uses: actions/cache@v4
	with:
	path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
	key: ${{ runner.os }}-cuda-12.9.1

	- name: Setup CUDA
	if: steps.cache-cuda.outputs.cache-hit != 'true'
	run: \|
	curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.9.1/network_installers/cuda_12.9.1_windows_network.exe
	cuda_installer.exe -s nvcc_12.9 cudart_12.9 cublas_dev_12.9 cufft_dev_12.9 curand_dev_12.9 cusparse_dev_12.9 cupti_12.9 thrust_12.9 nvtx_12.9

	- name: Download cuDNN inference library
	run: \|
	curl -LJ https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.12.0.46_cuda12-archive.zip -o cudnn.zip
	unzip cudnn.zip
	mkdir -p cudnn
	cp -r cudnn-windows-*/include cudnn/ -v
	cp -r cudnn-windows-*/lib cudnn/ -v

	- name: Configure
	run: cmake -S cmake -B build -G Ninja -Wno-dev -LA
	-D CMAKE_BUILD_TYPE=Release
	-D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded -D ONNX_USE_MSVC_STATIC_RUNTIME=ON -D ABSL_MSVC_STATIC_RUNTIME=ON
	-D onnxruntime_BUILD_UNIT_TESTS=OFF -D onnxruntime_BUILD_SHARED_LIB=ON
	-D onnxruntime_ENABLE_LTO=ON
	-D onnxruntime_USE_FLASH_ATTENTION=OFF -D onnxruntime_USE_LEAN_ATTENTION=OFF
	-D onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION=OFF -D onnxruntime_USE_FPA_INTB_GEMM=OFF
	-D onnxruntime_ENABLE_CPU_FP16_OPS=OFF -D onnxruntime_USE_AVX=ON
	-D onnxruntime_USE_CUDA=ON -D onnxruntime_CUDA_HOME="%CUDA_PATH%" -D onnxruntime_NVCC_THREADS=1
	-D onnxruntime_USE_CUDA_NHWC_OPS=ON
	-D onnxruntime_CUDNN_HOME="%cd%\cudnn"
	-D onnxruntime_ENABLE_NVTX_PROFILE=OFF
	-D onnxruntime_USE_DML=ON
	-D CMAKE_CUDA_ARCHITECTURES="75-real;86-real;89-real;120-real"
	env:
	CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9

	- name: Build
	run: cmake --build build --verbose

	- name: Install
	run: cmake --install build --prefix onnxruntime-gpu

	- name: Show
	run: ls -R onnxruntime-gpu

	- name: Upload
	uses: actions/upload-artifact@v4
	with:
	name: onnxruntime-windows-cuda
	retention-days: 1
	path: onnxruntime-gpu

	- name: Package
	shell: pwsh
	run: Compress-Archive onnxruntime-gpu -DestinationPath onnxruntime-gpu-win64.zip

	- name: Get description
	shell: bash
	run: \|
	echo ORT_VERSION=`git describe` >> $GITHUB_ENV
	echo TIME=`date -u +"%y%m%d-%H%M"` >> $GITHUB_ENV

	- name: Release
	uses: softprops/action-gh-release@v1
	with:
	files: onnxruntime-gpu-win64.zip
	name: Build ${{ env.ORT_VERSION }}
	tag_name: ${{ env.ORT_VERSION }}-${{ env.TIME }}
	prerelease: true

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

disable attentions #86

Workflow file

disable attentions #86

Uh oh!

Workflow file for this run