Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ jobs:
-c rapidsai
-c nvidia
python=3.13
cccl-python
cudf
cupy
cuda-version=${{ matrix.cuda-version }}
Expand Down
3 changes: 2 additions & 1 deletion requirements-test-gpu.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# The CI installs cudf and cupy using conda
# The CI installs cudf, cupy, and cuda-cccl using conda
# If you are using this file manually uncomment the following lines and
# set the cuda version matching your system (currently supporting CUDA 12 and 13).
# cudf-cu12
# cupy-cuda12x
# cuda-cccl[cu12]
fsspec>=2022.11.0
numba>=0.60
numba-cuda
Expand Down
64 changes: 60 additions & 4 deletions src/awkward/_backends/cupy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from awkward._backends.backend import Backend, KernelKeyType
from awkward._backends.dispatch import register_backend
from awkward._kernels import CupyKernel, NumpyKernel
from awkward._kernels import CudaComputeKernel, CupyKernel, NumpyKernel
from awkward._nplikes.cupy import Cupy
from awkward._nplikes.numpy import Numpy
from awkward._nplikes.numpy_like import NumpyMetadata
Expand All @@ -27,13 +27,69 @@ def nplike(self) -> Cupy:
def __init__(self):
self._cupy = Cupy.instance()

def __getitem__(self, index: KernelKeyType) -> CupyKernel | NumpyKernel:
def __getitem__(
self, index: KernelKeyType
) -> CudaComputeKernel | CupyKernel | NumpyKernel:
from awkward._connect import cuda
from awkward._connect.cuda import _compute as cuda_compute

kernel_name = index[0] if index else ""

# Try CuPy kernels first (primary implementation)
cupy = cuda.import_cupy("Awkward Arrays with CUDA")
_cuda_kernels = cuda.initialize_cuda_kernels(cupy)
func = _cuda_kernels[index]

if func is not None:
# CuPy kernel exists, use it
return CupyKernel(func, index)
else:
raise AssertionError(f"CuPyKernel not found: {index!r}")

# CuPy kernel not found, try cuda.compute as fallback
if self._supports_cuda_compute(kernel_name):
if cuda_compute.is_available():
# Return CudaComputeKernel for supported operations
compute_impl = self._get_cuda_compute_impl(kernel_name)
if compute_impl is not None:
return CudaComputeKernel(compute_impl, index)
else:
# cuda.compute is needed but not available
raise NotImplementedError(
f"Operation '{kernel_name}' on CUDA backend requires cuda.compute library "
f"(no CuPy kernel available). "
f"Please install cuda.compute or use the CPU backend: "
f"ak.to_backend(array, 'cpu')"
)

# Neither CuPy kernel nor cuda.compute implementation found
raise AssertionError(
f"Operation '{kernel_name}' is not supported on CUDA backend. "
f"CuPy kernel not found: {index!r}"
)

def _supports_cuda_compute(self, kernel_name: str) -> bool:
"""
Check if the given kernel operation is supported by cuda.compute.

Currently supports:
- awkward_sort
- awkward_argsort (future)
"""
# For now, we only support sort operations
return kernel_name in ("awkward_sort",)

def _get_cuda_compute_impl(self, kernel_name: str):
"""
Get the cuda.compute implementation for a kernel operation.

Args:
kernel_name: Name of the kernel operation (e.g., "awkward_sort")

Returns:
Callable implementing the operation, or None if not supported
"""
from awkward._connect.cuda import _compute as cuda_compute

if kernel_name == "awkward_sort":
return cuda_compute.segmented_sort

return None
65 changes: 65 additions & 0 deletions src/awkward/_connect/cuda/_compute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE

from __future__ import annotations

from awkward._nplikes.cupy import Cupy

# Cache for cuda.compute availability
_cuda_compute_available: bool | None = None


def is_available() -> bool:
global _cuda_compute_available

if _cuda_compute_available is not None:
return _cuda_compute_available

try:
import cuda.compute # noqa: F401

_cuda_compute_available = True
except ImportError:
_cuda_compute_available = False

return _cuda_compute_available


def segmented_sort(
toptr,
fromptr,
length,
offsets,
offsetslength,
parentslength,
ascending,
stable,
):
from cuda.compute import SortOrder, segmented_sort

cupy_nplike = Cupy.instance()
cp = cupy_nplike._module

# Ensure offsets are int64 as expected by segmented_sort
if offsets.dtype != cp.int64:
offsets = offsets.astype(cp.int64)

num_segments = offsetslength - 1
num_items = int(offsets[-1]) if len(offsets) > 0 else 0

start_offsets = offsets[:-1]
end_offsets = offsets[1:]

order = SortOrder.ASCENDING if ascending else SortOrder.DESCENDING

segmented_sort(
fromptr, # d_in_keys
toptr, # d_out_keys
None, # d_in_values (not sorting values, just keys)
None, # d_out_values
num_items, # num_items
num_segments, # num_segments
start_offsets, # start_offsets_in
end_offsets, # end_offsets_in
order, # order (ASCENDING or DESCENDING)
None, # stream (use default stream)
)
21 changes: 20 additions & 1 deletion src/awkward/_kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
from awkward._nplikes.typetracer import try_touch_data
from awkward._typing import Protocol, TypeAlias

KernelKeyType: TypeAlias = tuple # Tuple[str, Unpack[Tuple[metadata.dtype, ...]]]
# Tuple[str, Unpack[Tuple[metadata.dtype, ...]]]
KernelKeyType: TypeAlias = tuple


numpy = Numpy.instance()
Expand Down Expand Up @@ -215,6 +216,24 @@ def __call__(self, *args) -> None:
self._impl(grid, blocks, args)


class CudaComputeKernel(BaseKernel):
"""
Kernel implementation using cuda.compute library.

When the CUDA backend is used, this kernel is used for operations
that have ``cuda.compute`` implementations. For other operations,
the ``CupyKernel`` is used.
"""

def __init__(self, impl: Callable[..., Any], key: KernelKeyType):
super().__init__(impl, key)
self._cupy = Cupy.instance()

def __call__(self, *args) -> None:
args = maybe_materialize(*args)
return self._impl(*args)


class TypeTracerKernelError(KernelError):
def __init__(self):
self.str = None
Expand Down
9 changes: 5 additions & 4 deletions studies/cccl/_segment_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,8 @@ def segmented_select(
num_segments = len(d_in_segments) - 1

cond = numba.cuda.jit(cond)
# Apply select to get the data and indices where condition is true

# Step 1: Apply select to get the data and indices where condition is true

def select_predicate(pair):
return cond(pair[0])
Expand All @@ -253,13 +254,13 @@ def select_predicate(pair):
d_indices_out = d_indices_out[:total_selected]
d_selected_indices = d_indices_out[:total_selected]

# Step 3: Use searchsorted to count selected items per segment
# Step 2: Use searchsorted to count selected items per segment
# Use side='left' to count elements strictly less than each offset boundary
positions = cp.searchsorted(
d_selected_indices, d_in_segments, side='left')
d_counts = (positions[1:] - positions[:-1]).astype(cp.uint64)

# Step 4: Use exclusive scan to compute output segment start offsets
# Step 3: Use exclusive scan to compute output segment start offsets
exclusive_scan(
d_counts,
d_out_segments[:-1],
Expand All @@ -269,7 +270,7 @@ def select_predicate(pair):
stream,
)

# Step 5: Set the final offset to the total count
# Set the final offset to the total count
d_out_segments[-1] = total_selected
return total_selected

Expand Down
8 changes: 3 additions & 5 deletions tests-cuda/test_3459_virtualarray_with_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,6 @@ def test_numpyarray_nanargmax(numpyarray, virtual_numpyarray):
assert virtual_numpyarray.is_all_materialized


@pytest.mark.xfail(reason="awkward_sort is not implemented")
def test_numpyarray_sort(numpyarray, virtual_numpyarray):
assert not virtual_numpyarray.is_any_materialized
assert ak.array_equal(
Expand Down Expand Up @@ -1227,7 +1226,6 @@ def test_listoffsetarray_nanargmax(numpy_like):
assert virtual_array.is_all_materialized


@pytest.mark.xfail(reason="awkward_sort is not implemented")
def test_listoffsetarray_sort(listoffsetarray, virtual_listoffsetarray):
assert not virtual_listoffsetarray.is_any_materialized
assert ak.array_equal(
Expand Down Expand Up @@ -2259,7 +2257,9 @@ def test_listarray_nanargmax(numpy_like):
assert virtual_array.is_all_materialized


@pytest.mark.xfail(reason="awkward_sort is not implemented")
@pytest.mark.xfail(
reason="ListArray.to_ListOffsetArray64 fails with virtual arrays on CUDA"
)
def test_listarray_sort(listarray, virtual_listarray):
assert not virtual_listarray.is_any_materialized
assert ak.array_equal(
Expand Down Expand Up @@ -3359,7 +3359,6 @@ def test_recordarray_argmax_y_field(recordarray, virtual_recordarray):
assert virtual_recordarray.is_any_materialized


@pytest.mark.xfail(reason="awkward_sort is not implemented")
def test_recordarray_sort_x_field(recordarray, virtual_recordarray):
# Test sort on the x field (ListOffsetArray)
assert not virtual_recordarray.is_any_materialized
Expand All @@ -3373,7 +3372,6 @@ def test_recordarray_sort_x_field(recordarray, virtual_recordarray):
assert virtual_recordarray.is_any_materialized


@pytest.mark.xfail(reason="awkward_sort is not implemented")
def test_recordarray_sort_y_field(recordarray, virtual_recordarray):
# Test sort on the y field (NumpyArray)
assert not virtual_recordarray.is_any_materialized
Expand Down
Loading
Loading