Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions examples/sparse_tensor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) {
experimental::CSR::print();
experimental::CSC::print();
experimental::DCSR::print();
experimental::DIA::print();
experimental::SkewDIA::print();
experimental::DIAI::print();
experimental::DIAJ::print();
experimental::SkewDIAI::print();
experimental::SkewDIAJ::print();
experimental::BSR<2, 2>::print(); // 2x2 blocks
experimental::COO4::print(); // 4-dim tensor in COO
experimental::CSF5::print(); // 5-dim tensor in CSF
Expand Down Expand Up @@ -195,8 +197,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) {
auto doffsets = make_tensor<int>({3});
dvals.SetVals({-1, -1, -1, -1, -1, 0, 4, 4, 4, 4, 4, 4, 0, 1, 1, 1, 1, 1});
doffsets.SetVals({-1, 0, 1});
auto Adia = experimental::make_tensor_dia(dvals, doffsets, {6, 6});
print(Adia);
auto AdiaJ = experimental::make_tensor_dia<experimental::DIA_INDEX_J>(dvals, doffsets, {6, 6});
print(AdiaJ);

//
// Perform a direct SpMV. This is also the correct way of performing
Expand All @@ -205,7 +207,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) {
auto V = make_tensor<float>({6});
auto R = make_tensor<float>({6});
V.SetVals({1, 2, 3, 4, 5, 6});
(R = matvec(Adia, V)).run(exec);
(R = matvec(AdiaJ, V)).run(exec);
print(R);

MATX_EXIT_HANDLER();
Expand Down
16 changes: 10 additions & 6 deletions include/matx/core/make_sparse_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,11 +204,14 @@ auto make_zero_tensor_csc(const index_t (&shape)[2],

// Constructs a sparse matrix in DIA format directly from the values and the
// offset vectors. For an m x n matrix, this format uses a linearized storage
// where each diagonal has n entries, padded with zeros on the right for the
// lower triangular part and padded with zeros on the left for the upper
// triagonal part. This format is most efficient for matrices with only a
// few nonzero diagonals that are close to the main diagonal.
template <typename ValTensor, typename CrdTensor>
// where each diagonal has n entries and is accessed by index I or index J.
// For index I, diagonals padded with zeros on the left for the lower triangular
// part and padded with zeros on the right for the upper triagonal part. This
// is vv. when using index J. This format is most efficient for matrices with
// only a few nonzero diagonals that are close to the main diagonal.
struct DIA_INDEX_I {};
struct DIA_INDEX_J {};
template <typename IDX, typename ValTensor, typename CrdTensor>
auto make_tensor_dia(ValTensor &val, CrdTensor &off,
const index_t (&shape)[2]) {
using VAL = typename ValTensor::value_type;
Expand All @@ -224,7 +227,8 @@ auto make_tensor_dia(ValTensor &val, CrdTensor &off,
matxMemorySpace_t space = GetPointerKind(val.GetStorage().data());
auto tp = makeDefaultNonOwningZeroStorage<POS>(2, space);
setVal(tp.data() + 1, static_cast<POS>(val.Size(0)), space);
// Construct DIA.
// Construct DIA-I/J.
using DIA = std::conditional_t<std::is_same_v<IDX, DIA_INDEX_I>, DIAI, DIAJ>;
return sparse_tensor_t<VAL, CRD, POS, DIA>(
shape, val.GetStorage(),
{makeDefaultNonOwningEmptyStorage<CRD>(), off.GetStorage()},
Expand Down
21 changes: 18 additions & 3 deletions include/matx/core/sparse_tensor_format.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,18 @@ template <int D, class... S> class SparseTensorFormat {
return false;
}

static constexpr bool isDIA() {
static constexpr bool isDIAI() {
if constexpr (LVL == 2) {
using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
return type0::Expr::op == LvlOp::Sub && type0::Expr::di == 1 &&
type0::Expr::cj == 0 && type0::Type::isCompressed() &&
type1::Expr::isId(0) && type1::Type::isRange();
}
return false;
}

static constexpr bool isDIAJ() {
if constexpr (LVL == 2) {
using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
Expand Down Expand Up @@ -337,9 +348,13 @@ using DCSC =
SparseTensorFormat<2, LvlSpec<D1, Compressed>, LvlSpec<D0, Compressed>>;
using CROW = SparseTensorFormat<2, LvlSpec<D0, Compressed>, LvlSpec<D1, Dense>>;
using CCOL = SparseTensorFormat<2, LvlSpec<D1, Compressed>, LvlSpec<D0, Dense>>;
using DIA =
using DIAI =
SparseTensorFormat<2, LvlSpec<Sub<1, 0>, Compressed>, LvlSpec<D0, Range>>;
using DIAJ =
SparseTensorFormat<2, LvlSpec<Sub<1, 0>, Compressed>, LvlSpec<D1, Range>>;
using SkewDIA =
using SkewDIAI =
SparseTensorFormat<2, LvlSpec<Add<1, 0>, Compressed>, LvlSpec<D0, Range>>;
using SkewDIAJ =
SparseTensorFormat<2, LvlSpec<Add<1, 0>, Compressed>, LvlSpec<D1, Range>>;

// Sparse Block Matrices.
Expand Down
27 changes: 22 additions & 5 deletions include/matx/kernels/matvec.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,32 @@

namespace matx {

// Kernel that performs SpMV for an m x n DIA matrix.
// Kernel that performs SpMV for an m x n DIA-I matrix.
template <typename VAL, typename CRD>
__global__ void dia_spmv_kernel(VAL *A, CRD *diags, uint64_t numD, VAL *B,
VAL *C, uint64_t m, uint64_t n) {
__global__ void diai_spmv_kernel(VAL *A, CRD *diags, uint64_t numDiags, VAL *B,
VAL *C, uint64_t m, uint64_t n) {
uint64_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < m) {
VAL acc = 0.0;
for (uint64_t d = 0; d < numD; d++) { // numD-DIA SpMV
int64_t j = i + diags[d]; // signed
for (uint64_t d = 0; d < numDiags; d++) {
int64_t j = i + diags[d]; // signed
if (0 <= j && j < n) {
acc += A[d * m + i] * B[j];
}
}
C[i] = acc;
}
}

// Kernel that performs SpMV for an m x n DIA-J matrix.
template <typename VAL, typename CRD>
__global__ void diaj_spmv_kernel(VAL *A, CRD *diags, uint64_t numDiags, VAL *B,
VAL *C, uint64_t m, uint64_t n) {
uint64_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < m) {
VAL acc = 0.0;
for (uint64_t d = 0; d < numDiags; d++) {
int64_t j = i + diags[d]; // signed
if (0 <= j && j < n) {
acc += A[d * n + j] * B[j];
}
Expand Down
10 changes: 7 additions & 3 deletions include/matx/transforms/matmul/matvec_cusparse.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ void sparse_matvec_impl(TensorTypeC &C, const TensorTypeA &a,
MATX_ASSERT(b.Stride(RANKB - 1) == 1 && c.Stride(RANKC - 1) == 1,
matxInvalidParameter);

if constexpr (atype::Format::isDIA()) {
if constexpr (atype::Format::isDIAI() || atype::Format::isDIAJ()) {

// Fall back to a hand-written kernel for DIA format, since
// this format is not supported in cuSPARSE. The hand-written
Expand All @@ -325,8 +325,12 @@ void sparse_matvec_impl(TensorTypeC &C, const TensorTypeA &a,
uint32_t THREADS = static_cast<uint32_t>(std::min(m, 1024LU));
uint32_t BATCHES = static_cast<uint32_t>(
cuda::std::ceil(static_cast<double>(m) / THREADS));
dia_spmv_kernel<<<BATCHES, THREADS, 0, stream>>>(AD, diags, numD, BD, CD, m,
n);
if constexpr (atype::Format::isDIAI())
diai_spmv_kernel<<<BATCHES, THREADS, 0, stream>>>(AD, diags, numD, BD, CD,
m, n);
else
diaj_spmv_kernel<<<BATCHES, THREADS, 0, stream>>>(AD, diags, numD, BD, CD,
m, n);
#endif

} else {
Expand Down