NVIDIA · aartbik · May 31, 2025 · May 30, 2025 · May 30, 2025 · May 30, 2025
diff --git a/examples/sparse_tensor.cu b/examples/sparse_tensor.cu
@@ -55,8 +55,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) {
   experimental::CSR::print();
   experimental::CSC::print();
   experimental::DCSR::print();
-  experimental::DIA::print();
-  experimental::SkewDIA::print();
+  experimental::DIAI::print();
+  experimental::DIAJ::print();
+  experimental::SkewDIAI::print();
+  experimental::SkewDIAJ::print();
   experimental::BSR<2, 2>::print(); // 2x2 blocks
   experimental::COO4::print();      // 4-dim tensor in COO
   experimental::CSF5::print();      // 5-dim tensor in CSF
@@ -195,8 +197,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) {
   auto doffsets = make_tensor<int>({3});
   dvals.SetVals({-1, -1, -1, -1, -1, 0, 4, 4, 4, 4, 4, 4, 0, 1, 1, 1, 1, 1});
   doffsets.SetVals({-1, 0, 1});
-  auto Adia = experimental::make_tensor_dia(dvals, doffsets, {6, 6});
-  print(Adia);
+  auto AdiaJ = experimental::make_tensor_dia<experimental::DIA_INDEX_J>(dvals, doffsets, {6, 6});
+  print(AdiaJ);
 
   //
   // Perform a direct SpMV. This is also the correct way of performing
@@ -205,7 +207,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) {
   auto V = make_tensor<float>({6});
   auto R = make_tensor<float>({6});
   V.SetVals({1, 2, 3, 4, 5, 6});
-  (R = matvec(Adia, V)).run(exec);
+  (R = matvec(AdiaJ, V)).run(exec);
   print(R);
 
   MATX_EXIT_HANDLER();

diff --git a/include/matx/core/make_sparse_tensor.h b/include/matx/core/make_sparse_tensor.h
@@ -204,11 +204,14 @@ auto make_zero_tensor_csc(const index_t (&shape)[2],
 
 // Constructs a sparse matrix in DIA format directly from the values and the
 // offset vectors. For an m x n matrix, this format uses a linearized storage
-// where each diagonal has n entries, padded with zeros on the right for the
-// lower triangular part and padded with zeros on the left for the upper
-// triagonal part. This format is most efficient for matrices with only a
-// few nonzero diagonals that are close to the main diagonal.
-template <typename ValTensor, typename CrdTensor>
+// where each diagonal has n entries and is accessed by index I or index J.
+// For index I, diagonals padded with zeros on the left for the lower triangular
+// part and padded with zeros on the right for the upper triagonal part. This
+// is vv. when using index J. This format is most efficient for matrices with
+// only a few nonzero diagonals that are close to the main diagonal.
+struct DIA_INDEX_I {};
+struct DIA_INDEX_J {};
+template <typename IDX, typename ValTensor, typename CrdTensor>
 auto make_tensor_dia(ValTensor &val, CrdTensor &off,
                      const index_t (&shape)[2]) {
   using VAL = typename ValTensor::value_type;
@@ -224,7 +227,8 @@ auto make_tensor_dia(ValTensor &val, CrdTensor &off,
   matxMemorySpace_t space = GetPointerKind(val.GetStorage().data());
   auto tp = makeDefaultNonOwningZeroStorage<POS>(2, space);
   setVal(tp.data() + 1, static_cast<POS>(val.Size(0)), space);
-  // Construct DIA.
+  // Construct DIA-I/J.
+  using DIA = std::conditional_t<std::is_same_v<IDX, DIA_INDEX_I>, DIAI, DIAJ>;
   return sparse_tensor_t<VAL, CRD, POS, DIA>(
       shape, val.GetStorage(),
       {makeDefaultNonOwningEmptyStorage<CRD>(), off.GetStorage()},

diff --git a/include/matx/core/sparse_tensor_format.h b/include/matx/core/sparse_tensor_format.h
@@ -187,7 +187,18 @@ template <int D, class... S> class SparseTensorFormat {
     return false;
   }
 
-  static constexpr bool isDIA() {
+  static constexpr bool isDIAI() {
+    if constexpr (LVL == 2) {
+      using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
+      using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
+      return type0::Expr::op == LvlOp::Sub && type0::Expr::di == 1 &&
+             type0::Expr::cj == 0 && type0::Type::isCompressed() &&
+             type1::Expr::isId(0) && type1::Type::isRange();
+    }
+    return false;
+  }
+
+  static constexpr bool isDIAJ() {
     if constexpr (LVL == 2) {
       using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
       using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
@@ -337,9 +348,13 @@ using DCSC =
     SparseTensorFormat<2, LvlSpec<D1, Compressed>, LvlSpec<D0, Compressed>>;
 using CROW = SparseTensorFormat<2, LvlSpec<D0, Compressed>, LvlSpec<D1, Dense>>;
 using CCOL = SparseTensorFormat<2, LvlSpec<D1, Compressed>, LvlSpec<D0, Dense>>;
-using DIA =
+using DIAI =
+    SparseTensorFormat<2, LvlSpec<Sub<1, 0>, Compressed>, LvlSpec<D0, Range>>;
+using DIAJ =
     SparseTensorFormat<2, LvlSpec<Sub<1, 0>, Compressed>, LvlSpec<D1, Range>>;
-using SkewDIA =
+using SkewDIAI =
+    SparseTensorFormat<2, LvlSpec<Add<1, 0>, Compressed>, LvlSpec<D0, Range>>;
+using SkewDIAJ =
     SparseTensorFormat<2, LvlSpec<Add<1, 0>, Compressed>, LvlSpec<D1, Range>>;
 
 // Sparse Block Matrices.

diff --git a/include/matx/kernels/matvec.cuh b/include/matx/kernels/matvec.cuh
@@ -39,15 +39,32 @@
 
 namespace matx {
 
-// Kernel that performs SpMV for an m x n DIA matrix.
+// Kernel that performs SpMV for an m x n DIA-I matrix.
 template <typename VAL, typename CRD>
-__global__ void dia_spmv_kernel(VAL *A, CRD *diags, uint64_t numD, VAL *B,
-                                VAL *C, uint64_t m, uint64_t n) {
+__global__ void diai_spmv_kernel(VAL *A, CRD *diags, uint64_t numDiags, VAL *B,
+                                 VAL *C, uint64_t m, uint64_t n) {
   uint64_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < m) {
     VAL acc = 0.0;
-    for (uint64_t d = 0; d < numD; d++) { // numD-DIA SpMV
-      int64_t j = i + diags[d];           // signed
+    for (uint64_t d = 0; d < numDiags; d++) {
+      int64_t j = i + diags[d]; // signed
+      if (0 <= j && j < n) {
+        acc += A[d * m + i] * B[j];
+      }
+    }
+    C[i] = acc;
+  }
+}
+
+// Kernel that performs SpMV for an m x n DIA-J matrix.
+template <typename VAL, typename CRD>
+__global__ void diaj_spmv_kernel(VAL *A, CRD *diags, uint64_t numDiags, VAL *B,
+                                 VAL *C, uint64_t m, uint64_t n) {
+  uint64_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < m) {
+    VAL acc = 0.0;
+    for (uint64_t d = 0; d < numDiags; d++) {
+      int64_t j = i + diags[d]; // signed
       if (0 <= j && j < n) {
         acc += A[d * n + j] * B[j];
       }

diff --git a/include/matx/transforms/matmul/matvec_cusparse.h b/include/matx/transforms/matmul/matvec_cusparse.h
@@ -303,7 +303,7 @@ void sparse_matvec_impl(TensorTypeC &C, const TensorTypeA &a,
   MATX_ASSERT(b.Stride(RANKB - 1) == 1 && c.Stride(RANKC - 1) == 1,
               matxInvalidParameter);
 
-  if constexpr (atype::Format::isDIA()) {
+  if constexpr (atype::Format::isDIAI() || atype::Format::isDIAJ()) {
 
     // Fall back to a hand-written kernel for DIA format, since
     // this format is not supported in cuSPARSE. The hand-written
@@ -325,8 +325,12 @@ void sparse_matvec_impl(TensorTypeC &C, const TensorTypeA &a,
     uint32_t THREADS = static_cast<uint32_t>(std::min(m, 1024LU));
     uint32_t BATCHES = static_cast<uint32_t>(
         cuda::std::ceil(static_cast<double>(m) / THREADS));
-    dia_spmv_kernel<<<BATCHES, THREADS, 0, stream>>>(AD, diags, numD, BD, CD, m,
-                                                     n);
+    if constexpr (atype::Format::isDIAI())
+      diai_spmv_kernel<<<BATCHES, THREADS, 0, stream>>>(AD, diags, numD, BD, CD,
+                                                        m, n);
+    else
+      diaj_spmv_kernel<<<BATCHES, THREADS, 0, stream>>>(AD, diags, numD, BD, CD,
+                                                        m, n);
 #endif
 
   } else {