NVIDIA · cliffburdick · Jun 30, 2025 · Jun 26, 2025 · Jun 26, 2025 · Jun 27, 2025
diff --git a/docs_input/basics/sparse_tensor.rst b/docs_input/basics/sparse_tensor.rst
@@ -95,7 +95,7 @@ matvec, matmul, and solve::
    (Acsr = sparse2sparse(Acoo)).run(exec);
    (V = matvec(Acoo, W)).run(exec); // only Sparse-Matrix x Vector (SpMV)
    (C = matmul(Acoo, B)).run(exec); // only Sparse-Matrix x Matrix (SpMM)
-   (X = solve(Acsr, Y)).run(exec);  // only on CSR or tri-DIA format
+   (X = solve(Acsr, Y)).run(exec);  // only on CSR or (batched) tri-DIA format
 
 We expect the assortment of supported sparse operations and storage
 formats to grow if the experimental implementation is well-received.
@@ -139,15 +139,28 @@ to construct COO, CSR, CSC, and DIA are provided::
 
   // Constructs a sparse matrix in DIA format directly from the values and the
   // offset vectors. For an m x n matrix, this format uses a linearized storage
-  // where each diagonal has n entries and is accessed by index I or index J.
-  // For index I, diagonals padded with zeros on the left for the lower triangular
-  // part and padded with zeros on the right for the upper triangular part. This
-  // is vv. when using index J. This format is most efficient for matrices with
-  // only a few nonzero diagonals that are close to the main diagonal.
+  // where each diagonal has m or n entries and is accessed by either index I or
+  // index J, respectively. For index I, diagonals are padded with zeros on the
+  // left for the lower triangular part and padded with zeros on the right for
+  // the upper triagonal part. This is vv. when using index J. This format is
+  // most efficient for matrices with only a few nonzero diagonals that are
+  // close to the main diagonal.
   template <typename IDX, typename ValTensor, typename CrdTensor>
   auto make_tensor_dia(ValTensor &val,
                        CrdTensor &off,
-                       const index_t (&shape)[2]) {
+                       const index_t (&shape)[2]);
+
+  // Constructs a sparse tensor in uniform batched DIA format directly from
+  // the values and the offset vectors. For a b x m x n tensor, this format
+  // effectively stores b times m x n matrices in DIA format, using a uniform
+  // nonzero structure for each (non-uniform formats are possible as well).
+  // All diagonals are stored consecutively in linearized format, sorted lower
+  // to upper, with all diagonals at a certain offset appearing consecutively
+  // for all batches. With DIA(b,i,j) as indexing, can be indexed by i or j.
+  template <typename IDX, typename ValTensor, typename CrdTensor>
+  auto make_tensor_uniform_batched_dia(ValTensor &val,
+                                       CrdTensor &off,
+                                       const index_t (&shape)[2]);
 
 Matx Implementation of the UST Type
 -----------------------------------

diff --git a/examples/sparse_tensor.cu b/examples/sparse_tensor.cu
@@ -216,6 +216,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) {
   // Perform a direct solve. This is only supported for a tri-diagonal
   // matrix in DIA-I format where the rhs is overwritten with the answer.
   //
+  // |  4  1  0  0  0  0 |    | 1   7 |   |  6  36 |
+  // | -1  4  1  0  0  0 |    | 2   8 |   | 10  34 |
+  // |  0 -1  4  1  0  0 | x  | 3   9 | = | 14  38 |
+  // |  0  0 -1  4  1  0 |    | 4  10 |   | 18  42 |
+  // |  0  0  0 -1  4  1 |    | 5  11 |   | 22  46 |
+  // |  0  0  0  0 -1  4 |    | 6  12 |   | 19  37 |
+  //
   dvals.SetVals({0, -1, -1, -1, -1, -1, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 0});
   auto AdiaI = experimental::make_tensor_dia<experimental::DIA_INDEX_I>(
       dvals, doffsets, {6, 6});
@@ -224,5 +231,27 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) {
   (Rhs = solve(AdiaI, Rhs)).run(exec);
   print(Rhs);
 
+  //
+  // Perform a batched direct solve. This is only supported for a uniform
+  // batched tri-diagonal matrix in DIA-I format where the rhs-s are
+  //  overwritten with the answers.
+  //
+  // batch0          batch1
+  // | 10 -1  0  0 | | 20 -2  0  0 | x  | 1  1  1  1 |  1  1  1  1 |
+  // | -1 10 -1  0 | | -2 20 -2  0 |
+  // |  0 -1 10 -1 | |  0 -2 20 -2 | =  | 9  8  8  9 | 18 16 16 18 |
+  // |  0  0 -1 10 | |  0  0 -2 20 |
+  //
+  auto bdvals = make_tensor<float>({2 * 3 * 4});
+  bdvals.SetVals({0,  -1, -1, -1, 0,  -2, -2, -2, 10, 10, 10, 10,
+                  20, 20, 20, 20, -1, -1, -1, 0,  -2, -2, -2, 0});
+  auto AbatcheddiaI =
+      experimental::make_tensor_uniform_batched_dia<experimental::DIA_INDEX_I>(
+          bdvals, doffsets, {2, 4, 4});
+  auto L = make_tensor<float>({2 * 4});
+  L.SetVals({9, 8, 8, 9, 18, 16, 16, 18});
+  (L = solve(AbatcheddiaI, L)).run(exec);
+  print(L);
+
   MATX_EXIT_HANDLER();
 }
diff --git a/include/matx/core/make_sparse_tensor.h b/include/matx/core/make_sparse_tensor.h
@@ -83,11 +83,15 @@ __MATX_INLINE__ static auto makeDefaultNonOwningEmptyStorage() {
 // MatX implements a universal sparse tensor type that uses a tensor format
 // DSL (Domain Specific Language) to describe a vast space of storage formats.
 // This file provides a number of convenience factory methods that construct
-// sparse tensors in well-known storage formats, like COO, CSR, and CSC,
+// sparse tensors in well-known storage formats, like COO, CSR, CSC, and DIA,
 // directly from the constituent buffers. More factory methods can easily be
 // added as the need arises.
 //
 
+// Indexing options.
+struct DIA_INDEX_I {};
+struct DIA_INDEX_J {};
+
 // Constructs a sparse matrix in COO format directly from the values and
 // the two coordinates vectors. The entries should be sorted by row, then
 // column. Duplicate entries should not occur. Explicit zeros may be stored.
@@ -204,13 +208,12 @@ auto make_zero_tensor_csc(const index_t (&shape)[2],
 
 // Constructs a sparse matrix in DIA format directly from the values and the
 // offset vectors. For an m x n matrix, this format uses a linearized storage
-// where each diagonal has n entries and is accessed by index I or index J.
-// For index I, diagonals padded with zeros on the left for the lower triangular
-// part and padded with zeros on the right for the upper triagonal part. This
-// is vv. when using index J. This format is most efficient for matrices with
-// only a few nonzero diagonals that are close to the main diagonal.
-struct DIA_INDEX_I {};
-struct DIA_INDEX_J {};
+// where each diagonal has m or n entries and is accessed by either index I or
+// index J, respectively. For index I, diagonals are padded with zeros on the
+// left for the lower triangular part and padded with zeros on the right for
+// the upper triagonal part. This is vv. when using index J. This format is
+// most efficient for matrices with only a few nonzero diagonals that are
+// close to the main diagonal.
 template <typename IDX, typename ValTensor, typename CrdTensor>
 auto make_tensor_dia(ValTensor &val, CrdTensor &off,
                      const index_t (&shape)[2]) {
@@ -220,6 +223,13 @@ auto make_tensor_dia(ValTensor &val, CrdTensor &off,
   // Proper structure.
   MATX_STATIC_ASSERT_STR(ValTensor::Rank() == 1 && CrdTensor::Rank() == 1,
                          matxInvalidParameter, "data arrays should be rank-1");
+  if constexpr (std::is_same_v<IDX, DIA_INDEX_I>) {
+    MATX_ASSERT_STR(val.Size(0) == shape[0] * off.Size(0), matxInvalidParameter,
+                    "data arrays should contain all diagonals (by row index)");
+  } else {
+    MATX_ASSERT_STR(val.Size(0) == shape[1] * off.Size(0), matxInvalidParameter,
+                    "data arrays should contain all diagonals (by col index)");
+  }
   // Note that the DIA API typically does not involve positions.
   // However, under the formal DSL specifications, the top level
   // compression should set up pos[0] = {0, #diags}. This is done
@@ -235,5 +245,46 @@ auto make_tensor_dia(ValTensor &val, CrdTensor &off,
       {tp, makeDefaultNonOwningEmptyStorage<POS>()});
 }
 
+// Constructs a sparse tensor in uniform batched DIA format directly from
+// the values and the offset vectors. For a b x m x n tensor, this format
+// effectively stores b times m x n matrices in DIA format, using a uniform
+// nonzero structure for each (non-uniform formats are possible as well).
+// All diagonals are stored consecutively in linearized format, sorted lower
+// to upper, with all diagonals at a certain offset appearing consecutively
+// for all batches. With DIA(b,i,j) as indexing, can be indexed by i or j.
+template <typename IDX, typename ValTensor, typename CrdTensor>
+auto make_tensor_uniform_batched_dia(ValTensor &val, CrdTensor &off,
+                                     const index_t (&shape)[3]) {
+  using VAL = typename ValTensor::value_type;
+  using CRD = typename CrdTensor::value_type;
+  using POS = index_t;
+  // Proper structure.
+  MATX_STATIC_ASSERT_STR(ValTensor::Rank() == 1 && CrdTensor::Rank() == 1,
+                         matxInvalidParameter, "data arrays should be rank-1");
+  if constexpr (std::is_same_v<IDX, DIA_INDEX_I>) {
+    MATX_ASSERT_STR(val.Size(0) == shape[0] * shape[1] * off.Size(0),
+                    matxInvalidParameter,
+                    "data arrays should contain all diagonals (by row index)");
+  } else {
+    MATX_ASSERT_STR(val.Size(0) == shape[0] * shape[2] * off.Size(0),
+                    matxInvalidParameter,
+                    "data arrays should contain all diagonals (by col index)");
+  }
+  // Note that the DIA API typically does not involve positions.
+  // However, under the formal DSL specifications, the top level
+  // compression should set up pos[0] = {0, #diags}. This is done
+  // here, using the same memory space as the other data.
+  matxMemorySpace_t space = GetPointerKind(val.GetStorage().data());
+  auto tp = makeDefaultNonOwningZeroStorage<POS>(2, space);
+  setVal(tp.data() + 1, static_cast<POS>(off.Size(0)), space);
+  // Construct Batched DIA-I/J.
+  using DIA = std::conditional_t<std::is_same_v<IDX, DIA_INDEX_I>,
+                                 BatchedDIAIUniform, BatchedDIAJUniform>;
+  return sparse_tensor_t<VAL, CRD, POS, DIA>(
+      shape, val.GetStorage(),
+      {off.GetStorage(), makeDefaultNonOwningEmptyStorage<CRD>()},
+      {tp, makeDefaultNonOwningEmptyStorage<POS>()});
+}
+
 } // namespace experimental
 } // namespace matx
diff --git a/include/matx/core/sparse_tensor_format.h b/include/matx/core/sparse_tensor_format.h
@@ -44,6 +44,8 @@ namespace experimental {
 //
 // TODO: more elegant to have Var(i) and Const(c) in type "syntax"
 //
+// TODO: Add/Sub inversion semantics restricted to j+i,j-i and last-level range
+//
 enum class LvlOp { Id, Add, Sub, Div, Mod };
 template <LvlOp O, int I, int J = 0> class LvlExpr {
 public:
@@ -150,15 +152,15 @@ template <int D, class... S> class SparseTensorFormat {
   static_assert(DIM <= LVL);
 
   static constexpr bool isSpVec() {
-    if constexpr (LVL == 1) {
+    if constexpr (DIM == 1 && LVL == 1) {
       using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
       return type0::Expr::isId(0) && type0::Type::isCompressed();
     }
     return false;
   }
 
   static constexpr bool isCOO() {
-    if constexpr (LVL == 2) {
+    if constexpr (DIM == 2 && LVL == 2) {
       using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
       using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
       return type0::Expr::isId(0) && type0::Type::isCompressedNU() &&
@@ -168,7 +170,7 @@ template <int D, class... S> class SparseTensorFormat {
   }
 
   static constexpr bool isCSR() {
-    if constexpr (LVL == 2) {
+    if constexpr (DIM == 2 && LVL == 2) {
       using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
       using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
       return type0::Expr::isId(0) && type0::Type::isDense() &&
@@ -178,7 +180,7 @@ template <int D, class... S> class SparseTensorFormat {
   }
 
   static constexpr bool isCSC() {
-    if constexpr (LVL == 2) {
+    if constexpr (DIM == 2 && LVL == 2) {
       using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
       using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
       return type0::Expr::isId(1) && type0::Type::isDense() &&
@@ -188,7 +190,7 @@ template <int D, class... S> class SparseTensorFormat {
   }
 
   static constexpr bool isDIAI() {
-    if constexpr (LVL == 2) {
+    if constexpr (DIM == 2 && LVL == 2) {
       using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
       using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
       return type0::Expr::op == LvlOp::Sub && type0::Expr::di == 1 &&
@@ -199,7 +201,7 @@ template <int D, class... S> class SparseTensorFormat {
   }
 
   static constexpr bool isDIAJ() {
-    if constexpr (LVL == 2) {
+    if constexpr (DIM == 2 && LVL == 2) {
       using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
       using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
       return type0::Expr::op == LvlOp::Sub && type0::Expr::di == 1 &&
@@ -209,6 +211,19 @@ template <int D, class... S> class SparseTensorFormat {
     return false;
   }
 
+  static constexpr bool isBatchedDIAIUniform() {
+    if constexpr (DIM == 3 && LVL == 3) {
+      using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
+      using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
+      using type2 = cuda::std::tuple_element_t<2, LvlSpecs>;
+      return type0::Expr::op == LvlOp::Sub && type0::Expr::di == 2 &&
+             type0::Expr::cj == 1 && type0::Type::isCompressed() &&
+             type1::Expr::isId(0) && type1::Type::isDense() &&
+             type2::Expr::isId(1) && type2::Type::isRange();
+    }
+    return false;
+  }
+
   template <bool SZ, class CRD, int L = 0>
   __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ static void
   dim2lvl(const CRD *dims, CRD *lvls) {
@@ -259,20 +274,20 @@ template <int D, class... S> class SparseTensorFormat {
       if constexpr (ftype::Expr::op == LvlOp::Id) {
         dims[ftype::Expr::di] = lvls[L];
       } else if constexpr (ftype::Expr::op == LvlOp::Add) {
-        using ntype = cuda::std::tuple_element_t<L + 1, LvlSpecs>;
-        static_assert(ntype::Expr::op == LvlOp::Id && ntype::Type::isRange());
-        if constexpr (ftype::Expr::cj == ntype::Expr::di) {
-          dims[ftype::Expr::di] = lvls[L] - lvls[L + 1];
-        } else if constexpr (ftype::Expr::di == ntype::Expr::di) {
-          dims[ftype::Expr::cj] = lvls[L] - lvls[L + 1];
+        using ltype = cuda::std::tuple_element_t<LVL - 1, LvlSpecs>;
+        static_assert(ltype::Expr::op == LvlOp::Id && ltype::Type::isRange());
+        if constexpr (ftype::Expr::cj == ltype::Expr::di) {
+          dims[ftype::Expr::di] = lvls[L] - lvls[LVL - 1];
+        } else if constexpr (ftype::Expr::di == ltype::Expr::di) {
+          dims[ftype::Expr::cj] = lvls[L] - lvls[LVL - 1];
         }
       } else if constexpr (ftype::Expr::op == LvlOp::Sub) {
-        using ntype = cuda::std::tuple_element_t<L + 1, LvlSpecs>;
-        static_assert(ntype::Expr::op == LvlOp::Id && ntype::Type::isRange());
-        if constexpr (ftype::Expr::cj == ntype::Expr::di) {
-          dims[ftype::Expr::di] = lvls[L] + lvls[L + 1];
-        } else if constexpr (ftype::Expr::di == ntype::Expr::di) {
-          dims[ftype::Expr::cj] = lvls[L + 1] - lvls[L];
+        using ltype = cuda::std::tuple_element_t<LVL - 1, LvlSpecs>;
+        static_assert(ltype::Expr::op == LvlOp::Id && ltype::Type::isRange());
+        if constexpr (ftype::Expr::cj == ltype::Expr::di) {
+          dims[ftype::Expr::di] = lvls[L] + lvls[LVL - 1];
+        } else if constexpr (ftype::Expr::di == ltype::Expr::di) {
+          dims[ftype::Expr::cj] = lvls[LVL - 1] - lvls[L];
         }
       } else if constexpr (ftype::Expr::op == LvlOp::Div) {
         dims[ftype::Expr::di] = lvls[L] * ftype::Expr::cj;
@@ -393,6 +408,22 @@ using CSF3 =
     SparseTensorFormat<3, LvlSpec<D0, Compressed>, LvlSpec<D1, Compressed>,
                        LvlSpec<D2, Compressed>>;
 
+// Experimental batched DIA formats. Placing the batch *after* the offset
+// compression ensures similar diagonals of the batches are stored
+// consecutively, but forces a uniform nonzero structure over all.
+using BatchedDIAINonUniform =
+    SparseTensorFormat<3, LvlSpec<D0, Dense>, LvlSpec<Sub<2, 1>, Compressed>,
+                       LvlSpec<D1, Range>>;
+using BatchedDIAJNonUniform =
+    SparseTensorFormat<3, LvlSpec<D0, Dense>, LvlSpec<Sub<2, 1>, Compressed>,
+                       LvlSpec<D2, Range>>;
+using BatchedDIAIUniform =
+    SparseTensorFormat<3, LvlSpec<Sub<2, 1>, Compressed>, LvlSpec<D0, Dense>,
+                       LvlSpec<D1, Range>>;
+using BatchedDIAJUniform =
+    SparseTensorFormat<3, LvlSpec<Sub<2, 1>, Compressed>, LvlSpec<D0, Dense>,
+                       LvlSpec<D2, Range>>;
+
 // Sparse Block 3-D Tensors.
 template <int M, int N, int K>
 using BSR3 = SparseTensorFormat<

diff --git a/include/matx/operators/solve.h b/include/matx/operators/solve.h
@@ -108,6 +108,8 @@ class SolveOp : public BaseOp<SolveOp<OpA, OpB>> {
     if constexpr (is_sparse_tensor_v<OpA>) {
       if constexpr (OpA::Format::isDIAI() || OpA::Format::isDIAJ()) {
         sparse_dia_solve_impl(cuda::std::get<0>(out), a_, b_, ex);
+      } else if constexpr (OpA::Format::isBatchedDIAIUniform()) {
+        sparse_batched_dia_solve_impl(cuda::std::get<0>(out), a_, b_, ex);
       } else {
 #ifdef MATX_EN_CUDSS
         sparse_solve_impl(cuda::std::get<0>(out), a_, b_, ex);