Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 20 additions & 7 deletions docs_input/basics/sparse_tensor.rst
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ matvec, matmul, and solve::
(Acsr = sparse2sparse(Acoo)).run(exec);
(V = matvec(Acoo, W)).run(exec); // only Sparse-Matrix x Vector (SpMV)
(C = matmul(Acoo, B)).run(exec); // only Sparse-Matrix x Matrix (SpMM)
(X = solve(Acsr, Y)).run(exec); // only on CSR or tri-DIA format
(X = solve(Acsr, Y)).run(exec); // only on CSR or (batched) tri-DIA format

We expect the assortment of supported sparse operations and storage
formats to grow if the experimental implementation is well-received.
Expand Down Expand Up @@ -139,15 +139,28 @@ to construct COO, CSR, CSC, and DIA are provided::

// Constructs a sparse matrix in DIA format directly from the values and the
// offset vectors. For an m x n matrix, this format uses a linearized storage
// where each diagonal has n entries and is accessed by index I or index J.
// For index I, diagonals padded with zeros on the left for the lower triangular
// part and padded with zeros on the right for the upper triangular part. This
// is vv. when using index J. This format is most efficient for matrices with
// only a few nonzero diagonals that are close to the main diagonal.
// where each diagonal has m or n entries and is accessed by either index I or
// index J, respectively. For index I, diagonals are padded with zeros on the
// left for the lower triangular part and padded with zeros on the right for
// the upper triagonal part. This is vv. when using index J. This format is
// most efficient for matrices with only a few nonzero diagonals that are
// close to the main diagonal.
template <typename IDX, typename ValTensor, typename CrdTensor>
auto make_tensor_dia(ValTensor &val,
CrdTensor &off,
const index_t (&shape)[2]) {
const index_t (&shape)[2]);

// Constructs a sparse tensor in uniform batched DIA format directly from
// the values and the offset vectors. For a b x m x n tensor, this format
// effectively stores b times m x n matrices in DIA format, using a uniform
// nonzero structure for each (non-uniform formats are possible as well).
// All diagonals are stored consecutively in linearized format, sorted lower
// to upper, with all diagonals at a certain offset appearing consecutively
// for all batches. With DIA(b,i,j) as indexing, can be indexed by i or j.
template <typename IDX, typename ValTensor, typename CrdTensor>
auto make_tensor_uniform_batched_dia(ValTensor &val,
CrdTensor &off,
const index_t (&shape)[2]);

Matx Implementation of the UST Type
-----------------------------------
Expand Down
29 changes: 29 additions & 0 deletions examples/sparse_tensor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) {
// Perform a direct solve. This is only supported for a tri-diagonal
// matrix in DIA-I format where the rhs is overwritten with the answer.
//
// | 4 1 0 0 0 0 | | 1 7 | | 6 36 |
// | -1 4 1 0 0 0 | | 2 8 | | 10 34 |
// | 0 -1 4 1 0 0 | x | 3 9 | = | 14 38 |
// | 0 0 -1 4 1 0 | | 4 10 | | 18 42 |
// | 0 0 0 -1 4 1 | | 5 11 | | 22 46 |
// | 0 0 0 0 -1 4 | | 6 12 | | 19 37 |
//
dvals.SetVals({0, -1, -1, -1, -1, -1, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 0});
auto AdiaI = experimental::make_tensor_dia<experimental::DIA_INDEX_I>(
dvals, doffsets, {6, 6});
Expand All @@ -224,5 +231,27 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) {
(Rhs = solve(AdiaI, Rhs)).run(exec);
print(Rhs);

//
// Perform a batched direct solve. This is only supported for a uniform
// batched tri-diagonal matrix in DIA-I format where the rhs-s are
// overwritten with the answers.
//
// batch0 batch1
// | 10 -1 0 0 | | 20 -2 0 0 | x | 1 1 1 1 | 1 1 1 1 |
// | -1 10 -1 0 | | -2 20 -2 0 |
// | 0 -1 10 -1 | | 0 -2 20 -2 | = | 9 8 8 9 | 18 16 16 18 |
// | 0 0 -1 10 | | 0 0 -2 20 |
//
auto bdvals = make_tensor<float>({2 * 3 * 4});
bdvals.SetVals({0, -1, -1, -1, 0, -2, -2, -2, 10, 10, 10, 10,
20, 20, 20, 20, -1, -1, -1, 0, -2, -2, -2, 0});
auto AbatcheddiaI =
experimental::make_tensor_uniform_batched_dia<experimental::DIA_INDEX_I>(
bdvals, doffsets, {2, 4, 4});
auto L = make_tensor<float>({2 * 4});
L.SetVals({9, 8, 8, 9, 18, 16, 16, 18});
(L = solve(AbatcheddiaI, L)).run(exec);
print(L);

MATX_EXIT_HANDLER();
}
67 changes: 59 additions & 8 deletions include/matx/core/make_sparse_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,15 @@ __MATX_INLINE__ static auto makeDefaultNonOwningEmptyStorage() {
// MatX implements a universal sparse tensor type that uses a tensor format
// DSL (Domain Specific Language) to describe a vast space of storage formats.
// This file provides a number of convenience factory methods that construct
// sparse tensors in well-known storage formats, like COO, CSR, and CSC,
// sparse tensors in well-known storage formats, like COO, CSR, CSC, and DIA,
// directly from the constituent buffers. More factory methods can easily be
// added as the need arises.
//

// Indexing options.
struct DIA_INDEX_I {};
struct DIA_INDEX_J {};

// Constructs a sparse matrix in COO format directly from the values and
// the two coordinates vectors. The entries should be sorted by row, then
// column. Duplicate entries should not occur. Explicit zeros may be stored.
Expand Down Expand Up @@ -204,13 +208,12 @@ auto make_zero_tensor_csc(const index_t (&shape)[2],

// Constructs a sparse matrix in DIA format directly from the values and the
// offset vectors. For an m x n matrix, this format uses a linearized storage
// where each diagonal has n entries and is accessed by index I or index J.
// For index I, diagonals padded with zeros on the left for the lower triangular
// part and padded with zeros on the right for the upper triagonal part. This
// is vv. when using index J. This format is most efficient for matrices with
// only a few nonzero diagonals that are close to the main diagonal.
struct DIA_INDEX_I {};
struct DIA_INDEX_J {};
// where each diagonal has m or n entries and is accessed by either index I or
// index J, respectively. For index I, diagonals are padded with zeros on the
// left for the lower triangular part and padded with zeros on the right for
// the upper triagonal part. This is vv. when using index J. This format is
// most efficient for matrices with only a few nonzero diagonals that are
// close to the main diagonal.
template <typename IDX, typename ValTensor, typename CrdTensor>
auto make_tensor_dia(ValTensor &val, CrdTensor &off,
const index_t (&shape)[2]) {
Expand All @@ -220,6 +223,13 @@ auto make_tensor_dia(ValTensor &val, CrdTensor &off,
// Proper structure.
MATX_STATIC_ASSERT_STR(ValTensor::Rank() == 1 && CrdTensor::Rank() == 1,
matxInvalidParameter, "data arrays should be rank-1");
if constexpr (std::is_same_v<IDX, DIA_INDEX_I>) {
MATX_ASSERT_STR(val.Size(0) == shape[0] * off.Size(0), matxInvalidParameter,
"data arrays should contain all diagonals (by row index)");
} else {
MATX_ASSERT_STR(val.Size(0) == shape[1] * off.Size(0), matxInvalidParameter,
"data arrays should contain all diagonals (by col index)");
}
// Note that the DIA API typically does not involve positions.
// However, under the formal DSL specifications, the top level
// compression should set up pos[0] = {0, #diags}. This is done
Expand All @@ -235,5 +245,46 @@ auto make_tensor_dia(ValTensor &val, CrdTensor &off,
{tp, makeDefaultNonOwningEmptyStorage<POS>()});
}

// Constructs a sparse tensor in uniform batched DIA format directly from
// the values and the offset vectors. For a b x m x n tensor, this format
// effectively stores b times m x n matrices in DIA format, using a uniform
// nonzero structure for each (non-uniform formats are possible as well).
// All diagonals are stored consecutively in linearized format, sorted lower
// to upper, with all diagonals at a certain offset appearing consecutively
// for all batches. With DIA(b,i,j) as indexing, can be indexed by i or j.
template <typename IDX, typename ValTensor, typename CrdTensor>
auto make_tensor_uniform_batched_dia(ValTensor &val, CrdTensor &off,
const index_t (&shape)[3]) {
using VAL = typename ValTensor::value_type;
using CRD = typename CrdTensor::value_type;
using POS = index_t;
// Proper structure.
MATX_STATIC_ASSERT_STR(ValTensor::Rank() == 1 && CrdTensor::Rank() == 1,
matxInvalidParameter, "data arrays should be rank-1");
if constexpr (std::is_same_v<IDX, DIA_INDEX_I>) {
MATX_ASSERT_STR(val.Size(0) == shape[0] * shape[1] * off.Size(0),
matxInvalidParameter,
"data arrays should contain all diagonals (by row index)");
} else {
MATX_ASSERT_STR(val.Size(0) == shape[0] * shape[2] * off.Size(0),
matxInvalidParameter,
"data arrays should contain all diagonals (by col index)");
}
// Note that the DIA API typically does not involve positions.
// However, under the formal DSL specifications, the top level
// compression should set up pos[0] = {0, #diags}. This is done
// here, using the same memory space as the other data.
matxMemorySpace_t space = GetPointerKind(val.GetStorage().data());
auto tp = makeDefaultNonOwningZeroStorage<POS>(2, space);
setVal(tp.data() + 1, static_cast<POS>(off.Size(0)), space);
// Construct Batched DIA-I/J.
using DIA = std::conditional_t<std::is_same_v<IDX, DIA_INDEX_I>,
BatchedDIAIUniform, BatchedDIAJUniform>;
return sparse_tensor_t<VAL, CRD, POS, DIA>(
shape, val.GetStorage(),
{off.GetStorage(), makeDefaultNonOwningEmptyStorage<CRD>()},
{tp, makeDefaultNonOwningEmptyStorage<POS>()});
}

} // namespace experimental
} // namespace matx
67 changes: 49 additions & 18 deletions include/matx/core/sparse_tensor_format.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ namespace experimental {
//
// TODO: more elegant to have Var(i) and Const(c) in type "syntax"
//
// TODO: Add/Sub inversion semantics restricted to j+i,j-i and last-level range
//
enum class LvlOp { Id, Add, Sub, Div, Mod };
template <LvlOp O, int I, int J = 0> class LvlExpr {
public:
Expand Down Expand Up @@ -150,15 +152,15 @@ template <int D, class... S> class SparseTensorFormat {
static_assert(DIM <= LVL);

static constexpr bool isSpVec() {
if constexpr (LVL == 1) {
if constexpr (DIM == 1 && LVL == 1) {
using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
return type0::Expr::isId(0) && type0::Type::isCompressed();
}
return false;
}

static constexpr bool isCOO() {
if constexpr (LVL == 2) {
if constexpr (DIM == 2 && LVL == 2) {
using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
return type0::Expr::isId(0) && type0::Type::isCompressedNU() &&
Expand All @@ -168,7 +170,7 @@ template <int D, class... S> class SparseTensorFormat {
}

static constexpr bool isCSR() {
if constexpr (LVL == 2) {
if constexpr (DIM == 2 && LVL == 2) {
using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
return type0::Expr::isId(0) && type0::Type::isDense() &&
Expand All @@ -178,7 +180,7 @@ template <int D, class... S> class SparseTensorFormat {
}

static constexpr bool isCSC() {
if constexpr (LVL == 2) {
if constexpr (DIM == 2 && LVL == 2) {
using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
return type0::Expr::isId(1) && type0::Type::isDense() &&
Expand All @@ -188,7 +190,7 @@ template <int D, class... S> class SparseTensorFormat {
}

static constexpr bool isDIAI() {
if constexpr (LVL == 2) {
if constexpr (DIM == 2 && LVL == 2) {
using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
return type0::Expr::op == LvlOp::Sub && type0::Expr::di == 1 &&
Expand All @@ -199,7 +201,7 @@ template <int D, class... S> class SparseTensorFormat {
}

static constexpr bool isDIAJ() {
if constexpr (LVL == 2) {
if constexpr (DIM == 2 && LVL == 2) {
using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
return type0::Expr::op == LvlOp::Sub && type0::Expr::di == 1 &&
Expand All @@ -209,6 +211,19 @@ template <int D, class... S> class SparseTensorFormat {
return false;
}

static constexpr bool isBatchedDIAIUniform() {
if constexpr (DIM == 3 && LVL == 3) {
using type0 = cuda::std::tuple_element_t<0, LvlSpecs>;
using type1 = cuda::std::tuple_element_t<1, LvlSpecs>;
using type2 = cuda::std::tuple_element_t<2, LvlSpecs>;
return type0::Expr::op == LvlOp::Sub && type0::Expr::di == 2 &&
type0::Expr::cj == 1 && type0::Type::isCompressed() &&
type1::Expr::isId(0) && type1::Type::isDense() &&
type2::Expr::isId(1) && type2::Type::isRange();
}
return false;
}

template <bool SZ, class CRD, int L = 0>
__MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ static void
dim2lvl(const CRD *dims, CRD *lvls) {
Expand Down Expand Up @@ -259,20 +274,20 @@ template <int D, class... S> class SparseTensorFormat {
if constexpr (ftype::Expr::op == LvlOp::Id) {
dims[ftype::Expr::di] = lvls[L];
} else if constexpr (ftype::Expr::op == LvlOp::Add) {
using ntype = cuda::std::tuple_element_t<L + 1, LvlSpecs>;
static_assert(ntype::Expr::op == LvlOp::Id && ntype::Type::isRange());
if constexpr (ftype::Expr::cj == ntype::Expr::di) {
dims[ftype::Expr::di] = lvls[L] - lvls[L + 1];
} else if constexpr (ftype::Expr::di == ntype::Expr::di) {
dims[ftype::Expr::cj] = lvls[L] - lvls[L + 1];
using ltype = cuda::std::tuple_element_t<LVL - 1, LvlSpecs>;
static_assert(ltype::Expr::op == LvlOp::Id && ltype::Type::isRange());
if constexpr (ftype::Expr::cj == ltype::Expr::di) {
dims[ftype::Expr::di] = lvls[L] - lvls[LVL - 1];
} else if constexpr (ftype::Expr::di == ltype::Expr::di) {
dims[ftype::Expr::cj] = lvls[L] - lvls[LVL - 1];
}
} else if constexpr (ftype::Expr::op == LvlOp::Sub) {
using ntype = cuda::std::tuple_element_t<L + 1, LvlSpecs>;
static_assert(ntype::Expr::op == LvlOp::Id && ntype::Type::isRange());
if constexpr (ftype::Expr::cj == ntype::Expr::di) {
dims[ftype::Expr::di] = lvls[L] + lvls[L + 1];
} else if constexpr (ftype::Expr::di == ntype::Expr::di) {
dims[ftype::Expr::cj] = lvls[L + 1] - lvls[L];
using ltype = cuda::std::tuple_element_t<LVL - 1, LvlSpecs>;
static_assert(ltype::Expr::op == LvlOp::Id && ltype::Type::isRange());
if constexpr (ftype::Expr::cj == ltype::Expr::di) {
dims[ftype::Expr::di] = lvls[L] + lvls[LVL - 1];
} else if constexpr (ftype::Expr::di == ltype::Expr::di) {
dims[ftype::Expr::cj] = lvls[LVL - 1] - lvls[L];
}
} else if constexpr (ftype::Expr::op == LvlOp::Div) {
dims[ftype::Expr::di] = lvls[L] * ftype::Expr::cj;
Expand Down Expand Up @@ -393,6 +408,22 @@ using CSF3 =
SparseTensorFormat<3, LvlSpec<D0, Compressed>, LvlSpec<D1, Compressed>,
LvlSpec<D2, Compressed>>;

// Experimental batched DIA formats. Placing the batch *after* the offset
// compression ensures similar diagonals of the batches are stored
// consecutively, but forces a uniform nonzero structure over all.
using BatchedDIAINonUniform =
SparseTensorFormat<3, LvlSpec<D0, Dense>, LvlSpec<Sub<2, 1>, Compressed>,
LvlSpec<D1, Range>>;
using BatchedDIAJNonUniform =
SparseTensorFormat<3, LvlSpec<D0, Dense>, LvlSpec<Sub<2, 1>, Compressed>,
LvlSpec<D2, Range>>;
using BatchedDIAIUniform =
SparseTensorFormat<3, LvlSpec<Sub<2, 1>, Compressed>, LvlSpec<D0, Dense>,
LvlSpec<D1, Range>>;
using BatchedDIAJUniform =
SparseTensorFormat<3, LvlSpec<Sub<2, 1>, Compressed>, LvlSpec<D0, Dense>,
LvlSpec<D2, Range>>;

// Sparse Block 3-D Tensors.
template <int M, int N, int K>
using BSR3 = SparseTensorFormat<
Expand Down
2 changes: 2 additions & 0 deletions include/matx/operators/solve.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ class SolveOp : public BaseOp<SolveOp<OpA, OpB>> {
if constexpr (is_sparse_tensor_v<OpA>) {
if constexpr (OpA::Format::isDIAI() || OpA::Format::isDIAJ()) {
sparse_dia_solve_impl(cuda::std::get<0>(out), a_, b_, ex);
} else if constexpr (OpA::Format::isBatchedDIAIUniform()) {
sparse_batched_dia_solve_impl(cuda::std::get<0>(out), a_, b_, ex);
} else {
#ifdef MATX_EN_CUDSS
sparse_solve_impl(cuda::std::get<0>(out), a_, b_, ex);
Expand Down
Loading