Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
252 changes: 116 additions & 136 deletions include/matx/transforms/qr.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,145 +43,157 @@ namespace matx {

namespace detail {

// internal qr implementation which takes memory in api call
template<typename QType, typename RType, typename AType,
typename NType, typename VMType, typename HType, typename QNType, typename RNType>
void qr_internal(QType Q, RType R, AType A,
NType N, VMType VM, HType H, QNType QN, RNType RN, cudaStream_t stream) {
MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
template<typename AType>
inline auto qr_internal_workspace(const AType &A, cudaStream_t stream) {
using ATypeS = typename AType::scalar_type;
const int RANK = AType::Rank();

static_assert(A.Rank() >= 2);
static_assert(Q.Rank() == A.Rank());
static_assert(R.Rank() == A.Rank());
index_t m = A.Size(RANK-2);
index_t n = A.Size(RANK-1);

MATX_ASSERT_STR(A.Rank() == Q.Rank(), matxInvalidDim, "qr: A and Q must have the same rank");
MATX_ASSERT_STR(A.Rank() == R.Rank(), matxInvalidDim, "qr: A and R must have the same rank");
std::array<index_t, RANK-1> uShape;
for(int i = 0; i < RANK-2; i++) {
uShape[i] = A.Size(i);
}
uShape[RANK-2] = A.Size(RANK-1);

auto QShape = A.Shape();
QShape[RANK-1] = m;

auto Qin = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);
auto wwt = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);
auto u = make_tensor<ATypeS>(uShape, MATX_ASYNC_DEVICE_MEMORY, stream);

return std::make_tuple(Qin, wwt, u);
}

using ATypeS = typename AType::scalar_type;
using NTypeS = typename inner_op_type_t<ATypeS>::type;
const int RANK = AType::Rank();
template<typename QType, typename RType, typename AType, typename WType>
inline void qr_internal(QType &Q, RType &R, const AType &A, WType workspace, cudaStream_t stream) {
MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)

index_t m = A.Size(RANK-2);
index_t n = A.Size(RANK-1);
index_t k = std::min(m,n);
if(m<=n) k--; // these matrices have one less update since the diagonal ends on the bottom of the matrix
static_assert(A.Rank() >= 2);
static_assert(Q.Rank() == A.Rank());
static_assert(R.Rank() == A.Rank());

// Create Identity matrix
auto E = eye<ATypeS>({m, m});
MATX_ASSERT_STR(A.Rank() == Q.Rank(), matxInvalidDim, "qr: A and Q must have the same rank");
MATX_ASSERT_STR(A.Rank() == R.Rank(), matxInvalidDim, "qr: A and R must have the same rank");

// Clone over batch Dims
auto ECShape = Q.Shape();
ECShape[RANK-1] = matxKeepDim;
ECShape[RANK-2] = matxKeepDim;
using ATypeS = typename AType::scalar_type;
using NTypeS = typename inner_op_type_t<ATypeS>::type;
const int RANK = AType::Rank();

auto I = clone<RANK>(E, ECShape);
index_t m = A.Size(RANK-2);
index_t n = A.Size(RANK-1);
index_t k = std::min(m,n);
if(m<=n) k--; // these matrices have one less update since the diagonal ends on the bottom of the matrix

auto ci = N; // alias
auto Qin = std::get<0>(workspace);
auto wwt = std::get<1>(workspace);
auto u = std::get<2>(workspace);

// Inititalize Q
(Q = I).run(stream);
(R = A).run(stream);
static_assert(Qin.Rank() == Q.Rank());
static_assert(wwt.Rank() == Q.Rank());
static_assert(u.Rank() == Q.Rank()-1);

// setup slices
std::array<index_t, RANK> mSliceB, mSliceE; // matrix slice starting at i,i
mSliceB.fill(0); mSliceE.fill(matxEnd);
// Create Identity matrix
auto E = eye<ATypeS>({m, m});

std::array<index_t, RANK> qSliceB, qSliceE; // matrix slice starting at 0,i
qSliceB.fill(0); qSliceE.fill(matxEnd);
// Clone over batch Dims
auto ECShape = Q.Shape();
ECShape[RANK-1] = matxKeepDim;
ECShape[RANK-2] = matxKeepDim;

std::array<index_t, RANK> xSliceB, xSliceE; // vector slice starting at i,i
xSliceB.fill(0); xSliceE.fill(matxEnd);
xSliceE[RANK-1] = matxDropDim; // drop last dim to make a vector
auto I = clone<RANK>(E, ECShape);

std::array<index_t, RANK> vSliceB, vSliceE; // vector slice starting at i,0?
vSliceB.fill(0); vSliceE.fill(matxEnd);
vSliceE[RANK-1] = matxDropDim; // drop last dim to make a vector
// Inititalize Q
(Q = I).run(stream);
(R = A).run(stream);

std::array<index_t, RANK> vmSliceB, vmSliceE; // vector slice as a matrix starting at i
vmSliceB.fill(0); vmSliceE.fill(matxEnd);
// we will slice X directly from R.
std::array<index_t, RANK> xSliceB, xSliceE;
xSliceB.fill(0); xSliceE.fill(matxEnd);
xSliceE[RANK-1] = matxDropDim; // drop last dim to make a vector

// N cloned across x.
std::array<index_t, RANK-1> ncShape;
ncShape.fill(matxKeepDim);

// clone ci across hi
std::array<index_t, RANK> cicShape;
cicShape.fill(matxKeepDim);
// v is of size m x 1. Instead of allocating additional memory we will just reuse a row of Qin
std::array<index_t, RANK> vSliceB, vSliceE;
vSliceB.fill(0); vSliceE.fill(matxEnd);
// select a single row of Q to alias as v
vSliceE[RANK-2] = matxDropDim;
auto v = slice<RANK-1>(Qin, vSliceB, vSliceE);
auto xz = v; // alias

for(int i = 0 ; i < k ; i++) {

// update top left corner of slice
mSliceB[RANK-2] = i;
mSliceB[RANK-1] = i;
// N is of size 1. Instead of allocating additional memory we will just reuse an entry of Qin
std::array<index_t, RANK> nSliceB, nSliceE;
nSliceB.fill(0); nSliceE.fill(matxEnd);
// select a single row of Q to alias as v
nSliceE[RANK-2] = matxDropDim;
nSliceE[RANK-1] = matxDropDim;

// update q slice
qSliceB[RANK-1] = i;
auto N = slice<RANK-2>(wwt, nSliceB, nSliceE);

// update v/vm slices to start at i
xSliceB[RANK-2] = i;
xSliceB[RANK-1] = i;
vmSliceB[RANK-2] = i;
vSliceB[RANK-2] = i;
// N cloned with RANK-2 of size m.
std::array<index_t, RANK-1> ncShape;
ncShape.fill(matxKeepDim);
ncShape[RANK-2] = m;
auto nc = clone<RANK-1>(N,ncShape);

// matrix slices
auto qi = slice<RANK>(Q, qSliceB, qSliceE);
auto qn = slice<RANK>(QN, qSliceB, qSliceE);
auto ri = slice<RANK>(R, mSliceB, mSliceE);
auto rn = slice<RANK>(RN, mSliceB, mSliceE);
auto hi = slice<RANK>(H, mSliceB, mSliceE);
auto vm = slice<RANK>(VM, vmSliceB, vmSliceE);
std::array<index_t, RANK> umShape;
umShape.fill(matxKeepDim);
umShape[RANK-1] = 1;

// vector slices
auto v = slice<RANK-1>(VM, vSliceB, vSliceE);
auto x = slice<RANK-1>(R, xSliceB, xSliceE);
auto um = clone<RANK>(u, umShape);
auto vm = clone<RANK>(v, umShape);

// aliasing some memory here to share storage and provide clarity in the code below
auto s = N; // alias
auto sc = nc; // alias
auto w = v; // alias
auto wm = vm; // alias

//update clone shape
ncShape[RANK-2] = m-i;
auto nc = clone<RANK-1>(N,ncShape);
for(int i = 0 ; i < k ; i++) {

// update cloned ci shape
cicShape[RANK-2] = m - i;
cicShape[RANK-1] = m - i;
auto cic = clone<RANK>(ci, cicShape);
// slice off a column of R and alias as x
xSliceB[RANK-1] = i;
auto x = slice<RANK-1>(R, xSliceB, xSliceE);

// create identity matrix and clone across full rank
auto Ei = eye<ATypeS>({m-i,m-i});
auto Ii = clone<RANK>(Ei, ECShape);
// operator which zeros out values above current index in matrix
(xz = (index(x.Rank()-1) >= i) * x).run(stream);

(N = sum(norm(x))).run(stream);
(N = sqrt(N)).run(stream);
// compute L2 norm without sqrt.
(N = sum(norm(xz))).run(stream);
//(N = sqrt(N)).run(stream); // sqrt folded into next op

// copy x into v and apply signed addition of nc
(IFELSE( (index(v.Rank()-1) == 0),
v = x + sign(x, ATypeS(1) ) * nc,
v = x)).run(stream);
(v = xz + (index(v.Rank()-1) == i) * sign(xz) * sqrt(nc)).run(stream);

(ci = sum(norm(v))).run(stream);
auto r = x; // alias column of R happens to be the same as x

(ci = NTypeS(2) / ci).run(stream);
(s = sum(norm(v))).run(stream);
//(s = sqrt(s)).run(stream); // sqrt folded into next op

matmul_impl(hi, vm, conj(transpose_matrix(vm)), stream);
// IFELSE to avoid nans when dividing by zero
(IFELSE(sc != NTypeS(0),
w = (v / sqrt(sc)),
w = NTypeS(0))).run(stream);

(hi = Ii - cic * hi).run(stream);
(u = matvec(conj(transpose_matrix(R)), w, 2 , 0)).run(stream);

// update panel of r
matmul_impl(rn, hi, ri, stream);
// TODO replace with outer API
matmul_impl(R, wm, conj(transpose_matrix(um)), stream, -1, 1);

// update panel of q
matmul_impl(qn, qi, hi, stream);
// entries below diagonal should be numerical zero. Zero them out to avoid additional FP error.
(IF(index(x.Rank()-1) > i, r = ATypeS(0)) ).run(stream);

// deep copy required (can't swap)
// copy current panels into output matrix
(ri = rn).run(stream);
(qi = qn).run(stream);

// R & Q now contain latest
}
// TODO replace with outer API
matmul_impl(wwt, wm, conj(transpose_matrix(wm)), stream);

// zero out lower triangular part of R
(IF(index(RANK-1) < index(RANK-2), R = 0)).run(stream);
(Qin = Q).run(stream); // save input
matmul_impl(Q, Qin, wwt, stream, -2, 1);

}
}
} // end namespace detail

/**
Expand All @@ -204,7 +216,7 @@ namespace detail {
* CUDA stream
*/
template<typename QType, typename RType, typename AType>
void qr_impl(QType Q, RType R, AType A, cudaStream_t stream) {
inline void qr_impl(QType &Q, RType &R, const AType &A, cudaStream_t stream) {
MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)

static_assert(A.Rank() >= 2);
Expand All @@ -214,40 +226,8 @@ void qr_impl(QType Q, RType R, AType A, cudaStream_t stream) {
MATX_ASSERT_STR(A.Rank() == Q.Rank(), matxInvalidDim, "qr: A and Q must have the same rank");
MATX_ASSERT_STR(A.Rank() == R.Rank(), matxInvalidDim, "qr: A and R must have the same rank");

using ATypeS = typename AType::scalar_type;
using NTypeS = typename inner_op_type_t<ATypeS>::type;
const int RANK = AType::Rank();

index_t m = A.Size(RANK-2);
index_t n = A.Size(RANK-1);
index_t k = std::min(m,n);
if(m<=n) k--; // these matrices have one less update since the diagonal ends on the bottom of the matrix

std::array<index_t, RANK-2> NShape;
for(int i = 0; i < RANK-2; i++) {
NShape[i] = A.Size(i);
}

std::array<index_t, RANK> VMShape;
for(int i = 0; i < RANK-1; i++) {
VMShape[i] = A.Size(i);
}
VMShape[RANK-1] = 1;

std::array<index_t, RANK> HShape;
for(int i = 0; i < RANK-2; i++) {
HShape[i] = A.Size(i);
}
HShape[RANK-2] = m;
HShape[RANK-1] = m;

auto N = make_tensor<NTypeS>(NShape, MATX_ASYNC_DEVICE_MEMORY, stream);
auto VM = make_tensor<ATypeS>(VMShape, MATX_ASYNC_DEVICE_MEMORY, stream);
auto H = make_tensor<ATypeS>(HShape, MATX_ASYNC_DEVICE_MEMORY, stream);
auto QN = make_tensor<ATypeS>(Q.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream);
auto RN = make_tensor<ATypeS>(R.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream);

qr_internal(Q,R,A,N,VM,H,QN,RN,stream);
auto workspace = qr_internal_workspace(A, stream);
qr_internal(Q,R,A,workspace,stream);
}

} // end namespace matx
Loading