diff --git a/include/matx/transforms/qr.h b/include/matx/transforms/qr.h index 74950418e..78ac85683 100644 --- a/include/matx/transforms/qr.h +++ b/include/matx/transforms/qr.h @@ -43,145 +43,157 @@ namespace matx { namespace detail { - // internal qr implementation which takes memory in api call - template - void qr_internal(QType Q, RType R, AType A, - NType N, VMType VM, HType H, QNType QN, RNType RN, cudaStream_t stream) { - MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL) + template + inline auto qr_internal_workspace(const AType &A, cudaStream_t stream) { + using ATypeS = typename AType::scalar_type; + const int RANK = AType::Rank(); - static_assert(A.Rank() >= 2); - static_assert(Q.Rank() == A.Rank()); - static_assert(R.Rank() == A.Rank()); + index_t m = A.Size(RANK-2); + index_t n = A.Size(RANK-1); - MATX_ASSERT_STR(A.Rank() == Q.Rank(), matxInvalidDim, "qr: A and Q must have the same rank"); - MATX_ASSERT_STR(A.Rank() == R.Rank(), matxInvalidDim, "qr: A and R must have the same rank"); + std::array uShape; + for(int i = 0; i < RANK-2; i++) { + uShape[i] = A.Size(i); + } + uShape[RANK-2] = A.Size(RANK-1); + + auto QShape = A.Shape(); + QShape[RANK-1] = m; + + auto Qin = make_tensor(QShape, MATX_ASYNC_DEVICE_MEMORY, stream); + auto wwt = make_tensor(QShape, MATX_ASYNC_DEVICE_MEMORY, stream); + auto u = make_tensor(uShape, MATX_ASYNC_DEVICE_MEMORY, stream); + + return std::make_tuple(Qin, wwt, u); + } - using ATypeS = typename AType::scalar_type; - using NTypeS = typename inner_op_type_t::type; - const int RANK = AType::Rank(); + template + inline void qr_internal(QType &Q, RType &R, const AType &A, WType workspace, cudaStream_t stream) { + MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL) - index_t m = A.Size(RANK-2); - index_t n = A.Size(RANK-1); - index_t k = std::min(m,n); - if(m<=n) k--; // these matrices have one less update since the diagonal ends on the bottom of the matrix + static_assert(A.Rank() >= 2); + static_assert(Q.Rank() == A.Rank()); + static_assert(R.Rank() == A.Rank()); - // Create Identity matrix - auto E = eye({m, m}); + MATX_ASSERT_STR(A.Rank() == Q.Rank(), matxInvalidDim, "qr: A and Q must have the same rank"); + MATX_ASSERT_STR(A.Rank() == R.Rank(), matxInvalidDim, "qr: A and R must have the same rank"); - // Clone over batch Dims - auto ECShape = Q.Shape(); - ECShape[RANK-1] = matxKeepDim; - ECShape[RANK-2] = matxKeepDim; + using ATypeS = typename AType::scalar_type; + using NTypeS = typename inner_op_type_t::type; + const int RANK = AType::Rank(); - auto I = clone(E, ECShape); + index_t m = A.Size(RANK-2); + index_t n = A.Size(RANK-1); + index_t k = std::min(m,n); + if(m<=n) k--; // these matrices have one less update since the diagonal ends on the bottom of the matrix - auto ci = N; // alias + auto Qin = std::get<0>(workspace); + auto wwt = std::get<1>(workspace); + auto u = std::get<2>(workspace); - // Inititalize Q - (Q = I).run(stream); - (R = A).run(stream); + static_assert(Qin.Rank() == Q.Rank()); + static_assert(wwt.Rank() == Q.Rank()); + static_assert(u.Rank() == Q.Rank()-1); - // setup slices - std::array mSliceB, mSliceE; // matrix slice starting at i,i - mSliceB.fill(0); mSliceE.fill(matxEnd); + // Create Identity matrix + auto E = eye({m, m}); - std::array qSliceB, qSliceE; // matrix slice starting at 0,i - qSliceB.fill(0); qSliceE.fill(matxEnd); + // Clone over batch Dims + auto ECShape = Q.Shape(); + ECShape[RANK-1] = matxKeepDim; + ECShape[RANK-2] = matxKeepDim; - std::array xSliceB, xSliceE; // vector slice starting at i,i - xSliceB.fill(0); xSliceE.fill(matxEnd); - xSliceE[RANK-1] = matxDropDim; // drop last dim to make a vector + auto I = clone(E, ECShape); - std::array vSliceB, vSliceE; // vector slice starting at i,0? - vSliceB.fill(0); vSliceE.fill(matxEnd); - vSliceE[RANK-1] = matxDropDim; // drop last dim to make a vector + // Inititalize Q + (Q = I).run(stream); + (R = A).run(stream); - std::array vmSliceB, vmSliceE; // vector slice as a matrix starting at i - vmSliceB.fill(0); vmSliceE.fill(matxEnd); + // we will slice X directly from R. + std::array xSliceB, xSliceE; + xSliceB.fill(0); xSliceE.fill(matxEnd); + xSliceE[RANK-1] = matxDropDim; // drop last dim to make a vector - // N cloned across x. - std::array ncShape; - ncShape.fill(matxKeepDim); - // clone ci across hi - std::array cicShape; - cicShape.fill(matxKeepDim); + // v is of size m x 1. Instead of allocating additional memory we will just reuse a row of Qin + std::array vSliceB, vSliceE; + vSliceB.fill(0); vSliceE.fill(matxEnd); + // select a single row of Q to alias as v + vSliceE[RANK-2] = matxDropDim; + auto v = slice(Qin, vSliceB, vSliceE); + auto xz = v; // alias - for(int i = 0 ; i < k ; i++) { - // update top left corner of slice - mSliceB[RANK-2] = i; - mSliceB[RANK-1] = i; + // N is of size 1. Instead of allocating additional memory we will just reuse an entry of Qin + std::array nSliceB, nSliceE; + nSliceB.fill(0); nSliceE.fill(matxEnd); + // select a single row of Q to alias as v + nSliceE[RANK-2] = matxDropDim; + nSliceE[RANK-1] = matxDropDim; - // update q slice - qSliceB[RANK-1] = i; + auto N = slice(wwt, nSliceB, nSliceE); - // update v/vm slices to start at i - xSliceB[RANK-2] = i; - xSliceB[RANK-1] = i; - vmSliceB[RANK-2] = i; - vSliceB[RANK-2] = i; + // N cloned with RANK-2 of size m. + std::array ncShape; + ncShape.fill(matxKeepDim); + ncShape[RANK-2] = m; + auto nc = clone(N,ncShape); - // matrix slices - auto qi = slice(Q, qSliceB, qSliceE); - auto qn = slice(QN, qSliceB, qSliceE); - auto ri = slice(R, mSliceB, mSliceE); - auto rn = slice(RN, mSliceB, mSliceE); - auto hi = slice(H, mSliceB, mSliceE); - auto vm = slice(VM, vmSliceB, vmSliceE); + std::array umShape; + umShape.fill(matxKeepDim); + umShape[RANK-1] = 1; - // vector slices - auto v = slice(VM, vSliceB, vSliceE); - auto x = slice(R, xSliceB, xSliceE); + auto um = clone(u, umShape); + auto vm = clone(v, umShape); + + // aliasing some memory here to share storage and provide clarity in the code below + auto s = N; // alias + auto sc = nc; // alias + auto w = v; // alias + auto wm = vm; // alias - //update clone shape - ncShape[RANK-2] = m-i; - auto nc = clone(N,ncShape); + for(int i = 0 ; i < k ; i++) { - // update cloned ci shape - cicShape[RANK-2] = m - i; - cicShape[RANK-1] = m - i; - auto cic = clone(ci, cicShape); + // slice off a column of R and alias as x + xSliceB[RANK-1] = i; + auto x = slice(R, xSliceB, xSliceE); - // create identity matrix and clone across full rank - auto Ei = eye({m-i,m-i}); - auto Ii = clone(Ei, ECShape); + // operator which zeros out values above current index in matrix + (xz = (index(x.Rank()-1) >= i) * x).run(stream); - (N = sum(norm(x))).run(stream); - (N = sqrt(N)).run(stream); + // compute L2 norm without sqrt. + (N = sum(norm(xz))).run(stream); + //(N = sqrt(N)).run(stream); // sqrt folded into next op - // copy x into v and apply signed addition of nc - (IFELSE( (index(v.Rank()-1) == 0), - v = x + sign(x, ATypeS(1) ) * nc, - v = x)).run(stream); + (v = xz + (index(v.Rank()-1) == i) * sign(xz) * sqrt(nc)).run(stream); - (ci = sum(norm(v))).run(stream); + auto r = x; // alias column of R happens to be the same as x - (ci = NTypeS(2) / ci).run(stream); + (s = sum(norm(v))).run(stream); + //(s = sqrt(s)).run(stream); // sqrt folded into next op - matmul_impl(hi, vm, conj(transpose_matrix(vm)), stream); + // IFELSE to avoid nans when dividing by zero + (IFELSE(sc != NTypeS(0), + w = (v / sqrt(sc)), + w = NTypeS(0))).run(stream); - (hi = Ii - cic * hi).run(stream); + (u = matvec(conj(transpose_matrix(R)), w, 2 , 0)).run(stream); - // update panel of r - matmul_impl(rn, hi, ri, stream); + // TODO replace with outer API + matmul_impl(R, wm, conj(transpose_matrix(um)), stream, -1, 1); - // update panel of q - matmul_impl(qn, qi, hi, stream); + // entries below diagonal should be numerical zero. Zero them out to avoid additional FP error. + (IF(index(x.Rank()-1) > i, r = ATypeS(0)) ).run(stream); - // deep copy required (can't swap) - // copy current panels into output matrix - (ri = rn).run(stream); - (qi = qn).run(stream); - // R & Q now contain latest - } + // TODO replace with outer API + matmul_impl(wwt, wm, conj(transpose_matrix(wm)), stream); - // zero out lower triangular part of R - (IF(index(RANK-1) < index(RANK-2), R = 0)).run(stream); + (Qin = Q).run(stream); // save input + matmul_impl(Q, Qin, wwt, stream, -2, 1); } + } } // end namespace detail /** @@ -204,7 +216,7 @@ namespace detail { * CUDA stream */ template -void qr_impl(QType Q, RType R, AType A, cudaStream_t stream) { +inline void qr_impl(QType &Q, RType &R, const AType &A, cudaStream_t stream) { MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL) static_assert(A.Rank() >= 2); @@ -214,40 +226,8 @@ void qr_impl(QType Q, RType R, AType A, cudaStream_t stream) { MATX_ASSERT_STR(A.Rank() == Q.Rank(), matxInvalidDim, "qr: A and Q must have the same rank"); MATX_ASSERT_STR(A.Rank() == R.Rank(), matxInvalidDim, "qr: A and R must have the same rank"); - using ATypeS = typename AType::scalar_type; - using NTypeS = typename inner_op_type_t::type; - const int RANK = AType::Rank(); - - index_t m = A.Size(RANK-2); - index_t n = A.Size(RANK-1); - index_t k = std::min(m,n); - if(m<=n) k--; // these matrices have one less update since the diagonal ends on the bottom of the matrix - - std::array NShape; - for(int i = 0; i < RANK-2; i++) { - NShape[i] = A.Size(i); - } - - std::array VMShape; - for(int i = 0; i < RANK-1; i++) { - VMShape[i] = A.Size(i); - } - VMShape[RANK-1] = 1; - - std::array HShape; - for(int i = 0; i < RANK-2; i++) { - HShape[i] = A.Size(i); - } - HShape[RANK-2] = m; - HShape[RANK-1] = m; - - auto N = make_tensor(NShape, MATX_ASYNC_DEVICE_MEMORY, stream); - auto VM = make_tensor(VMShape, MATX_ASYNC_DEVICE_MEMORY, stream); - auto H = make_tensor(HShape, MATX_ASYNC_DEVICE_MEMORY, stream); - auto QN = make_tensor(Q.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream); - auto RN = make_tensor(R.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream); - - qr_internal(Q,R,A,N,VM,H,QN,RN,stream); + auto workspace = qr_internal_workspace(A, stream); + qr_internal(Q,R,A,workspace,stream); } } // end namespace matx diff --git a/include/matx/transforms/svd.h b/include/matx/transforms/svd.h index 698e8da07..ac25ef4fb 100644 --- a/include/matx/transforms/svd.h +++ b/include/matx/transforms/svd.h @@ -302,6 +302,36 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat } } +template +inline auto svdbpi_impl_workspace(const AType &A, cudaStream_t stream) { + using ATypeS = typename AType::scalar_type; + const int RANK = AType::Rank(); + + auto m = A.Size(RANK-2); // rows + auto n = A.Size(RANK-1); // cols + auto d = std::min(n,m); // dim for AAT or ATA + + auto ATShape = A.Shape(); + ATShape[RANK-2] = d; + ATShape[RANK-1] = d; + + auto QShape = A.Shape(); + QShape[RANK-1] = d; + QShape[RANK-2] = d; + + auto RShape = A.Shape(); + RShape[RANK-1] = d; + RShape[RANK-2] = d; + + // temp memory for block power iteration + auto AT = make_tensor(ATShape, MATX_ASYNC_DEVICE_MEMORY, stream); + auto Q = make_tensor(QShape, MATX_ASYNC_DEVICE_MEMORY, stream); + auto R = make_tensor(RShape, MATX_ASYNC_DEVICE_MEMORY, stream); + auto Z = make_tensor(QShape, MATX_ASYNC_DEVICE_MEMORY, stream); + + return std::tuple(AT, Q, R, Z); +} + /** * Perform a SVD decomposition using the block power iteration. This version of * SVD works well on small n/m with large batch. @@ -327,12 +357,12 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat * @param A * Input tensor or operator for tensor A input with size "batches by m by n" * @param iterations - * The number of power iterations to perform for each singular value. + * The number of block power iterations to perform. * @param stream * CUDA stream */ template -void svdbpi_impl(UType &U, SType &S, VTType &VT, AType &A, int iterations, cudaStream_t stream) { +inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int iterations, cudaStream_t stream) { MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL) static_assert(U.Rank() == A.Rank()); @@ -346,63 +376,23 @@ void svdbpi_impl(UType &U, SType &S, VTType &VT, AType &A, int iterations, cuda auto m = A.Size(RANK-2); // rows auto n = A.Size(RANK-1); // cols auto d = std::min(n,m); // dim for AAT or ATA - + // assert batch sizes are the same for(int i = 0 ; i < RANK-2; i++) { MATX_ASSERT_STR(U.Size(i) == A.Size(i), matxInvalidDim, "svdbpi: U and A must have the same batch sizes"); MATX_ASSERT_STR(VT.Size(i) == A.Size(i), matxInvalidDim, "svdbpi: VT and A must have the same batch sizes"); MATX_ASSERT_STR(S.Size(i) == A.Size(i), matxInvalidDim, "svdbpi: S and A must have the same batch sizes"); } - + MATX_ASSERT_STR(U.Size(RANK-2) == m, matxInvalidDim, "svdbpi: U must have Size(RANK-2) == m"); MATX_ASSERT_STR(U.Size(RANK-1) == d, matxInvalidDim, "svdbpi: U must have Size(RANK-1) == d"); MATX_ASSERT_STR(VT.Size(RANK-2) == d, matxInvalidDim, "svdbpi: VT must have Size(RANK-2) == d"); MATX_ASSERT_STR(VT.Size(RANK-1) == n, matxInvalidDim, "svdbpi: VT must have Size(RANK-1) == n"); MATX_ASSERT_STR(S.Size(RANK-2) == d, matxInvalidDim, "svdbpi: S must have Size(RANK-2) == d"); - - - auto ATShape = A.Shape(); - ATShape[RANK-2] = d; - ATShape[RANK-1] = d; - - auto QShape = A.Shape(); - QShape[RANK-1] = d; - QShape[RANK-2] = d; - - auto RShape = A.Shape(); - RShape[RANK-1] = d; - RShape[RANK-2] = d; - // temp memory for block power iteration - auto AT = make_tensor(ATShape, MATX_ASYNC_DEVICE_MEMORY, stream); - auto Q = make_tensor(QShape, MATX_ASYNC_DEVICE_MEMORY, stream); - auto R = make_tensor(RShape, MATX_ASYNC_DEVICE_MEMORY, stream); - auto Z = make_tensor(QShape, MATX_ASYNC_DEVICE_MEMORY, stream); - - std::array NShape; - for(int i = 0; i < RANK-2; i++) { - NShape[i] = Z.Size(i); - } - std::array VMShape; - for(int i = 0; i < RANK-1; i++) { - VMShape[i] = Z.Size(i); - } - VMShape[RANK-1] = 1; - - std::array HShape; - for(int i = 0; i < RANK-2; i++) { - HShape[i] = Z.Size(i); - } - HShape[RANK-2] = Z.Size(RANK-2); - HShape[RANK-1] = Z.Size(RANK-2); - - // temp memory for qr - auto N = make_tensor(NShape, MATX_ASYNC_DEVICE_MEMORY, stream); - auto VM = make_tensor(VMShape, MATX_ASYNC_DEVICE_MEMORY, stream); - auto H = make_tensor(HShape, MATX_ASYNC_DEVICE_MEMORY, stream); - auto QN = make_tensor(QShape, MATX_ASYNC_DEVICE_MEMORY, stream); - auto RN = make_tensor(RShape, MATX_ASYNC_DEVICE_MEMORY, stream); + auto [AT, Q, R, Z] = svdbpi_impl_workspace(A, stream); + auto qr_workspace = qr_internal_workspace(Z, stream); // create spd matrix if ( m >= n ) { @@ -410,43 +400,46 @@ void svdbpi_impl(UType &U, SType &S, VTType &VT, AType &A, int iterations, cuda } else { matmul_impl(AT, A, conj(transpose_matrix(A)), stream); } - + auto e2 = eye({d,d}); auto cShape = A.Shape(); cShape[RANK-1] = matxKeepDim; cShape[RANK-2] = matxKeepDim; - + (Q = clone(e2, cShape)).run(stream); for(int i = 0; i < iterations; i++) { matmul_impl(Z, AT, Q, stream); - qr_internal(Q,R,Z,N,VM,H,QN,RN,stream); + qr_internal(Q, R, Z, qr_workspace, stream); } (S = real(sqrt(diag(R)))).run(stream); - + if( m >= n ) { (VT = conj(transpose_matrix(Q))).run(stream); matmul_impl(U, A, Q, stream); - + auto DShape = U.Shape(); DShape.fill(matxKeepDim); DShape[RANK-2] = m; auto D = clone(S, DShape); - - // normalize U by singular values - (U = U * STypeS(1) / D).run(stream); + + // normalize U by singular values + // IF required to avoid nans when singular value is 0 + (IF(D != STypeS(0), U = U / D)).run(stream); + } else { (U = Q).run(stream); matmul_impl(VT, conj(transpose_matrix(Q)), A, stream); - + auto DShape = VT.Shape(); DShape.fill(matxKeepDim); DShape[RANK-1] = n; auto D = clone(S, DShape); - + // normalize VT by singular values - (VT = VT * STypeS(1) / D).run(stream); + // IF required to avoid nans when singular value is 0 + (IF(D != STypeS(0), VT = VT / D)).run(stream); } }