diff --git a/examples/svd_power.cu b/examples/svd_power.cu index a07749eec..6c86167fe 100644 --- a/examples/svd_power.cu +++ b/examples/svd_power.cu @@ -41,12 +41,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) { MATX_ENTER_HANDLER(); +#if 1 //using AType = float; using AType = cuda::std::complex; using SType = float; cudaStream_t stream = 0; - int batch = 1; int m = 5; int n = 4; @@ -54,6 +54,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) int d = std::min(m,n); int k = d; // number of singular values to find +#if 0 + int batch = 1; auto A = make_tensor({batch, m, n}); auto U = make_tensor({batch, m, k}); auto VT = make_tensor({batch, k, n}); @@ -66,20 +68,41 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) auto UTU = make_tensor({batch, k, k}); auto VVT = make_tensor({batch, n, n}); auto VTV = make_tensor({batch, k, k}); + auto x0 = random({batch, d}, NORMAL); + + (A = random({batch, m, n}, NORMAL)).run(stream); + +#else + auto A = make_tensor({m, n}); + auto U = make_tensor({m, k}); + auto VT = make_tensor({k, n}); + auto S = make_tensor({k}); + + // for correctness checking + auto UD = make_tensor({m, k}); + auto UDVT = make_tensor({m, n}); + auto UUT = make_tensor({m, m}); + auto UTU = make_tensor({k, k}); + auto VVT = make_tensor({n, n}); + auto VTV = make_tensor({k, k}); + auto x0 = random({d}, NORMAL); + + (A = random({m, n}, NORMAL)).run(stream); + +#endif std::array Dshape; Dshape.fill(matxKeepDim); Dshape[U.Rank()-2] = m; // cloning D across auto D = clone(S, Dshape); - int iterations = 10; + float tol = (float)1e-3; + int iterations = 20; + { - auto x0 = random({batch, d}, NORMAL); printf("iterations: %d\n", iterations); - (A = random({batch, m, n}, NORMAL)).run(stream); - A.PrefetchDevice(stream); U.PrefetchDevice(stream); S.PrefetchDevice(stream); @@ -138,6 +161,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) printf("A-UDVT\n"); print(A); } + // Same as above but with svdbpi { @@ -145,7 +169,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) (S = 0).run(stream); (VT = 0).run(stream); // TODO add k - (mtie(U, S, VT) = svdbpi(A, iterations)).run(stream); + (mtie(U, S, VT) = svdbpi(A, iterations, tol)).run(stream); cudaDeviceSynchronize(); printf("svdbpi:\n"); @@ -194,7 +218,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) printf("A-UDVT\n"); print(A); } - +#endif CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); } diff --git a/include/matx/operators/svd.h b/include/matx/operators/svd.h index be06ce1f2..be20fa6a0 100644 --- a/include/matx/operators/svd.h +++ b/include/matx/operators/svd.h @@ -182,7 +182,8 @@ namespace detail { { private: OpA a_; - int iterations_; + int max_iters_; + float tol_; public: using matxop = bool; @@ -191,7 +192,7 @@ namespace detail { using svd_xform_op = bool; __MATX_INLINE__ std::string str() const { return "svdpi(" + get_type_str(a_) + ")"; } - __MATX_INLINE__ SVDBPIOp(const OpA &a, int iterations) : a_(a), iterations_(iterations) + __MATX_INLINE__ SVDBPIOp(const OpA &a, int max_iters, float tol) : a_(a), max_iters_(max_iters), tol_(tol) { } // This should never be called @@ -203,7 +204,7 @@ namespace detail { static_assert(is_device_executor_v, "svdbpi() only supports the CUDA executor currently"); static_assert(std::tuple_size_v> == 4, "Must use mtie with 3 outputs on svdbpi(). ie: (mtie(U, S, V) = svdbpi(A))"); - svdbpi_impl(std::get<0>(out), std::get<1>(out), std::get<2>(out), a_, iterations_, ex.getStream()); + svdbpi_impl(std::get<0>(out), std::get<1>(out), std::get<2>(out), a_, max_iters_, tol_, ex.getStream()); } static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() @@ -236,12 +237,14 @@ namespace detail { * * @param A * Input tensor or operator for tensor A input with size "batches by m by n" - * @param iterations - * The number of power iterations to perform for each singular value. + * @param max_iters + * The approximate maximum number of QR iterations to perform. + * @param tol + * The termination tolerance for the QR iteration. Setting this to 0 will skip the tolerance check. */ template -__MATX_INLINE__ auto svdbpi(AType &A, int iterations) { - return detail::SVDBPIOp(A, iterations); +__MATX_INLINE__ auto svdbpi(AType &A, int max_iters=10, float tol=0.0f) { + return detail::SVDBPIOp(A, max_iters, tol); } } diff --git a/include/matx/transforms/qr.h b/include/matx/transforms/qr.h index 78ac85683..081767436 100644 --- a/include/matx/transforms/qr.h +++ b/include/matx/transforms/qr.h @@ -139,18 +139,10 @@ namespace detail { ncShape[RANK-2] = m; auto nc = clone(N,ncShape); - std::array umShape; - umShape.fill(matxKeepDim); - umShape[RANK-1] = 1; - - auto um = clone(u, umShape); - auto vm = clone(v, umShape); - // aliasing some memory here to share storage and provide clarity in the code below auto s = N; // alias auto sc = nc; // alias auto w = v; // alias - auto wm = vm; // alias for(int i = 0 ; i < k ; i++) { @@ -179,15 +171,12 @@ namespace detail { (u = matvec(conj(transpose_matrix(R)), w, 2 , 0)).run(stream); - // TODO replace with outer API - matmul_impl(R, wm, conj(transpose_matrix(um)), stream, -1, 1); + (R = outer(w, conj(u), -1, 1)).run(stream); // entries below diagonal should be numerical zero. Zero them out to avoid additional FP error. (IF(index(x.Rank()-1) > i, r = ATypeS(0)) ).run(stream); - - // TODO replace with outer API - matmul_impl(wwt, wm, conj(transpose_matrix(wm)), stream); + (wwt = outer(w, conj(w))).run(stream); (Qin = Q).run(stream); // save input matmul_impl(Q, Qin, wwt, stream, -2, 1); diff --git a/include/matx/transforms/svd.h b/include/matx/transforms/svd.h index ac25ef4fb..118b1b607 100644 --- a/include/matx/transforms/svd.h +++ b/include/matx/transforms/svd.h @@ -323,13 +323,20 @@ inline auto svdbpi_impl_workspace(const AType &A, cudaStream_t stream) { RShape[RANK-1] = d; RShape[RANK-2] = d; + std::array l2NormShape; + for(int i=0;i(ATShape, MATX_ASYNC_DEVICE_MEMORY, stream); auto Q = make_tensor(QShape, MATX_ASYNC_DEVICE_MEMORY, stream); + auto Qold = make_tensor(QShape, MATX_ASYNC_DEVICE_MEMORY, stream); auto R = make_tensor(RShape, MATX_ASYNC_DEVICE_MEMORY, stream); auto Z = make_tensor(QShape, MATX_ASYNC_DEVICE_MEMORY, stream); - - return std::tuple(AT, Q, R, Z); + auto l2Norm = make_tensor(l2NormShape, MATX_ASYNC_DEVICE_MEMORY, stream); + auto converged = make_tensor({}, MATX_ASYNC_DEVICE_MEMORY, stream); + return std::tuple(AT, Q, Qold, R, Z, l2Norm, converged); } /** @@ -356,14 +363,16 @@ inline auto svdbpi_impl_workspace(const AType &A, cudaStream_t stream) { * VT tensor or operator for right singular vectors output as VH with size "batches by min(m,n) by n" * @param A * Input tensor or operator for tensor A input with size "batches by m by n" - * @param iterations - * The number of block power iterations to perform. + * @param max_iters + * The approximate maximum number of QR iterations to perform. + * @param tol + * The termination tolerance for the QR iteration. Setting this to 0 will skip the tolerance check. * @param stream * CUDA stream */ template -inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int iterations, cudaStream_t stream) { - MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL) +inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int max_iters, float tol, cudaStream_t stream) { + MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL); static_assert(U.Rank() == A.Rank()); static_assert(VT.Rank() == A.Rank()); @@ -390,8 +399,15 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int iter MATX_ASSERT_STR(VT.Size(RANK-1) == n, matxInvalidDim, "svdbpi: VT must have Size(RANK-1) == n"); MATX_ASSERT_STR(S.Size(RANK-2) == d, matxInvalidDim, "svdbpi: S must have Size(RANK-2) == d"); + int converged_host = false; + cudaStream_t d2h; + cudaEvent_t event; + if(tol>0.0f) { + cudaStreamCreateWithFlags(&d2h,cudaStreamNonBlocking); + cudaEventCreate(&event); + } - auto [AT, Q, R, Z] = svdbpi_impl_workspace(A, stream); + auto [AT, Q, Qold, R, Z, l2Norm, converged] = svdbpi_impl_workspace(A, stream); auto qr_workspace = qr_internal_workspace(Z, stream); // create spd matrix @@ -406,11 +422,49 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int iter cShape[RANK-1] = matxKeepDim; cShape[RANK-2] = matxKeepDim; - (Q = clone(e2, cShape)).run(stream); + (Qold = Q = clone(e2, cShape)).run(stream); - for(int i = 0; i < iterations; i++) { + // TODO multistream? + for(int i = 0; i < max_iters; i+=2) + { + + // double pump this iteration so we get Qold and Q for tolerance checking. + // We might take an extra iteration but it will overheads associated with checking concergence. matmul_impl(Z, AT, Q, stream); + qr_internal(Qold, R, Z, qr_workspace, stream); + + matmul_impl(Z, AT, Qold, stream); qr_internal(Q, R, Z, qr_workspace, stream); + + if(tol!=0.0f) { + + cudaStreamSynchronize(d2h); // wait for d2h transfer to finish + if(converged_host == true) { + // if converged exit loop + break; + } + + //compute L2(Q-Qold) + // sqrt folded into next operation + (l2Norm = sum(norm(Q-Qold))).run(stream); + + // compute if all batches have converged + if constexpr (RANK > 2) { + (converged = all(as_int(sqrt(l2Norm) < tol))).run(stream); + } else { + (converged = as_int(sqrt(l2Norm) < tol)).run(stream); + } + + // event to record when converged is ready in stream + cudaEventRecord(event, stream); + // wait for d2h transfer until converged is ready + cudaStreamWaitEvent(d2h, event); + + // copy convergence criteria to host. + // This is in unpinned memory and cannot on most systems run asynchronously. + // We do this here to hide the copy/sync behind prior launch latency/execution of next iteration. + cudaMemcpyAsync(&converged_host, converged.Data(), sizeof(int), cudaMemcpyDeviceToHost, d2h); + } } (S = real(sqrt(diag(R)))).run(stream); @@ -441,6 +495,11 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int iter // IF required to avoid nans when singular value is 0 (IF(D != STypeS(0), VT = VT / D)).run(stream); } + + if(tol>0.0f) { + cudaEventDestroy(event); + cudaStreamDestroy(d2h); + } } } // end namespace matx