Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 31 additions & 7 deletions examples/svd_power.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,21 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
{
MATX_ENTER_HANDLER();

#if 1
//using AType = float;
using AType = cuda::std::complex<float>;
using SType = float;

cudaStream_t stream = 0;
int batch = 1;

int m = 5;
int n = 4;

int d = std::min(m,n);
int k = d; // number of singular values to find

#if 0
int batch = 1;
auto A = make_tensor<AType>({batch, m, n});
auto U = make_tensor<AType>({batch, m, k});
auto VT = make_tensor<AType>({batch, k, n});
Expand All @@ -66,20 +68,41 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
auto UTU = make_tensor<AType>({batch, k, k});
auto VVT = make_tensor<AType>({batch, n, n});
auto VTV = make_tensor<AType>({batch, k, k});
auto x0 = random<float>({batch, d}, NORMAL);

(A = random<float>({batch, m, n}, NORMAL)).run(stream);

#else
auto A = make_tensor<AType>({m, n});
auto U = make_tensor<AType>({m, k});
auto VT = make_tensor<AType>({k, n});
auto S = make_tensor<SType>({k});

// for correctness checking
auto UD = make_tensor<AType>({m, k});
auto UDVT = make_tensor<AType>({m, n});
auto UUT = make_tensor<AType>({m, m});
auto UTU = make_tensor<AType>({k, k});
auto VVT = make_tensor<AType>({n, n});
auto VTV = make_tensor<AType>({k, k});
auto x0 = random<float>({d}, NORMAL);

(A = random<float>({m, n}, NORMAL)).run(stream);

#endif
std::array<index_t, U.Rank()> Dshape;
Dshape.fill(matxKeepDim);
Dshape[U.Rank()-2] = m;
// cloning D across
auto D = clone<U.Rank()>(S, Dshape);

int iterations = 10;
float tol = (float)1e-3;
int iterations = 20;

{
auto x0 = random<float>({batch, d}, NORMAL);

printf("iterations: %d\n", iterations);

(A = random<float>({batch, m, n}, NORMAL)).run(stream);

A.PrefetchDevice(stream);
U.PrefetchDevice(stream);
S.PrefetchDevice(stream);
Expand Down Expand Up @@ -138,14 +161,15 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
printf("A-UDVT\n");
print(A);
}

// Same as above but with svdbpi
{

(U = 0).run(stream);
(S = 0).run(stream);
(VT = 0).run(stream);
// TODO add k
(mtie(U, S, VT) = svdbpi(A, iterations)).run(stream);
(mtie(U, S, VT) = svdbpi(A, iterations, tol)).run(stream);

cudaDeviceSynchronize();
printf("svdbpi:\n");
Expand Down Expand Up @@ -194,7 +218,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
printf("A-UDVT\n");
print(A);
}

#endif
CUDA_CHECK_LAST_ERROR();
MATX_EXIT_HANDLER();
}
17 changes: 10 additions & 7 deletions include/matx/operators/svd.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,8 @@ namespace detail {
{
private:
OpA a_;
int iterations_;
int max_iters_;
float tol_;

public:
using matxop = bool;
Expand All @@ -191,7 +192,7 @@ namespace detail {
using svd_xform_op = bool;

__MATX_INLINE__ std::string str() const { return "svdpi(" + get_type_str(a_) + ")"; }
__MATX_INLINE__ SVDBPIOp(const OpA &a, int iterations) : a_(a), iterations_(iterations)
__MATX_INLINE__ SVDBPIOp(const OpA &a, int max_iters, float tol) : a_(a), max_iters_(max_iters), tol_(tol)
{ }

// This should never be called
Expand All @@ -203,7 +204,7 @@ namespace detail {
static_assert(is_device_executor_v<Executor>, "svdbpi() only supports the CUDA executor currently");
static_assert(std::tuple_size_v<remove_cvref_t<Out>> == 4, "Must use mtie with 3 outputs on svdbpi(). ie: (mtie(U, S, V) = svdbpi(A))");

svdbpi_impl(std::get<0>(out), std::get<1>(out), std::get<2>(out), a_, iterations_, ex.getStream());
svdbpi_impl(std::get<0>(out), std::get<1>(out), std::get<2>(out), a_, max_iters_, tol_, ex.getStream());
}

static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
Expand Down Expand Up @@ -236,12 +237,14 @@ namespace detail {
*
* @param A
* Input tensor or operator for tensor A input with size "batches by m by n"
* @param iterations
* The number of power iterations to perform for each singular value.
* @param max_iters
* The approximate maximum number of QR iterations to perform.
* @param tol
* The termination tolerance for the QR iteration. Setting this to 0 will skip the tolerance check.
*/
template<typename AType>
__MATX_INLINE__ auto svdbpi(AType &A, int iterations) {
return detail::SVDBPIOp(A, iterations);
__MATX_INLINE__ auto svdbpi(AType &A, int max_iters=10, float tol=0.0f) {
return detail::SVDBPIOp(A, max_iters, tol);
}

}
15 changes: 2 additions & 13 deletions include/matx/transforms/qr.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,18 +139,10 @@ namespace detail {
ncShape[RANK-2] = m;
auto nc = clone<RANK-1>(N,ncShape);

std::array<index_t, RANK> umShape;
umShape.fill(matxKeepDim);
umShape[RANK-1] = 1;

auto um = clone<RANK>(u, umShape);
auto vm = clone<RANK>(v, umShape);

// aliasing some memory here to share storage and provide clarity in the code below
auto s = N; // alias
auto sc = nc; // alias
auto w = v; // alias
auto wm = vm; // alias

for(int i = 0 ; i < k ; i++) {

Expand Down Expand Up @@ -179,15 +171,12 @@ namespace detail {

(u = matvec(conj(transpose_matrix(R)), w, 2 , 0)).run(stream);

// TODO replace with outer API
matmul_impl(R, wm, conj(transpose_matrix(um)), stream, -1, 1);
(R = outer(w, conj(u), -1, 1)).run(stream);

// entries below diagonal should be numerical zero. Zero them out to avoid additional FP error.
(IF(index(x.Rank()-1) > i, r = ATypeS(0)) ).run(stream);


// TODO replace with outer API
matmul_impl(wwt, wm, conj(transpose_matrix(wm)), stream);
(wwt = outer(w, conj(w))).run(stream);

(Qin = Q).run(stream); // save input
matmul_impl(Q, Qin, wwt, stream, -2, 1);
Expand Down
77 changes: 68 additions & 9 deletions include/matx/transforms/svd.h
Original file line number Diff line number Diff line change
Expand Up @@ -323,13 +323,20 @@ inline auto svdbpi_impl_workspace(const AType &A, cudaStream_t stream) {
RShape[RANK-1] = d;
RShape[RANK-2] = d;

std::array<index_t,RANK-2> l2NormShape;
for(int i=0;i<RANK-2;i++) {
l2NormShape[i] = A.Size(i);
}

// temp memory for block power iteration
auto AT = make_tensor<ATypeS>(ATShape, MATX_ASYNC_DEVICE_MEMORY, stream);
auto Q = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);
auto Qold = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);
auto R = make_tensor<ATypeS>(RShape, MATX_ASYNC_DEVICE_MEMORY, stream);
auto Z = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);

return std::tuple(AT, Q, R, Z);
auto l2Norm = make_tensor<float>(l2NormShape, MATX_ASYNC_DEVICE_MEMORY, stream);
auto converged = make_tensor<int>({}, MATX_ASYNC_DEVICE_MEMORY, stream);
return std::tuple(AT, Q, Qold, R, Z, l2Norm, converged);
}

/**
Expand All @@ -356,14 +363,16 @@ inline auto svdbpi_impl_workspace(const AType &A, cudaStream_t stream) {
* VT tensor or operator for right singular vectors output as VH with size "batches by min(m,n) by n"
* @param A
* Input tensor or operator for tensor A input with size "batches by m by n"
* @param iterations
* The number of block power iterations to perform.
* @param max_iters
* The approximate maximum number of QR iterations to perform.
* @param tol
* The termination tolerance for the QR iteration. Setting this to 0 will skip the tolerance check.
* @param stream
* CUDA stream
*/
template<typename UType, typename SType, typename VTType, typename AType>
inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int iterations, cudaStream_t stream) {
MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int max_iters, float tol, cudaStream_t stream) {
MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL);

static_assert(U.Rank() == A.Rank());
static_assert(VT.Rank() == A.Rank());
Expand All @@ -390,8 +399,15 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int iter
MATX_ASSERT_STR(VT.Size(RANK-1) == n, matxInvalidDim, "svdbpi: VT must have Size(RANK-1) == n");
MATX_ASSERT_STR(S.Size(RANK-2) == d, matxInvalidDim, "svdbpi: S must have Size(RANK-2) == d");

int converged_host = false;
cudaStream_t d2h;
cudaEvent_t event;
if(tol>0.0f) {
cudaStreamCreateWithFlags(&d2h,cudaStreamNonBlocking);
cudaEventCreate(&event);
}

auto [AT, Q, R, Z] = svdbpi_impl_workspace(A, stream);
auto [AT, Q, Qold, R, Z, l2Norm, converged] = svdbpi_impl_workspace(A, stream);
auto qr_workspace = qr_internal_workspace(Z, stream);

// create spd matrix
Expand All @@ -406,11 +422,49 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int iter
cShape[RANK-1] = matxKeepDim;
cShape[RANK-2] = matxKeepDim;

(Q = clone<RANK>(e2, cShape)).run(stream);
(Qold = Q = clone<RANK>(e2, cShape)).run(stream);

for(int i = 0; i < iterations; i++) {
// TODO multistream?
for(int i = 0; i < max_iters; i+=2)
{

// double pump this iteration so we get Qold and Q for tolerance checking.
// We might take an extra iteration but it will overheads associated with checking concergence.
matmul_impl(Z, AT, Q, stream);
qr_internal(Qold, R, Z, qr_workspace, stream);

matmul_impl(Z, AT, Qold, stream);
qr_internal(Q, R, Z, qr_workspace, stream);

if(tol!=0.0f) {

cudaStreamSynchronize(d2h); // wait for d2h transfer to finish
if(converged_host == true) {
// if converged exit loop
break;
}

//compute L2(Q-Qold)
// sqrt folded into next operation
(l2Norm = sum(norm(Q-Qold))).run(stream);

// compute if all batches have converged
if constexpr (RANK > 2) {
(converged = all(as_int(sqrt(l2Norm) < tol))).run(stream);
} else {
(converged = as_int(sqrt(l2Norm) < tol)).run(stream);
}

// event to record when converged is ready in stream
cudaEventRecord(event, stream);
// wait for d2h transfer until converged is ready
cudaStreamWaitEvent(d2h, event);

// copy convergence criteria to host.
// This is in unpinned memory and cannot on most systems run asynchronously.
// We do this here to hide the copy/sync behind prior launch latency/execution of next iteration.
cudaMemcpyAsync(&converged_host, converged.Data(), sizeof(int), cudaMemcpyDeviceToHost, d2h);
}
}

(S = real(sqrt(diag(R)))).run(stream);
Expand Down Expand Up @@ -441,6 +495,11 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int iter
// IF required to avoid nans when singular value is 0
(IF(D != STypeS(0), VT = VT / D)).run(stream);
}

if(tol>0.0f) {
cudaEventDestroy(event);
cudaStreamDestroy(d2h);
}
}

} // end namespace matx
Expand Down