NVIDIA · luitjens · Jan 16, 2024 · Jan 12, 2024 · Jan 12, 2024
diff --git a/examples/svd_power.cu b/examples/svd_power.cu
@@ -41,19 +41,21 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
 
+#if 1
   //using AType = float;
   using AType = cuda::std::complex<float>;
   using SType = float;
 
   cudaStream_t stream = 0;
-  int batch = 1; 
 
   int m = 5;
   int n = 4;
 
   int d = std::min(m,n);
   int k = d;  // number of singular values to find
 
+#if 0
+  int batch = 1; 
   auto A = make_tensor<AType>({batch, m, n});
   auto U = make_tensor<AType>({batch, m, k});
   auto VT = make_tensor<AType>({batch, k, n});
@@ -66,20 +68,41 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto UTU = make_tensor<AType>({batch, k, k});
   auto VVT = make_tensor<AType>({batch, n, n});
   auto VTV = make_tensor<AType>({batch, k, k});
+  auto x0 = random<float>({batch, d}, NORMAL);
+
+  (A = random<float>({batch, m, n}, NORMAL)).run(stream);
+
+#else
+  auto A = make_tensor<AType>({m, n});
+  auto U = make_tensor<AType>({m, k});
+  auto VT = make_tensor<AType>({k, n});
+  auto S = make_tensor<SType>({k});
+
+  // for correctness checking
+  auto UD = make_tensor<AType>({m, k});
+  auto UDVT = make_tensor<AType>({m, n});
+  auto UUT = make_tensor<AType>({m, m});
+  auto UTU = make_tensor<AType>({k, k});
+  auto VVT = make_tensor<AType>({n, n});
+  auto VTV = make_tensor<AType>({k, k});
+  auto x0 = random<float>({d}, NORMAL);
+
+  (A = random<float>({m, n}, NORMAL)).run(stream);
+
+#endif
   std::array<index_t, U.Rank()> Dshape;
   Dshape.fill(matxKeepDim);
   Dshape[U.Rank()-2] = m;
   // cloning D across
   auto D = clone<U.Rank()>(S, Dshape);
 
-  int iterations = 10;
+  float tol = (float)1e-3;
+  int iterations = 20;
+
   {
-    auto x0 = random<float>({batch, d}, NORMAL);
 
     printf("iterations: %d\n", iterations);
 
-    (A = random<float>({batch, m, n}, NORMAL)).run(stream);
-
     A.PrefetchDevice(stream);
     U.PrefetchDevice(stream);
     S.PrefetchDevice(stream);
@@ -138,14 +161,15 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     printf("A-UDVT\n");
     print(A);
   }
+
   // Same as above but with svdbpi
   {
 
     (U = 0).run(stream);
     (S = 0).run(stream);
     (VT = 0).run(stream);
     // TODO add k
-    (mtie(U, S, VT) = svdbpi(A, iterations)).run(stream);
+    (mtie(U, S, VT) = svdbpi(A, iterations, tol)).run(stream);
 
     cudaDeviceSynchronize();
     printf("svdbpi:\n");
@@ -194,7 +218,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     printf("A-UDVT\n");
     print(A);
   }
-
+#endif
   CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
diff --git a/include/matx/operators/svd.h b/include/matx/operators/svd.h
@@ -182,7 +182,8 @@ namespace detail {
   {
     private:
       OpA a_;
-      int iterations_;
+      int max_iters_;
+      float tol_;
 
     public:
       using matxop = bool;
@@ -191,7 +192,7 @@ namespace detail {
       using svd_xform_op = bool;
 
       __MATX_INLINE__ std::string str() const { return "svdpi(" + get_type_str(a_) + ")"; }
-      __MATX_INLINE__ SVDBPIOp(const OpA &a, int iterations) : a_(a), iterations_(iterations)
+      __MATX_INLINE__ SVDBPIOp(const OpA &a, int max_iters, float tol) : a_(a), max_iters_(max_iters), tol_(tol)
       { }
 
       // This should never be called
@@ -203,7 +204,7 @@ namespace detail {
         static_assert(is_device_executor_v<Executor>, "svdbpi() only supports the CUDA executor currently");
         static_assert(std::tuple_size_v<remove_cvref_t<Out>> == 4, "Must use mtie with 3 outputs on svdbpi(). ie: (mtie(U, S, V) = svdbpi(A))");
 
-        svdbpi_impl(std::get<0>(out), std::get<1>(out), std::get<2>(out), a_, iterations_, ex.getStream());
+        svdbpi_impl(std::get<0>(out), std::get<1>(out), std::get<2>(out), a_, max_iters_, tol_, ex.getStream());
       }
 
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
@@ -236,12 +237,14 @@ namespace detail {
  *
  * @param A
  *   Input tensor or operator for tensor A input with size "batches by m by n"
- * @param iterations
- *   The number of power iterations to perform for each singular value.  
+ * @param max_iters
+ *   The approximate maximum number of QR iterations to perform. 
+ * @param tol
+ *   The termination tolerance for the QR iteration. Setting this to 0 will skip the tolerance check.
  */
 template<typename AType>
-__MATX_INLINE__ auto svdbpi(AType &A, int iterations) {
-  return detail::SVDBPIOp(A, iterations);
+__MATX_INLINE__ auto svdbpi(AType &A, int max_iters=10, float tol=0.0f) {
+  return detail::SVDBPIOp(A, max_iters, tol);
 }
 
 }
diff --git a/include/matx/transforms/qr.h b/include/matx/transforms/qr.h
@@ -139,18 +139,10 @@ namespace detail {
       ncShape[RANK-2] = m;
       auto nc = clone<RANK-1>(N,ncShape);
 
-      std::array<index_t, RANK> umShape;
-      umShape.fill(matxKeepDim);
-      umShape[RANK-1] = 1;
-
-      auto um = clone<RANK>(u, umShape);
-      auto vm = clone<RANK>(v, umShape);
-
       // aliasing some memory here to share storage and provide clarity in the code below
       auto s = N; // alias 
       auto sc = nc; // alias
       auto w = v; // alias 
-      auto wm = vm; // alias
 
       for(int i = 0 ; i < k ; i++) {
 
@@ -179,15 +171,12 @@ namespace detail {
 
         (u = matvec(conj(transpose_matrix(R)), w, 2 , 0)).run(stream);
 
-        // TODO replace with outer API
-        matmul_impl(R, wm, conj(transpose_matrix(um)), stream, -1, 1);
+        (R = outer(w, conj(u), -1, 1)).run(stream);
 
         // entries below diagonal should be numerical zero.  Zero them out to avoid additional FP error.
         (IF(index(x.Rank()-1) > i, r = ATypeS(0)) ).run(stream);
 
-
-        // TODO replace with outer API
-        matmul_impl(wwt, wm, conj(transpose_matrix(wm)), stream);
+        (wwt = outer(w, conj(w))).run(stream);
 
         (Qin = Q).run(stream);  // save input 
         matmul_impl(Q, Qin, wwt, stream, -2, 1);

diff --git a/include/matx/transforms/svd.h b/include/matx/transforms/svd.h
@@ -323,13 +323,20 @@ inline auto svdbpi_impl_workspace(const AType &A, cudaStream_t stream) {
   RShape[RANK-1] = d;
   RShape[RANK-2] = d;  
 
+  std::array<index_t,RANK-2> l2NormShape;
+  for(int i=0;i<RANK-2;i++) {
+    l2NormShape[i] = A.Size(i);
+  }
+
   // temp memory for block power iteration
   auto AT = make_tensor<ATypeS>(ATShape, MATX_ASYNC_DEVICE_MEMORY, stream);
   auto Q = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);
+  auto Qold = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);
   auto R = make_tensor<ATypeS>(RShape, MATX_ASYNC_DEVICE_MEMORY, stream);
   auto Z = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);
-
-  return std::tuple(AT, Q, R, Z);
+  auto l2Norm = make_tensor<float>(l2NormShape, MATX_ASYNC_DEVICE_MEMORY, stream);
+  auto converged = make_tensor<int>({}, MATX_ASYNC_DEVICE_MEMORY, stream); 
+  return std::tuple(AT, Q, Qold, R, Z, l2Norm, converged);
 }
 
 /**
@@ -356,14 +363,16 @@ inline auto svdbpi_impl_workspace(const AType &A, cudaStream_t stream) {
  *   VT tensor or operator for right singular vectors output as VH with size "batches by min(m,n) by n"
  * @param A
  *   Input tensor or operator for tensor A input with size "batches by m by n"
- * @param iterations
- *   The number of block power iterations to perform.  
+ * @param max_iters
+ *   The approximate maximum number of QR iterations to perform. 
+ * @param tol
+ *   The termination tolerance for the QR iteration. Setting this to 0 will skip the tolerance check.
  * @param stream
  *   CUDA stream
  */
 template<typename UType, typename SType, typename VTType, typename AType>
-inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int iterations,  cudaStream_t stream) {
-  MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
+inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int max_iters, float tol,  cudaStream_t stream) {
+  MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL);
 
   static_assert(U.Rank() == A.Rank());
   static_assert(VT.Rank() == A.Rank());
@@ -390,8 +399,15 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int iter
   MATX_ASSERT_STR(VT.Size(RANK-1) == n, matxInvalidDim, "svdbpi: VT must have Size(RANK-1) == n");
   MATX_ASSERT_STR(S.Size(RANK-2) == d, matxInvalidDim, "svdbpi:  S must have Size(RANK-2) == d");
 
+  int converged_host = false;
+  cudaStream_t d2h;
+  cudaEvent_t event;
+  if(tol>0.0f) {
+    cudaStreamCreateWithFlags(&d2h,cudaStreamNonBlocking);
+    cudaEventCreate(&event);
+  }
 
-  auto [AT, Q, R, Z] = svdbpi_impl_workspace(A, stream);
+  auto [AT, Q, Qold, R, Z, l2Norm, converged] = svdbpi_impl_workspace(A, stream);
   auto qr_workspace = qr_internal_workspace(Z, stream);
 
   // create spd matrix
@@ -406,11 +422,49 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int iter
   cShape[RANK-1] = matxKeepDim;
   cShape[RANK-2] = matxKeepDim;
 
-  (Q = clone<RANK>(e2, cShape)).run(stream);
+  (Qold = Q = clone<RANK>(e2, cShape)).run(stream);
 
-  for(int i = 0; i < iterations; i++) {
+  // TODO multistream?
+  for(int i = 0; i < max_iters; i+=2)
+  {
+
+    // double pump this iteration so we get Qold and Q for tolerance checking.
+    // We might take an extra iteration but it will overheads associated with checking concergence.
     matmul_impl(Z, AT, Q, stream);
+    qr_internal(Qold, R, Z, qr_workspace, stream);
+
+    matmul_impl(Z, AT, Qold, stream);
     qr_internal(Q, R, Z, qr_workspace, stream);
+
+    if(tol!=0.0f) {
+
+      cudaStreamSynchronize(d2h);  // wait for d2h transfer to finish
+      if(converged_host == true) {
+        // if converged exit loop
+        break;
+      }
+
+      //compute L2(Q-Qold)
+      // sqrt folded into next operation
+      (l2Norm = sum(norm(Q-Qold))).run(stream);  
+
+      // compute if all batches have converged
+      if constexpr (RANK > 2) {
+        (converged = all(as_int(sqrt(l2Norm) < tol))).run(stream);
+      } else {
+        (converged = as_int(sqrt(l2Norm) < tol)).run(stream);
+      }
+
+      // event to record when converged is ready in stream
+      cudaEventRecord(event, stream);
+      // wait for d2h transfer until converged is ready
+      cudaStreamWaitEvent(d2h, event);
+
+      // copy convergence criteria to host.  
+      // This is in unpinned memory and cannot on most systems run asynchronously.  
+      // We do this here to hide the copy/sync behind prior launch latency/execution of next iteration.
+      cudaMemcpyAsync(&converged_host, converged.Data(), sizeof(int), cudaMemcpyDeviceToHost, d2h);
+    }
   }
 
   (S = real(sqrt(diag(R)))).run(stream);
@@ -441,6 +495,11 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int iter
     // IF required to avoid nans when singular value is 0
     (IF(D != STypeS(0), VT = VT / D)).run(stream);
   }
+
+  if(tol>0.0f) {
+    cudaEventDestroy(event);
+    cudaStreamDestroy(d2h);
+  }
 }
 
 } // end namespace matx