diff --git a/include/matx/transforms/qr.h b/include/matx/transforms/qr.h
index 74950418e..78ac85683 100644
--- a/include/matx/transforms/qr.h
+++ b/include/matx/transforms/qr.h
@@ -43,145 +43,157 @@ namespace matx {
 
 namespace detail {
 
-  // internal qr implementation which takes memory in api call
-  template<typename QType, typename RType, typename AType, 
-    typename NType, typename VMType, typename HType, typename QNType, typename RNType>
-      void qr_internal(QType Q, RType R, AType A,
-          NType N, VMType VM, HType H, QNType QN, RNType RN, cudaStream_t stream) {
-        MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
+  template<typename AType>
+    inline auto qr_internal_workspace(const AType &A, cudaStream_t stream) {
+      using ATypeS = typename AType::scalar_type;
+      const int RANK = AType::Rank();
 
-        static_assert(A.Rank() >= 2);
-        static_assert(Q.Rank() == A.Rank());
-        static_assert(R.Rank() == A.Rank());
+      index_t m = A.Size(RANK-2);
+      index_t n = A.Size(RANK-1);
 
-        MATX_ASSERT_STR(A.Rank() == Q.Rank(), matxInvalidDim, "qr: A and Q must have the same rank");
-        MATX_ASSERT_STR(A.Rank() == R.Rank(), matxInvalidDim, "qr: A and R must have the same rank");
+      std::array<index_t, RANK-1> uShape;
+      for(int i = 0; i < RANK-2; i++) {
+        uShape[i] = A.Size(i);
+      }
+      uShape[RANK-2] = A.Size(RANK-1);
+
+      auto QShape = A.Shape();
+      QShape[RANK-1] = m;
+
+      auto Qin = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);
+      auto wwt = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);
+      auto u = make_tensor<ATypeS>(uShape, MATX_ASYNC_DEVICE_MEMORY, stream);
+
+      return std::make_tuple(Qin, wwt, u);
+    }
 
-        using ATypeS = typename AType::scalar_type;
-        using NTypeS = typename inner_op_type_t<ATypeS>::type;
-        const int RANK = AType::Rank();
+  template<typename QType, typename RType, typename AType, typename WType>
+    inline void qr_internal(QType &Q, RType &R, const AType &A, WType workspace, cudaStream_t stream) {
+      MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
 
-        index_t m = A.Size(RANK-2);
-        index_t n = A.Size(RANK-1);
-        index_t k = std::min(m,n);
-        if(m<=n) k--;  // these matrices have one less update since the diagonal ends on the bottom of the matrix
+      static_assert(A.Rank() >= 2);
+      static_assert(Q.Rank() == A.Rank());
+      static_assert(R.Rank() == A.Rank());
 
-        // Create Identity matrix
-        auto E = eye<ATypeS>({m, m});
+      MATX_ASSERT_STR(A.Rank() == Q.Rank(), matxInvalidDim, "qr: A and Q must have the same rank");
+      MATX_ASSERT_STR(A.Rank() == R.Rank(), matxInvalidDim, "qr: A and R must have the same rank");
 
-        // Clone over batch Dims
-        auto ECShape = Q.Shape();
-        ECShape[RANK-1] = matxKeepDim;
-        ECShape[RANK-2] = matxKeepDim;
+      using ATypeS = typename AType::scalar_type;
+      using NTypeS = typename inner_op_type_t<ATypeS>::type;
+      const int RANK = AType::Rank();
 
-        auto I = clone<RANK>(E, ECShape);
+      index_t m = A.Size(RANK-2);
+      index_t n = A.Size(RANK-1);
+      index_t k = std::min(m,n);
+      if(m<=n) k--;  // these matrices have one less update since the diagonal ends on the bottom of the matrix
 
-        auto ci = N;  // alias
+      auto Qin = std::get<0>(workspace);
+      auto wwt  = std::get<1>(workspace);
+      auto u = std::get<2>(workspace);
 
-        // Inititalize Q
-        (Q = I).run(stream);
-        (R = A).run(stream);
+      static_assert(Qin.Rank() == Q.Rank());
+      static_assert(wwt.Rank() == Q.Rank());
+      static_assert(u.Rank() == Q.Rank()-1);
 
-        // setup slices
-        std::array<index_t, RANK> mSliceB, mSliceE;  // matrix slice starting at i,i
-        mSliceB.fill(0); mSliceE.fill(matxEnd);
+      // Create Identity matrix
+      auto E = eye<ATypeS>({m, m});
 
-        std::array<index_t, RANK> qSliceB, qSliceE;  // matrix slice starting at 0,i
-        qSliceB.fill(0); qSliceE.fill(matxEnd);
+      // Clone over batch Dims
+      auto ECShape = Q.Shape();
+      ECShape[RANK-1] = matxKeepDim;
+      ECShape[RANK-2] = matxKeepDim;
 
-        std::array<index_t, RANK> xSliceB, xSliceE;  // vector slice starting at i,i
-        xSliceB.fill(0); xSliceE.fill(matxEnd);
-        xSliceE[RANK-1] = matxDropDim; // drop last dim to make a vector
+      auto I = clone<RANK>(E, ECShape);
 
-        std::array<index_t, RANK> vSliceB, vSliceE;  // vector slice starting at i,0?
-        vSliceB.fill(0); vSliceE.fill(matxEnd);
-        vSliceE[RANK-1] = matxDropDim; // drop last dim to make a vector
+      // Inititalize Q
+      (Q = I).run(stream);
+      (R = A).run(stream);
 
-        std::array<index_t, RANK> vmSliceB, vmSliceE;  // vector slice as a matrix starting at i
-        vmSliceB.fill(0); vmSliceE.fill(matxEnd);
+      // we will slice X directly from R.
+      std::array<index_t, RANK> xSliceB, xSliceE;   
+      xSliceB.fill(0); xSliceE.fill(matxEnd);
+      xSliceE[RANK-1] = matxDropDim; // drop last dim to make a vector
 
-        // N cloned across x.
-        std::array<index_t, RANK-1> ncShape;
-        ncShape.fill(matxKeepDim);
 
-        // clone  ci across hi
-        std::array<index_t, RANK> cicShape;
-        cicShape.fill(matxKeepDim);
+      // v is of size m x 1.  Instead of allocating additional memory we will just reuse a row of Qin
+      std::array<index_t, RANK> vSliceB, vSliceE;   
+      vSliceB.fill(0); vSliceE.fill(matxEnd);
+      // select a single row of Q to alias as v
+      vSliceE[RANK-2] = matxDropDim; 
+      auto v = slice<RANK-1>(Qin, vSliceB, vSliceE);
+      auto xz = v; // alias 
 
-        for(int i = 0 ; i < k ; i++) {
 
-          // update top left corner of slice
-          mSliceB[RANK-2] = i;
-          mSliceB[RANK-1] = i;
+      // N is of size 1.  Instead of allocating additional memory we will just reuse an entry of Qin
+      std::array<index_t, RANK> nSliceB, nSliceE;   
+      nSliceB.fill(0); nSliceE.fill(matxEnd);
+      // select a single row of Q to alias as v
+      nSliceE[RANK-2] = matxDropDim; 
+      nSliceE[RANK-1] = matxDropDim; 
 
-          // update q slice
-          qSliceB[RANK-1] = i;
+      auto N = slice<RANK-2>(wwt, nSliceB, nSliceE);
 
-          // update v/vm slices to start at i
-          xSliceB[RANK-2] = i;
-          xSliceB[RANK-1] = i;
-          vmSliceB[RANK-2] = i;
-          vSliceB[RANK-2] = i;
+      // N cloned with RANK-2 of size m.
+      std::array<index_t, RANK-1> ncShape;
+      ncShape.fill(matxKeepDim);
+      ncShape[RANK-2] = m;
+      auto nc = clone<RANK-1>(N,ncShape);
 
-          // matrix slices
-          auto qi = slice<RANK>(Q, qSliceB, qSliceE);
-          auto qn = slice<RANK>(QN, qSliceB, qSliceE);
-          auto ri = slice<RANK>(R, mSliceB, mSliceE);
-          auto rn = slice<RANK>(RN, mSliceB, mSliceE);
-          auto hi = slice<RANK>(H, mSliceB, mSliceE);
-          auto vm = slice<RANK>(VM, vmSliceB, vmSliceE);
+      std::array<index_t, RANK> umShape;
+      umShape.fill(matxKeepDim);
+      umShape[RANK-1] = 1;
 
-          // vector slices
-          auto v = slice<RANK-1>(VM, vSliceB, vSliceE);
-          auto x = slice<RANK-1>(R, xSliceB, xSliceE);
+      auto um = clone<RANK>(u, umShape);
+      auto vm = clone<RANK>(v, umShape);
+      
+      // aliasing some memory here to share storage and provide clarity in the code below
+      auto s = N; // alias 
+      auto sc = nc; // alias
+      auto w = v; // alias 
+      auto wm = vm; // alias
 
-          //update clone shape
-          ncShape[RANK-2] = m-i;
-          auto nc = clone<RANK-1>(N,ncShape);
+      for(int i = 0 ; i < k ; i++) {
 
-          // update cloned ci shape
-          cicShape[RANK-2] = m - i;
-          cicShape[RANK-1] = m - i;
-          auto cic = clone<RANK>(ci, cicShape);
+        // slice off a column of R and alias as x
+        xSliceB[RANK-1] = i;
+        auto x = slice<RANK-1>(R, xSliceB, xSliceE);
 
-          // create identity matrix and clone across full rank
-          auto Ei = eye<ATypeS>({m-i,m-i});
-          auto Ii = clone<RANK>(Ei, ECShape);
+        // operator which zeros out values above current index in matrix
+        (xz = (index(x.Rank()-1) >= i) * x).run(stream);
 
-          (N = sum(norm(x))).run(stream);
-          (N = sqrt(N)).run(stream);
+        // compute L2 norm without sqrt. 
+        (N = sum(norm(xz))).run(stream);
+        //(N = sqrt(N)).run(stream);  // sqrt folded into next op
 
-          // copy x into v and apply signed addition of nc
-          (IFELSE( (index(v.Rank()-1) == 0),
-                   v = x + sign(x, ATypeS(1) ) * nc,
-                   v = x)).run(stream);
+        (v = xz + (index(v.Rank()-1) == i) * sign(xz) * sqrt(nc)).run(stream); 
 
-          (ci = sum(norm(v))).run(stream);
+        auto r = x;  // alias column of R happens to be the same as x
 
-          (ci = NTypeS(2) / ci).run(stream);
+        (s = sum(norm(v))).run(stream);
+        //(s = sqrt(s)).run(stream);  // sqrt folded into next op
 
-          matmul_impl(hi, vm, conj(transpose_matrix(vm)), stream);
+        // IFELSE to avoid nans when dividing by zero
+        (IFELSE(sc != NTypeS(0), 
+                w = (v / sqrt(sc)),
+                w = NTypeS(0))).run(stream);
 
-          (hi = Ii - cic * hi).run(stream);
+        (u = matvec(conj(transpose_matrix(R)), w, 2 , 0)).run(stream);
 
-          // update panel of r
-          matmul_impl(rn, hi, ri, stream);
+        // TODO replace with outer API
+        matmul_impl(R, wm, conj(transpose_matrix(um)), stream, -1, 1);
 
-          // update panel of q
-          matmul_impl(qn, qi, hi, stream); 
+        // entries below diagonal should be numerical zero.  Zero them out to avoid additional FP error.
+        (IF(index(x.Rank()-1) > i, r = ATypeS(0)) ).run(stream);
 
-          // deep copy required (can't swap)
-          // copy current panels into output matrix
-          (ri = rn).run(stream);
-          (qi = qn).run(stream);
 
-          // R & Q now contain latest
-        }
+        // TODO replace with outer API
+        matmul_impl(wwt, wm, conj(transpose_matrix(wm)), stream);
 
-        // zero out lower triangular part of R
-        (IF(index(RANK-1) < index(RANK-2), R = 0)).run(stream);
+        (Qin = Q).run(stream);  // save input 
+        matmul_impl(Q, Qin, wwt, stream, -2, 1);
 
       }
+    }
 } // end namespace detail
 
 /**
@@ -204,7 +216,7 @@ namespace detail {
  *   CUDA stream
  */
 template<typename QType, typename RType, typename AType>
-void qr_impl(QType Q, RType R, AType A, cudaStream_t stream) {
+inline void qr_impl(QType &Q, RType &R, const AType &A, cudaStream_t stream) {
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
 
   static_assert(A.Rank() >= 2);
@@ -214,40 +226,8 @@ void qr_impl(QType Q, RType R, AType A, cudaStream_t stream) {
   MATX_ASSERT_STR(A.Rank() == Q.Rank(), matxInvalidDim, "qr: A and Q must have the same rank");
   MATX_ASSERT_STR(A.Rank() == R.Rank(), matxInvalidDim, "qr: A and R must have the same rank");
 
-  using ATypeS = typename AType::scalar_type;
-  using NTypeS = typename inner_op_type_t<ATypeS>::type;
-  const int RANK = AType::Rank();
-
-  index_t m = A.Size(RANK-2);
-  index_t n = A.Size(RANK-1);
-  index_t k = std::min(m,n);
-  if(m<=n) k--;  // these matrices have one less update since the diagonal ends on the bottom of the matrix
-
-  std::array<index_t, RANK-2> NShape;
-  for(int i = 0; i < RANK-2; i++) {
-    NShape[i] = A.Size(i);
-  }
-
-  std::array<index_t, RANK> VMShape;
-  for(int i = 0; i < RANK-1; i++) {
-    VMShape[i] = A.Size(i);
-  }
-  VMShape[RANK-1] = 1;
-
-  std::array<index_t, RANK> HShape;
-  for(int i = 0; i < RANK-2; i++) {
-    HShape[i] = A.Size(i);
-  }
-  HShape[RANK-2] = m;
-  HShape[RANK-1] = m;
-
-  auto N = make_tensor<NTypeS>(NShape, MATX_ASYNC_DEVICE_MEMORY, stream);
-  auto VM = make_tensor<ATypeS>(VMShape, MATX_ASYNC_DEVICE_MEMORY, stream);
-  auto H = make_tensor<ATypeS>(HShape, MATX_ASYNC_DEVICE_MEMORY, stream);
-  auto QN = make_tensor<ATypeS>(Q.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream);
-  auto RN = make_tensor<ATypeS>(R.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream);
-
-  qr_internal(Q,R,A,N,VM,H,QN,RN,stream);
+  auto workspace = qr_internal_workspace(A, stream);
+  qr_internal(Q,R,A,workspace,stream);
 }
 
 } // end namespace matx
diff --git a/include/matx/transforms/svd.h b/include/matx/transforms/svd.h
index 698e8da07..ac25ef4fb 100644
--- a/include/matx/transforms/svd.h
+++ b/include/matx/transforms/svd.h
@@ -302,6 +302,36 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat
   }
 }
 
+template<typename AType>
+inline auto svdbpi_impl_workspace(const AType &A, cudaStream_t stream) {
+  using ATypeS = typename AType::scalar_type;
+  const int RANK = AType::Rank();
+
+  auto m = A.Size(RANK-2);  // rows
+  auto n = A.Size(RANK-1);  // cols
+  auto d = std::min(n,m); // dim for AAT or ATA
+
+  auto ATShape = A.Shape();
+  ATShape[RANK-2] = d;
+  ATShape[RANK-1] = d;
+
+  auto QShape = A.Shape();
+  QShape[RANK-1] = d;
+  QShape[RANK-2] = d;
+
+  auto RShape = A.Shape();
+  RShape[RANK-1] = d;
+  RShape[RANK-2] = d;  
+
+  // temp memory for block power iteration
+  auto AT = make_tensor<ATypeS>(ATShape, MATX_ASYNC_DEVICE_MEMORY, stream);
+  auto Q = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);
+  auto R = make_tensor<ATypeS>(RShape, MATX_ASYNC_DEVICE_MEMORY, stream);
+  auto Z = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);
+
+  return std::tuple(AT, Q, R, Z);
+}
+
 /**
  * Perform a SVD decomposition using the block power iteration.  This version of
  * SVD works well on small n/m with large batch.
@@ -327,12 +357,12 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat
  * @param A
  *   Input tensor or operator for tensor A input with size "batches by m by n"
  * @param iterations
- *   The number of power iterations to perform for each singular value.  
+ *   The number of block power iterations to perform.  
  * @param stream
  *   CUDA stream
  */
 template<typename UType, typename SType, typename VTType, typename AType>
-void svdbpi_impl(UType &U, SType &S, VTType &VT, AType &A, int iterations,  cudaStream_t stream) {
+inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int iterations,  cudaStream_t stream) {
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
 
   static_assert(U.Rank() == A.Rank());
@@ -346,63 +376,23 @@ void svdbpi_impl(UType &U, SType &S, VTType &VT, AType &A, int iterations,  cuda
   auto m = A.Size(RANK-2);  // rows
   auto n = A.Size(RANK-1);  // cols
   auto d = std::min(n,m); // dim for AAT or ATA
-  
+
   // assert batch sizes are the same
   for(int i = 0 ; i < RANK-2; i++) {
     MATX_ASSERT_STR(U.Size(i) == A.Size(i), matxInvalidDim, "svdbpi:  U and A must have the same batch sizes");
     MATX_ASSERT_STR(VT.Size(i) == A.Size(i), matxInvalidDim, "svdbpi:  VT and A must have the same batch sizes");
     MATX_ASSERT_STR(S.Size(i) == A.Size(i), matxInvalidDim, "svdbpi:  S and A must have the same batch sizes");
   }
-  
+
   MATX_ASSERT_STR(U.Size(RANK-2) == m, matxInvalidDim, "svdbpi: U must have Size(RANK-2) == m");
   MATX_ASSERT_STR(U.Size(RANK-1) == d, matxInvalidDim, "svdbpi: U must have Size(RANK-1) == d");
   MATX_ASSERT_STR(VT.Size(RANK-2) == d, matxInvalidDim, "svdbpi: VT must have Size(RANK-2) == d");
   MATX_ASSERT_STR(VT.Size(RANK-1) == n, matxInvalidDim, "svdbpi: VT must have Size(RANK-1) == n");
   MATX_ASSERT_STR(S.Size(RANK-2) == d, matxInvalidDim, "svdbpi:  S must have Size(RANK-2) == d");
-  
-
-  auto ATShape = A.Shape();
-  ATShape[RANK-2] = d;
-  ATShape[RANK-1] = d;
-
-  auto QShape = A.Shape();
-  QShape[RANK-1] = d;
-  QShape[RANK-2] = d;
-  
-  auto RShape = A.Shape();
-  RShape[RANK-1] = d;
-  RShape[RANK-2] = d;  
 
-  // temp memory for block power iteration
-  auto AT = make_tensor<ATypeS>(ATShape, MATX_ASYNC_DEVICE_MEMORY, stream);
-  auto Q = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);
-  auto R = make_tensor<ATypeS>(RShape, MATX_ASYNC_DEVICE_MEMORY, stream);
-  auto Z = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);
-
-  std::array<index_t, RANK-2> NShape;
-  for(int i = 0; i < RANK-2; i++) {
-    NShape[i] = Z.Size(i);
-  }
 
-  std::array<index_t, RANK> VMShape;
-  for(int i = 0; i < RANK-1; i++) {
-    VMShape[i] = Z.Size(i);
-  }
-  VMShape[RANK-1] = 1;
-
-  std::array<index_t, RANK> HShape;
-  for(int i = 0; i < RANK-2; i++) {
-    HShape[i] = Z.Size(i);
-  }
-  HShape[RANK-2] = Z.Size(RANK-2);
-  HShape[RANK-1] = Z.Size(RANK-2);
-
-  // temp memory for qr
-  auto N = make_tensor<STypeS>(NShape, MATX_ASYNC_DEVICE_MEMORY, stream);
-  auto VM = make_tensor<ATypeS>(VMShape, MATX_ASYNC_DEVICE_MEMORY, stream);
-  auto H = make_tensor<ATypeS>(HShape, MATX_ASYNC_DEVICE_MEMORY, stream);
-  auto QN = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);
-  auto RN = make_tensor<ATypeS>(RShape, MATX_ASYNC_DEVICE_MEMORY, stream);
+  auto [AT, Q, R, Z] = svdbpi_impl_workspace(A, stream);
+  auto qr_workspace = qr_internal_workspace(Z, stream);
 
   // create spd matrix
   if ( m >= n ) {
@@ -410,43 +400,46 @@ void svdbpi_impl(UType &U, SType &S, VTType &VT, AType &A, int iterations,  cuda
   } else {
     matmul_impl(AT, A, conj(transpose_matrix(A)), stream);
   }
-   
+
   auto e2 = eye({d,d});
   auto cShape = A.Shape();
   cShape[RANK-1] = matxKeepDim;
   cShape[RANK-2] = matxKeepDim;
-  
+
   (Q = clone<RANK>(e2, cShape)).run(stream);
 
   for(int i = 0; i < iterations; i++) {
     matmul_impl(Z, AT, Q, stream);
-    qr_internal(Q,R,Z,N,VM,H,QN,RN,stream);
+    qr_internal(Q, R, Z, qr_workspace, stream);
   }
 
   (S = real(sqrt(diag(R)))).run(stream);
-  
+
   if( m >= n ) {
     (VT = conj(transpose_matrix(Q))).run(stream);
     matmul_impl(U, A, Q, stream);
-    
+
     auto DShape = U.Shape();
     DShape.fill(matxKeepDim);
     DShape[RANK-2] = m;
     auto D = clone<RANK>(S, DShape);
-    
-    // normalize U by singular values
-    (U = U * STypeS(1) / D).run(stream);
+
+    // normalize U by singular values 
+    // IF required to avoid nans when singular value is 0
+    (IF(D != STypeS(0), U = U / D)).run(stream);
+
   } else {
     (U = Q).run(stream);
     matmul_impl(VT, conj(transpose_matrix(Q)), A, stream);
-    
+
     auto DShape = VT.Shape();
     DShape.fill(matxKeepDim);
     DShape[RANK-1] = n;
     auto D = clone<RANK>(S, DShape);
-    
+
     // normalize VT by singular values
-    (VT = VT * STypeS(1) / D).run(stream);
+    // IF required to avoid nans when singular value is 0
+    (IF(D != STypeS(0), VT = VT / D)).run(stream);
   }
 }