diff --git a/docs_input/api/math/complex/norm.rst b/docs_input/api/math/complex/norm.rst
deleted file mode 100644
index e34ebfb6b..000000000
--- a/docs_input/api/math/complex/norm.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-.. _norm_func:
-
-norm
-====
-
-Square of the magnitude of a complex number
-
-.. doxygenfunction:: matx::norm(Op t)
-
-Examples
-~~~~~~~~
-
-.. literalinclude:: ../../../../test/00_operators/OperatorTests.cu
-   :language: cpp
-   :start-after: example-begin norm-test-1
-   :end-before: example-end norm-test-1
-   :dedent:
-
diff --git a/docs_input/notebooks/04_radar_pipeline.ipynb b/docs_input/notebooks/04_radar_pipeline.ipynb
index eeb840961..622f73e4a 100644
--- a/docs_input/notebooks/04_radar_pipeline.ipynb
+++ b/docs_input/notebooks/04_radar_pipeline.ipynb
@@ -133,7 +133,7 @@
     "In this case we're applying a Hamming window to our partial waveform view. `hamming` is a generator function that generates Hamming window values at each point defined in the tensor. Next, we compute the L2 norm of the partial waveform. The L2 norm is done in two steps currently: an I^2 + Q^2 reduction, followed by a square root on the output of the reduction:\n",
     "\n",
     "```c++\n",
-    "  sum(norms, norm(waveformPart), stream);\n",
+    "  sum(norms, abs2(waveformPart), stream);\n",
     "  exec(norms, sqrt(norms), stream);\n",
     "```\n",
     "\n",
@@ -245,10 +245,10 @@
     "## CFAR Detection\n",
     "The last step in the pipeline is the constant false alarm rate (CFAR) detection. CFAR detection is broadly used to filter observible signals from noise by setting a threshold for observation. A filter mask was created in the constructor to represent the \"field of view\" that we are looking for a target in. By describing the field of view, we can differentiate what parts of the signal we believe are signal power and noise power. \n",
     "\n",
-    "CFAR detection begins by taking the signal power of the last stage by summing the squares of all complex numbers (I^2 + Q^2). This is done by using the MatX `norm` operator:\n",
+    "CFAR detection begins by taking the signal power of the last stage by summing the squares of all complex numbers (I^2 + Q^2). This is done by using the MatX `abs2` operator:\n",
     "\n",
     "```c++\n",
-    "exec(xdPow, norm(cfarIn), stream);\n",
+    "exec(xdPow, abs2(cfarIn), stream);\n",
     "```\n",
     "\n",
     "xdPow now contains the sum of the squares of each element. Using the computed power per cell, we apply the CFAR mask that was computed in the constructor. The mask is applied using a 2D convolution from the MatX `conv2d` function:\n",
diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu
index 9afb0a0fd..00198bd5a 100644
--- a/examples/fft_conv.cu
+++ b/examples/fft_conv.cu
@@ -172,4 +172,4 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
-}
+}
\ No newline at end of file
diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h
index de08ffd5c..ecaaa08c8 100644
--- a/examples/simple_radar_pipeline.h
+++ b/examples/simple_radar_pipeline.h
@@ -253,7 +253,7 @@ class RadarPipeline {
     (waveformPart = waveformPart * hamming<0>({waveformLength})).run(exec);
 
     // compute L2 norm
-    (norms = sum(norm(waveformPart))).run(exec);
+    (norms = sum(abs2(waveformPart))).run(exec);
     (norms = sqrt(norms)).run(exec);
 
     (waveformPart = waveformPart / norms).run(exec);
@@ -358,7 +358,7 @@ class RadarPipeline {
    */
   void CFARDetections()
   {
-    (xPow = norm(tpcView)).run(exec);
+    (xPow = abs2(tpcView)).run(exec);
 
     // Estimate the background average power in each cell
     // background_averages = conv2(Xpow, mask, 'same') ./ norm;
diff --git a/include/matx/core/half_complex.h b/include/matx/core/half_complex.h
index a5f0383a9..b074ddec3 100644
--- a/include/matx/core/half_complex.h
+++ b/include/matx/core/half_complex.h
@@ -740,14 +740,14 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T atan2(const T &x, const T &y)
 }
 
 /**
- * @brief Norm operator
+ * @brief Squared absolute value operator
  * 
  * @tparam T Underlying type
  * @param x Value of input
  * @return Result of operation 
  */
 template <typename T>
-__MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T norm(const matxHalfComplex<T> &x)
+__MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T abs2(const matxHalfComplex<T> &x)
 {
   if (isinf(x.real()))
     return static_cast<T>(cuda::std::abs(static_cast<float>(x.real())));
diff --git a/include/matx/core/operator_utils.h b/include/matx/core/operator_utils.h
index 5dfbc4d0f..da4afa1b4 100644
--- a/include/matx/core/operator_utils.h
+++ b/include/matx/core/operator_utils.h
@@ -42,7 +42,7 @@ namespace matx {
   __MATX_HOST__ __MATX_INLINE__ auto ReduceOutput(Func &&func, OutputOp &&out, InputOp &&in, BeginIter &&bi, EndIter &&ei) {
     if constexpr (remove_cvref_t<decltype(out)>::Rank() <= 1 && is_tensor_view_v<OutputOp>) {
       if (out.IsContiguous()) {
-        if constexpr(ConvertType) {
+        if constexpr(ConvertType) {   
           return func(  in, 
                         reinterpret_cast<detail::convert_matx_type_t<typename remove_cvref_t<OutputOp>::scalar_type> *>(out.Data()), 
                         bi, 
@@ -64,7 +64,7 @@ namespace matx {
 
   template <typename Func, typename OutputOp, typename InputOp, bool ConvertType = true>
   __MATX_HOST__ __MATX_INLINE__ auto ReduceInput(Func &&func, OutputOp &&out, InputOp &&in) {
-    typename detail::base_type_t<InputOp> in_base = in;
+    typename detail::base_type_t<InputOp> in_base = in;    
     if constexpr (in_base.Rank() < 2 && is_tensor_view_v<InputOp>) {
       if (in_base.IsContiguous()) {
         if constexpr (ConvertType) {
@@ -89,8 +89,6 @@ namespace matx {
     auto collapsed = matx::lcollapse<remove_cvref_t<decltype(out)>::Rank()>(rcollapse<remove_cvref_t<decltype(in)>::Rank() - 
                                                                                       remove_cvref_t<decltype(out)>::Rank()>(in_base));
     const auto &iter = matx::RandomOperatorIterator<decltype(collapsed), ConvertType>{collapsed};
-
-  
     return ReduceOutput<ConvertType>(std::forward<Func>(func), std::forward<OutputOp>(out), iter, BeginOffset{iter}, EndOffset{iter});   
   } 
 
@@ -116,4 +114,21 @@ namespace matx {
 
     return shape;
   }
+
+  namespace detail {
+    // Used inside of transforms to allocate temporary output
+    template <typename TensorType, typename Executor, typename ShapeType> 
+    __MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex, ShapeType &&shape, typename TensorType::scalar_type **ptr) {
+      const auto ttl_size = std::accumulate(shape.begin(), shape.end(), static_cast<index_t>(1),
+                                  std::multiplies<index_t>()) * sizeof(*ptr);      
+      if constexpr (is_cuda_executor_v<Executor>) {
+        matxAlloc((void**)ptr, ttl_size, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
+        make_tensor(tensor, *ptr, shape);
+      }
+      else {        
+        matxAlloc((void**)ptr, ttl_size, MATX_HOST_MEMORY);
+        make_tensor(tensor, *ptr, shape);        
+      }  
+    }  
+  }
 }; 
diff --git a/include/matx/core/pybind.h b/include/matx/core/pybind.h
index 0a42d6b24..2eb6b3525 100644
--- a/include/matx/core/pybind.h
+++ b/include/matx/core/pybind.h
@@ -336,35 +336,40 @@ class MatXPybind {
     using ntype = matx_convert_complex_type<T>;
     auto ften = pybind11::array_t<ntype>(np_ten);
 
-    for (index_t s1 = 0; s1 < ten.Size(0); s1++) {
-      if constexpr (RANK > 1) {
-        for (index_t s2 = 0; s2 < ten.Size(1); s2++) {
-          if constexpr (RANK > 2) {
-            for (index_t s3 = 0; s3 < ten.Size(2); s3++) {
-              if constexpr (RANK > 3) {
-                for (index_t s4 = 0; s4 < ten.Size(3); s4++) {
-                  if constexpr (RANK > 4) {
-                    for (index_t s5 = 0; s5 < ten.Size(4); s5++) {
-                      ten(s1, s2, s3, s4, s5) = ConvertComplex(ften.at(s1, s2, s3, s4, s5));
+    if constexpr (RANK == 0) {
+      ten() = ConvertComplex(ften.at());
+    }
+    else {
+      for (index_t s1 = 0; s1 < ten.Size(0); s1++) {
+        if constexpr (RANK > 1) {
+          for (index_t s2 = 0; s2 < ten.Size(1); s2++) {
+            if constexpr (RANK > 2) {
+              for (index_t s3 = 0; s3 < ten.Size(2); s3++) {
+                if constexpr (RANK > 3) {
+                  for (index_t s4 = 0; s4 < ten.Size(3); s4++) {
+                    if constexpr (RANK > 4) {
+                      for (index_t s5 = 0; s5 < ten.Size(4); s5++) {
+                        ten(s1, s2, s3, s4, s5) = ConvertComplex(ften.at(s1, s2, s3, s4, s5));
+                      }
+                    }
+                    else {
+                      ten(s1, s2, s3, s4) = ConvertComplex(ften.at(s1, s2, s3, s4));
                     }
-                  }
-                  else {
-                    ten(s1, s2, s3, s4) = ConvertComplex(ften.at(s1, s2, s3, s4));
                   }
                 }
-              }
-              else {
-                ten(s1, s2, s3) = ConvertComplex(ften.at(s1, s2, s3));
+                else {
+                  ten(s1, s2, s3) = ConvertComplex(ften.at(s1, s2, s3));
+                }
               }
             }
-          }
-          else {
-            ten(s1, s2) = ConvertComplex(ften.at(s1, s2));
+            else {
+              ten(s1, s2) = ConvertComplex(ften.at(s1, s2));
+            }
           }
         }
-      }
-      else {
-        ten(s1) = ConvertComplex(ften.at(s1));
+        else {
+          ten(s1) = ConvertComplex(ften.at(s1));
+        }
       }
     }
   }
diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h
index e3bae4d1f..5b6f3dea5 100644
--- a/include/matx/core/tensor_impl.h
+++ b/include/matx/core/tensor_impl.h
@@ -78,6 +78,7 @@ class tensor_impl_t {
     using shape_type = typename Desc::shape_type;
     using stride_type = typename Desc::stride_type;
     using matxoplvalue = bool;
+    using self_type = tensor_impl_t<T, RANK, Desc>;
 
     // Type specifier for signaling this is a matx operation
     using matxop = bool;
@@ -231,6 +232,12 @@ class tensor_impl_t {
     {
     }      
 
+    __MATX_HOST__ void Shallow(const self_type &rhs) noexcept
+    {
+      ldata_ = rhs.ldata_;
+      desc_ = rhs.desc_;
+    }    
+
     /**
      * Lazy assignment operator=. Used to create a "set" object for deferred
      * execution on a device
@@ -811,7 +818,7 @@ class tensor_impl_t {
      * 
      * @return data pointer 
      */
-    auto Data() const noexcept {
+    __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__  auto Data() const noexcept {
       return ldata_;
     } 
 
diff --git a/include/matx/core/tie.h b/include/matx/core/tie.h
index c7ca3566f..fdb2d0d90 100644
--- a/include/matx/core/tie.h
+++ b/include/matx/core/tie.h
@@ -94,6 +94,11 @@ struct mtie : public BaseOp<mtie<Ts...>>{
 
   template <typename Executor>
   __MATX_INLINE__ void Exec(Executor &&ex) {
+    // Run the PreRun on the inner type to avoid allocation but allow transforms using MatX operators
+    // to do any setup needed
+    if constexpr (sizeof...(Ts) == 2) {
+      cuda::std::get<sizeof...(Ts) - 1>(ts_).InnerPreRun(NoShape{}, std::forward<Executor>(ex));
+    }
     cuda::std::get<sizeof...(Ts) - 1>(ts_).Exec(ts_, std::forward<Executor>(ex));
   }
 
diff --git a/include/matx/operators/all.h b/include/matx/operators/all.h
index 0a59dc55b..0407f53f1 100644
--- a/include/matx/operators/all.h
+++ b/include/matx/operators/all.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;     
 
     public:
       using matxop = bool;
@@ -80,29 +81,30 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
         }     
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
 
-        template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PostRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
-        {
-          if constexpr (is_matx_op<OpA>()) {
-            a_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
-        }      
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PostRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
+      {
+        if constexpr (is_matx_op<OpA>()) {
+          a_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+        }
+      }      
 
       constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(int dim) const
       {
diff --git a/include/matx/operators/ambgfun.h b/include/matx/operators/ambgfun.h
index e9531857f..551ab3298 100644
--- a/include/matx/operators/ambgfun.h
+++ b/include/matx/operators/ambgfun.h
@@ -50,7 +50,8 @@ namespace matx
         AMBGFunCutType_t cut_;
         float cut_val_;
         cuda::std::array<index_t, 2> out_dims_;
-        mutable matx::tensor_t<typename OpX::scalar_type, 2> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpX>::scalar_type, 2> tmp_out_;
+        mutable typename remove_cvref_t<OpX>::scalar_type *ptr;         
 
       public:
         using matxop = bool;
@@ -111,19 +112,23 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpX>()) {
             x_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
+          }     
 
           if constexpr (is_matx_op<OpY>()) {
             y_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }          
+          }             
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));         
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/any.h b/include/matx/operators/any.h
index 2c3d469dd..a2600346d 100644
--- a/include/matx/operators/any.h
+++ b/include/matx/operators/any.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
         }     
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));   
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/cgsolve.h b/include/matx/operators/cgsolve.h
index 1dea4f7c9..fd7ba3cc7 100644
--- a/include/matx/operators/cgsolve.h
+++ b/include/matx/operators/cgsolve.h
@@ -49,7 +49,8 @@ namespace matx
         double tol_;
         int max_iters_;
         cuda::std::array<index_t, 2> out_dims_;
-        mutable matx::tensor_t<typename OpA::scalar_type, 2> tmp_out_;
+        mutable detail::tensor_impl_t<typename OpA::scalar_type, 2> tmp_out_;
+        mutable typename OpA::scalar_type *ptr;               
 
       public:
         using matxop = bool;
@@ -92,7 +93,7 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
@@ -100,11 +101,15 @@ namespace matx
 
           if constexpr (is_matx_op<OpB>()) {
             b_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }          
+          }           
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));         
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/channelize_poly.h b/include/matx/operators/channelize_poly.h
index 69255442e..d9e81301d 100644
--- a/include/matx/operators/channelize_poly.h
+++ b/include/matx/operators/channelize_poly.h
@@ -55,7 +55,8 @@ namespace detail {
       index_t num_channels_;
       index_t decimation_factor_;
       cuda::std::array<index_t, OpA::Rank() + 1> out_dims_;
-      mutable matx::tensor_t<out_t, OpA::Rank() + 1> tmp_out_;
+      mutable detail::tensor_impl_t<out_t, OpA::Rank() + 1> tmp_out_;
+      mutable out_t *ptr;       
 
     public:
       using matxop = bool;
@@ -95,7 +96,7 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
@@ -103,11 +104,15 @@ namespace detail {
 
         if constexpr (is_matx_op<FilterType>()) {
           f_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }          
+        }           
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));           
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/chol.h b/include/matx/operators/chol.h
index 99afee14f..82efde6c2 100644
--- a/include/matx/operators/chol.h
+++ b/include/matx/operators/chol.h
@@ -73,11 +73,17 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }
+        }             
+      }      
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
 
         if constexpr (is_cuda_executor_v<Executor>) {
           make_tensor(tmp_out_, a_.Shape(), MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
diff --git a/include/matx/operators/conv.h b/include/matx/operators/conv.h
index 800c69c32..e9833eb0e 100644
--- a/include/matx/operators/conv.h
+++ b/include/matx/operators/conv.h
@@ -53,7 +53,8 @@ namespace matx
         matxConvCorrMethod_t method_;
         PermDims perm_;
         cuda::std::array<index_t, max_rank> out_dims_;
-        mutable matx::tensor_t<out_t, max_rank> tmp_out_;
+        mutable detail::tensor_impl_t<out_t, max_rank> tmp_out_;
+        mutable out_t *ptr; 
 
         static constexpr int MAX_MIN_DIMENSION_DIRECT = 1024;
 
@@ -160,19 +161,23 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
+          }     
 
           if constexpr (is_matx_op<OpB>()) {
             b_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
+          }           
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
@@ -237,7 +242,8 @@ namespace detail {
       matxConvCorrMode_t mode_;
       PermDims perm_;
       cuda::std::array<index_t, max_rank> out_dims_;
-      mutable matx::tensor_t<out_t, max_rank> tmp_out_;
+      mutable detail::tensor_impl_t<out_t, max_rank> tmp_out_;
+      mutable out_t *ptr; 
 
     public:
       using matxop = bool;
@@ -330,7 +336,7 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
@@ -338,11 +344,15 @@ namespace detail {
 
         if constexpr (is_matx_op<OpB>()) {
           b_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }          
+        }           
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));           
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/corr.h b/include/matx/operators/corr.h
index 36a42038d..224e65bd5 100644
--- a/include/matx/operators/corr.h
+++ b/include/matx/operators/corr.h
@@ -53,7 +53,8 @@ namespace matx
         matxConvCorrMethod_t method_;
         PermDims perm_;
         cuda::std::array<index_t, max_rank> out_dims_;
-        mutable matx::tensor_t<out_t, max_rank> tmp_out_;
+        mutable detail::tensor_impl_t<out_t, max_rank> tmp_out_;
+        mutable out_t *ptr; 
 
       public:
         using matxop = bool;
@@ -146,7 +147,7 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
@@ -154,11 +155,15 @@ namespace matx
 
           if constexpr (is_matx_op<OpB>()) {
             b_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }          
+          }           
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));        
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/cov.h b/include/matx/operators/cov.h
index e0ecb3145..a2f00cefd 100644
--- a/include/matx/operators/cov.h
+++ b/include/matx/operators/cov.h
@@ -46,7 +46,8 @@ namespace matx
       private:
         OpA a_;
         cuda::std::array<index_t, 2> out_dims_;
-        mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+        mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
       public:
         using matxop = bool;
@@ -88,15 +89,19 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }     
+          }         
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));      
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/cumsum.h b/include/matx/operators/cumsum.h
index 6a672be74..a872e7630 100644
--- a/include/matx/operators/cumsum.h
+++ b/include/matx/operators/cumsum.h
@@ -48,7 +48,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, OpA::Rank()> out_dims_;
-      mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;  
 
     public:
       using matxop = bool;
@@ -79,15 +80,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }         
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));      
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/det.h b/include/matx/operators/det.h
index 20f85ece0..a74411746 100644
--- a/include/matx/operators/det.h
+++ b/include/matx/operators/det.h
@@ -44,7 +44,8 @@ namespace detail {
   {
     private:
       OpA a_;
-      mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -72,18 +73,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }
+        }       
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, a_.Shape(), MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, a_.Shape(), MATX_HOST_MEMORY);
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), a_.Shape(), &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/eig.h b/include/matx/operators/eig.h
index 33b3d6d32..7cf9cf0f4 100644
--- a/include/matx/operators/eig.h
+++ b/include/matx/operators/eig.h
@@ -49,7 +49,6 @@ namespace detail {
       OpA a_;
       cusolverEigMode_t jobz_;
       cublasFillMode_t uplo_;
-      matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;
 
     public:
       using matxop = bool;
diff --git a/include/matx/operators/einsum.h b/include/matx/operators/einsum.h
index f91406f30..982fcfdb4 100644
--- a/include/matx/operators/einsum.h
+++ b/include/matx/operators/einsum.h
@@ -78,6 +78,12 @@ namespace detail {
         return matxNoRank;
       }
 
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
+      {
+        // Maybe do something here later if we take operators as input        
+      }         
+
       template <typename ShapeType, typename Executor>
       __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
diff --git a/include/matx/operators/fft.h b/include/matx/operators/fft.h
index 1c86540e1..89cd0cbc7 100644
--- a/include/matx/operators/fft.h
+++ b/include/matx/operators/fft.h
@@ -53,9 +53,12 @@ namespace matx
         FFTType type_;
         FFTNorm norm_;
         cuda::std::array<index_t, OpA::Rank()> out_dims_;
-        mutable matx::tensor_t<std::conditional_t<is_complex_v<typename OpA::scalar_type>, 
+        using ttype = std::conditional_t<is_complex_v<typename OpA::scalar_type>, 
                                           typename OpA::scalar_type, 
-                                          typename scalar_to_complex<typename OpA::scalar_type>::ctype>, OpA::Rank()> tmp_out_;
+                                          typename scalar_to_complex<typename OpA::scalar_type>::ctype>;
+        // This should be tensor_impl_t, but need to work around issues with temp types returned in fft
+        mutable matx::tensor_t<ttype, OpA::Rank()> tmp_out_;
+        mutable ttype *ptr;                                           
 
       public:
         using matxop = bool;
@@ -161,18 +164,19 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
+          }         
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
-          else if constexpr (is_host_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_HOST_MALLOC_MEMORY);
-          }          
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);         
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
@@ -299,9 +303,12 @@ namespace matx
         FFTType type_;
         FFTNorm norm_;
         cuda::std::array<index_t, OpA::Rank()> out_dims_;
-        mutable matx::tensor_t<std::conditional_t<is_complex_v<typename OpA::scalar_type>,
+        using ttype = std::conditional_t<is_complex_v<typename OpA::scalar_type>, 
                                           typename OpA::scalar_type, 
-                                          typename scalar_to_complex<OpA>::ctype>, OpA::Rank()> tmp_out_;
+                                          typename scalar_to_complex<typename OpA::scalar_type>::ctype>;
+        // This should be tensor_impl_t, but need to work around issues with temp types returned in fft
+        mutable matx::tensor_t<ttype, OpA::Rank()> tmp_out_; 
+        mutable ttype *ptr;                                                
 
       public:
         using matxop = bool;
@@ -368,18 +375,19 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
+          }         
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
-          else if constexpr (is_host_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_HOST_MALLOC_MEMORY);
-          }          
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);          
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/filter.h b/include/matx/operators/filter.h
index f20ca2039..1c2f0041d 100644
--- a/include/matx/operators/filter.h
+++ b/include/matx/operators/filter.h
@@ -50,7 +50,8 @@ namespace detail {
       cuda::std::array<FilterType, NR> h_rec_;
       cuda::std::array<FilterType, NNR> h_nonrec_;
       cuda::std::array<index_t, OpA::Rank()> out_dims_;
-      mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -87,15 +88,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }         
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));     
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/hist.h b/include/matx/operators/hist.h
index a0c5ca87a..6060b61dc 100644
--- a/include/matx/operators/hist.h
+++ b/include/matx/operators/hist.h
@@ -50,7 +50,8 @@ namespace detail {
       typename OpA::scalar_type lower_;
       typename OpA::scalar_type upper_;
       cuda::std::array<index_t, OpA::Rank()> out_dims_;
-      mutable matx::tensor_t<int, OpA::Rank()> tmp_out_;      
+      mutable detail::tensor_impl_t<int, OpA::Rank()> tmp_out_;
+      mutable int *ptr;  
 
     public:
       using matxop = bool;
@@ -83,15 +84,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }         
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));     
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/inverse.h b/include/matx/operators/inverse.h
index da463621b..21c3fb56b 100644
--- a/include/matx/operators/inverse.h
+++ b/include/matx/operators/inverse.h
@@ -46,7 +46,8 @@ namespace detail {
   {
     private:
       OpA a_;
-      mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -79,18 +80,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }
-
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, a_.Shape(), MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, a_.Shape(), MATX_HOST_MEMORY);
-        }
+        }          
+      }      
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), a_.Shape(), &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }      
diff --git a/include/matx/operators/lu.h b/include/matx/operators/lu.h
index aba5b0ddd..16dd5146d 100644
--- a/include/matx/operators/lu.h
+++ b/include/matx/operators/lu.h
@@ -44,7 +44,8 @@ namespace detail {
   {
     private:
       OpA a_;
-      mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
diff --git a/include/matx/operators/matmul.h b/include/matx/operators/matmul.h
index 185e9eec0..1ea6cce44 100644
--- a/include/matx/operators/matmul.h
+++ b/include/matx/operators/matmul.h
@@ -51,7 +51,8 @@ namespace matx
         PermDims perm_; 
         static constexpr int out_rank = cuda::std::max(OpA::Rank(), OpB::Rank());
         cuda::std::array<index_t, out_rank> out_dims_;
-        mutable matx::tensor_t<typename OpA::scalar_type, out_rank> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, out_rank> tmp_out_;
+        mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
       public:
         using matxop = bool;
@@ -116,19 +117,23 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
+          }     
 
           if constexpr (is_matx_op<OpB>()) {
             b_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
+          }           
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/matvec.h b/include/matx/operators/matvec.h
index 16bdcc4e5..15fe1fa33 100644
--- a/include/matx/operators/matvec.h
+++ b/include/matx/operators/matvec.h
@@ -50,7 +50,8 @@ namespace matx
         float beta_;
         static constexpr int RANK = remove_cvref_t<OpB>::Rank();
         cuda::std::array<index_t, RANK> out_dims_;
-        mutable matx::tensor_t<typename OpA::scalar_type, RANK> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, RANK> tmp_out_;
+        mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
       public:
         using matxop = bool;
@@ -93,7 +94,7 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
@@ -101,11 +102,15 @@ namespace matx
 
           if constexpr (is_matx_op<OpB>()) {
             b_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }          
+          }           
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));           
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/max.h b/include/matx/operators/max.h
index 0792861f5..69aee65c1 100644
--- a/include/matx/operators/max.h
+++ b/include/matx/operators/max.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;    
 
     public:
       using matxop = bool;
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }          
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));    
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
@@ -131,7 +133,7 @@ namespace detail {
 template <typename InType, int D>
 __MATX_INLINE__ auto max(const InType &in, const int (&dims)[D])
 {
-  static_assert(D < InType::Rank(), "reduction dimensions must be <= Rank of input");
+  static_assert(D <= InType::Rank(), "reduction dimensions must be <= Rank of input");
   auto perm = detail::getPermuteDims<InType::Rank()>(dims);
   auto permop = permute(in, perm);
 
@@ -142,7 +144,7 @@ template <typename InType, int D>
 [[deprecated("Use max() instead of rmax() for reductions")]]
 __MATX_INLINE__ auto rmax(const InType &in, const int (&dims)[D])
 {
-  static_assert(D < InType::Rank(), "reduction dimensions must be <= Rank of input");
+  static_assert(D <= InType::Rank(), "reduction dimensions must be <= Rank of input");
   auto perm = detail::getPermuteDims<InType::Rank()>(dims);
   auto permop = permute(in, perm);
 
diff --git a/include/matx/operators/mean.h b/include/matx/operators/mean.h
index 37d1e73e5..2cf22955a 100644
--- a/include/matx/operators/mean.h
+++ b/include/matx/operators/mean.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;     
 
     public:
       using matxop = bool;
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }           
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));       
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/median.h b/include/matx/operators/median.h
index 866cb7fcb..cce0a4672 100644
--- a/include/matx/operators/median.h
+++ b/include/matx/operators/median.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }         
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));      
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/min.h b/include/matx/operators/min.h
index aad175ce6..71049914e 100644
--- a/include/matx/operators/min.h
+++ b/include/matx/operators/min.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;     
 
     public:
       using matxop = bool;
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }       
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));    
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/norm.h b/include/matx/operators/norm.h
new file mode 100644
index 000000000..546584baa
--- /dev/null
+++ b/include/matx/operators/norm.h
@@ -0,0 +1,201 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+
+#include "matx/core/type_utils.h"
+#include "matx/operators/base_operator.h"
+#include "matx/transforms/norm.h"
+
+namespace matx
+{
+  namespace detail {
+    template <typename OpA, typename NormType>
+    class NormOp : public BaseOp<NormOp<OpA, NormType>>
+    {
+      private:
+        using out_type = typename inner_op_type_t<typename remove_cvref_t<OpA>::scalar_type>::type;
+        OpA a_;
+        NormOrder order_;
+        static constexpr int ORank = std::is_same_v<NormType, detail::NormTypeVector> ? OpA::Rank() - 1 : OpA::Rank() - 2;
+        cuda::std::array<index_t, ORank> out_dims_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+        mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
+
+      public:
+        using matxop = bool;
+        using scalar_type = out_type;
+        using matx_transform_op = bool;
+        using norm_xform_op = bool;
+        using matx_inner_op_impl = bool; // Indicates this operator uses matx operators for its implementation
+
+      __MATX_INLINE__ std::string str() const { 
+        if constexpr (std::is_same_v<NormType, detail::NormTypeVector>) {
+          return "vector_norm()"; 
+        }
+        else {
+          return "matrix_norm";
+        }
+      }
+      __MATX_INLINE__ NormOp(const OpA &op, NormOrder order) : a_(op), order_(order) {
+        if constexpr (std::is_same_v<NormType, detail::NormTypeVector>) {
+          MATX_ASSERT_STR(order == NormOrder::NONE || order == NormOrder::L1 || order == NormOrder::L2, matxInvalidParameter,
+            "Invalid norm order used for vector mode");
+        }
+
+        for (int r = 0; r < ORank; r++) {
+          out_dims_[r] = a_.Size(r);
+        }
+      }
+
+
+      template <typename... Is>
+      __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const {
+        return tmp_out_(indices...);
+      };
+
+      template <typename Out, typename Executor>
+      void Exec(Out &&out, Executor &&ex) const {
+        norm_impl<NormType>(cuda::std::get<0>(out), a_, order_, ex);
+      }
+
+      static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
+      {
+        return ORank;
+      }
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept {
+        if constexpr (is_matx_op<OpA>()) {
+          a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+        }
+      } 
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
+
+        Exec(std::make_tuple(tmp_out_), std::forward<Executor>(ex));
+      }
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PostRun(ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        if constexpr (is_matx_op<OpA>()) {
+          a_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+        }
+      }
+
+      constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(int dim) const
+      {
+        return out_dims_[dim];
+      }
+    };
+  }
+
+
+  /**
+   * @brief Compute a vector norm
+   *
+   * Computes various types of matrix and vector norms based on the order
+   *
+   * @tparam Op Type of input values to evaluate
+   * @param op Input values to evaluate
+   * @param order Order of norm
+   * @return norm operator
+   */
+  template <typename Op>
+  __MATX_INLINE__ auto vector_norm(const Op &op,
+                            NormOrder order = NormOrder::NONE) {
+    return detail::NormOp<Op, detail::NormTypeVector>(op, order);
+  }
+
+
+  /**
+   * @brief Compute a vector norm
+   *
+   * Computes various types of vector norms based on the order
+   *
+   * @tparam Op Type of input values to evaluate
+   * @param op Input values to evaluate
+   * @param dims Dimensions to perform norm over   
+   * @param order Order of norm
+   * @return norm operator
+   */
+  template <typename Op, int D>
+  __MATX_INLINE__ auto vector_norm(const Op &op, const int (&dims)[D],
+                            NormOrder order = NormOrder::NONE) {
+    auto perm = detail::getPermuteDims<Op::Rank()>(dims);
+    auto permop = permute(op, perm);
+    return detail::NormOp<Op, detail::NormTypeVector>(permop, order);
+  }
+
+  /**
+   * @brief Compute a matrix norm
+   *
+   * Computes various types of matrix and matrix norms based on the order
+   *
+   * @tparam Op Type of input values to evaluate
+   * @param op Input values to evaluate
+   * @param order Order of norm
+   * @return norm operator
+   */
+  template <typename Op>
+  __MATX_INLINE__ auto matrix_norm(const Op &op,
+                            NormOrder order = NormOrder::NONE) {
+    return detail::NormOp<Op, detail::NormTypeMatrix>(op, order);
+  }
+
+
+  /**
+   * @brief Compute a matrix norm
+   *
+   * Computes various types of matrix norms based on the order
+   *
+   * @tparam Op Type of input values to evaluate
+   * @param op Input values to evaluate
+   * @param dims Dimensions to perform norm over   
+   * @param order Order of norm
+   * @return norm operator
+   */
+  template <typename Op, int D>
+  __MATX_INLINE__ auto matrix_norm(const Op &op, const int (&dims)[D],
+                            NormOrder order = NormOrder::NONE) {
+    auto perm = detail::getPermuteDims<Op::Rank()>(dims);
+    auto permop = permute(op, perm);
+    return detail::NormOp<Op, detail::NormTypeMatrix>(permop, order);
+  }  
+} // end namespace matx
diff --git a/include/matx/operators/operators.h b/include/matx/operators/operators.h
index 5a249d748..52482ed9c 100644
--- a/include/matx/operators/operators.h
+++ b/include/matx/operators/operators.h
@@ -76,6 +76,7 @@
 #include "matx/operators/lu.h"
 #include "matx/operators/matmul.h"
 #include "matx/operators/matvec.h"
+#include "matx/operators/norm.h"
 #include "matx/operators/outer.h"
 #include "matx/operators/overlap.h"
 #include "matx/operators/percentile.h"
diff --git a/include/matx/operators/outer.h b/include/matx/operators/outer.h
index 9e4122755..3ef2d9f41 100644
--- a/include/matx/operators/outer.h
+++ b/include/matx/operators/outer.h
@@ -50,7 +50,8 @@ namespace matx
         float beta_;
         static constexpr int RANK = cuda::std::max(remove_cvref_t<OpA>::Rank(), remove_cvref_t<OpB>::Rank()) + 1;
         cuda::std::array<index_t, RANK> out_dims_;
-        mutable matx::tensor_t<typename OpA::scalar_type, RANK> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, RANK> tmp_out_;
+        mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
       public:
         using matxop = bool;
@@ -102,7 +103,7 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
@@ -110,11 +111,15 @@ namespace matx
 
           if constexpr (is_matx_op<OpB>()) {
             b_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }          
+          }           
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));            
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/percentile.h b/include/matx/operators/percentile.h
index 58edf5a3f..c404850b0 100644
--- a/include/matx/operators/percentile.h
+++ b/include/matx/operators/percentile.h
@@ -49,7 +49,8 @@ namespace detail {
       uint32_t q_;
       PercentileMethod method_;
       cuda::std::array<index_t, ORank> out_dims_; 
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }
+        }          
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/prod.h b/include/matx/operators/prod.h
index a23fdff56..df08bc113 100644
--- a/include/matx/operators/prod.h
+++ b/include/matx/operators/prod.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -77,21 +78,22 @@ namespace detail {
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
       {
         return ORank;
-    }
+      }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }          
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));       
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/pwelch.h b/include/matx/operators/pwelch.h
index a4b4ce283..a255fc8b7 100644
--- a/include/matx/operators/pwelch.h
+++ b/include/matx/operators/pwelch.h
@@ -51,7 +51,8 @@ namespace matx
         index_t noverlap_;
         index_t nfft_;
         cuda::std::array<index_t, 1> out_dims_;
-        mutable matx::tensor_t<typename OpX::scalar_type::value_type, 1> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpX>::scalar_type, 1> tmp_out_;
+        mutable typename remove_cvref_t<OpX>::scalar_type *ptr; 
 
       public:
         using matxop = bool;
@@ -95,19 +96,23 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpX>()) {
-            x_.PreRun(Shape(x_), std::forward<Executor>(ex));
-          }
+            x_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+          } 
 
           if constexpr (is_matx_op<OpW>()) {
             w_.PreRun(Shape(w_), std::forward<Executor>(ex));
-          }
+          }                 
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));    
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/reduce.h b/include/matx/operators/reduce.h
index 53f73f426..b3c0b8dcb 100644
--- a/include/matx/operators/reduce.h
+++ b/include/matx/operators/reduce.h
@@ -50,7 +50,8 @@ namespace matx
         ReductionOp reduction_op_;
         bool init_;
         cuda::std::array<index_t, ORank> out_dims_;
-        mutable matx::tensor_t<typename OpA::scalar_type, ORank> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+        mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
       public:
         using matxop = bool;
@@ -97,15 +98,19 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
           }           
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));           
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/resample_poly.h b/include/matx/operators/resample_poly.h
index ff101b810..7fe63ef67 100644
--- a/include/matx/operators/resample_poly.h
+++ b/include/matx/operators/resample_poly.h
@@ -53,7 +53,8 @@ namespace detail {
       index_t up_;
       index_t down_;
       cuda::std::array<index_t, OpA::Rank()> out_dims_;
-      mutable matx::tensor_t<out_t, OpA::Rank()> tmp_out_;
+      mutable detail::tensor_impl_t<out_t, OpA::Rank()> tmp_out_;
+      mutable out_t *ptr;       
 
     public:
       using matxop = bool;
@@ -92,20 +93,24 @@ namespace detail {
         return OpA::Rank();
       }
 
+
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
-      {
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }
 
         if constexpr (is_matx_op<FilterType>()) {
           f_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }          
+        }              
+      } 
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));         
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/scalar_ops.h b/include/matx/operators/scalar_ops.h
index 60ffa838e..310623ce7 100644
--- a/include/matx/operators/scalar_ops.h
+++ b/include/matx/operators/scalar_ops.h
@@ -202,7 +202,7 @@ MATX_UNARY_OP_GEN(log10, Log10);
 MATX_UNARY_OP_GEN(log2, Log2);
 MATX_UNARY_OP_GEN(log, Log);
 MATX_UNARY_OP_GEN(abs, Abs);
-MATX_UNARY_OP_GEN(norm, Norm);
+
 
 // Trigonometric functions
 template <typename T> static __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto _internal_sin(T v1)
diff --git a/include/matx/operators/softmax.h b/include/matx/operators/softmax.h
index 179e65bcb..f8617233a 100644
--- a/include/matx/operators/softmax.h
+++ b/include/matx/operators/softmax.h
@@ -47,7 +47,8 @@ namespace matx
         OpA a_;
         PermDims perm_;
         cuda::std::array<index_t, OpA::Rank()> out_dims_;
-        mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+        mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
       public:
         using matxop = bool;
@@ -94,15 +95,19 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }           
+          }         
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));          
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/sort.h b/include/matx/operators/sort.h
index 0728e6f99..5817cab4f 100644
--- a/include/matx/operators/sort.h
+++ b/include/matx/operators/sort.h
@@ -49,7 +49,8 @@ namespace detail {
       OpA a_;
       SortDirection_t dir_;
       cuda::std::array<index_t, OpA::Rank()> out_dims_;
-      mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }          
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));     
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/stdd.h b/include/matx/operators/stdd.h
index 50161bb19..94c6faff4 100644
--- a/include/matx/operators/stdd.h
+++ b/include/matx/operators/stdd.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;   
 
     public:
       using matxop = bool;
@@ -61,7 +62,7 @@ namespace detail {
       __MATX_INLINE__ StddOp(OpA a) : a_(a) { 
         for (int r = 0; r < ORank; r++) {
           out_dims_[r] = a_.Size(r);
-        }        
+        }
       };
 
       template <typename... Is>
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }         
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
@@ -131,7 +133,7 @@ namespace detail {
 template <typename InType, int D>
 __MATX_INLINE__ auto stdd(const InType &in, const int (&dims)[D])
 {
-  static_assert(D < InType::Rank(), "reduction dimensions must be <= Rank of input");
+  static_assert(D <= InType::Rank(), "reduction dimensions must be <= Rank of input");
   auto perm = detail::getPermuteDims<InType::Rank()>(dims);
   auto permop = permute(in, perm);
 
diff --git a/include/matx/operators/sum.h b/include/matx/operators/sum.h
index 387b37355..36e32ad27 100644
--- a/include/matx/operators/sum.h
+++ b/include/matx/operators/sum.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;
 
     public:
       using matxop = bool;
@@ -58,7 +59,7 @@ namespace detail {
       using sum_xform_op = bool;
 
       __MATX_INLINE__ std::string str() const { return "sum(" + get_type_str(a_) + ")"; }
-      __MATX_INLINE__ SumOp(OpA a) : a_(a) { 
+      __MATX_INLINE__ SumOp(const OpA &a) : a_(a) { 
         for (int r = 0; r < ORank; r++) {
           out_dims_[r] = a_.Size(r);
         }        
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }          
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));    
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/trace.h b/include/matx/operators/trace.h
index 864f62c58..53da7669d 100644
--- a/include/matx/operators/trace.h
+++ b/include/matx/operators/trace.h
@@ -47,7 +47,8 @@ namespace detail {
   {
     private:
       OpA a_;
-      mutable matx::tensor_t<typename OpA::scalar_type, 0> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, 0> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;
 
     public:
       using matxop = bool;
@@ -75,18 +76,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }          
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));      
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), {}, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/unary_operators.h b/include/matx/operators/unary_operators.h
index ce8c86cfe..5f3c4d56b 100644
--- a/include/matx/operators/unary_operators.h
+++ b/include/matx/operators/unary_operators.h
@@ -175,13 +175,6 @@ namespace matx
  */
   Op conj(Op t) {}
 
-  /**
- * Compute the squared magnitude of every element in the tensor
- * @param t
- *   Tensor or operator input
- */
-  Op norm(Op t) {}
-
   /**
  * Compute absolute value of every element in the tensor. For complex numbers
  * this returns the magnitude, or sqrt(x^2+y^2)
@@ -387,7 +380,6 @@ namespace matx
     }
   }
 #endif
-  DEFINE_UNARY_OP(norm, detail::NormOp);
   DEFINE_UNARY_OP(abs, detail::AbsOp);
   DEFINE_UNARY_OP(abs2, detail::Abs2Op);
   DEFINE_UNARY_OP(sin, detail::SinOp);
diff --git a/include/matx/operators/var.h b/include/matx/operators/var.h
index 04e3abbdd..e1771ddd6 100644
--- a/include/matx/operators/var.h
+++ b/include/matx/operators/var.h
@@ -50,7 +50,8 @@ namespace detail {
       OpA a_;
       int ddof_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;  
 
     public:
       using matxop = bool;
@@ -81,18 +82,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }          
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));      
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/transforms/ambgfun.h b/include/matx/transforms/ambgfun.h
index f83007bb9..6c45e0401 100644
--- a/include/matx/transforms/ambgfun.h
+++ b/include/matx/transforms/ambgfun.h
@@ -177,7 +177,7 @@ void ambgfun_impl(AMFTensor &amf, XTensor &x,
   auto x_normdiv_v = make_tensor<T1>(x.Shape(),  MATX_ASYNC_DEVICE_MEMORY, stream);
   auto x_norm_v = make_tensor<float>({}, MATX_ASYNC_DEVICE_MEMORY, stream);
 
-  (x_norm_v = sum(norm(x))).run(stream);
+  (x_norm_v = sum(abs2(x))).run(stream);
   (x_norm_v = sqrt(x_norm_v)).run(stream);
   (x_normdiv_v = x / x_norm_v).run(stream);
 
@@ -188,7 +188,7 @@ void ambgfun_impl(AMFTensor &amf, XTensor &x,
     y_normdiv_v.Shallow(make_tensor<T1>(y_normdiv_v.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream));
     auto y_norm_v = make_tensor<float>({}, MATX_ASYNC_DEVICE_MEMORY, stream);
 
-    (y_norm_v = sum(norm(ry))).run(stream);
+    (y_norm_v = sum(abs2(ry))).run(stream);
     (y_normdiv_v = ry / y_norm_v).run(stream);
   }
 
diff --git a/include/matx/transforms/cub.h b/include/matx/transforms/cub.h
index 7f7e6fa8f..bb8f54126 100644
--- a/include/matx/transforms/cub.h
+++ b/include/matx/transforms/cub.h
@@ -870,7 +870,9 @@ inline void ExecSort(OutputTensor &a_out,
       auto ft = [&](auto &&in, auto &&out, [[maybe_unused]] auto &&unused1, [[maybe_unused]] auto &&unused2) {
         return cub::DeviceReduce::Max(d_temp, temp_storage_bytes, in, out, static_cast<int>(TotalSize(in_base)), stream);
       };
+
       auto rv = ReduceInput(ft, out_base, in_base);
+
       MATX_ASSERT_STR_EXP(rv, cudaSuccess, matxCudaError, "Error in cub::DeviceReduce::Max");
     }
 #endif
diff --git a/include/matx/transforms/norm.h b/include/matx/transforms/norm.h
new file mode 100644
index 000000000..291eaa97d
--- /dev/null
+++ b/include/matx/transforms/norm.h
@@ -0,0 +1,91 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+
+#pragma once
+
+#include <cstdint>
+#include <cstdio>
+#include <type_traits>
+
+#include "matx/core/error.h"
+#include "matx/core/nvtx.h"
+#include "matx/core/tensor.h"
+
+namespace matx {
+
+enum class NormOrder {
+  NONE,
+  L1,
+  L2,
+  FROB
+};
+
+namespace detail {
+  struct NormTypeVector{};
+  struct NormTypeMatrix{};
+};
+
+
+template <typename NormType, typename OutputOp, typename InputOp, typename Executor>
+__MATX_INLINE__ void norm_impl(OutputOp out, const InputOp &in,
+          NormOrder order, Executor &&exec)
+{
+  MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
+
+  if constexpr (std::is_same_v<NormType, detail::NormTypeVector>) {
+    if (order == NormOrder::NONE || order == NormOrder::L2) {
+      (out = sqrt(sum(abs2(in), {InputOp::Rank() - 1}))).run(exec);
+    }
+    else if (order == NormOrder::L1) {
+      (out = sum(abs(in), {InputOp::Rank() - 1})).run(exec);
+    }
+    else {
+      MATX_ASSERT_STR(false, matxInvalidParameter, "Invalid order type for vector norm");
+    }
+  }
+  else {
+    if (order == NormOrder::NONE || order == NormOrder::FROB) {
+      (out = sqrt(sum(abs2(in), {InputOp::Rank() - 2, InputOp::Rank() - 1}))).run(exec);
+    }
+    else if (order == NormOrder::L1) {
+      (out = max(sum(abs(in), {InputOp::Rank() - 2}), {InputOp::Rank() - 2})).run(exec);
+    }
+    else {
+      MATX_ASSERT_STR(false, matxInvalidParameter, "Invalid order type for matrix norm");
+    }
+  }
+}
+
+
+
+} // end namespace matx
diff --git a/include/matx/transforms/qr.h b/include/matx/transforms/qr.h
index 891610a5e..fe3a3de89 100644
--- a/include/matx/transforms/qr.h
+++ b/include/matx/transforms/qr.h
@@ -154,14 +154,14 @@ namespace detail {
         (xz = (index(x.Rank()-1) >= i) * x).run(stream);
 
         // compute L2 norm without sqrt. 
-        (N = sum(norm(xz))).run(stream);
+        (N = sum(abs2(xz))).run(stream);
         //(N = sqrt(N)).run(stream);  // sqrt folded into next op
 
         (v = xz + (index(v.Rank()-1) == i) * sign(xz) * sqrt(nc)).run(stream); 
 
         auto r = x;  // alias column of R happens to be the same as x
 
-        (s = sum(norm(v))).run(stream);
+        (s = sum(abs2(v))).run(stream);
         //(s = sqrt(s)).run(stream);  // sqrt folded into next op
 
         // IFELSE to avoid nans when dividing by zero
diff --git a/include/matx/transforms/reduce.h b/include/matx/transforms/reduce.h
index c244fe433..f7d7df7a8 100644
--- a/include/matx/transforms/reduce.h
+++ b/include/matx/transforms/reduce.h
@@ -1897,7 +1897,8 @@ void __MATX_INLINE__ sum_impl(OutType dest, const InType &in, [[maybe_unused]] H
     }
     else {
       for (index_t b = 0; b < lin.Size(0); b++) {
-        *(lout + b) = std::accumulate(lin + lbegin[b], lin + lend[b], static_cast<typename InType::scalar_type>(0));
+        auto f = std::accumulate(lin + lbegin[b], lin + lend[b], static_cast<typename InType::scalar_type>(0));
+        *(lout + b) = f;
       }
     }
   };
diff --git a/include/matx/transforms/svd.h b/include/matx/transforms/svd.h
index 864f824c5..69d03e274 100644
--- a/include/matx/transforms/svd.h
+++ b/include/matx/transforms/svd.h
@@ -185,10 +185,10 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat
         // normalize x at each iteration to avoid instability
         // first compute sum of squares, norm will work for complex and real
 #if 0
-        sum(s, norm(x), stream);
+        sum(s, abs2(x), stream);
 #else
         //WAR cub not supporting strided output
-        (sums = sum(norm(x))).run(stream);
+        (sums = sum(abs2(x))).run(stream);
         (s = sums).run(stream);
 #endif
 
@@ -242,10 +242,10 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat
       // compute singular value as L2 norm of v
       // first compute sum of squares, norm will work for complex and real
 #if 0
-      sum(s, norm(v), stream);
+      sum(s, abs2(v), stream);
 #else
       //WAR cub not supporting strided output
-      (sums = sum(norm(v))).run(stream);
+      (sums = sum(abs2(v))).run(stream);
       (s = sums).run(stream);;
 #endif
       (s = sqrt(s)).run(stream);
@@ -269,10 +269,10 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat
       // compute singular value as L2 norm of v
       // first compute sum of squares, norm will work for complex and real
 #if 0
-      sum(s, norm(u), stream);
+      sum(s, abs2(u), stream);
 #else
       //WAR cub not supporting strided output
-      (sums = sum(norm(u))).run(stream);
+      (sums = sum(abs2(u))).run(stream);
       (s = sums).run(stream);;
 #endif
       (s = sqrt(s)).run(stream);
@@ -445,7 +445,7 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int max_
 
       //compute L2(Q-Qold)
       // sqrt folded into next operation
-      (l2Norm = sum(norm(Q-Qold))).run(stream);  
+      (l2Norm = sum(abs2(Q-Qold))).run(stream);  
 
       // compute if all batches have converged
       if constexpr (RANK > 2) {
diff --git a/test/00_operators/OperatorTests.cu b/test/00_operators/OperatorTests.cu
index 3afe9c826..4786540a7 100644
--- a/test/00_operators/OperatorTests.cu
+++ b/test/00_operators/OperatorTests.cu
@@ -2203,13 +2203,8 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, OperatorFuncs)
   exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_conj(c)));
 
-  // abs and norm take a complex and output a floating point value
+  // abs takes a complex and output a floating point value
   auto tdd0 = make_tensor<typename TestType::value_type>({});
-  // example-begin norm-test-1
-  (tdd0 = norm(tiv0)).run(exec);
-  // example-end norm-test-1
-  exec.sync();
-  EXPECT_TRUE(MatXUtils::MatXTypeCompare(tdd0(), detail::_internal_norm(c)));
 
   // example-begin abs-test-1
   (tdd0 = abs(tiv0)).run(exec);
diff --git a/test/00_transform/Norm.cu b/test/00_transform/Norm.cu
new file mode 100644
index 000000000..f6a9306a6
--- /dev/null
+++ b/test/00_transform/Norm.cu
@@ -0,0 +1,147 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#include "assert.h"
+#include "matx.h"
+#include "test_types.h"
+#include "utilities.h"
+#include "gtest/gtest.h"
+
+using namespace matx;
+
+constexpr index_t a_len = 16;
+
+template <typename T>
+class NormTest : public ::testing::Test {
+  using GTestType = std::tuple_element_t<0, T>;
+  using GExecType = std::tuple_element_t<1, T>;
+protected:
+  void SetUp() override
+  {
+    CheckTestTypeSupport<GTestType>();
+    pb = std::make_unique<detail::MatXPybind>();
+
+    // Half precision needs a bit more tolerance when compared to fp32
+    if constexpr (is_complex_half_v<GTestType> || is_matx_half_v<GTestType>) {
+      thresh = 0.5f;
+    }
+  }
+
+  void TearDown() { pb.reset(); }
+  GExecType exec{};   
+  std::unique_ptr<detail::MatXPybind> pb;
+  tensor_t<GTestType, 1> in_v{{a_len}};
+  tensor_t<GTestType, 2> in_m{{a_len, a_len}};
+  tensor_t<GTestType, 0> out_v{{}};
+  tensor_t<GTestType, 0> out_m{{}};
+  float thresh = 0.01f;
+};
+
+
+template <typename TensorType>
+class NormTestFloatTypes
+    : public NormTest<TensorType> {
+};
+
+
+
+TYPED_TEST_SUITE(NormTestFloatTypes, MatXTypesFloatNonComplexAllExecs);
+
+
+TYPED_TEST(NormTestFloatTypes, VectorL1)
+{
+  MATX_ENTER_HANDLER();
+  using TestType = std::tuple_element_t<0, TypeParam>;
+  this->pb->template InitTVGenerator<TestType>("00_transforms", "norm_operators", {a_len});
+  this->pb->RunTVGenerator("vector_l1");
+  this->pb->NumpyToTensorView(this->in_v, "in_v");
+  this->pb->NumpyToTensorView(this->out_v, "out_v");
+  // example-begin vector-norm-test-1
+  (this->out_v = vector_norm(this->in_v, NormOrder::L1)).run(this->exec);
+  // example-end vector-norm-test-1
+
+  MATX_TEST_ASSERT_COMPARE(this->pb, this->out_v, "out_v", this->thresh);
+
+  MATX_EXIT_HANDLER();
+}
+
+TYPED_TEST(NormTestFloatTypes, VectorL2)
+{
+  MATX_ENTER_HANDLER();
+  using TestType = std::tuple_element_t<0, TypeParam>;
+  this->pb->template InitTVGenerator<TestType>("00_transforms", "norm_operators", {a_len});
+  this->pb->RunTVGenerator("vector_l2");
+  this->pb->NumpyToTensorView(this->in_v, "in_v");
+  this->pb->NumpyToTensorView(this->out_v, "out_v");
+  // example-begin vector-norm-test-2
+  (this->out_v = vector_norm(this->in_v, NormOrder::L2)).run(this->exec);
+  // example-end vector-norm-test-2
+
+  MATX_TEST_ASSERT_COMPARE(this->pb, this->out_v, "out_v", this->thresh);
+
+  MATX_EXIT_HANDLER();
+}
+
+TYPED_TEST(NormTestFloatTypes, MatrixL1)
+{
+  MATX_ENTER_HANDLER();
+  using TestType = std::tuple_element_t<0, TypeParam>;
+  this->pb->template InitTVGenerator<TestType>("00_transforms", "norm_operators", {a_len, a_len});
+  this->pb->RunTVGenerator("matrix_l1");
+  this->pb->NumpyToTensorView(this->in_m, "in_m");
+  this->pb->NumpyToTensorView(this->out_m, "out_m");
+  // example-begin matrix-norm-test-1
+  (this->out_m = matrix_norm(this->in_m, NormOrder::L1)).run(this->exec);
+  // example-end matrix-norm-test-1
+
+  MATX_TEST_ASSERT_COMPARE(this->pb, this->out_m, "out_m", this->thresh);
+
+  MATX_EXIT_HANDLER();
+}
+
+TYPED_TEST(NormTestFloatTypes, MatrixL2)
+{
+  MATX_ENTER_HANDLER();
+  using TestType = std::tuple_element_t<0, TypeParam>;
+  this->pb->template InitTVGenerator<TestType>("00_transforms", "norm_operators", {a_len, a_len});
+  this->pb->RunTVGenerator("matrix_frob");
+  this->pb->NumpyToTensorView(this->in_m, "in_m");
+  this->pb->NumpyToTensorView(this->out_m, "out_m");
+  // example-begin matrix-norm-test-2
+  (this->out_m = matrix_norm(this->in_m, NormOrder::FROB)).run(this->exec);
+  // example-end matrix-norm-test-2
+
+  MATX_TEST_ASSERT_COMPARE(this->pb, this->out_m, "out_m", this->thresh);
+
+  MATX_EXIT_HANDLER();
+}
+
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 2ca1b8d74..d1bbf9d03 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -17,6 +17,7 @@ set (test_sources
     00_transform/Copy.cu
     00_transform/Cov.cu
     00_transform/FFT.cu
+    00_transform/Norm.cu
     00_transform/ResamplePoly.cu
     00_transform/Solve.cu
     00_solver/Cholesky.cu
diff --git a/test/test_vectors/generators/00_transforms.py b/test/test_vectors/generators/00_transforms.py
index 2782a943d..5bc7d236c 100755
--- a/test/test_vectors/generators/00_transforms.py
+++ b/test/test_vectors/generators/00_transforms.py
@@ -392,3 +392,37 @@ def run(self) -> Dict[str, np.ndarray]:
             self.res['bc'][b] = np.outer(self.res['ba'][b], self.res['bb'][b])
             
         return self.res
+
+class norm_operators:
+    def __init__(self, dtype: str, size: List[int]):
+        self.size = size
+        self.dtype = dtype
+        np.random.seed(1234)
+
+    def vector_l2(self) -> Dict[str, np.ndarray]:
+        seq = matx_common.randn_ndarray((self.size[0],), self.dtype)
+        return {
+            'in_v': seq,
+            'out_v': np.linalg.norm(seq, 2)
+        }
+    
+    def vector_l1(self) -> Dict[str, np.ndarray]:
+        seq = matx_common.randn_ndarray((self.size[0],), self.dtype)
+        return {
+            'in_v': seq,
+            'out_v': np.linalg.norm(seq, 1)
+        }
+    
+    def matrix_frob(self) -> Dict[str, np.ndarray]:
+        seq = matx_common.randn_ndarray((self.size[0],self.size[1]), self.dtype)
+        return {
+            'in_m': seq,
+            'out_m': np.linalg.norm(seq, 'fro')
+        }
+    
+    def matrix_l1(self) -> Dict[str, np.ndarray]:
+        seq = matx_common.randn_ndarray((self.size[0],self.size[1]), self.dtype)
+        return {
+            'in_m': seq,
+            'out_m': np.linalg.norm(seq, 1)
+        }       
\ No newline at end of file