diff --git a/CMakeLists.txt b/CMakeLists.txt
index 733ca6836..83e074706 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,10 +185,10 @@ if (MATX_NVTX_FLAGS)
     target_compile_definitions(matx INTERFACE MATX_NVTX_FLAGS)
 endif()
 if (MATX_BUILD_32_BIT)
-    set(INT_TYPE "lp64")
-    target_compile_definitions(matx INTERFACE INDEX_32_BIT)
+    set(MATX_NVPL_INT_TYPE "lp64")
+    target_compile_definitions(matx INTERFACE MATX_INDEX_32_BIT)
 else()
-    set(INT_TYPE "ilp64")
+    set(MATX_NVPL_INT_TYPE "ilp64")
 endif()
 
 # Host support
@@ -211,13 +211,13 @@ if (MATX_EN_NVPL OR MATX_EN_X86_FFTW OR MATX_EN_BLIS OR MATX_EN_OPENBLAS)
     endif()
 
     if (MATX_EN_NVPL)
-        message(STATUS "Enabling NVPL library support for ARM CPUs with ${INT_TYPE} interface")
+        message(STATUS "Enabling NVPL library support for ARM CPUs with ${MATX_NVPL_INT_TYPE} interface")
         find_package(nvpl REQUIRED COMPONENTS fft blas lapack HINTS ${blas_DIR})
         if (NOT MATX_BUILD_32_BIT)
             target_compile_definitions(matx INTERFACE NVPL_ILP64)
         endif()
         target_compile_definitions(matx INTERFACE NVPL_LAPACK_COMPLEX_CUSTOM)
-        target_link_libraries(matx INTERFACE nvpl::fftw nvpl::blas_${INT_TYPE}_omp nvpl::lapack_${INT_TYPE}_omp)
+        target_link_libraries(matx INTERFACE nvpl::fftw nvpl::blas_${MATX_NVPL_INT_TYPE}_omp nvpl::lapack_${MATX_NVPL_INT_TYPE}_omp)
         target_compile_definitions(matx INTERFACE MATX_EN_NVPL)
     else()
         # FFTW
diff --git a/examples/black_scholes.cu b/examples/black_scholes.cu
index 1938cab28..2a8ab47a2 100644
--- a/examples/black_scholes.cu
+++ b/examples/black_scholes.cu
@@ -46,8 +46,8 @@ using namespace matx;
  * instructions. While caching helps, this can have a slight performance impact when compared to native CUDA
  * kernels. To work around this problem, complex expressions can be placed in a custom operator by adding some
  * boilerplate code around the original expression. This custom operator can then be used either alone or inside
- * other arithmetic expressions, and only a single load is issues for each tensor. 
- * 
+ * other arithmetic expressions, and only a single load is issues for each tensor.
+ *
  * This example uses the Black-Scholes equtation to demonstrate the two ways to implement the equation in MatX, and
  * shows the performance difference.
  */
@@ -76,7 +76,7 @@ public:
     auto d2 = d1 - VsqrtT;
     auto cdf_d1 = normcdf(d1);
     auto cdf_d2 = normcdf(d2);
-    auto expRT = exp(-1 * r * T); 
+    auto expRT = exp(-1 * r * T);
 
     out_(idx) = S * cdf_d1 - K * expRT * cdf_d2;
   }
@@ -87,12 +87,12 @@ public:
 
 /* Arithmetic expression */
 template<typename T1>
-void compute_black_scholes_matx(tensor_t<T1,1>& K, 
-                                tensor_t<T1,1>& S, 
-                                tensor_t<T1,1>& V, 
-                                tensor_t<T1,1>& r, 
-                                tensor_t<T1,1>& T, 
-                                tensor_t<T1,1>& output, 
+void compute_black_scholes_matx(tensor_t<T1,1>& K,
+                                tensor_t<T1,1>& S,
+                                tensor_t<T1,1>& V,
+                                tensor_t<T1,1>& r,
+                                tensor_t<T1,1>& T,
+                                tensor_t<T1,1>& output,
                                 cudaExecutor& exec)
 {
     auto VsqrtT = V * sqrt(T);
@@ -100,7 +100,7 @@ void compute_black_scholes_matx(tensor_t<T1,1>& K,
     auto d2 = d1 - VsqrtT;
     auto cdf_d1 = normcdf(d1);
     auto cdf_d2 = normcdf(d2);
-    auto expRT = exp(-1 * r * T); 
+    auto expRT = exp(-1 * r * T);
 
     (output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);
 }
@@ -120,13 +120,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   tensor_t<dtype, 1> V_tensor{{input_size}};
   tensor_t<dtype, 1> r_tensor{{input_size}};
   tensor_t<dtype, 1> T_tensor{{input_size}};
-  tensor_t<dtype, 1> output_tensor{{input_size}};  
+  tensor_t<dtype, 1> output_tensor{{input_size}};
 
   cudaStream_t stream;
   cudaStreamCreate(&stream);
   cudaExecutor exec{stream};
 
-  compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);  
+  compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);
 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
@@ -154,11 +154,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventElapsedTime(&time_ms, start, stop);
 
   printf("Time with custom operator = %.2fms per iteration\n",
-         time_ms / num_iterations);         
+         time_ms / num_iterations);
 
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
   cudaStreamDestroy(stream);
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
diff --git a/examples/cgsolve.cu b/examples/cgsolve.cu
index 9027d634d..cba23f02d 100644
--- a/examples/cgsolve.cu
+++ b/examples/cgsolve.cu
@@ -86,6 +86,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // example-end sync-test-1
   printf ("max l2 norm: %f\n", (float)sqrt(maxn()));
 
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
diff --git a/examples/channelize_poly_bench.cu b/examples/channelize_poly_bench.cu
index ac38367a4..48a53eb4f 100644
--- a/examples/channelize_poly_bench.cu
+++ b/examples/channelize_poly_bench.cu
@@ -101,7 +101,7 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
       }
       cudaEventRecord(stop, stream);
       exec.sync();
-      CUDA_CHECK_LAST_ERROR();
+      MATX_CUDA_CHECK_LAST_ERROR();
       cudaEventElapsedTime(&elapsed_ms, start, stop);
 
       const double avg_elapsed_us = (static_cast<double>(elapsed_ms)/NUM_ITERATIONS)*1.0e3;
@@ -112,7 +112,7 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
     printf("\n");
   }
 
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
 
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
diff --git a/examples/conv2d.cu b/examples/conv2d.cu
index 71c648b1b..d4387581f 100644
--- a/examples/conv2d.cu
+++ b/examples/conv2d.cu
@@ -39,23 +39,23 @@ using namespace matx;
 int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
-  
+
   index_t iN = 4;
   index_t iM = 6;
- 
+
   index_t fN = 4;
   index_t fM = 2;
-  
+
   auto in = make_tensor<int>({iN,iM});
   auto filter = make_tensor<int>({fN,fM});
-  
+
   in.SetVals({ {1,2,3,4,5,6},
                {5,4,3,2,1,0},
                {3,4,5,6,7,8},
                {1,2,3,4,5,6},
                });
 
-  filter.SetVals({ {1,2}, 
+  filter.SetVals({ {1,2},
                    {3,4},
                    {5,6},
                    {7,8}});
@@ -73,9 +73,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   index_t oM = iM - fM + 1;
   auto mode = MATX_C_MODE_VALID;
 #endif
-  
+
   auto out = make_tensor<int>({oN,oM});
-  
+
   (out = conv2d(in, filter, mode)).run();
 
   printf("in:\n");
@@ -86,6 +86,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   print(out);
 
 
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
diff --git a/examples/convolution.cu b/examples/convolution.cu
index f45b625ad..ac41759bb 100644
--- a/examples/convolution.cu
+++ b/examples/convolution.cu
@@ -93,7 +93,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   for (uint32_t i = 0; i < iterations; i++) {
     (outView = conv1d(inView, filterView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
   }
-  
+
 
   cudaEventRecord(stop, stream);
   exec.sync();
@@ -149,6 +149,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   matxPrintMemoryStatistics();
 
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
diff --git a/examples/eigenExample.cu b/examples/eigenExample.cu
index d204c4be7..e2ec26535 100644
--- a/examples/eigenExample.cu
+++ b/examples/eigenExample.cu
@@ -44,6 +44,8 @@
 
 int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
+  MATX_ENTER_HANDLER();
+
   int dimX = 3;
   int dimY = 3;
 
@@ -53,9 +55,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //////////////               Eigen Test Data Setup               //////////////
   ///////////////////////////////////////////////////////////////////////////////
 #ifdef USE_EIGEN
-  
+
   typedef Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXdRowMajor; // define a custom type that is aligned to MatX row-Major.
-  
+
   Eigen::MatrixXd  a(dimX, dimY);
   MatrixXdRowMajor b(dimX, dimY);
   Eigen::RowVectorXd rowVec(dimX);
@@ -88,7 +90,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   complexMatrix(0, 1) = std::complex<double>(2.0, 3.0);
   complexMatrix(1, 0) = std::complex<double>(3.0, 4.0);
   complexMatrix(1, 1) = std::complex<double>(4.0, 5.0);
-  
+
 #else
   std::cout <<"!!!!!!!!! Eigen NOT USED in Test !!!!!!!!!" << std ::endl;
   // provide data in tensors if eigen is not used
@@ -140,18 +142,18 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   //
   // Data Mapping Example
-  // 
-#ifdef USE_EIGEN  
+  //
+#ifdef USE_EIGEN
   std::cout << "=================== Data Map Example ===================" << std::endl;
   double *raw_data;
   // memory could be any type of allocation, but choosing to use managed memory so it's valid on the host and device (this does cost performance)
-  cudaMallocManaged((void**)&raw_data, dimX*dimY * sizeof(double)); 
-  
+  cudaMallocManaged((void**)&raw_data, dimX*dimY * sizeof(double));
+
   for(int i=0; i < dimX * dimY; i++)
   {
     raw_data[i] = 0.1 + i * 0.1;
   }
-  
+
   // map user memory into Eigen Matrix
   Eigen::Map<MatrixXdRowMajor> mappedMatrix(raw_data, dimX, dimY);
   std::cout << "Eigen Mapped Data :\n" << mappedMatrix << std::endl;
@@ -159,24 +161,24 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // map user memory into Eigen Matrix
   auto mappedTensor = matx::make_tensor(raw_data, {dimX, dimY}, false); // create MatX tensor with non-owning user allocated memory
   matx::print(mappedTensor);
-  
-  // modify the data from each of the references  
+
+  // modify the data from each of the references
   raw_data[4] = 117;
   mappedMatrix(0,1) = 42;
   mappedTensor(2,1) = 87;
-  
+
   // print modified data
   std::cout << "Eigen Mapped Data After Modified :\n" << mappedMatrix << std::endl;
   matx::print(mappedTensor);
-#endif 
+#endif
 
   //
   // Basic Indexing
   //
   std::cout << "=================== Indexing ===================" << std::endl;
 #ifdef USE_EIGEN
-  std::cout << "eigen a(1,2) = " <<  a(1,2) << std::endl; 
-#endif  
+  std::cout << "eigen a(1,2) = " <<  a(1,2) << std::endl;
+#endif
 
   std::cout << "MatX  a(1,2) = " << aTensor(1,2) << std::endl;
 
@@ -186,9 +188,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //
   std::cout << "=================== Addition ===================" << std::endl;
 #ifdef USE_EIGEN
-  Eigen::MatrixXd addResult = a + b; 
-  std::cout << "A + B = \n" << addResult << std::endl; 
-#endif  
+  Eigen::MatrixXd addResult = a + b;
+  std::cout << "A + B = \n" << addResult << std::endl;
+#endif
 
   auto addTensor = aTensor + bTensor;
   matx::print(addTensor);
@@ -199,9 +201,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //
   std::cout << "=================== Element-Wise Multiply ===================" << std::endl;
 #ifdef USE_EIGEN
-  Eigen::MatrixXd elementWise = a.cwiseProduct(b); 
-  std::cout << "A .* B = \n" << elementWise << std::endl; 
-#endif  
+  Eigen::MatrixXd elementWise = a.cwiseProduct(b);
+  std::cout << "A .* B = \n" << elementWise << std::endl;
+#endif
 
   auto elementWiseTensor = aTensor*bTensor;
   matx::print(elementWiseTensor);
@@ -212,9 +214,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //
   std::cout << "=================== Element-Wise Division ===================" << std::endl;
 #ifdef USE_EIGEN
-  Eigen::MatrixXd divResult = a.cwiseQuotient(b); 
-  std::cout << "A / B = \n" << divResult << std::endl; 
-#endif  
+  Eigen::MatrixXd divResult = a.cwiseQuotient(b);
+  std::cout << "A / B = \n" << divResult << std::endl;
+#endif
 
   auto divResultTensor = aTensor / bTensor;
   matx::print(divResultTensor);
@@ -225,9 +227,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //
   std::cout << "=================== Continuous Slice ===================" << std::endl;
 #ifdef USE_EIGEN
-  Eigen::Matrix2d aSlice = a.block(0, 0, 2, 2);       
-  std::cout << "A Sliced: \n" << aSlice << std::endl; 
-#endif  
+  Eigen::Matrix2d aSlice = a.block(0, 0, 2, 2);
+  std::cout << "A Sliced: \n" << aSlice << std::endl;
+#endif
 
   auto aSliceTensor = matx::slice<2>(aTensor,{0,0},{2,2});
   matx::print(aSliceTensor);
@@ -238,7 +240,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //
   std::cout << "=================== Strided Slice ===================" << std::endl;
 #ifdef USE_EIGEN
-  std::cout << "Original matrix10x10:\n" << matrix10x10 << "\n\n"; 
+  std::cout << "Original matrix10x10:\n" << matrix10x10 << "\n\n";
   // Define the starting point, number of elements to select, and strides for both rows and columns
   // int startRow = 0, startCol = 0; // Starting index for rows and columns
   // int rowStride = 3, colStride = 2; // Stride along rows and columns
@@ -246,14 +248,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // int numCols = 3; // Grab every third item until the 8th item (0, 3, 6)
 
   // Create a Map with Stride to access the elements
-  Eigen::Map<Eigen::MatrixXf, 0, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>>                        
-  strided(matrix10x10.data() + 0 * matrix10x10.outerStride() + 0,                                                
-          5, 3,                                                                                        
-          Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(3 * matrix10x10.outerStride(), 2));                 
+  Eigen::Map<Eigen::MatrixXf, 0, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>>
+  strided(matrix10x10.data() + 0 * matrix10x10.outerStride() + 0,
+          5, 3,
+          Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(3 * matrix10x10.outerStride(), 2));
 
   // Print the strided matrix10x10
-  std::cout << "Strided matrix10x10:\n" << strided << "\n";  
-#endif  
+  std::cout << "Strided matrix10x10:\n" << strided << "\n";
+#endif
 
   auto slicedMat = matx::slice(matTensor10x10, {0,0}, {matx::matxEnd,9}, {2,3});
   matx::print(slicedMat);
@@ -265,9 +267,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   std::cout << "=================== Clone ===================" << std::endl;
 #ifdef USE_EIGEN
   // Use the replicate function to create a 5x5 matrix by replicating the 1x5 matrix
-  Eigen::MatrixXd mat = rowVec.replicate(3, 1);         
+  Eigen::MatrixXd mat = rowVec.replicate(3, 1);
   std::cout << "1D Cloned to 2D \n" << mat << std::endl;
-#endif  
+#endif
 
   auto cloned3Tensor = matx::clone<2>(tensor1D, {3, matx::matxKeepDim});
   matx::print(cloned3Tensor);
@@ -278,9 +280,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //
   std::cout << "=================== Slice Row ===================" << std::endl;
 #ifdef USE_EIGEN
-  Eigen::RowVector3d row = a.row(1);               
+  Eigen::RowVector3d row = a.row(1);
   std::cout << "Sliced Row \n" << row << std::endl;
-#endif  
+#endif
 
   auto rowSlice = matx::slice<1>(aTensor, {1, 0}, {matx::matxDropDim, matx::matxEnd});
   matx::print(rowSlice);
@@ -291,14 +293,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //
   std::cout << "=================== Permute Rows ===================" << std::endl;
 #ifdef USE_EIGEN
-  std::cout << "Original Matrix:\n" << a << std::endl;  
+  std::cout << "Original Matrix:\n" << a << std::endl;
   // Define a permutation a
-  Eigen::PermutationMatrix<3> perm;    
+  Eigen::PermutationMatrix<3> perm;
   perm.indices() << 2, 1, 0;  // This permutation swaps the first and third rows
   // Apply the permutation to the rows
   Eigen::Matrix3d permutedMatrix = perm * a;
   std::cout << "Permuted Matrix (Rows):\n" << permutedMatrix << std::endl;
-#endif  
+#endif
 
   // Define a permutation a
   auto permVec = matx::make_tensor<int>({dimX});
@@ -323,12 +325,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //
   std::cout << "=================== Get Real Values ===================" << std::endl;
 #ifdef USE_EIGEN
-  std::cout << "Original Complex Matrix:\n" << complexMatrix << std::endl; 
+  std::cout << "Original Complex Matrix:\n" << complexMatrix << std::endl;
 
   // Extract and output the real part of the complex matrix
-  Eigen::Matrix<double, 2, 2> realMatrix = complexMatrix.real();    
-  std::cout << "Real Part of Matrix:\n" << realMatrix << std::endl; 
-#endif  
+  Eigen::Matrix<double, 2, 2> realMatrix = complexMatrix.real();
+  std::cout << "Real Part of Matrix:\n" << realMatrix << std::endl;
+#endif
 
   auto realTensor = matx::real(complexTensor);
   matx::print(realTensor);
@@ -339,9 +341,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //
   std::cout << "=================== Matrix Multiply ===================" << std::endl;
 #ifdef USE_EIGEN
-  Eigen::MatrixXd multResult = a * b;                    
-  std::cout << "A * B = \n" << multResult << std::endl;  
-#endif  
+  Eigen::MatrixXd multResult = a * b;
+  std::cout << "A * B = \n" << multResult << std::endl;
+#endif
 
   auto multResultTensor=matmul(aTensor,bTensor);
   matx::print(multResultTensor);
@@ -354,7 +356,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 #ifdef USE_EIGEN
   // Eigen::MatrixXd inverseMatrix = a.inverse();                                              // current bug where .run(exec) in inverse is ambiguous, so cannot be used with MatX
   // std::cout << "Inverse of the Real Part:\n" << inverseMatrix << std::endl; // current bug where .run(exec) in inverse is ambiguous, so cannot be used with MatX
-#endif  
+#endif
 
   auto invTensor = matx::inv(aTensor);
   matx::print(invTensor);
@@ -384,5 +386,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //
   // Unsupported by eigen
 
+  MATX_CUDA_CHECK_LAST_ERROR();
+  MATX_EXIT_HANDLER();
   return 0;
 }
diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu
index 9833ea83f..c6361481c 100644
--- a/examples/fft_conv.cu
+++ b/examples/fft_conv.cu
@@ -64,7 +64,7 @@ using namespace matx;
  * For smaller signal sizes, the FFT convolution typically performs worse since
  * there is some buffer and 3 FFT operations (2 for FFT of signal and filter,
  * and 1 IFFT after the multiply) that causes the setup time to dominate.
- * 
+ *
  * Note that the conv1d() operator has a mode to perform FFT-based convolution
  * automatically.
  *
@@ -82,11 +82,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   float fused_ms;
   constexpr int iterations = 100;
   cudaStream_t stream;
-  cudaStreamCreate(&stream);  
+  cudaStreamCreate(&stream);
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
-  cudaEventCreate(&stop);  
-  cudaExecutor exec{stream};  
+  cudaEventCreate(&stop);
+  cudaExecutor exec{stream};
 
   // Create time domain buffers
   auto sig_time  = make_tensor<complex>({batches, signal_size});
@@ -118,7 +118,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   for (int i = 0; i < iterations; i++) {
     if (i == 1) {
       cudaEventRecord(start, stream);
-    }    
+    }
     (sig_freq = fft(sig_time, filtered_size)).run(exec);
     (filt_freq = fft(filt_time, filtered_size)).run(exec);
 
@@ -126,12 +126,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
     // IFFT in-place
     (sig_freq = ifft(sig_freq)).run(exec);
-    
+
   }
 
   cudaEventRecord(stop, stream);
   exec.sync();
-  cudaEventElapsedTime(&separate_ms, start, stop);   
+  cudaEventElapsedTime(&separate_ms, start, stop);
 
   for (int i = 0; i < iterations; i++) {
     if (i == 1) {
@@ -139,10 +139,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     }
     (sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(exec);
   }
-  
+
   cudaEventRecord(stop, stream);
   exec.sync();
-  cudaEventElapsedTime(&fused_ms, start, stop);  
+  cudaEventElapsedTime(&fused_ms, start, stop);
 
   printf("FFT runtimes for separate = %.2f ms, fused = %.2f ms\n", separate_ms/(iterations-1), fused_ms/(iterations-1));
 
@@ -153,7 +153,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   (time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
 
   exec.sync();
- 
+
   // Compare signals
   for (index_t b = 0; b < batches; b++) {
     for (index_t i = 0; i < filtered_size; i++) {
@@ -170,6 +170,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   std::cout << "Verification successful" << std::endl;
 
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
\ No newline at end of file
diff --git a/examples/mvdr_beamformer.cu b/examples/mvdr_beamformer.cu
index 07fc97ebf..c1271c3b9 100644
--- a/examples/mvdr_beamformer.cu
+++ b/examples/mvdr_beamformer.cu
@@ -92,6 +92,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
   cudaStreamDestroy(stream);
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
\ No newline at end of file
diff --git a/examples/print_styles.cu b/examples/print_styles.cu
index d76ec9f80..19cf10078 100644
--- a/examples/print_styles.cu
+++ b/examples/print_styles.cu
@@ -58,6 +58,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     {-13,  13}, {-14,  14}, {-15,  15}, {-16, 16}
   });
 
+  A1.set_name("A1 Matrix");
   auto A2 = reshape(A1, {4,4});
   auto A3 = reshape(A1, {2,2,4});
   auto A4 = reshape(A1, {2,2,2,2});
@@ -83,7 +84,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   print(A4);
   // example-end print-example-1
 
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
   return 0;
 }
diff --git a/examples/pwelch.cu b/examples/pwelch.cu
index ea2679180..f3873900b 100644
--- a/examples/pwelch.cu
+++ b/examples/pwelch.cu
@@ -97,7 +97,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   print(Pxx);
   printf("PWelchOp avg runtime = %.3f ms\n", exec_time_ms / num_iterations);
 
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
   return 0;
 }
diff --git a/examples/qr.cu b/examples/qr.cu
index 2953dabcf..6115bde50 100644
--- a/examples/qr.cu
+++ b/examples/qr.cu
@@ -43,14 +43,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   //using AType = double;
   using AType = cuda::std::complex<float>;
-  
+
   cudaStream_t stream = 0;
   cudaExecutor exec{stream};
-  int batch = 1; 
+  int batch = 1;
 
   int m = 4;
   int n = 5;
- 
+
   auto A = make_tensor<AType>({batch, m, n});
   auto QR = make_tensor<AType>({batch, m, n});
   auto QTQ = make_tensor<AType>({batch, m, m});
@@ -73,13 +73,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   (QR = matmul(Q, R)).run(exec);
   (QTQ = matmul(conj(transpose_matrix(Q)), Q)).run(exec);
   exec.sync();
-  
+
   printf("Q:\n"); print(Q);
   printf("R:\n"); print(R);
   printf("QTQ:\n"); print(QTQ);
   printf("QR:\n"); print(QR);
   printf("A:\n"); print(A);
 
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
diff --git a/examples/recursive_filter.cu b/examples/recursive_filter.cu
index 9023bc2d4..fbeedec4d 100644
--- a/examples/recursive_filter.cu
+++ b/examples/recursive_filter.cu
@@ -43,7 +43,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   using complex = cuda::std::complex<float>;
 
   cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, 0);  
+  cudaGetDeviceProperties(&prop, 0);
 
   if (prop.sharedMemPerBlock < 40000) {
     printf("Recursive filter example requires at least 40KB of shared memory to run. Exiting.");
@@ -139,6 +139,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   matxPrintMemoryStatistics();
 
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
diff --git a/examples/resample.cu b/examples/resample.cu
index c7c8b6bf9..bb97d166e 100644
--- a/examples/resample.cu
+++ b/examples/resample.cu
@@ -97,6 +97,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
   cudaStreamDestroy(stream);
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
-}
\ No newline at end of file
+}
diff --git a/examples/resample_poly_bench.cu b/examples/resample_poly_bench.cu
index 6debd4311..11a9b2e48 100644
--- a/examples/resample_poly_bench.cu
+++ b/examples/resample_poly_bench.cu
@@ -159,17 +159,17 @@ void ResamplePolyBench()
       }
       cudaEventRecord(stop, stream);
       exec.sync();
-      CUDA_CHECK_LAST_ERROR();
+      MATX_CUDA_CHECK_LAST_ERROR();
       cudaEventElapsedTime(&elapsed_ms, start, stop);
 
       const double gflops = static_cast<double>(num_batches*(2*filter_len_per_phase-1)*output_len) / 1.0e9;
       const double avg_elapsed_us = (static_cast<double>(elapsed_ms)/NUM_ITERATIONS)*1.0e3;
-      printf("Batches: %5" INDEX_T_FMT "  FilterLen: %5" INDEX_T_FMT "  InputLen: %9" INDEX_T_FMT "  OutputLen: %8" INDEX_T_FMT
-      "  Up/Down: %4" INDEX_T_FMT "/%4" INDEX_T_FMT " Elapsed Usecs: %12.1f GFLOPS: %10.3f\n",
+      printf("Batches: %5" MATX_INDEX_T_FMT "  FilterLen: %5" MATX_INDEX_T_FMT "  InputLen: %9" MATX_INDEX_T_FMT "  OutputLen: %8" MATX_INDEX_T_FMT
+      "  Up/Down: %4" MATX_INDEX_T_FMT "/%4" MATX_INDEX_T_FMT " Elapsed Usecs: %12.1f GFLOPS: %10.3f\n",
         num_batches, filter_len, input_len, output_len, up, down, avg_elapsed_us, gflops/(avg_elapsed_us/1.0e6));
   }
 
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
 
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu
index a89cb1b1f..126ded2b5 100644
--- a/examples/simple_radar_pipeline.cu
+++ b/examples/simple_radar_pipeline.cu
@@ -43,7 +43,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   uint32_t iterations = 100;
   constexpr int num_streams = 1;
   cudaGraph_t graphs[num_streams];
-  cudaGraphExec_t instances[num_streams];  
+  cudaGraphExec_t instances[num_streams];
   using complex = cuda::std::complex<float>;
   RadarPipeline<complex> *pipelines[num_streams];
 
@@ -55,10 +55,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   // cuda stream to place work in
   cudaStream_t streams[num_streams];
-  
+
   // manually set to log all NVTX levels
   MATX_NVTX_SET_LOG_LEVEL( matx_nvxtLogLevels::MATX_NVTX_LOG_ALL );
-  
+
   // create some events for timing
   cudaEvent_t starts[num_streams];
   cudaEvent_t stops[num_streams];
@@ -67,13 +67,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     cudaEventCreate(&starts[s]);
     cudaEventCreate(&stops[s]);
     cudaStreamCreate(&streams[s]);
-    
+
     MATX_NVTX_START_RANGE("Pipeline Initialize", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 1)
     printf("Initializing data structures for stream %d...\n", s);
     pipelines[s] = new RadarPipeline(numPulses, numSamples, waveformLength, numChannels, streams[s]);
     MATX_NVTX_END_RANGE(1)
 
-    pipelines[s]->sync();  
+    pipelines[s]->sync();
   }
 
   MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2)
@@ -82,20 +82,20 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto run_pipeline = [&](int s) {
     MATX_NVTX_START_RANGE("PulseCompression", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 21)
     pipelines[s]->PulseCompression();
-    MATX_NVTX_END_RANGE(21)  
-    
+    MATX_NVTX_END_RANGE(21)
+
     MATX_NVTX_START_RANGE("ThreePulseCanceller", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 22)
     pipelines[s]->ThreePulseCanceller();
     MATX_NVTX_END_RANGE(22)
-    
+
     MATX_NVTX_START_RANGE("DopplerProcessing", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 23)
     pipelines[s]->DopplerProcessing();
     MATX_NVTX_END_RANGE(23)
-    
+
     MATX_NVTX_START_RANGE("CFARDetections", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 24)
     pipelines[s]->CFARDetections();
     MATX_NVTX_END_RANGE(24)
-  }; 
+  };
 
   // Warmup
   for (int s = 0; s < num_streams; s++) {
@@ -107,10 +107,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
       cudaStreamBeginCapture(streams[s], cudaStreamCaptureModeGlobal);
       run_pipeline(s);
       cudaStreamEndCapture(streams[s], &graphs[s]);
-      cudaGraphInstantiate(&instances[s], graphs[s], NULL, NULL, 0);     
+      cudaGraphInstantiate(&instances[s], graphs[s], NULL, NULL, 0);
     }
   }
-  
+
   for (uint32_t i = 0; i < iterations; i++) {
     for (int s = 0; s < num_streams; s++) {
       if (i == 1) {
@@ -131,7 +131,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     pipelines[s]->sync();
   }
   MATX_NVTX_END_RANGE(2)
-  
+
   MATX_NVTX_START_RANGE("Pipeline Results", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 3)
   float time_ms;
   cudaEventElapsedTime(&time_ms, starts[num_streams-1], stops[num_streams-1]);
@@ -150,7 +150,7 @@ for (int s = 0; s < num_streams; s++) {
 }
 
   cudaDeviceSynchronize();
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
 
   matxPrintMemoryStatistics();
 
diff --git a/examples/spectrogram.cu b/examples/spectrogram.cu
index 8d566b325..79b066a07 100644
--- a/examples/spectrogram.cu
+++ b/examples/spectrogram.cu
@@ -154,6 +154,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventDestroy(stop);
   cudaStreamDestroy(stream);
 
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
diff --git a/examples/spectrogram_graph.cu b/examples/spectrogram_graph.cu
index 6f0583c8f..38d7c5d2e 100644
--- a/examples/spectrogram_graph.cu
+++ b/examples/spectrogram_graph.cu
@@ -143,7 +143,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
       viz::contour(time, freqs, Sxx);
 #else
       printf("Not outputting plot since visualizations disabled\n");
-#endif            
+#endif
     }
   }
 
@@ -164,6 +164,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
   cudaStreamDestroy(stream);
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
diff --git a/examples/spherical_harmonics.cu b/examples/spherical_harmonics.cu
index 35d650f5f..48ddcdcd3 100644
--- a/examples/spherical_harmonics.cu
+++ b/examples/spherical_harmonics.cu
@@ -64,7 +64,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto [phi, theta] = meshgrid(az, col);
 
   auto Plm = lcollapse<3>(legendre(l, m, cos(theta)));
- 
+
   ValueType a = (2*l+1)*factorial<ValueType>(l-m);
   ValueType b = 4*M_PI*factorial<ValueType>(l+m);
   ValueType C = cuda::std::sqrt(a/b);
@@ -85,6 +85,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 #if MATX_ENABLE_VIZ
   matx::viz::surf(X, Y, Z, "test-viz.html");
 #endif
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
diff --git a/examples/svd_power.cu b/examples/svd_power.cu
index 451c8f8a2..ca7f4b2c6 100644
--- a/examples/svd_power.cu
+++ b/examples/svd_power.cu
@@ -56,7 +56,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   int k = d;  // number of singular values to find
 
 #if 0
-  int batch = 1; 
+  int batch = 1;
   auto A = make_tensor<AType>({batch, m, n});
   auto U = make_tensor<AType>({batch, m, k});
   auto VT = make_tensor<AType>({batch, k, n});
@@ -70,7 +70,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto VVT = make_tensor<AType>({batch, n, n});
   auto VTV = make_tensor<AType>({batch, k, k});
   auto x0 = random<float>({batch, d}, NORMAL);
-    
+
   (A = random<float>({batch, m, n}, NORMAL)).run(exec);
 
 #else
@@ -87,7 +87,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto VVT = make_tensor<AType>({n, n});
   auto VTV = make_tensor<AType>({k, k});
   auto x0 = random<float>({d}, NORMAL);
-  
+
   (A = random<float>({m, n}, NORMAL)).run(exec);
 
 #endif
@@ -99,7 +99,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   float tol = (float)1e-3;
   int iterations = 20;
-  
+
   {
 
     printf("iterations: %d\n", iterations);
@@ -157,7 +157,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     printf("A-UDVT\n");
     print(A);
   }
-  
+
   // Same as above but with svdbpi
   {
 
@@ -215,6 +215,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     print(A);
   }
 #endif
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
diff --git a/include/matx.h b/include/matx.h
index d783c2481..4114a03ff 100644
--- a/include/matx.h
+++ b/include/matx.h
@@ -58,6 +58,3 @@ namespace matx {
   using fcomplex = cuda::std::complex<float>;
   using dcomplex = cuda::std::complex<double>;
 }
-
-#define TEST_VECTOR_PATH "generated/"
-
diff --git a/include/matx/core/defines.h b/include/matx/core/defines.h
index cdfaafa9e..c4cfde1ba 100644
--- a/include/matx/core/defines.h
+++ b/include/matx/core/defines.h
@@ -37,12 +37,12 @@
 
 namespace matx {
 
-#ifdef INDEX_32_BIT
+#ifdef MATX_INDEX_32_BIT
     using index_t = int32_t;
-    #define INDEX_T_FMT "d"
-#else    
+    #define MATX_INDEX_T_FMT "d"
+#else
     using index_t = long long int;
-    #define INDEX_T_FMT "lld"
+    #define MATX_INDEX_T_FMT "lld"
 #endif
 
 #ifdef __CUDACC__
@@ -53,39 +53,39 @@ namespace matx {
     #define __MATX_DEVICE__ __device__
 #endif
 
-#ifdef __GNUC__ 
+#ifdef __GNUC__
     #define __MATX_INLINE__ __attribute__((always_inline)) inline
 #elif __CUDACC__
-    #define __MATX_INLINE__ __forceinline__ 
+    #define __MATX_INLINE__ __forceinline__
 #else
     #define __MATX_INLINE__ inline
 #endif
 
 
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
+#define MATX_STRINGIFY(x) #x
+#define MATX_TOSTRING(x) MATX_STRINGIFY(x)
 
 #if defined(__clang__ )
-    #define IGNORE_WARNING_PUSH_GCC(WARN_MSG)
-    #define IGNORE_WARNING_POP_GCC
+    #define MATX_IGNORE_WARNING_PUSH_GCC(WARN_MSG)
+    #define MATX_IGNORE_WARNING_POP_GCC
 
-    #define IGNORE_WARNING_PUSH_CLANG(WARN_MSG) \
+    #define MATX_IGNORE_WARNING_PUSH_CLANG(WARN_MSG) \
         _Pragma("clang diagnostic push") \
-        _Pragma(TOSTRING(clang diagnostic ignored WARN_MSG))
+        _Pragma(MATX_TOSTRING(clang diagnostic ignored WARN_MSG))
 
-    #define IGNORE_WARNING_POP_CLANG \
+    #define MATX_IGNORE_WARNING_POP_CLANG \
         _Pragma("clang diagnostic pop")
 #elif defined(__GNUC__)
-    #define IGNORE_WARNING_PUSH_CLANG(WARN_MSG)
-    #define IGNORE_WARNING_POP_CLANG
+    #define MATX_IGNORE_WARNING_PUSH_CLANG(WARN_MSG)
+    #define MATX_IGNORE_WARNING_POP_CLANG
 
-    #define IGNORE_WARNING_PUSH_GCC(WARN_MSG) \
+    #define MATX_IGNORE_WARNING_PUSH_GCC(WARN_MSG) \
         _Pragma("GCC diagnostic push") \
-        _Pragma(TOSTRING(GCC diagnostic ignored WARN_MSG))
+        _Pragma(MATX_TOSTRING(GCC diagnostic ignored WARN_MSG))
 
-    #define IGNORE_WARNING_POP_GCC \
+    #define MATX_IGNORE_WARNING_POP_GCC \
         _Pragma("GCC diagnostic pop")
-#endif        
+#endif
 
 // std::ceil is not constexpr until C++23
 #define MATX_ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))
diff --git a/include/matx/core/error.h b/include/matx/core/error.h
index 58b32d11d..9185db16f 100644
--- a/include/matx/core/error.h
+++ b/include/matx/core/error.h
@@ -46,7 +46,7 @@ namespace matx
 
   /**
    * @brief MatX error codes
-   * 
+   *
    */
   enum matxError_t
   {
@@ -121,11 +121,11 @@ namespace matx
 
     /**
      * @brief Throw an exception and print a stack trace
-     * 
-     * @param error 
-     * @param s 
-     * @param file 
-     * @param line 
+     *
+     * @param error
+     * @param s
+     * @param file
+     * @param line
      */
     matxException(matxError_t error, const char *s, const char *file, int line)
         : e(error)
@@ -190,8 +190,8 @@ namespace matx
       std::cout << #a ": " << str << "(" << tmp << " != " << expected << ")\n";\
       MATX_THROW(error, "");  \
     }                                  \
-  }  
-  
+  }
+
 #else
   #define MATX_ASSERT(a, error) {}
   #define MATX_ASSERT_STR(a, error, str) {}
@@ -216,14 +216,14 @@ namespace matx
   }
 
 // Macro for checking cuda errors following a cuda launch or api call
-#define CUDA_CHECK_LAST_ERROR()        \
+#define MATX_CUDA_CHECK_LAST_ERROR()        \
   {                                    \
     const auto e = cudaGetLastError(); \
     MATX_CUDA_CHECK(e);                \
   }
 
 // This macro asserts compatible dimensions of current class to an operator.
-#define ASSERT_COMPATIBLE_OP_SIZES(op)                               \
+#define MATX_ASSERT_COMPATIBLE_OP_SIZES(op)                               \
   if constexpr (Rank() > 0) {                                        \
     bool compatible = true;                                          \
     _Pragma("unroll")                                                \
@@ -249,6 +249,6 @@ namespace matx
       std::cerr << ")" << std::endl; \
       MATX_THROW(matxInvalidSize, "Incompatible operator sizes"); \
     }                   \
-  }    
+  }
 
 } // end namespace matx
diff --git a/include/matx/core/print.h b/include/matx/core/print.h
index 2a5483f61..d7c7ee51f 100644
--- a/include/matx/core/print.h
+++ b/include/matx/core/print.h
@@ -27,7 +27,7 @@
 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /////////////////////////////////////////////////////////////////////////////////
 
 #include <matx/core/type_utils.h>
@@ -136,11 +136,15 @@ namespace matx {
 
     template <typename Op>
     void PrintShapeImpl(const Op& op, FILE *fp) {
-      std::string type = (is_tensor_view_v<Op>) ? "Tensor" : "Operator";      
+      if (is_tensor_view_v<Op>) {
+        fprintf(fp, "%s: ",op.str().c_str());
+      }
+
+      std::string type = (is_tensor_view_v<Op>) ? "Tensor" : "Operator";
       fprintf(fp, "%s{%s} Rank: %d, Sizes:[", type.c_str(), detail::GetTensorTypeString<typename Op::value_type>().c_str(), op.Rank());
       for (index_t dimIdx = 0; dimIdx < op.Rank(); dimIdx++)
       {
-        fprintf(fp, "%" INDEX_T_FMT, op.Size(static_cast<int>(dimIdx)) );
+        fprintf(fp, "%" MATX_INDEX_T_FMT, op.Size(static_cast<int>(dimIdx)) );
         if( dimIdx < (op.Rank() - 1) )
           fprintf(fp, ", ");
       }
@@ -152,7 +156,7 @@ namespace matx {
         {
           for (index_t dimIdx = 0; dimIdx < (op.Rank() ); dimIdx++ )
           {
-            fprintf(fp, "%" INDEX_T_FMT, op.Stride(static_cast<int>(dimIdx)) );
+            fprintf(fp, "%" MATX_INDEX_T_FMT, op.Stride(static_cast<int>(dimIdx)) );
             if( dimIdx < (op.Rank() - 1) )
             {
               fprintf(fp, ",");
@@ -161,8 +165,8 @@ namespace matx {
         }
       }
 
-      fprintf(fp, "]\n");      
-    }    
+      fprintf(fp, "]\n");
+    }
 
 
     /**
@@ -195,7 +199,7 @@ namespace matx {
             }
           }
           if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_DEFAULT) {
-            fprintf(fp, "%06" INDEX_T_FMT ": ", _k);
+            fprintf(fp, "%06" MATX_INDEX_T_FMT ": ", _k);
           }
           PrintVal(fp, op.operator()(_k));
           if (_k == (op.Size(0)-1)) {
@@ -223,7 +227,7 @@ namespace matx {
           for (index_t _l = 0; _l < ((l == 0) ? op.Size(1) : l); _l++) {
             if (_l == 0) {
               if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_DEFAULT) {
-                fprintf(fp, "%06" INDEX_T_FMT ": ", _k);
+                fprintf(fp, "%06" MATX_INDEX_T_FMT ": ", _k);
               }
               else if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_PYTHON) {
                 if (_k == 0) {
@@ -284,7 +288,7 @@ namespace matx {
             }
           }
           if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_DEFAULT) {
-            fprintf(fp, "[%06" INDEX_T_FMT ",:,:]\n", _j);
+            fprintf(fp, "[%06" MATX_INDEX_T_FMT ",:,:]\n", _j);
           }
           for (index_t _k = 0; _k < ((k == 0) ? op.Size(1) : k); _k++) {
             if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_MLAB) {
@@ -311,7 +315,7 @@ namespace matx {
             for (index_t _l = 0; _l < ((l == 0) ? op.Size(2) : l); _l++) {
               if (_l == 0) {
                 if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_DEFAULT) {
-                  fprintf(fp, "%06" INDEX_T_FMT ": ", _k);
+                  fprintf(fp, "%06" MATX_INDEX_T_FMT ": ", _k);
                 }
                 else if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_PYTHON) {
                   if (_k == 0) {
@@ -395,7 +399,7 @@ namespace matx {
               }
             }
             if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_DEFAULT) {
-              fprintf(fp, "[%06" INDEX_T_FMT ",%06" INDEX_T_FMT ",:,:]\n", _i, _j);
+              fprintf(fp, "[%06" MATX_INDEX_T_FMT ",%06" MATX_INDEX_T_FMT ",:,:]\n", _i, _j);
             }
             for (index_t _k = 0; _k < ((k == 0) ? op.Size(2) : k); _k++) {
               if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_MLAB) {
@@ -422,7 +426,7 @@ namespace matx {
               for (index_t _l = 0; _l < ((l == 0) ? op.Size(3) : l); _l++) {
                 if (_l == 0) {
                   if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_DEFAULT) {
-                    fprintf(fp, "%06" INDEX_T_FMT ": ", _k);
+                    fprintf(fp, "%06" MATX_INDEX_T_FMT ": ", _k);
                   }
                   else if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_PYTHON) {
                     if (_k == 0) {
@@ -506,7 +510,7 @@ namespace matx {
         PrintData(fp, tmpv, dims...);
       }
   #endif
-    }    
+    }
 
 
     /**
@@ -596,7 +600,7 @@ namespace matx {
   template <typename T>
   void print_shape(const T& op) {
     detail::PrintShapeImpl(op, stdout);
-  }  
+  }
 
   /**
    * @brief print a tensor's values to output file stream
@@ -685,19 +689,19 @@ namespace matx {
   /**
    * @brief Print a tensor's all values to stdout
    *
-   * This form of `print()` is a specialization for 0D tensors. 
+   * This form of `print()` is a specialization for 0D tensors.
    *
    * @tparam Op Operator input type
    * @param op Operator input
    */
-  template <typename Op, 
+  template <typename Op,
           std::enable_if_t<(Op::Rank() == 0), bool> = true>
-  void print(const Op &op) 
+  void print(const Op &op)
   {
     fprint(stdout, op);
   }
 
-  #endif // not DOXYGEN_ONLY  
+  #endif // not DOXYGEN_ONLY
 
   /**
    * @brief Set the print() precision for floating point values
@@ -733,6 +737,6 @@ namespace matx {
    */
   __MATX_INLINE__ __MATX_HOST__ enum PrintFormatType get_print_format_type() {
     return PRINT_FORMAT_TYPE;
-  }  
+  }
 
 } // End namespace matx
diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h
index cf4d5f224..be5ffecea 100644
--- a/include/matx/core/tensor.h
+++ b/include/matx/core/tensor.h
@@ -156,8 +156,13 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
   __MATX_INLINE__  ~tensor_t() = default;
 
-  __MATX_INLINE__ const std::string str() const {
-    return std::string("T") + std::to_string(RANK) + "_" + detail::to_short_str<T>();
+  const std::string str() const {
+    return name_;
+  }
+
+  void set_name(std::string name)
+  {
+    name_ = name;
   }
 
   /**
@@ -1332,7 +1337,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    * more dimensions of a tensor. This includes completely dropping an unwanted
    * dimension, or simply taking a piece of a wanted dimension. Slice() is very
    * similar to indexing operations in both Python and MATLAB.
-   * 
+   *
    * *NOTE* Users should not call Slice() directly anymore. Use the slice() operator instead.
    *
    * @param firsts
@@ -1511,6 +1516,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
 private:
   Storage storage_;
+  std::string name_ = std::string("tensor_") + std::to_string(RANK) + "_" + detail::to_short_str<T>();
 };
 
 
diff --git a/include/matx/core/tensor_desc.h b/include/matx/core/tensor_desc.h
index aa0a8eef7..5487ed9cd 100644
--- a/include/matx/core/tensor_desc.h
+++ b/include/matx/core/tensor_desc.h
@@ -46,12 +46,12 @@ namespace matx {
  * @tparam ShapeContainer type of sizes
  * @tparam StrideContainer type of strides
  */
-template <typename ShapeContainer, typename StrideContainer, int RANK> 
+template <typename ShapeContainer, typename StrideContainer, int RANK>
 class tensor_desc_t {
 public:
   template <typename T1, typename T2, int R1>
   using self_type = tensor_desc_t<T1, T2, R1>;
-  
+
   using shape_container = ShapeContainer;
   using stride_container = StrideContainer;
   using shape_type  = typename ShapeContainer::value_type; ///< Type trait of shape type
@@ -65,17 +65,17 @@ class tensor_desc_t {
 
   /**
    * @brief Default move constructor
-   */  
+   */
   __MATX_INLINE__  tensor_desc_t<ShapeContainer, StrideContainer, RANK>(tensor_desc_t &&) = default;
 
   /**
    * @brief Default const copy assignment constructor
-   */  
+   */
   __MATX_INLINE__  tensor_desc_t& operator=(const tensor_desc_t&) = default;
 
   /**
    * @brief Default copy assignment constructor
-   */    
+   */
   __MATX_INLINE__  tensor_desc_t& operator=(tensor_desc_t&&) = default;
 
   /** Swaps two raw_pointer_buffers
@@ -87,18 +87,18 @@ class tensor_desc_t {
    * @param rhs
    *   Right argument
    */
-  friend void swap( tensor_desc_t<ShapeContainer, StrideContainer, RANK> &lhs, 
+  friend void swap( tensor_desc_t<ShapeContainer, StrideContainer, RANK> &lhs,
                     tensor_desc_t<ShapeContainer, StrideContainer, RANK> &rhs) noexcept
   {
     using std::swap;
 
     swap(lhs.shape_, rhs.shape_);
     swap(lhs.stride_, rhs.stride_);
-  }   
+  }
 
   /**
    * @brief Construct a tensor_desc_t from a generic shape and stride
-   * 
+   *
    * @tparam S Unused
    * @param shape Shape object
    * @param stride Stride object
@@ -110,35 +110,35 @@ class tensor_desc_t {
     MATX_ASSERT_STR(shape.size() == stride.size(), matxInvalidDim,
                        "Size and stride array sizes must match");
     MATX_ASSERT_STR(shape.size() == RANK, matxInvalidDim,
-                       "Rank parameter must match array size");                       
+                       "Rank parameter must match array size");
   }
 
   /**
    * @brief Construct a tensor_desc_t for a 0D tensor
-   * 
+   *
    */
   __MATX_INLINE__ __MATX_HOST__  tensor_desc_t() {
-  } 
+  }
 
   /**
    * @brief Constructor with just shape for non-C-style arrays
-   * 
+   *
    * @tparam S2 Unused
-   * @param shape 
+   * @param shape
    *   Shape of tensor
    */
   template <typename S2, std::enable_if_t<!std::is_array_v<typename remove_cvref<S2>::type> && !is_matx_descriptor_v<typename remove_cvref<S2>::type>, bool> = true>
-  __MATX_INLINE__ __MATX_HOST__  tensor_desc_t(S2 &&shape) 
+  __MATX_INLINE__ __MATX_HOST__  tensor_desc_t(S2 &&shape)
   {
     InitFromShape(std::forward<S2>(shape));
   }
 
   /**
    * @brief Constructor with just shape for C-style arrays
-   * 
-   * @tparam M 
+   *
+   * @tparam M
    *   Unused
-   * @param shape 
+   * @param shape
    *   Shape of tensor
    */
   template <int M = RANK>
@@ -146,20 +146,20 @@ class tensor_desc_t {
   {
     // Construct a new cuda::std::array. Slower, but saves duplication
     cuda::std::array<index_t, M> tshape;
-    std::move(std::begin(shape), std::end(shape), tshape.begin());    
+    std::move(std::begin(shape), std::end(shape), tshape.begin());
     InitFromShape(std::move(tshape));
-  }  
+  }
 
   /**
    * @brief Constructor with perfect-forwarded shape and C array of strides
-   * 
-   * @param shape 
+   *
+   * @param shape
    *   Shape of tensor
    * @param strides
    *   Strides of tensor
    */
   template <typename S2, std::enable_if_t<!std::is_array_v<S2>, bool> = true>
-  __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(S2 &&shape, const stride_type (&strides)[RANK]) : 
+  __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(S2 &&shape, const stride_type (&strides)[RANK]) :
       shape_(std::forward<S2>(shape)) {
     for (int i = 0; i < RANK; i++) {
       MATX_ASSERT_STR(*(shape.begin() + i) > 0, matxInvalidSize,
@@ -170,26 +170,26 @@ class tensor_desc_t {
 
   /**
    * @brief Constructor with perfect-forwarded shape and C array of strides
-   * 
-   * @param shape 
+   *
+   * @param shape
    *   Shape of tensor
    * @param strides
    *   Strides of tensor
    */
   template <std::enable_if_t<!std::is_array_v<StrideContainer>, bool> = true>
-  __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(const shape_type (&shape)[RANK], StrideContainer &&strides) : 
+  __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(const shape_type (&shape)[RANK], StrideContainer &&strides) :
       stride_(std::forward<StrideContainer>(strides)) {
     for (int i = 0; i < RANK; i++) {
       MATX_ASSERT_STR(shape[i] > 0, matxInvalidSize,
                       "Must specify size larger than 0 for each dimension");
       *(shape_.begin() + i) = shape[i];
     }
-  }  
+  }
 
   /**
    * @brief Constructor with C-style array shape and strides
-   * 
-   * @param shape 
+   *
+   * @param shape
    *   Shape of tensor
    * @param strides
    *   Strides of tensor
@@ -201,7 +201,7 @@ class tensor_desc_t {
       *(stride_.begin() + i) = strides[i];
       *(shape_.begin() + i) = shape[i];
     }
-  }    
+  }
 
   /**
    * Check if a descriptor is contiguous in memory for all elements in the view
@@ -233,16 +233,16 @@ class tensor_desc_t {
   constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto TotalSize() const noexcept
   {
     // The stride_type is expected to be able to hold this without overflowing
-    stride_type size = 1; 
+    stride_type size = 1;
     for (int i = 0; i < RANK; i++) {
       size *= Size(i);
     }
     return size;
-  }  
+  }
 
   /**
    * @brief Initialize descriptor from existing shape
-   * 
+   *
    * @tparam S2 Shape type
    * @param shape Shape object
    */
@@ -262,51 +262,51 @@ class tensor_desc_t {
     #pragma unroll
     for (int i = RANK - 2; i >= 0; i--) {
       *(stride_.begin() + i) = Stride(i+1) * Size(i+1);
-    } 
-    
-  }  
+    }
+
+  }
 
   /**
    * @brief Set the Size object
-   * 
+   *
    * @param dim Dimension to size
    * @param size Size to set dimension to
-   * 
+   *
    */
   void __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ SetSize(int dim, shape_type size) { *(shape_.begin() + dim) = size; }
 
   /**
    * @brief Return size of descriptor on a single dimension
-   * 
+   *
    * @param dim Dimension to retrieve
    * @return Size of dimension
    */
-  constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto Size([[maybe_unused]] int dim) const noexcept { 
+  constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto Size([[maybe_unused]] int dim) const noexcept {
     if constexpr (RANK == 0) {
       return static_cast<shape_type>(1);
     }
 
 // gcc 14.1 incorrectly reports shape_ as uninitialized in some contexts
-IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") 
-    return *(shape_.begin() + dim); 
-IGNORE_WARNING_POP_GCC
+MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
+    return *(shape_.begin() + dim);
+MATX_IGNORE_WARNING_POP_GCC
   }
 
   /**
    * @brief Return strides contaienr of descriptor
-   * 
+   *
    * @return Strides container
    */
-  auto __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ Strides() const { 
+  auto __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ Strides() const {
     return stride_;
   }
   /**
    * @brief Return stride of descriptor on a single dimension
-   * 
+   *
    * @param dim Dimension to retrieve
    * @return Stride of dimension
    */
-  auto __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ Stride([[maybe_unused]] int dim) const { 
+  auto __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ Stride([[maybe_unused]] int dim) const {
     if constexpr (RANK == 0) {
       return static_cast<stride_type>(0);
     }
@@ -315,25 +315,25 @@ IGNORE_WARNING_POP_GCC
         and clone(). It appears there's no valid code path that would cause this to be unitialized,
         so we're ignoring the warning in this one spot. gcc also incorrectly reports:
         error: array subscript 3 is outside array bounds of. This is impossible in the case it's reporting
-        since it comes from a clone where the loop inside of clone() is a compile-time constant of 2, 
+        since it comes from a clone where the loop inside of clone() is a compile-time constant of 2,
         so it can never count up to 3. */
-IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
-IGNORE_WARNING_PUSH_GCC("-Warray-bounds")
+MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
+MATX_IGNORE_WARNING_PUSH_GCC("-Warray-bounds")
     return *(stride_.begin() + dim);
-IGNORE_WARNING_POP_GCC
-IGNORE_WARNING_POP_GCC
+MATX_IGNORE_WARNING_POP_GCC
+MATX_IGNORE_WARNING_POP_GCC
   }
 
   /**
    * @brief Return shape object
-   * 
-   * @return Shape object 
+   *
+   * @return Shape object
    */
   auto __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ Shape() const { return shape_; }
 
   /**
    * @brief Get rank of descriptor
-   * 
+   *
    * @return Rank of descriptor
    */
   static auto constexpr Rank() { return RANK; }
@@ -345,11 +345,11 @@ IGNORE_WARNING_POP_GCC
 
 /**
  * @brief Tensor descriptor for compile-time descriptors
- * 
+ *
  * @tparam I First size
  * @tparam Is Parameter pack of sizes
  */
-template <index_t I, index_t... Is> 
+template <index_t I, index_t... Is>
 class static_tensor_desc_t {
 public:
   using shape_container = cuda::std::array<index_t, sizeof...(Is) + 1>;  ///< Type trait of shape type
@@ -372,7 +372,7 @@ class static_tensor_desc_t {
 
   /**
    * @brief Get size of dimension
-   * 
+   *
    * @param dim Dimension to retrieve
    * @return Size of dimension
    */
@@ -380,48 +380,48 @@ class static_tensor_desc_t {
 
   /**
    * @brief Get stride of dimension
-   * 
+   *
    * @param dim Dimension to retrieve
    * @return Stride of dimension
-   */  
+   */
   static constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto Stride(int dim) { return stride_[dim]; }
 
   /**
    * @brief Return strides contaienr of descriptor
-   * 
+   *
    * @return Strides container
    */
-  static constexpr auto Strides() { 
+  static constexpr auto Strides() {
     return stride_;
-  }  
+  }
 
   /**
    * @brief Get rank of descriptor
-   * 
+   *
    * @return Descriptor rank
-   */  
+   */
   static constexpr int Rank() { return shape_.size(); }
 
   /**
    * @brief Get underlying shape object
-   * 
+   *
    * @return Shape object
-   */  
+   */
   static constexpr auto Shape() { return shape_; }
 
   /**
    * @brief Get total size of descriptor
-   * 
+   *
    * @return Product of all sizes
-   */  
+   */
   static constexpr auto TotalSize() {
       return std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<index_t>());
-  }  
+  }
 
 private:
   static constexpr auto make_shape(){
       return cuda::std::array{I, Is...};
-  }    
+  }
 
   static constexpr auto make_strides(){
       cuda::std::array<index_t, 1 + sizeof...(Is)> m{};
@@ -435,7 +435,7 @@ class static_tensor_desc_t {
   }
 
   static constexpr shape_container shape_ = make_shape();
-  static constexpr stride_container stride_ = make_strides();  
+  static constexpr stride_container stride_ = make_strides();
 };
 
 /**
@@ -490,7 +490,7 @@ using tensor_desc_cr_disi_dist = tensor_desc_cr_ds_t<index_t, index_t, RANK>;
  *
  * @tparam RANK Rank of shape
  */
-#ifdef INDEX_32_BIT 
+#ifdef MATX_INDEX_32_BIT
   template <int RANK>
   using DefaultDescriptor = tensor_desc_cr_ds_32_32_t<RANK>;
 #else
diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h
index 77dd0dbd3..6c00062bc 100644
--- a/include/matx/core/tensor_impl.h
+++ b/include/matx/core/tensor_impl.h
@@ -92,7 +92,7 @@ class tensor_impl_t {
 
 
     const std::string str() const {
-      return std::string("T") + std::to_string(RANK) + "_" + to_short_str<T>();
+      return std::string("tensor_impl_") + std::to_string(RANK) + "_" + to_short_str<T>();
     }
 
     /** Swaps two raw_pointer_buffers
@@ -210,14 +210,14 @@ class tensor_impl_t {
      *   Data type
      */
 // gcc 14.1 incorrectly reports desc as uninitialized in some contexts
-IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
+MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
     template <typename DescriptorType, std::enable_if_t<is_matx_descriptor_v<typename remove_cvref<DescriptorType>::type>, bool> = true>
-    __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ tensor_impl_t(T *const ldata,  
-                    DescriptorType &&desc)                
+    __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ tensor_impl_t(T *const ldata,
+                    DescriptorType &&desc)
         : ldata_(ldata), desc_{std::forward<DescriptorType>(desc)}
     {
     }
-IGNORE_WARNING_POP_GCC
+MATX_IGNORE_WARNING_POP_GCC
 
     /**
      * Constructor for creating a view with only a descriptor
@@ -663,7 +663,7 @@ IGNORE_WARNING_POP_GCC
         MATX_ASSERT_STR(first < end, matxInvalidSize, "Slice must be at least one element long");
 
         [[maybe_unused]] typename Desc::stride_type stride_mult;
-        
+
         if constexpr (std::is_same_v<StrideType, detail::NoStride>) {
           stride_mult = 1;
         }
@@ -724,7 +724,7 @@ IGNORE_WARNING_POP_GCC
       MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
       return Slice<N, detail::NoStride>(firsts, ends, detail::NoStride{});
-    }  
+    }
 
 
     template <int N>
@@ -761,7 +761,7 @@ IGNORE_WARNING_POP_GCC
       tensor_desc_t<decltype(n), decltype(s), N> new_desc{std::move(n), std::move(s)};
       return new_desc;
     }
-  
+
 
     template <int N>
     __MATX_INLINE__ auto Clone(const cuda::std::array<index_t, N> &clones) const
@@ -857,7 +857,7 @@ IGNORE_WARNING_POP_GCC
       } else {
         return false;
       }
-    }    
+    }
 
     /**
      * Set the size of a dimension
@@ -887,9 +887,9 @@ IGNORE_WARNING_POP_GCC
     template <int I = 0, typename ...Is>
     __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ stride_type GetVal([[maybe_unused]] cuda::std::tuple<Is...> tup)  {
       if constexpr (I < sizeof...(Is)) {
-IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
+MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
         return GetVal<I+1, Is...>(tup) + cuda::std::get<I>(tup)*this->desc_.Stride(I);
-IGNORE_WARNING_POP_GCC
+MATX_IGNORE_WARNING_POP_GCC
       }
       else {
         return 0;
@@ -899,9 +899,9 @@ IGNORE_WARNING_POP_GCC
     template <int I = 0, typename ...Is>
     __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ stride_type GetValC([[maybe_unused]] const cuda::std::tuple<Is...> tup) const {
       if constexpr (I < sizeof...(Is)) {
-IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
+MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
         return GetValC<I+1, Is...>(tup) + cuda::std::get<I>(tup)*this->desc_.Stride(I);
-IGNORE_WARNING_POP_GCC
+MATX_IGNORE_WARNING_POP_GCC
       }
       else {
         return 0;
diff --git a/include/matx/core/utils.h b/include/matx/core/utils.h
index 50bda7bb0..80f377029 100644
--- a/include/matx/core/utils.h
+++ b/include/matx/core/utils.h
@@ -38,12 +38,13 @@
 #include "matx/core/defines.h"
 #include "matx/core/error.h"
 
-#define HOPPER_CC 9
-#define AMPERE_CC 8
-#define VOLTA_CC 7
-#define PASCAL_CC 6
-
 namespace matx {
+
+  constexpr int HOPPER_CC = 9;
+  constexpr int AMPERE_CC = 8;
+  constexpr int VOLTA_CC = 7;
+  constexpr int PASCAL_CC = 6;
+
 namespace detail {
 __MATX_INLINE__ int GetDeviceAttr(cudaDeviceAttr attr) {
     int val;
@@ -85,7 +86,7 @@ bool SizesMatch(const Op1 &op1, const Op2 &op2) {
 template <typename T1, typename T2, typename T3>
 __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ auto madd( const T1 &x, const T2 &y, const T3 &z) {
   // CUDA 12.6 with gcc 13 is reporting a parsing bug with the expression below. Use an alternative form.
-  // using T4 = decltype(x*y+z); 
+  // using T4 = decltype(x*y+z);
   using T4 = std::invoke_result_t<decltype(std::plus<>{}), decltype(std::multiplies<>{}(x, y)), decltype(z)>;
   if constexpr (is_complex_v<T4> && !is_complex_half_v<T4>) {
 
@@ -150,7 +151,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ auto madd( const T1 &x, const T2 &
     __half2 ari = make_half2(X.x, X.y);
     // negate and swap supported in hardware sm_8.6+
     __half2 air = make_half2(X.y, __hneg(X.x));
-    // broadcast supported in hardware 
+    // broadcast supported in hardware
     __half2 brr = make_half2(Y.x, Y.x);
     // broadcast supported in hardware
     __half2 bii = make_half2(Y.y, Y.y);
@@ -158,9 +159,9 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ auto madd( const T1 &x, const T2 &
     __half2 d;
 
     // HFMA2 RD, RA.H1_H0, RB.H1_H1, RC.H1_H0
-    d = __hfma2(ari, brr, c); 
+    d = __hfma2(ari, brr, c);
     // HFMA2 RD, RB.H0_H0, -RA.H0_NH1, RC.H1_H0
-    d = __hfma2(bii, -air, d); 
+    d = __hfma2(bii, -air, d);
 
     return T4(d.x, d.y);
 #endif
diff --git a/include/matx/executors/host.h b/include/matx/executors/host.h
index d8219ae9f..e7ad02698 100644
--- a/include/matx/executors/host.h
+++ b/include/matx/executors/host.h
@@ -39,7 +39,7 @@
 #ifdef MATX_EN_OMP
 #include <omp.h>
 #endif
-namespace matx 
+namespace matx
 {
 
 // Matches current Linux max
@@ -66,14 +66,14 @@ struct HostExecParams {
 
   private:
     int threads_;
-    cpu_set_t cpu_set_;
+    cpu_set_t cpu_set_ {0};
 };
 
 /**
  * @brief Executor for running an operator on a single or multi-threaded host
- * 
+ *
  * @tparam MODE Threading policy
- * 
+ *
  */
 template <ThreadsMode MODE = ThreadsMode::SINGLE>
 class HostExecutor {
@@ -106,13 +106,13 @@ class HostExecutor {
 
     /**
      * @brief Synchronize the host executor's threads.
-     * 
+     *
      */
     void sync() {}
 
     /**
      * @brief Execute an operator
-     * 
+     *
      * @tparam Op Operator type
      * @param op Operator to execute
      */
@@ -120,7 +120,7 @@ class HostExecutor {
     void Exec(const Op &op) const noexcept {
       if constexpr (Op::Rank() == 0) {
         op();
-      } 
+      }
       else {
         index_t size = TotalSize(op);
   #ifdef MATX_EN_OMP
@@ -130,17 +130,17 @@ class HostExecutor {
             auto idx = GetIdxFromAbs(op, i);
             cuda::std::apply([&](auto... args) {
               return op(args...);
-            }, idx);        
+            }, idx);
           }
         } else
   #endif
-        { 
+        {
           for (index_t i = 0; i < size; i++) {
             auto idx = GetIdxFromAbs(op, i);
             cuda::std::apply([&](auto... args) {
               return op(args...);
-            }, idx);        
-          }      
+            }, idx);
+          }
         }
       }
     }
diff --git a/include/matx/generators/range.h b/include/matx/generators/range.h
index 492fe6c0a..0a87c7f37 100644
--- a/include/matx/generators/range.h
+++ b/include/matx/generators/range.h
@@ -55,14 +55,14 @@ namespace matx
         __MATX_DEVICE__ __MATX_HOST__ __MATX_INLINE__ T operator()(index_t idx) const
         {
           if constexpr (is_matx_half_v<T>) {
-IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
+MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
             return first_ + T(static_cast<T>((float)idx) * step_);
-IGNORE_WARNING_POP_GCC
+MATX_IGNORE_WARNING_POP_GCC
           }
           else {
-IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
+MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
             return first_ + T(static_cast<T>(idx) * step_);
-IGNORE_WARNING_POP_GCC
+MATX_IGNORE_WARNING_POP_GCC
           }
         }
     };
diff --git a/include/matx/kernels/conv.cuh b/include/matx/kernels/conv.cuh
index b914dd295..eaa11566b 100644
--- a/include/matx/kernels/conv.cuh
+++ b/include/matx/kernels/conv.cuh
@@ -13,10 +13,14 @@
 #include "matx/core/type_utils.h"
 #include "matx/core/tensor_utils.h"
 
-#define CONV1D_ELEMENTS_PER_BLOCK 512
 
 namespace matx {
 
+namespace matx_conv1d_detail {
+  constexpr size_t CONV1D_ELEMENTS_PER_BLOCK = 512;
+};
+using namespace matx_conv1d_detail;
+
 typedef enum {
   MATX_C_MODE_FULL, // Default. Keep all elements of ramp up/down
   MATX_C_MODE_SAME, // Only keep elements where entire filter was present
@@ -28,7 +32,7 @@ typedef enum {
   MATX_C_METHOD_FFT
 } matxConvCorrMethod_t;
 
-#ifdef __CUDACC__ 
+#ifdef __CUDACC__
 template <int THREADS, int EPT, typename OutType, typename InType, typename FilterType>
 __launch_bounds__(THREADS)
 __global__ void Conv1D(OutType d_out, InType d_in, FilterType d_filter,
@@ -37,11 +41,11 @@ __global__ void Conv1D(OutType d_out, InType d_in, FilterType d_filter,
 {
 
   /* strategy:
-     1 thread per EPT outputs.  
+     1 thread per EPT outputs.
      Each block produces EPT * THREADS outputs
      Full convolution is computed and results are windowed down based on the request
      Filter is fully loaded into shared memory
-     Chunk of signal is loaded into shared memory with filter_len pandding on the negative side.  
+     Chunk of signal is loaded into shared memory with filter_len pandding on the negative side.
      If out of range then we pad with zeros.
      */
   static_assert(InType::Rank() == FilterType::Rank());
@@ -93,14 +97,14 @@ __global__ void Conv1D(OutType d_out, InType d_in, FilterType d_filter,
       __syncthreads();
 
     // load signal,  pad extra elements with zeros
-    for (int32_t lidx = threadIdx.x, gidx  = chunk_idx * CONV1D_ELEMENTS_PER_BLOCK - filter_len + 1 + threadIdx.x;  
-        gidx < static_cast<int32_t>((chunk_idx+1) * CONV1D_ELEMENTS_PER_BLOCK) ; 
+    for (int32_t lidx = threadIdx.x, gidx  = chunk_idx * CONV1D_ELEMENTS_PER_BLOCK - filter_len + 1 + threadIdx.x;
+        gidx < static_cast<int32_t>((chunk_idx+1) * CONV1D_ELEMENTS_PER_BLOCK) ;
         gidx += THREADS, lidx += THREADS) {
 
       // some elements may be out of range.  We set their values to 0.
       intype_strip val(0);
 
-      if( gidx >= 0 && gidx < signal_len) { 
+      if( gidx >= 0 && gidx < signal_len) {
         bdims[Rank - 1] = gidx;
         cuda::std::apply([&val, d_in](auto &&...args) {
             val = d_in.operator()(args...);
@@ -113,8 +117,8 @@ __global__ void Conv1D(OutType d_out, InType d_in, FilterType d_filter,
     // wait for signal to load
     __syncthreads();
 
-    // register array for output data  
-    outtype_strip oval[EPT] = {0}; 
+    // register array for output data
+    outtype_strip oval[EPT] = {0};
 
     // Below will use pointer modification instead of offsets to change IMADS into IADS.  IMADS go through FMA pipe.
 
@@ -178,10 +182,10 @@ __global__ void Conv1D(OutType d_out, InType d_in, FilterType d_filter,
       int32_t gidx = idx - start;
 
       if(idx >= start && idx <= stop) {
-        bdims[Rank - 1] = gidx; 
+        bdims[Rank - 1] = gidx;
         cuda::std::apply([&](auto &&...args) {
             d_out.operator()(args...) = oval[i];
-            }, bdims);        
+            }, bdims);
       }
     }
   } // end chunk loop
@@ -205,12 +209,12 @@ struct ShmBuffer2D {
 
   __MATX_INLINE__  __MATX_DEVICE__ __MATX_HOST__ T &operator()(index_t y, index_t x) noexcept {
     return *(ptr + y * X_LEN + x);
-  }  
+  }
 
   T *ptr;
 };
 
-template <typename OutType, typename InType1, typename InType2, 
+template <typename OutType, typename InType1, typename InType2,
   int BLOCK_DIM_X,      // blockDim.x
   int BLOCK_DIM_Y,      // blockDim.y
   int FILTER_SHARED_CHUNK_X, // Filter shared memory tile in X
@@ -313,7 +317,7 @@ __global__ void Conv2D(OutType d_out, InType1 d_in1, InType2 d_in2,
                 // compute filter index
                 index_t y = bi + nStart + ii - dy;
                 index_t x = bj + mStart + jj - dx;
-                
+
                 // if signal in range
                 if( x >= 0 && x < i1M && y >=0 && y < i1N) {
                   // Signal Dims
@@ -321,7 +325,7 @@ __global__ void Conv2D(OutType d_out, InType1 d_in1, InType2 d_in2,
                   bdims[Rank - 1] = x;
                   cuda::std::apply([&](auto &&...args) { val = d_in1.operator()(args...); }, bdims);
                 }
-                
+
                 // store in shared
                 s_signal(ii,jj) = val;
               }
@@ -336,7 +340,7 @@ __global__ void Conv2D(OutType d_out, InType1 d_in1, InType2 d_in2,
             for (int mm = 0; mm < FILTER_SHARED_CHUNK_X; mm+=FILTER_REG_CHUNK_X) {
 #pragma unroll
               for (int nn = 0; nn < FILTER_SHARED_CHUNK_Y; nn+=FILTER_REG_CHUNK_Y) {
-            
+
 
                 // Copy chunk from shared memory in to registers
 #pragma unroll
@@ -348,19 +352,19 @@ __global__ void Conv2D(OutType d_out, InType1 d_in1, InType2 d_in2,
                 }
 
 
-                // convolution loop:  convolve filter and signal.  
+                // convolution loop:  convolve filter and signal.
                 // Keep signal in registers as much as possible by shifting.
 #pragma unroll
                 for (int m = 0; m < FILTER_REG_CHUNK_X; m++) {
-              
+
 #pragma unroll
                   for (int n = 0; n < FILTER_REG_CHUNK_Y; n++) {
-                
+
                     in2type i2 = r_filter[n][m];
 
                     // if FILTER_REG_CHUNK_X > 1 then we need to reload i1 every m loop
-                    if( nn == 0 ||  
-                        (FILTER_REG_CHUNK_X > 1 && n == 0)) { 
+                    if( nn == 0 ||
+                        (FILTER_REG_CHUNK_X > 1 && n == 0)) {
                     // load ILPY signal points
 #pragma unroll
                       for(int u = 0; u < ILPY; u++) {
@@ -395,7 +399,7 @@ __global__ void Conv2D(OutType d_out, InType1 d_in1, InType2 d_in2,
           if(i + u < oN && j < oM) {
             bdims[Rank - 2] = i + u;
             bdims[Rank - 1] = j;
-            cuda::std::apply([&](auto &&...args) { d_out.operator()(args...) = sum[u]; }, bdims);        
+            cuda::std::apply([&](auto &&...args) { d_out.operator()(args...) = sum[u]; }, bdims);
           }
         }
 
diff --git a/include/matx/kernels/filter.cuh b/include/matx/kernels/filter.cuh
index fcd41cca0..a40ccef68 100644
--- a/include/matx/kernels/filter.cuh
+++ b/include/matx/kernels/filter.cuh
@@ -8,34 +8,31 @@
 #include <stdio.h>
 #include <vector>
 
-#define MAX_BATCHES 10000
-#define BLOCK_SIZE_RECURSIVE 1024
-#define CORR_COLS BLOCK_SIZE_RECURSIVE
-#define MAX_BLOCKS_PER_BATCH 1000
-#define RECURSIVE_VALS_PER_THREAD 8
-#define MAX_NON_RECURSIVE_COEFFS 4
-#define MAX_RECURSIVE_COEFFS 4
-#define WARP_SIZE 32
-#define COMPLEX_TYPE cuComplex
-#define RECURSIVE_CHUNK_SIZE (BLOCK_SIZE_RECURSIVE * RECURSIVE_VALS_PER_THREAD)
-#define MAX_SIGNAL_LEN_PER_BATCH                                               \
-  (BLOCK_SIZE_RECURSIVE * RECURSIVE_VALS_PER_THREAD * MAX_BLOCKS_PER_BATCH)
-
-#define COMPLEX_TYPE cuComplex
-
-// cuda::std::max/min isn't working on template value parameters
-#define MAX(a, b) ((a) < (b) ? (b) : (a))
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
 namespace matx {
 
+namespace detail_filter {
+  constexpr size_t MAX_BATCHES = 10000;
+  constexpr size_t BLOCK_SIZE_RECURSIVE = 1024;
+  constexpr size_t CORR_COLS = BLOCK_SIZE_RECURSIVE;
+  constexpr size_t MAX_BLOCKS_PER_BATCH = 1000;
+  constexpr size_t RECURSIVE_VALS_PER_THREAD = 8;
+  constexpr size_t MAX_NON_RECURSIVE_COEFFS = 4;
+  constexpr size_t MAX_RECURSIVE_COEFFS = 4;
+  constexpr size_t WARP_SIZE = 32;
+  using COMPLEX_TYPE = cuComplex;
+  constexpr size_t RECURSIVE_CHUNK_SIZE = BLOCK_SIZE_RECURSIVE * RECURSIVE_VALS_PER_THREAD;
+  constexpr size_t MAX_SIGNAL_LEN_PER_BATCH =
+    (BLOCK_SIZE_RECURSIVE * RECURSIVE_VALS_PER_THREAD * MAX_BLOCKS_PER_BATCH);
+};
+using namespace detail_filter;
+
 typedef enum {
   STATUS_FLAG_INCOMPLETE = 0,
   STATUS_FLAG_PARTIAL_COMPLETE = 1,
   STATUS_FLAG_FULL_COMPLETE = 2,
 } STATUS_FLAGS;
 
-#ifdef __CUDACC__  
+#ifdef __CUDACC__
 // Chunk ID assignment used for atomic incrementing between blocks
 static __device__ uint32_t cid_assign[MAX_BATCHES] = {0};
 
@@ -54,7 +51,7 @@ __global__ __launch_bounds__(BLOCK_SIZE_RECURSIVE, 1) void RecursiveFilter(
 
   __shared__ intype_strip
       s_exch[1 + (1 + BLOCK_SIZE_RECURSIVE) *
-                     MAX(num_non_recursive - 1,
+                     cuda::std::max(num_non_recursive - 1,
                          num_recursive)]; // Data exchange between threads
   __shared__ uint32_t s_chunk_id;
   __shared__ FilterType
@@ -64,7 +61,7 @@ __global__ __launch_bounds__(BLOCK_SIZE_RECURSIVE, 1) void RecursiveFilter(
                                              // since nvcc doesn't like that
   intype_strip tmp[RECURSIVE_VALS_PER_THREAD];
   intype_strip vals[RECURSIVE_VALS_PER_THREAD];
-  intype_strip r_nonr[MAX(MAX_NON_RECURSIVE_COEFFS, MAX_RECURSIVE_COEFFS)];
+  intype_strip r_nonr[cuda::std::max(MAX_NON_RECURSIVE_COEFFS, MAX_RECURSIVE_COEFFS)];
   const uint32_t lane = threadIdx.x & 31;
   const uint32_t warp_id = threadIdx.x / WARP_SIZE;
 // const index_t batch_offset = blockIdx.y * len;
@@ -259,7 +256,7 @@ __global__ __launch_bounds__(BLOCK_SIZE_RECURSIVE, 1) void RecursiveFilter(
     for (uint32_t r = 0; r < RECURSIVE_VALS_PER_THREAD; r++) {
 // Load all of the values we need from other threads in the warp
 #pragma unroll
-      for (int32_t rec = 0; rec < MIN(num_recursive, wl); rec++) {
+      for (int32_t rec = 0; rec < cuda::std::min(num_recursive, static_cast<uint32_t>(wl)); rec++) {
         if constexpr (is_cuda_complex_v<InType>) {
           *reinterpret_cast<uint64_t *>(&tmp[rec + 1]) =
               __shfl_sync(~0, *reinterpret_cast<uint64_t *>(&vals[r]),
@@ -274,7 +271,7 @@ __global__ __launch_bounds__(BLOCK_SIZE_RECURSIVE, 1) void RecursiveFilter(
           len) { // Make sure this value is within bounds of the signal
 // Now apply those values
 #pragma unroll
-        for (int32_t rec = 0; rec < MIN(num_recursive, wl); rec++) {
+        for (int32_t rec = 0; rec < cuda::std::min(num_recursive, static_cast<uint32_t>(wl)); rec++) {
           if constexpr (is_cuda_complex_v<InType>) {
             vals[r] =
                 cuCaddf(vals[r],
diff --git a/include/matx/kernels/transpose.cuh b/include/matx/kernels/transpose.cuh
index f1a5adb55..b49066aa0 100644
--- a/include/matx/kernels/transpose.cuh
+++ b/include/matx/kernels/transpose.cuh
@@ -13,9 +13,13 @@
 
 namespace matx {
 
-#ifdef __CUDACC__  
-// Tile dims for one block
-#define TILE_DIM 32
+#ifdef __CUDACC__
+
+namespace matx_transpose_detail {
+  // Tile dims for one block
+  constexpr size_t TILE_DIM = 32;
+}
+using namespace matx_transpose_detail;
 
 /* Out of place. Adapted from:
    https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/. Works
@@ -26,7 +30,7 @@ __global__ void transpose_kernel_oop(OutputTensor out,
 {
   using T = typename OutputTensor::value_type;
   constexpr int RANK = OutputTensor::Rank();
-  
+
   extern __shared__ float
       tile[]; // Need to swap complex types also, so cast when needed
   T *shm_tile = reinterpret_cast<T *>(&tile[0]);
diff --git a/include/matx/operators/binary_operators.h b/include/matx/operators/binary_operators.h
index 49635ccd0..c6c0eea1b 100644
--- a/include/matx/operators/binary_operators.h
+++ b/include/matx/operators/binary_operators.h
@@ -36,7 +36,7 @@
 #include "matx/operators/base_operator.h"
 #include "matx/operators/scalar_ops.h"
 
-#define DEFINE_BINARY_OP(FUNCTION, TENSOR_OP)                        \
+#define MATX_DEFINE_BINARY_OP(FUNCTION, TENSOR_OP)                        \
   template <typename I1, typename I2,                                \
             typename = typename std::enable_if_t<is_matx_op<I1>() or \
                                                  is_matx_op<I2>()>>  \
@@ -54,7 +54,7 @@ namespace matx
 {
   /**
    * @brief Utility operator for multiplying scalars by a complex value
-   * 
+   *
    * @tparam T Complex type
    * @tparam S Scalar type
    * @param n Scalar value
@@ -72,7 +72,7 @@ namespace matx
 
   /**
    * @brief Utility operator for multiplying scalars by a complex value
-   * 
+   *
    * @tparam T Complex type
    * @tparam S Scalar type
    * @param n Scalar value
@@ -112,8 +112,8 @@ namespace matx
       {
         if constexpr (Rank() > 0)
         {
-          ASSERT_COMPATIBLE_OP_SIZES(in1_);
-          ASSERT_COMPATIBLE_OP_SIZES(in2_);
+          MATX_ASSERT_COMPATIBLE_OP_SIZES(in1_);
+          MATX_ASSERT_COMPATIBLE_OP_SIZES(in2_);
         }
       }
 
@@ -130,8 +130,8 @@ namespace matx
       {
         return cuda::std::apply([&](auto &&...args)  {
             return this->operator()(args...);
-          }, idx);      
-      }        
+          }, idx);
+      }
 
 
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
@@ -159,7 +159,7 @@ namespace matx
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PostRun(ShapeType &&shape, Executor &&ex) const noexcept  
+      __MATX_INLINE__ void PostRun(ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<I1>()) {
           in1_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
@@ -168,7 +168,7 @@ namespace matx
         if constexpr (is_matx_op<I2>()) {
           in2_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
         }
-      }      
+      }
     };
   }
 
@@ -353,7 +353,7 @@ namespace matx
    * @param t2
    *   RHS tensor or operator input
    */
-  Op operator|(Op t, Op t2) {}  
+  Op operator|(Op t, Op t2) {}
 
   /**
    * Compute t ^ t2 (bitwise XOR) of two operators or tensors
@@ -362,7 +362,7 @@ namespace matx
    * @param t2
    *   RHS tensor or operator input
    */
-  Op operator^(Op t, Op t2) {}    
+  Op operator^(Op t, Op t2) {}
 
   /**
    * Compute the arctangent of two inputs
@@ -371,31 +371,31 @@ namespace matx
    * @param t2
    *   Y value of input
    */
-  Op atan2(Op t, Op t2) {}  
+  Op atan2(Op t, Op t2) {}
 #else
-  DEFINE_BINARY_OP(operator+, detail::AddOp);
-  DEFINE_BINARY_OP(operator-, detail::SubOp);
-  DEFINE_BINARY_OP(operator*, detail::MulOp);
-  DEFINE_BINARY_OP(mul, detail::MulOp);
-  DEFINE_BINARY_OP(operator/, detail::DivOp);
-  DEFINE_BINARY_OP(operator%, detail::ModOp);
-  DEFINE_BINARY_OP(fmod, detail::FModOp);
-  DEFINE_BINARY_OP(operator|, detail::OrOp);
-  DEFINE_BINARY_OP(operator&, detail::AndOp);
-  DEFINE_BINARY_OP(operator^, detail::XorOp);
-  DEFINE_BINARY_OP(pow, detail::PowOp);
-  DEFINE_BINARY_OP(max, detail::MaximumOp);
-  DEFINE_BINARY_OP(atan2, detail::Atan2Op);
-  DEFINE_BINARY_OP(min, detail::MinimumOp);
-  DEFINE_BINARY_OP(operator<, detail::LTOp);
-  DEFINE_BINARY_OP(operator>, detail::GTOp);
-  DEFINE_BINARY_OP(operator<=, detail::LTEOp);
-  DEFINE_BINARY_OP(operator>=, detail::GTEOp);
-  DEFINE_BINARY_OP(operator==, detail::EQOp);
-  DEFINE_BINARY_OP(operator!=, detail::NEOp);
-  DEFINE_BINARY_OP(operator&&, detail::AndAndOp);
-  DEFINE_BINARY_OP(operator||, detail::OrOrOp);
-  DEFINE_UNARY_OP(operator!, detail::NotOp);
+  MATX_DEFINE_BINARY_OP(operator+, detail::AddOp);
+  MATX_DEFINE_BINARY_OP(operator-, detail::SubOp);
+  MATX_DEFINE_BINARY_OP(operator*, detail::MulOp);
+  MATX_DEFINE_BINARY_OP(mul, detail::MulOp);
+  MATX_DEFINE_BINARY_OP(operator/, detail::DivOp);
+  MATX_DEFINE_BINARY_OP(operator%, detail::ModOp);
+  MATX_DEFINE_BINARY_OP(fmod, detail::FModOp);
+  MATX_DEFINE_BINARY_OP(operator|, detail::OrOp);
+  MATX_DEFINE_BINARY_OP(operator&, detail::AndOp);
+  MATX_DEFINE_BINARY_OP(operator^, detail::XorOp);
+  MATX_DEFINE_BINARY_OP(pow, detail::PowOp);
+  MATX_DEFINE_BINARY_OP(max, detail::MaximumOp);
+  MATX_DEFINE_BINARY_OP(atan2, detail::Atan2Op);
+  MATX_DEFINE_BINARY_OP(min, detail::MinimumOp);
+  MATX_DEFINE_BINARY_OP(operator<, detail::LTOp);
+  MATX_DEFINE_BINARY_OP(operator>, detail::GTOp);
+  MATX_DEFINE_BINARY_OP(operator<=, detail::LTEOp);
+  MATX_DEFINE_BINARY_OP(operator>=, detail::GTEOp);
+  MATX_DEFINE_BINARY_OP(operator==, detail::EQOp);
+  MATX_DEFINE_BINARY_OP(operator!=, detail::NEOp);
+  MATX_DEFINE_BINARY_OP(operator&&, detail::AndAndOp);
+  MATX_DEFINE_BINARY_OP(operator||, detail::OrOrOp);
+  MATX_DEFINE_UNARY_OP(operator!, detail::NotOp);
 #endif
 
 } // end namespace matx
diff --git a/include/matx/operators/cart2sph.h b/include/matx/operators/cart2sph.h
index d57aa238e..15dca39e4 100644
--- a/include/matx/operators/cart2sph.h
+++ b/include/matx/operators/cart2sph.h
@@ -54,18 +54,18 @@ namespace matx
         using matxop = bool;
         using value_type = typename T1::value_type;
 
-        __MATX_INLINE__ std::string str() const { return "cart2sph(" + get_type_str(x_) + 
+        __MATX_INLINE__ std::string str() const { return "cart2sph(" + get_type_str(x_) +
           "," + get_type_str(y_) + "," + get_type_str(z_) + ")"; }
 
         __MATX_INLINE__ Cart2SphOp(const T1 &x, const T2 &y, const T3 &z) : x_(x), y_(y), z_(z)
       {
-        ASSERT_COMPATIBLE_OP_SIZES(x);
-        ASSERT_COMPATIBLE_OP_SIZES(y);
-        ASSERT_COMPATIBLE_OP_SIZES(z);
+        MATX_ASSERT_COMPATIBLE_OP_SIZES(x);
+        MATX_ASSERT_COMPATIBLE_OP_SIZES(y);
+        MATX_ASSERT_COMPATIBLE_OP_SIZES(z);
       }
 
         template <typename... Is>
-          __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto operator()(Is... indices) const 
+          __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto operator()(Is... indices) const
           {
             auto x = get_value(x_, indices...);
             auto y = get_value(y_, indices...);
@@ -78,7 +78,7 @@ namespace matx
             } else {  // r
               return _internal_sqrt(x * x + y * y + z * z);
             }
-          }    
+          }
 
         static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
         {
@@ -90,7 +90,7 @@ namespace matx
 					index_t size1 = get_expanded_size<Rank()>(x_, dim);
 					index_t size2 = get_expanded_size<Rank()>(y_, dim);
 					index_t size3 = get_expanded_size<Rank()>(z_, dim);
-					return detail::matx_max(size1, size2, size3);  
+					return detail::matx_max(size1, size2, size3);
         }
 
         template <typename ShapeType, typename Executor>
@@ -123,7 +123,7 @@ namespace matx
           if constexpr (is_matx_op<T3>()) {
             z_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
           }
-        }        
+        }
     };
   }
   /**
@@ -139,7 +139,7 @@ namespace matx
    *
    * @param x
    *   Operator defining x
-   *   
+   *
    * @param y
    *   Operator defining y
    *
@@ -152,7 +152,7 @@ namespace matx
   template <typename T1, typename T2, typename T3>
     auto __MATX_INLINE__ cart2sph(const T1 &x, const T2 &y, const T3 &z)
     {
-      return cuda::std::tuple{ 
+      return cuda::std::tuple{
         detail::Cart2SphOp<T1, T2, T3, 0>(x, y, z),
         detail::Cart2SphOp<T1, T2, T3, 1>(x, y, z),
         detail::Cart2SphOp<T1, T2, T3, 2>(x, y, z)};
diff --git a/include/matx/operators/clone.h b/include/matx/operators/clone.h
index 85c614d86..b99bd83af 100644
--- a/include/matx/operators/clone.h
+++ b/include/matx/operators/clone.h
@@ -58,7 +58,7 @@ namespace matx
           static_assert(T::Rank() < CRank, "Cloning rank must be higher than input operator rank");
 
           const index_t num_keep = std::count_if(shape.begin(), shape.end(), [](index_t i) { return i == matxKeepDim; });
-          MATX_ASSERT_STR(num_keep == T::Rank(), matxInvalidParameter, 
+          MATX_ASSERT_STR(num_keep == T::Rank(), matxInvalidParameter,
             "Number of matxKeepDim in a clone must match input operator rank");
 
           // create gather list
@@ -69,9 +69,9 @@ namespace matx
                 sizes_[i] = op_.Size(d);
                 // gcc incorrectly shows an invalid access to array element [1] in a unit test here. This is not
                 // possible based on runtime checks we have. Disable the warning temporarily.
-IGNORE_WARNING_PUSH_GCC("-Warray-bounds")                
+MATX_IGNORE_WARNING_PUSH_GCC("-Warray-bounds")
                 dims_[d++] = i;
-IGNORE_WARNING_POP_GCC           
+MATX_IGNORE_WARNING_POP_GCC
               } else {
                 sizes_[i] = shape[i];
               }
@@ -87,11 +87,11 @@ IGNORE_WARNING_POP_GCC
 
         template <typename Op, typename Dims, typename... Is>
         static __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) get_impl(Op&& op, const Dims &dims, Is... indices)
-        {      
-IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")        
+        {
+MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
           cuda::std::array<index_t, Rank()> sind{indices...};
           cuda::std::array<index_t, T::Rank()> gind;
-IGNORE_WARNING_POP_GCC
+MATX_IGNORE_WARNING_POP_GCC
 
           // gather indices
           for(int i = 0; i < T::Rank(); i++) {
diff --git a/include/matx/operators/diag.h b/include/matx/operators/diag.h
index 08c737643..6eab9669d 100644
--- a/include/matx/operators/diag.h
+++ b/include/matx/operators/diag.h
@@ -59,11 +59,11 @@ namespace matx
         using value_type = typename T1::value_type;
 
         __MATX_INLINE__ std::string str() const { return "diag(" + op_.str() + ")"; }
- 
+
         __MATX_INLINE__ DiagOp(const T1 &op, index_t k) : op_(op), k_(k) { }
 
         template <typename... Is>
-        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const 
+        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const
         {
           static_assert(RANK != 0, "Cannot make get diagonals from 0D tensor");
           using tt = cuda::std::tuple_element_t<0, cuda::std::tuple<Is...>>;
@@ -79,26 +79,26 @@ namespace matx
           }
           else {
             static_assert(sizeof...(Is) == RANK - 1, "Diagonal operator must have one fewer op() index than rank of operator");
-            
+
             // Offset either the rows or columns by k_, depending on if it's negative
             if (k_ < 0) {
               cuda::std::array<tt, sizeof...(Is) + 1> tmp{indices...};
               tmp[RANK - 1] = pp_get<RANK-2>(indices...);
               //cuda::std::get<RANK - 1>(tup) = pp_get<RANK-2>(indices...) ;
-IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
+MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
               tmp[RANK - 2] -= k_;
               //cuda::std::get<RANK - 2>(tup) = cuda::std::get<RANK - 2>(tup) - k_;
-IGNORE_WARNING_POP_GCC
+MATX_IGNORE_WARNING_POP_GCC
               return get_value(op_, tmp);
             }
             else {
               cuda::std::array<tt, sizeof...(Is) + 1> tmp{indices...};
               //auto tup = cuda::std::make_tuple(indices..., static_cast<tt>(0));
-IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
+MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
               tmp[RANK - 1] = pp_get<RANK-2>(indices...) + k_;
               //cuda::std::get<RANK - 1>(tup) = pp_get<RANK-2>(indices...) + k_;
-IGNORE_WARNING_POP_GCC
-              return get_value(op_, tmp);                
+MATX_IGNORE_WARNING_POP_GCC
+              return get_value(op_, tmp);
             }
           }
         }
@@ -166,13 +166,13 @@ IGNORE_WARNING_POP_GCC
    *   Diagonal to pull (0 is the main diagonal). Only used for 2D tensors and above
    */
 #ifdef DOXYGEN_ONLY
-  auto __MATX_INLINE__ diag(const T1 &t, index_t k = 0) { 
+  auto __MATX_INLINE__ diag(const T1 &t, index_t k = 0) {
 #else
   template <typename T1, std::enable_if_t<is_matx_op<T1>(), bool> = true>
-    auto __MATX_INLINE__ diag(T1 t, index_t k = 0) { 
-#endif      
-      MATX_ASSERT_STR(T1::Rank() != 1 || k == 0, matxInvalidParameter, 
+    auto __MATX_INLINE__ diag(T1 t, index_t k = 0) {
+#endif
+      MATX_ASSERT_STR(T1::Rank() != 1 || k == 0, matxInvalidParameter,
           "k parameter in diag() can only be used for 2D tensors and above");
-      return detail::DiagOp<T1, T1::Rank()>(t, k); 
-    } 
+      return detail::DiagOp<T1, T1::Rank()>(t, k);
+    }
 } // end namespace matx
diff --git a/include/matx/operators/if.h b/include/matx/operators/if.h
index 08c923aeb..f0b858c48 100644
--- a/include/matx/operators/if.h
+++ b/include/matx/operators/if.h
@@ -63,10 +63,10 @@ namespace matx
       __MATX_INLINE__ std::string str() const { return  "if(" + cond_.str() + ") then {" +  op_.str() + "}"; }
       /**
        * @brief Constructor for an IF statement
-       * 
+       *
        * @param cond Condition to perform the IF/ELSE on
        * @param op Operator if conditional branch is true
-       */    
+       */
       __MATX_INLINE__ IFOP(const T1 &cond, const T2 &op) : cond_(cond), op_(op)
     {
       static_assert((!is_tensor_view_v<T2>),
@@ -83,17 +83,17 @@ namespace matx
         {
           index_t size1 = detail::get_expanded_size<Rank()>(cond_, i);
           index_t size2 = detail::get_expanded_size<Rank()>(op_, i);
-          size_[i] = detail::matx_max(size1, size2);          
+          size_[i] = detail::matx_max(size1, size2);
         }
-      
-        ASSERT_COMPATIBLE_OP_SIZES(op_);
-        ASSERT_COMPATIBLE_OP_SIZES(cond_);
+
+        MATX_ASSERT_COMPATIBLE_OP_SIZES(op_);
+        MATX_ASSERT_COMPATIBLE_OP_SIZES(cond_);
       }
     }
 
       /**
        * @brief Operator() for getting values of an if operator
-       * 
+       *
        * @tparam Is Index types
        * @param indices Index values
        */
@@ -102,13 +102,13 @@ namespace matx
           if (get_value(cond_, indices...)) {
             get_value(op_, indices...);
           }
-        }   
+        }
 
       /**
        * @brief Rank of IF operator
-       * 
+       *
        * @return Rank
-       */    
+       */
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
       {
         return detail::matx_max(detail::get_rank<T1>(), detail::get_rank<T2>());
@@ -116,10 +116,10 @@ namespace matx
 
       /**
        * @brief Size of dimension of operator
-       * 
+       *
        * @param dim Dimension to get size of
-       * @return Size of dimension 
-       */    
+       * @return Size of dimension
+       */
       constexpr index_t __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ Size(int dim) const
       {
         return size_[dim];
@@ -135,7 +135,7 @@ namespace matx
 
         if constexpr (is_matx_op<T1>()) {
           cond_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }        
+        }
       }
 
       template <typename ShapeType, typename Executor>
@@ -143,12 +143,12 @@ namespace matx
       {
         if constexpr (is_matx_op<T2>()) {
           op_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }
 
         if constexpr (is_matx_op<T1>()) {
           cond_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
         }
-      }      
+      }
   };
 
   /**
@@ -159,7 +159,7 @@ namespace matx
    * operator must be defined for the particular type. For example, operator< on
    * two integers is okay, but the same operator on two complex numbers will give
    * a compiler error.
-   * 
+   *
    * @param t1 op1
    *
    * @param t2 op2
diff --git a/include/matx/operators/ifelse.h b/include/matx/operators/ifelse.h
index 43d7f0466..649aa726c 100644
--- a/include/matx/operators/ifelse.h
+++ b/include/matx/operators/ifelse.h
@@ -56,24 +56,24 @@ namespace matx
     private:
       typename detail::base_type_t<C1> cond_;
       typename detail::base_type_t<T1> op1_;
-      typename detail::base_type_t<T2> op2_;    
+      typename detail::base_type_t<T2> op2_;
       cuda::std::array<index_t, detail::matx_max(detail::get_rank<C1>(), detail::get_rank<T1>(), detail::get_rank<T2>())> size_;
 
     public:
       using value_type = void; ///< Scalar type for type extraction
 
-      __MATX_INLINE__ std::string str() const { 
-        return  "if(" + detail::get_type_str(cond_) + ") then {" +  detail::get_type_str(op1_) + "} else {" + detail::get_type_str(op2_) + "}"; 
+      __MATX_INLINE__ std::string str() const {
+        return  "if(" + detail::get_type_str(cond_) + ") then {" +  detail::get_type_str(op1_) + "} else {" + detail::get_type_str(op2_) + "}";
       }
 
       /**
        * @brief Constructor for an IFELSE statement
-       * 
+       *
        * @param cond Condition to perform the IF/ELSE on
        * @param op1 Operator if conditional branch is true
        * @param op2 Operator if conditional branch is false
        */
-      __MATX_INLINE__ IFELSE(const C1 &cond, const T1 &op1, const T2 &op2) : 
+      __MATX_INLINE__ IFELSE(const C1 &cond, const T1 &op1, const T2 &op2) :
                               cond_(cond), op1_(op1), op2_(op2)
     {
       static_assert((!is_tensor_view_v<T1> && !is_tensor_view_v<T2>),
@@ -97,14 +97,14 @@ namespace matx
         }
       }
 
-      ASSERT_COMPATIBLE_OP_SIZES(op1_);
-      ASSERT_COMPATIBLE_OP_SIZES(op2_);
-      ASSERT_COMPATIBLE_OP_SIZES(cond_);
+      MATX_ASSERT_COMPATIBLE_OP_SIZES(op1_);
+      MATX_ASSERT_COMPATIBLE_OP_SIZES(op2_);
+      MATX_ASSERT_COMPATIBLE_OP_SIZES(cond_);
     }
 
       /**
        * @brief Operator() for getting values of an if/else
-       * 
+       *
        * @tparam Is Index types
        * @param indices Index values
        */
@@ -116,11 +116,11 @@ namespace matx
           else {
             get_value(op2_, indices...);
           }
-        }      
+        }
 
       /**
        * @brief Rank of IF/ELSE operator
-       * 
+       *
        * @return Rank
        */
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
@@ -137,11 +137,11 @@ namespace matx
 
         if constexpr (is_matx_op<T2>()) {
           op2_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }        
+        }
 
         if constexpr (is_matx_op<C1>()) {
           cond_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }        
+        }
       }
 
       template <typename ShapeType, typename Executor>
@@ -153,18 +153,18 @@ namespace matx
 
         if constexpr (is_matx_op<T2>()) {
           op2_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }        
+        }
 
         if constexpr (is_matx_op<C1>()) {
           cond_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
         }
-      }        
+      }
 
       /**
        * @brief Size of dimension of operator
-       * 
+       *
        * @param dim Dimension to get size of
-       * @return Size of dimension 
+       * @return Size of dimension
        */
       constexpr index_t __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ Size(int dim) const
       {
diff --git a/include/matx/operators/isclose.h b/include/matx/operators/isclose.h
index 53f8a3b75..c5f922c38 100644
--- a/include/matx/operators/isclose.h
+++ b/include/matx/operators/isclose.h
@@ -41,7 +41,7 @@ namespace matx
 {
 
   namespace detail {
-    template <typename Op1, typename Op2> 
+    template <typename Op1, typename Op2>
     class IsCloseOp : public BaseOp<IsCloseOp<Op1, Op2>>
     {
       public:
@@ -52,19 +52,19 @@ namespace matx
 
         __MATX_INLINE__ std::string str() const { return "isclose()"; }
 
-        __MATX_INLINE__ IsCloseOp(const Op1 &op1, const Op2 &op2, double rtol, double atol) : 
-          op1_(op1), op2_(op2), rtol_(static_cast<inner_type>(rtol)), atol_(static_cast<inner_type>(atol)) 
+        __MATX_INLINE__ IsCloseOp(const Op1 &op1, const Op2 &op2, double rtol, double atol) :
+          op1_(op1), op2_(op2), rtol_(static_cast<inner_type>(rtol)), atol_(static_cast<inner_type>(atol))
         {
           static_assert(Op1::Rank() == Op2::Rank(), "Operator ranks must match in isclose()");
-          ASSERT_COMPATIBLE_OP_SIZES(op1); 
-          ASSERT_COMPATIBLE_OP_SIZES(op2);
+          MATX_ASSERT_COMPATIBLE_OP_SIZES(op1);
+          MATX_ASSERT_COMPATIBLE_OP_SIZES(op2);
         }
 
         template <typename... Is>
-        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ int operator()([[maybe_unused]] Is... indices) const 
+        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ int operator()([[maybe_unused]] Is... indices) const
         {
 
-          return static_cast<int>(detail::_internal_abs(get_value(op1_, indices...) - get_value(op2_, indices...)) <= 
+          return static_cast<int>(detail::_internal_abs(get_value(op1_, indices...) - get_value(op2_, indices...)) <=
               static_cast<inner_type>(atol_) + static_cast<inner_type>(rtol_) * detail::_internal_abs(get_value(op2_, indices...)));
         }
 
@@ -89,7 +89,7 @@ namespace matx
 
           if constexpr (is_matx_op<Op2>()) {
             op2_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }          
+          }
         }
 
         template <typename ShapeType, typename Executor>
@@ -102,7 +102,7 @@ namespace matx
           if constexpr (is_matx_op<Op2>()) {
             op2_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
           }
-        }          
+        }
 
       private:
         typename detail::base_type_t<Op1> op1_;
@@ -116,9 +116,9 @@ namespace matx
   /**
    * @brief Returns an integer tensor where an element is 1 if:
    *    abs(op1 - op2) <= atol + rtol * abs(op2)
-   * 
+   *
    * or 0 otherwise
-   * 
+   *
    * @tparam Op1 First operator type
    * @tparam Op2 Second operator type
    * @param op1 First operator
diff --git a/include/matx/operators/permute.h b/include/matx/operators/permute.h
index cebb62dc3..0732b3aec 100644
--- a/include/matx/operators/permute.h
+++ b/include/matx/operators/permute.h
@@ -84,30 +84,30 @@ namespace matx
 
           // convert variadic type to tuple so we can read/update
           cuda::std::array<index_t, Rank()> inds{indices...};
-IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
+MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
           cuda::std::array<index_t, Rank()> ind;
-IGNORE_WARNING_POP_GCC
+MATX_IGNORE_WARNING_POP_GCC
 
 #if 0
     //This causes register spills but might be faster if Rank is large
 #pragma unroll
-          for(int32_t i = 0; i < Rank(); i++) {	
+          for(int32_t i = 0; i < Rank(); i++) {
             ind[dims_[i]] = inds[i];
           }
 #else
 #pragma unroll
     // use double loop to avoid register spills
-          for(int32_t i = 0; i < Rank(); i++) {	
+          for(int32_t i = 0; i < Rank(); i++) {
 #pragma unroll
-            for(int32_t j = 0; j < Rank(); j++) {	
+            for(int32_t j = 0; j < Rank(); j++) {
               if(dims[j] == i) {
                 ind[i] = inds[j];
-              }			
+              }
             }
           }
-#endif         
+#endif
           return get_value(cuda::std::forward<Op>(op), ind);
-        }        
+        }
 
         template <typename... Is>
         __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const
diff --git a/include/matx/operators/set.h b/include/matx/operators/set.h
index 898857061..d74d47a5b 100644
--- a/include/matx/operators/set.h
+++ b/include/matx/operators/set.h
@@ -101,7 +101,7 @@ class set : public BaseOp<set<T, Op>> {
     // set() is a placeholder when using mtie() for multiple return types, so we don't need to check compatible
     // sizes
     if constexpr (!is_mtie<T>()) {
-      ASSERT_COMPATIBLE_OP_SIZES(op);
+      MATX_ASSERT_COMPATIBLE_OP_SIZES(op);
     }
   }
 
@@ -152,18 +152,18 @@ class set : public BaseOp<set<T, Op>> {
   {
     if constexpr (is_matx_op<T>()) {
       out_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-    }    
+    }
     if constexpr (is_matx_op<Op>()) {
       op_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
     }
   }
 
   template <typename ShapeType, typename Executor>
-  __MATX_INLINE__ void PostRun(ShapeType &&shape, Executor &&ex) const noexcept  
+  __MATX_INLINE__ void PostRun(ShapeType &&shape, Executor &&ex) const noexcept
   {
     if constexpr (is_matx_op<T>()) {
       out_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-    }     
+    }
     if constexpr (is_matx_op<Op>()) {
       op_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
     }
diff --git a/include/matx/operators/shift.h b/include/matx/operators/shift.h
index 4685e9e0d..cebab1acc 100644
--- a/include/matx/operators/shift.h
+++ b/include/matx/operators/shift.h
@@ -40,7 +40,7 @@ namespace matx
 {
   /**
    * Shifts the indexing of an operator to move the array forward or backward by the
-   * shift amount. 
+   * shift amount.
    *
    * ShiftOp allows adjusting the relative view of a tensor to start at a
    * new offset. This may be useful to cut off part of a tensor that is
@@ -68,27 +68,27 @@ namespace matx
           #pragma unroll
           for (int i = 0; i < Rank(); i++) {
             index_t size1 = detail::get_expanded_size<Rank()>(op_, i);
-            index_t size2 = detail::get_expanded_size<Rank()>(shift_, i);      
+            index_t size2 = detail::get_expanded_size<Rank()>(shift_, i);
             sizes_[i] = detail::matx_max(size1,size2);
           }
 
-          ASSERT_COMPATIBLE_OP_SIZES(shift_); 
-          ASSERT_COMPATIBLE_OP_SIZES(op_);           
+          MATX_ASSERT_COMPATIBLE_OP_SIZES(shift_);
+          MATX_ASSERT_COMPATIBLE_OP_SIZES(op_);
         }
 
         template <typename Op, typename Sizes, typename ShiftType, typename... Is>
         static __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) get_impl(
-            Op&& op, 
-            const Sizes &sizes, 
-            ShiftType shiftin, 
+            Op&& op,
+            const Sizes &sizes,
+            ShiftType shiftin,
             Is... indices)
-        {   
+        {
           cuda::std::array idx{indices...};
           index_t shift = -get_value(shiftin, indices...);
 
           shift = (shift + idx[DIM]) % sizes[DIM];
 
-          if (shift < 0) { 
+          if (shift < 0) {
             shift += sizes[DIM];
           }
 
@@ -96,12 +96,12 @@ namespace matx
 
           return get_value(cuda::std::forward<Op>(op), idx);
         }
-        
+
         template <typename... Is>
-        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const 
+        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const
         {
           return get_impl(cuda::std::as_const(op_), sizes_, shift_, indices...);
-        }    
+        }
 
         template <typename... Is>
         __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices)
@@ -123,7 +123,7 @@ namespace matx
           if constexpr (is_matx_op<T1>()) {
             op_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
           }
-        }          
+        }
 
         static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
         {
@@ -137,24 +137,24 @@ namespace matx
 
         ~ShiftOp() = default;
         ShiftOp(const ShiftOp &rhs) = default;
-        __MATX_INLINE__ auto operator=(const self_type &rhs) { 
-          return set(*this, rhs); 
-        }            
+        __MATX_INLINE__ auto operator=(const self_type &rhs) {
+          return set(*this, rhs);
+        }
 
-        template<typename R> 
-        __MATX_INLINE__ auto operator=(const R &rhs) { 
+        template<typename R>
+        __MATX_INLINE__ auto operator=(const R &rhs) {
           if constexpr (is_matx_transform_op<R>()) {
             return mtie(*this, rhs);
           }
-          else {          
-            return set(*this, rhs); 
+          else {
+            return set(*this, rhs);
           }
         }
 
       private:
         typename detail::base_type_t<T1> op_;
         cuda::std::array<index_t, Rank()> sizes_;
-        typename detail::base_type_t<T2> shift_;        
+        typename detail::base_type_t<T2> shift_;
     };
   }
   /**
@@ -206,7 +206,7 @@ namespace matx
    *
    * @param s
    *   Amount to shift forward
-   * 
+   *
    * @param shifts
    *    list of shift amounts
    * @returns
diff --git a/include/matx/operators/sph2cart.h b/include/matx/operators/sph2cart.h
index cd25c0139..bba6a41e4 100644
--- a/include/matx/operators/sph2cart.h
+++ b/include/matx/operators/sph2cart.h
@@ -54,18 +54,18 @@ namespace matx
         using matxop = bool;
         using value_type = typename T1::value_type;
 
-        __MATX_INLINE__ std::string str() const { return "sph2cart(" + get_type_str(theta_) + 
+        __MATX_INLINE__ std::string str() const { return "sph2cart(" + get_type_str(theta_) +
           "," + get_type_str(phi_) + "," + get_type_str(r_) + ")"; }
 
         __MATX_INLINE__ Sph2CartOp(const T1 &theta, const T2 &phi, const T3 &r) : theta_(theta), phi_(phi), r_(r)
       {
-        ASSERT_COMPATIBLE_OP_SIZES(theta);
-        ASSERT_COMPATIBLE_OP_SIZES(phi);
-        ASSERT_COMPATIBLE_OP_SIZES(r);
+        MATX_ASSERT_COMPATIBLE_OP_SIZES(theta);
+        MATX_ASSERT_COMPATIBLE_OP_SIZES(phi);
+        MATX_ASSERT_COMPATIBLE_OP_SIZES(r);
       }
 
         template <typename... Is>
-        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto operator()(Is... indices) const 
+        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto operator()(Is... indices) const
         {
           [[maybe_unused]] auto theta = get_value(theta_, indices...);
           [[maybe_unused]] auto phi = get_value(phi_, indices...);
@@ -122,7 +122,7 @@ namespace matx
 					index_t size1 = get_expanded_size<Rank()>(theta_, dim);
 					index_t size2 = get_expanded_size<Rank()>(phi_, dim);
 					index_t size3 = get_expanded_size<Rank()>(r_, dim);
-					return detail::matx_max(size1, size2, size3);  
+					return detail::matx_max(size1, size2, size3);
         }
     };
   }
@@ -140,7 +140,7 @@ namespace matx
    *
    * @param theta
    *   Operator defining theta
-   *   
+   *
    * @param phi
    *   Operator defining phi
    *
@@ -153,7 +153,7 @@ namespace matx
   template <typename T1, typename T2, typename T3>
     auto __MATX_INLINE__ sph2cart(const T1 &theta, const T2 &phi, const T3 &r)
     {
-      return cuda::std::tuple{ 
+      return cuda::std::tuple{
         detail::Sph2CartOp<T1,T2,T3,0>(theta, phi, r),
         detail::Sph2CartOp<T1,T2,T3,1>(theta, phi, r),
         detail::Sph2CartOp<T1,T2,T3,2>(theta, phi, r)};
diff --git a/include/matx/operators/unary_operators.h b/include/matx/operators/unary_operators.h
index e0a3d9c71..dbf1afdb0 100644
--- a/include/matx/operators/unary_operators.h
+++ b/include/matx/operators/unary_operators.h
@@ -37,15 +37,15 @@
 #include "matx/operators/scalar_ops.h"
 #include "matx/operators/base_operator.h"
 
-#define DEFINE_UNARY_OP(FUNCTION, TENSOR_OP)                        \
+#define MATX_DEFINE_UNARY_OP(FUNCTION, TENSOR_OP)                   \
   template <typename I1,                                            \
             typename = typename std::enable_if_t<is_matx_op<I1>()>> \
-  [[nodiscard]] __MATX_INLINE__ auto FUNCTION(const I1 &i1)                         \
+  [[nodiscard]] __MATX_INLINE__ auto FUNCTION(const I1 &i1)         \
   {                                                                 \
-    using I1Type = extract_value_type_t<I1>;                       \
+    using I1Type = extract_value_type_t<I1>;                        \
     using Op = TENSOR_OP<I1Type>;                                   \
-    const typename detail::base_type_t<I1> &base = i1;          \
-    return detail::matxUnaryOp(base, Op());                           \
+    const typename detail::base_type_t<I1> &base = i1;              \
+    return detail::matxUnaryOp(base, Op());                         \
   }
 
 
@@ -83,8 +83,8 @@ namespace matx
     {
       return cuda::std::apply([&](auto &&...args)  {
           return this->operator()(args...);
-        }, idx);      
-    }  
+        }, idx);
+    }
 
     template <typename... Is, std::enable_if_t<std::conjunction_v<std::is_integral<Is>...>, bool> = true>
     __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const
@@ -110,7 +110,7 @@ namespace matx
     }
 
     template <typename ShapeType, typename Executor>
-    __MATX_INLINE__ void PostRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept  
+    __MATX_INLINE__ void PostRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
     {
       in1_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
     }
@@ -337,52 +337,52 @@ namespace matx
  * @param t
  *   Input operator
  */
-  Op real(Op t) {}  
+  Op real(Op t) {}
 
  /**
  * Return imaginary components of an operator
  * @param t
  *   Input operator
  */
-  Op imag(Op t) {}  
+  Op imag(Op t) {}
 
  /**
  * Returns a truth value if operator value is NaN
  * @param t
  *   Input operator
  */
-  Op isnan(Op t) {}  
+  Op isnan(Op t) {}
 
  /**
  * Returns a truth value if operator value is infinite
   * @param x
  *   Input operator
  */
-  Op isinf( Op x) {}  
+  Op isinf( Op x) {}
 
  /**
  * Returns values from the standard normal cumulative distribution function
   * @param x
  *   Input operator
  */
-  Op normcdf( Op x) {}  
+  Op normcdf( Op x) {}
 
 #else
-  DEFINE_UNARY_OP(sqrt, detail::SqrtOp);
-  DEFINE_UNARY_OP(csqrt, detail::CsqrtOp);
-  DEFINE_UNARY_OP(rsqrt, detail::RSqrtOp);
-  DEFINE_UNARY_OP(exp, detail::ExpOp);
-  DEFINE_UNARY_OP(expj, detail::ExpjOp);
-  DEFINE_UNARY_OP(log10, detail::Log10Op);
-  DEFINE_UNARY_OP(log2, detail::Log2Op);
-  DEFINE_UNARY_OP(log, detail::LogOp);
-  DEFINE_UNARY_OP(loge, detail::LogOp);
+  MATX_DEFINE_UNARY_OP(sqrt, detail::SqrtOp);
+  MATX_DEFINE_UNARY_OP(csqrt, detail::CsqrtOp);
+  MATX_DEFINE_UNARY_OP(rsqrt, detail::RSqrtOp);
+  MATX_DEFINE_UNARY_OP(exp, detail::ExpOp);
+  MATX_DEFINE_UNARY_OP(expj, detail::ExpjOp);
+  MATX_DEFINE_UNARY_OP(log10, detail::Log10Op);
+  MATX_DEFINE_UNARY_OP(log2, detail::Log2Op);
+  MATX_DEFINE_UNARY_OP(log, detail::LogOp);
+  MATX_DEFINE_UNARY_OP(loge, detail::LogOp);
 #if 0
-  DEFINE_UNARY_OP(conj, detail::ConjOp);
+  MATX_DEFINE_UNARY_OP(conj, detail::ConjOp);
 #else
   // implementing without a macro so we can optimize conj(real)
-  template <typename I1,                        
-            typename = typename std::enable_if_t<is_matx_op<I1>()>> 
+  template <typename I1,
+            typename = typename std::enable_if_t<is_matx_op<I1>()>>
   [[nodiscard]] __MATX_INLINE__ auto conj(I1 i1) {
     using I1Type = extract_value_type_t<I1>;
     if constexpr (is_complex_v<I1Type>) {
@@ -395,31 +395,31 @@ namespace matx
     }
   }
 #endif
-  DEFINE_UNARY_OP(abs, detail::AbsOp);
-  DEFINE_UNARY_OP(abs2, detail::Abs2Op);
-  DEFINE_UNARY_OP(sin, detail::SinOp);
-  DEFINE_UNARY_OP(cos, detail::CosOp);
-  DEFINE_UNARY_OP(tan, detail::TanOp);
-  DEFINE_UNARY_OP(asin, detail::AsinOp);
-  DEFINE_UNARY_OP(acos, detail::AcosOp);
-  DEFINE_UNARY_OP(atan, detail::AtanOp);
-  DEFINE_UNARY_OP(sinh, detail::SinhOp);
-  DEFINE_UNARY_OP(cosh, detail::CoshOp);
-  DEFINE_UNARY_OP(tanh, detail::TanhOp);
-  DEFINE_UNARY_OP(asinh, detail::AsinhOp);
-  DEFINE_UNARY_OP(acosh, detail::AcoshOp);
-  DEFINE_UNARY_OP(atanh, detail::AtanhOp);
-  DEFINE_UNARY_OP(angle, detail::AngleOp);
-  DEFINE_UNARY_OP(floor, detail::FloorOp);
-  DEFINE_UNARY_OP(ceil, detail::CeilOp);
-  DEFINE_UNARY_OP(round, detail::RoundOp);
-  DEFINE_UNARY_OP(normcdf, detail::NormCdfOp);
+  MATX_DEFINE_UNARY_OP(abs, detail::AbsOp);
+  MATX_DEFINE_UNARY_OP(abs2, detail::Abs2Op);
+  MATX_DEFINE_UNARY_OP(sin, detail::SinOp);
+  MATX_DEFINE_UNARY_OP(cos, detail::CosOp);
+  MATX_DEFINE_UNARY_OP(tan, detail::TanOp);
+  MATX_DEFINE_UNARY_OP(asin, detail::AsinOp);
+  MATX_DEFINE_UNARY_OP(acos, detail::AcosOp);
+  MATX_DEFINE_UNARY_OP(atan, detail::AtanOp);
+  MATX_DEFINE_UNARY_OP(sinh, detail::SinhOp);
+  MATX_DEFINE_UNARY_OP(cosh, detail::CoshOp);
+  MATX_DEFINE_UNARY_OP(tanh, detail::TanhOp);
+  MATX_DEFINE_UNARY_OP(asinh, detail::AsinhOp);
+  MATX_DEFINE_UNARY_OP(acosh, detail::AcoshOp);
+  MATX_DEFINE_UNARY_OP(atanh, detail::AtanhOp);
+  MATX_DEFINE_UNARY_OP(angle, detail::AngleOp);
+  MATX_DEFINE_UNARY_OP(floor, detail::FloorOp);
+  MATX_DEFINE_UNARY_OP(ceil, detail::CeilOp);
+  MATX_DEFINE_UNARY_OP(round, detail::RoundOp);
+  MATX_DEFINE_UNARY_OP(normcdf, detail::NormCdfOp);
 #if 0
-  DEFINE_UNARY_OP(real, detail::RealOp);
+  MATX_DEFINE_UNARY_OP(real, detail::RealOp);
 #else
   // implementing without a macro so we can optimize away real on a real operator
-  template <typename I1,                        
-            typename = typename std::enable_if_t<is_matx_op<I1>()>> 
+  template <typename I1,
+            typename = typename std::enable_if_t<is_matx_op<I1>()>>
   [[nodiscard]] __MATX_INLINE__ auto real(I1 i1) {
     using I1Type = extract_value_type_t<I1>;
     if constexpr (is_complex_v<I1Type>) {
@@ -432,10 +432,10 @@ namespace matx
     }
   }
 #endif
-  DEFINE_UNARY_OP(imag, detail::ImagOp);  
-  DEFINE_UNARY_OP(operator-, detail::SubNegOp );
-  DEFINE_UNARY_OP(isnan, detail::IsNanOp);
-  DEFINE_UNARY_OP(isinf, detail::IsInfOp);
+  MATX_DEFINE_UNARY_OP(imag, detail::ImagOp);
+  MATX_DEFINE_UNARY_OP(operator-, detail::SubNegOp );
+  MATX_DEFINE_UNARY_OP(isnan, detail::IsNanOp);
+  MATX_DEFINE_UNARY_OP(isinf, detail::IsInfOp);
 #endif
 
 } // end namespace matx
diff --git a/include/matx/transforms/eig/eig_lapack.h b/include/matx/transforms/eig/eig_lapack.h
index 0b56948ab..0340e43ba 100644
--- a/include/matx/transforms/eig/eig_lapack.h
+++ b/include/matx/transforms/eig/eig_lapack.h
@@ -138,7 +138,7 @@ class matxDnEigHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
     syevd_dispatch<WorkQuery, RworkQuery>("V", &params.uplo, &params.n, nullptr, &params.n,
                     nullptr, &work_query, &this->lwork, &rwork_query,
                     &this->lrwork, &iwork_query, &this->liwork, &info);
-    
+
     MATX_ASSERT_STR_EXP(info, 0, matxSolverError,
       ("Parameter " + std::to_string(-info) + " had an illegal value in LAPACK syevd workspace query").c_str());
 
@@ -205,7 +205,7 @@ class matxDnEigHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
                       reinterpret_cast<T2*>(this->rwork), &this->lrwork,
                       reinterpret_cast<lapack_int_t*>(this->iwork), &this->liwork, &info);
 
-      MATX_ASSERT_STR_EXP(info, 0, matxSolverError, 
+      MATX_ASSERT_STR_EXP(info, 0, matxSolverError,
           (std::to_string(info) + " off-diagonal elements of an intermediate tridiagonal form did not converge to zero in LAPACK syevd").c_str());
     }
   }
@@ -231,7 +231,7 @@ class matxDnEigHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
             const lapack_int_t* liwork_in, lapack_int_t* info)
   {
     // TODO: remove warning suppression once syevd is optimized in NVPL LAPACK
-IGNORE_WARNING_PUSH_GCC("-Wdeprecated-declarations")
+MATX_IGNORE_WARNING_PUSH_GCC("-Wdeprecated-declarations")
     if constexpr (std::is_same_v<AType, float>) {
       LAPACK_CALL(ssyevd)(jobz, uplo, n, a, lda, w, work_in, lwork_in, iwork_in, liwork_in, info);
     } else if constexpr (std::is_same_v<AType, double>) {
@@ -241,7 +241,7 @@ IGNORE_WARNING_PUSH_GCC("-Wdeprecated-declarations")
     } else if constexpr (std::is_same_v<AType, cuda::std::complex<double>>) {
       LAPACK_CALL(zheevd)(jobz, uplo, n, a, lda, w, work_in, lwork_in, rwork_in, lrwork_in, iwork_in, liwork_in, info);
     }
-IGNORE_WARNING_POP_GCC
+MATX_IGNORE_WARNING_POP_GCC
   }
 
   std::vector<T2 *> batch_w_ptrs;
diff --git a/include/matx/transforms/matmul/matmul_cblas.h b/include/matx/transforms/matmul/matmul_cblas.h
index e8e9a6aa9..1a661a908 100644
--- a/include/matx/transforms/matmul/matmul_cblas.h
+++ b/include/matx/transforms/matmul/matmul_cblas.h
@@ -354,7 +354,7 @@ __MATX_INLINE__ void matmul_exec(TensorTypeC &c,
   #elif defined(MATX_EN_BLIS)
   bli_thread_set_num_threads(exec.GetNumThreads());
   #endif
-  
+
   total_iter *= params.batch;
   for (size_t iter = 0; iter < total_iter; iter++) {
     // Get pointers into A/B/C for this round
@@ -397,7 +397,7 @@ __MATX_INLINE__ void matmul_exec(TensorTypeC &c,
 }
 
 template <typename TensorTypeC, typename TensorTypeA, typename TensorTypeB, ThreadsMode MODE>
-__MATX_INLINE__ void matmul_dispatch(TensorTypeC &c, 
+__MATX_INLINE__ void matmul_dispatch(TensorTypeC &c,
                                      const TensorTypeA &a,
                                      const TensorTypeB &b,
                                      const float alpha,
@@ -452,7 +452,7 @@ __MATX_INLINE__ auto getCBLASSupportedTensor( const Op &in) {
       return true;
     }
   };
-  
+
   return GetSupportedTensor(in, support_func, MATX_HOST_MALLOC_MEMORY);
 }
 
diff --git a/include/matx/transforms/solver_common.h b/include/matx/transforms/solver_common.h
index 3a78225aa..51599d1a3 100644
--- a/include/matx/transforms/solver_common.h
+++ b/include/matx/transforms/solver_common.h
@@ -55,7 +55,7 @@ namespace matx {
   #define LAPACK_CALL(fn) LAPACK_##fn
 #else
   using lapack_int_t = index_t;
-#endif  
+#endif
 
 /* Parameter enums */
 
@@ -159,18 +159,18 @@ enum class BatchType {
 
 /**
  * @brief Sets batch pointers for a batched tensor of arbitrary rank.
- * 
+ *
  * Clears the given batch pointers vector and then populates it
  * with pointers to the data of the tensor for batched operations.
  * Handles both batched matrices and vectors.
- * 
+ *
  * @tparam BTYPE
  *   Whether the input is a batch of matrices or vectors
  * @tparam TensorType
  *   Type of input tensor a
  * @tparam PointerType
  *   Tensor value type
- * 
+ *
  * @param a
  *   The tensor for which batch pointers are to be set.
  * @param batch_ptrs
@@ -182,7 +182,7 @@ __MATX_INLINE__ void SetBatchPointers(const TensorType &a, std::vector<PointerTy
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
 
   batch_ptrs.clear();
-  
+
   if constexpr (BTYPE == BatchType::VECTOR && TensorType::Rank() == 1) {
     // single vector case
     batch_ptrs.push_back(&a(0));
@@ -264,7 +264,7 @@ class matxDnCUDASolver_t {
         matxAlloc(&d_workspace, dspace, MATX_ASYNC_DEVICE_MEMORY, stream);
       }
 
-      // cuSolver has a bug where the workspace needs to be zeroed before using it when the type is complex. 
+      // cuSolver has a bug where the workspace needs to be zeroed before using it when the type is complex.
       // Zero it out for all types for now.
       cudaMemsetAsync(d_workspace, 0, dspace, stream);
       matxAlloc((void **)&d_info, sizeof(*d_info) * batches, MATX_ASYNC_DEVICE_MEMORY, stream);
@@ -282,7 +282,7 @@ class matxDnCUDASolver_t {
 
       if (hspace > 0) {
         matxAlloc(&h_workspace, batches * hspace, MATX_HOST_MEMORY);
-      } 
+      }
     }
   }
 
@@ -303,15 +303,15 @@ class matxDnCUDASolver_t {
 /**
  * Dense LAPACK base class that all dense host solver types inherit common methods
  * and structures from. Depending on the decomposition, it may require different
- * types of workspace arrays.  
+ * types of workspace arrays.
  *
  * @tparam ValueType
  *   Input tensor type
- * 
+ *
  */
 template <typename ValueType>
 class matxDnHostSolver_t {
-  
+
 public:
   matxDnHostSolver_t()
   {
diff --git a/include/matx/transforms/svd/svd_cuda.h b/include/matx/transforms/svd/svd_cuda.h
index 3c7b017a0..720249e8f 100644
--- a/include/matx/transforms/svd/svd_cuda.h
+++ b/include/matx/transforms/svd/svd_cuda.h
@@ -75,7 +75,7 @@ inline auto svdbpi_impl_workspace(const AType &A, cudaStream_t stream) {
 
   auto RShape = A.Shape();
   RShape[RANK-1] = d;
-  RShape[RANK-2] = d;  
+  RShape[RANK-2] = d;
 
   cuda::std::array<index_t,RANK-2> l2NormShape;
   for(int i=0;i<RANK-2;i++) {
@@ -89,7 +89,7 @@ inline auto svdbpi_impl_workspace(const AType &A, cudaStream_t stream) {
   auto R = make_tensor<ATypeS>(RShape, MATX_ASYNC_DEVICE_MEMORY, stream);
   auto Z = make_tensor<ATypeS>(QShape, MATX_ASYNC_DEVICE_MEMORY, stream);
   auto l2Norm = make_tensor<float>(l2NormShape, MATX_ASYNC_DEVICE_MEMORY, stream);
-  auto converged = make_tensor<int>({}, MATX_ASYNC_DEVICE_MEMORY, stream); 
+  auto converged = make_tensor<int>({}, MATX_ASYNC_DEVICE_MEMORY, stream);
   return cuda::std::tuple(AT, Q, Qold, R, Z, l2Norm, converged);
 }
 
@@ -124,7 +124,7 @@ inline auto svdbpi_impl_workspace(const AType &A, cudaStream_t stream) {
  *   Input tensor or operator signaling the initial guess for x0 at each power iteration.  A
  *   Random tensor of size batches x min(n,m) is suggested.
  * @param iterations
- *   The number of power iterations to perform for each singular value.  
+ *   The number of power iterations to perform for each singular value.
  * @param exec
  *   CUDA executor
  * @param k
@@ -146,7 +146,7 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat
   auto m = A.Size(RANK-2);  // rows
   auto n = A.Size(RANK-1);  // cols
   auto d = cuda::std::min(n,m); // dim for AAT or ATA
-  
+
   // if sentinal found get all singularvalues
   if( k == -1 ) k = (int) d;
 
@@ -156,7 +156,7 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat
     MATX_ASSERT_STR(VT.Size(i) == A.Size(i), matxInvalidDim, "svdpi:  VT and A must have the same batch sizes");
     MATX_ASSERT_STR(S.Size(i) == A.Size(i), matxInvalidDim, "svdpi:  S and A must have the same batch sizes");
   }
-  
+
   MATX_ASSERT_STR(U.Size(RANK-2) == m, matxInvalidDim, "svdpi: U must have Size(RANK-2) == m");
   MATX_ASSERT_STR(U.Size(RANK-1) == k, matxInvalidDim, "svdpi: U must have Size(RANK-1) == k");
   MATX_ASSERT_STR(VT.Size(RANK-2) == k, matxInvalidDim, "svdpi: VT must have Size(RANK-2) == k");
@@ -199,7 +199,7 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat
   auto uv = make_tensor<ATypeS>(AShape, MATX_ASYNC_DEVICE_MEMORY, stream);
   auto AT = make_tensor<ATypeS>(ATShape, MATX_ASYNC_DEVICE_MEMORY, stream);
   auto xm = make_tensor<ATypeS>(xmShape, MATX_ASYNC_DEVICE_MEMORY, stream);
-  
+
   // we shouldn't need sums but cub doesn't support strided tensors so we cannot write directly at this time.
   auto sums = make_tensor<STypeS>(sumsShape, MATX_ASYNC_DEVICE_MEMORY, stream);
   auto x = slice<RANK-1>(xm, xSliceB, xSliceE);
@@ -341,7 +341,7 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat
 
       (u = u / clone(s, sCloneShape)).run(stream);
     } // end ufirst
-    
+
     // Remove current singular vectors from matrix
     if(i < k - 1) {
 
@@ -384,7 +384,7 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat
  * @param A
  *   Input tensor or operator for tensor A input with size "batches by m by n"
  * @param max_iters
- *   The approximate maximum number of QR iterations to perform. 
+ *   The approximate maximum number of QR iterations to perform.
  * @param tol
  *   The termination tolerance for the QR iteration. Setting this to 0 will skip the tolerance check.
  * @param exec
@@ -466,7 +466,7 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int max_
 
       //compute L2(Q-Qold)
       // sqrt folded into next operation
-      (l2Norm = sum(abs2(Q-Qold))).run(stream);  
+      (l2Norm = sum(abs2(Q-Qold))).run(stream);
 
       // compute if all batches have converged
       if constexpr (RANK > 2) {
@@ -474,14 +474,14 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int max_
       } else {
         (converged = as_int(sqrt(l2Norm) < tol)).run(stream);
       }
-      
+
       // event to record when converged is ready in stream
       cudaEventRecord(event, stream);
       // wait for d2h transfer until converged is ready
       cudaStreamWaitEvent(d2h, event);
 
-      // copy convergence criteria to host.  
-      // This is in unpinned memory and cannot on most systems run asynchronously.  
+      // copy convergence criteria to host.
+      // This is in unpinned memory and cannot on most systems run asynchronously.
       // We do this here to hide the copy/sync behind prior launch latency/execution of next iteration.
       cudaMemcpyAsync(&converged_host, converged.Data(), sizeof(int), cudaMemcpyDeviceToHost, d2h);
     }
@@ -498,7 +498,7 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int max_
     DShape[RANK-2] = m;
     auto D = clone<RANK>(S, DShape);
 
-    // normalize U by singular values 
+    // normalize U by singular values
     // IF required to avoid nans when singular value is 0
     (IF(D != STypeS(0), U = U / D)).run(stream);
 
@@ -548,7 +548,7 @@ struct DnSVDCUDAParams_t {
 
 template <typename ATensor>
 static __MATX_INLINE__ SVDMethod GetCUDASVDMethod(const ATensor &a) {
-  static constexpr int RANK = ATensor::Rank();  
+  static constexpr int RANK = ATensor::Rank();
   index_t m = a.Size(RANK - 2);
   index_t n = a.Size(RANK - 1);
 
@@ -556,16 +556,16 @@ static __MATX_INLINE__ SVDMethod GetCUDASVDMethod(const ATensor &a) {
   SVDMethod method = detail::SVDMethod::GESVD;
 
   if (a.Rank() != 2) {
-    if (a.Size(RANK-2) <= 32 && 
+    if (a.Size(RANK-2) <= 32 &&
         a.Size(RANK-1) <= 32) {
       if constexpr (is_tensor_view_v<ATensor>) {
-  #if !defined(INDEX_32_BIT)        
+  #if !defined(MATX_INDEX_32_BIT)
         if (a.Stride(0) < std::numeric_limits<int32_t>::max()) {
           method = detail::SVDMethod::GESVDJ_BATCHED;
         }
   #else
         method = detail::SVDMethod::GESVDJ_BATCHED;
-  #endif        
+  #endif
       }
       else {
         method = detail::SVDMethod::GESVDJ_BATCHED;
@@ -651,7 +651,7 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
       MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "Failure in cusolverDnCreateGesvdjInfo");
 
       ret = cusolverDnXgesvdjSetTolerance(batch_params, 1e-9);
-      MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "Failure in cusolverDnXgesvdjSetTolerance");      
+      MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "Failure in cusolverDnXgesvdjSetTolerance");
 
       ret = cusolverDnXgesvdjSetMaxSweeps(batch_params, 15);
       MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "Failure in cusolverDnXgesvdjSetMaxSweeps");
@@ -666,7 +666,7 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
     cusolverStatus_t ret;
 
     // Use all mode for a larger workspace size that works for all modes
-    if (!batched) {
+    if (params.method == SVDMethod::GESVD) {
       ret = cusolverDnXgesvd_bufferSize(
               this->handle, this->dn_params, 'A', 'A', params.m, params.n,
               MatXTypeToCudaType<T1>(), params.A, params.m,
@@ -675,44 +675,45 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
               MatXTypeToCudaType<T1>(), &this->dspace, &this->hspace);
     }
     else {
+      int i_dspace;
+
       if constexpr (std::is_same_v<float, T1>) {
         ret = cusolverDnSgesvdjBatched_bufferSize(
                 this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast<int>(params.m), static_cast<int>(params.n),
                 reinterpret_cast<const float *>(params.A), static_cast<int>(params.m),
-                reinterpret_cast<const float *>(params.S), reinterpret_cast<const float *>(params.U), static_cast<int>(params.m), 
+                reinterpret_cast<const float *>(params.S), reinterpret_cast<const float *>(params.U), static_cast<int>(params.m),
                 reinterpret_cast<const float *>(params.VT), static_cast<int>(params.n),
-                reinterpret_cast<int*>(&this->dspace), batch_params, static_cast<int>(params.batch_size));
+                &i_dspace, batch_params, static_cast<int>(params.batch_size));
       }
       else if constexpr (std::is_same_v<double, T1>) {
         ret = cusolverDnDgesvdjBatched_bufferSize(
                 this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast<int>(params.m), static_cast<int>(params.n),
                 reinterpret_cast<const double *>(params.A), static_cast<int>(params.m),
-                reinterpret_cast<const double *>(params.S), reinterpret_cast<const double *>(params.U), 
+                reinterpret_cast<const double *>(params.S), reinterpret_cast<const double *>(params.U),
                 static_cast<int>(params.m), reinterpret_cast<const double *>(params.VT), static_cast<int>(params.n),
-                reinterpret_cast<int*>(&this->dspace), batch_params, static_cast<int>(params.batch_size));
+                &i_dspace, batch_params, static_cast<int>(params.batch_size));
       }
       else if constexpr (std::is_same_v<cuda::std::complex<float>, T1>) {
         ret = cusolverDnCgesvdjBatched_bufferSize(
                 this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast<int>(params.m), static_cast<int>(params.n),
                 reinterpret_cast<const cuComplex *>(params.A), static_cast<int>(params.m),
-                reinterpret_cast<const float *>(params.S), reinterpret_cast<const cuComplex *>(params.U), 
+                reinterpret_cast<const float *>(params.S), reinterpret_cast<const cuComplex *>(params.U),
                 static_cast<int>(params.m), reinterpret_cast<const cuComplex *>(params.VT), static_cast<int>(params.n),
-                reinterpret_cast<int*>(&this->dspace), batch_params, static_cast<int>(params.batch_size));
+                &i_dspace, batch_params, static_cast<int>(params.batch_size));
       }
       else if constexpr (std::is_same_v<cuda::std::complex<double>, T1>) {
         ret = cusolverDnZgesvdjBatched_bufferSize(
                 this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast<int>(params.m), static_cast<int>(params.n),
                 reinterpret_cast<const cuDoubleComplex *>(params.A), static_cast<int>(params.m),
-                reinterpret_cast<const double *>(params.S), reinterpret_cast<const cuDoubleComplex *>(params.U), 
+                reinterpret_cast<const double *>(params.S), reinterpret_cast<const cuDoubleComplex *>(params.U),
                 static_cast<int>(params.m), reinterpret_cast<const cuDoubleComplex *>(params.VT), static_cast<int>(params.n),
-                reinterpret_cast<int*>(&this->dspace), batch_params, static_cast<int>(params.batch_size));
+                &i_dspace, batch_params, static_cast<int>(params.batch_size));
       }
       else {
         MATX_THROW(matxInvalidType, "Invalid data type passed to svd()");
-      } 
+      }
 
-      int dspace_tmp = *reinterpret_cast<int*>(&this->dspace);
-      this->dspace = dspace_tmp;
+      this->dspace = i_dspace;
       this->hspace = 0;
     }
 
@@ -722,7 +723,7 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
 
   static DnSVDCUDAParams_t
   GetSVDParams(UTensor &u, STensor &s,
-               VtTensor &vt, const ATensor &a, 
+               VtTensor &vt, const ATensor &a,
                const char jobz, const cudaExecutor &exec)
   {
     DnSVDCUDAParams_t params;
@@ -773,7 +774,7 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
     SetBatchPointers<BatchType::MATRIX>(vt, this->batch_vt_ptrs);
     SetBatchPointers<BatchType::VECTOR>(s, this->batch_s_ptrs);
 
-    const int64_t ldvt = vt.Size(RANK-2);    
+    const int64_t ldvt = vt.Size(RANK-2);
 
     if (params.method == SVDMethod::GESVD) {
       // At this time cuSolver does not have a batched 64-bit SVD interface. Change
@@ -797,9 +798,9 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
         ret = cusolverDnSgesvdjBatched(
                 this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast<int>(params.m), static_cast<int>(params.n),
                 reinterpret_cast<float *>(params.A), static_cast<int>(params.m),
-                reinterpret_cast<float *>(params.S), reinterpret_cast<float *>(params.U), 
+                reinterpret_cast<float *>(params.S), reinterpret_cast<float *>(params.U),
                 static_cast<int>(params.m), reinterpret_cast<float *>(params.VT), static_cast<int>(params.n),
-                reinterpret_cast<float *>(this->d_workspace), static_cast<int>(this->dspace), 
+                reinterpret_cast<float *>(this->d_workspace), static_cast<int>(this->dspace),
                 this->d_info, batch_params, static_cast<int>(params.batch_size));
       }
       else if constexpr (std::is_same_v<double, T1>) {
@@ -808,30 +809,30 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
                 reinterpret_cast<double *>(params.A), static_cast<int>(params.m),
                 reinterpret_cast<double *>(params.S), reinterpret_cast<double *>(params.U),
                 static_cast<int>(params.m), reinterpret_cast<double *>(params.VT), static_cast<int>(params.n),
-                reinterpret_cast<double *>(this->d_workspace), static_cast<int>(this->dspace), 
+                reinterpret_cast<double *>(this->d_workspace), static_cast<int>(this->dspace),
                 this->d_info, batch_params, static_cast<int>(params.batch_size));
       }
       else if constexpr (std::is_same_v<cuda::std::complex<float>, T1>) {
         ret = cusolverDnCgesvdjBatched(
                 this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast<int>(params.m), static_cast<int>(params.n),
                 reinterpret_cast<cuComplex *>(params.A), static_cast<int>(params.m),
-                reinterpret_cast<float *>(params.S), reinterpret_cast<cuComplex *>(params.U), 
+                reinterpret_cast<float *>(params.S), reinterpret_cast<cuComplex *>(params.U),
                 static_cast<int>(params.m), reinterpret_cast<cuComplex *>(params.VT), static_cast<int>(params.n),
-                reinterpret_cast<cuComplex *>(this->d_workspace), static_cast<int>(this->dspace), 
+                reinterpret_cast<cuComplex *>(this->d_workspace), static_cast<int>(this->dspace),
                 this->d_info, batch_params, static_cast<int>(params.batch_size));
       }
       else if constexpr (std::is_same_v<cuda::std::complex<double>, T1>) {
         ret = cusolverDnZgesvdjBatched(
                 this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast<int>(params.m), static_cast<int>(params.n),
                 reinterpret_cast<cuDoubleComplex *>(params.A), static_cast<int>(params.m),
-                reinterpret_cast<double *>(params.S), reinterpret_cast<cuDoubleComplex *>(params.U), 
+                reinterpret_cast<double *>(params.S), reinterpret_cast<cuDoubleComplex *>(params.U),
                 static_cast<int>(params.m), reinterpret_cast<cuDoubleComplex *>(params.VT), static_cast<int>(params.n),
-                reinterpret_cast<cuDoubleComplex *>(this->d_workspace), static_cast<int>(this->dspace), 
+                reinterpret_cast<cuDoubleComplex *>(this->d_workspace), static_cast<int>(this->dspace),
                 this->d_info, batch_params, static_cast<int>(params.batch_size));
       }
       else {
         MATX_THROW(matxInvalidType, "Invalid data type passed to svd()");
-      }         
+      }
 
       MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "cusolverDnSgesvdjBatched failed");
     }
@@ -850,10 +851,10 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
         MATX_ASSERT_STR_EXP(info, 0, matxSolverError,
           ("Parameter " + std::to_string(-info) + " had an illegal value in cuSolver Xgesvd").c_str());
       } else {
-        MATX_ASSERT_STR_EXP(info, 0, matxSolverError, 
+        MATX_ASSERT_STR_EXP(info, 0, matxSolverError,
           (std::to_string(info) + " superdiagonals of an intermediate bidiagonal form did not converge to zero in cuSolver Xgesvd").c_str());
       }
-    }    
+    }
   }
 
   /**
@@ -871,7 +872,6 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
   std::vector<T4 *> batch_vt_ptrs;
   gesvdjInfo_t batch_params = nullptr;
   DnSVDCUDAParams_t params;
-  bool batched;
 };
 
 /**
@@ -893,10 +893,10 @@ struct DnSVDCUDAParamsKeyHash {
 struct DnSVDCUDAParamsKeyEq {
   bool operator()(const DnSVDCUDAParams_t &l, const DnSVDCUDAParams_t &t) const noexcept
   {
-    return  l.n == t.n && 
-            l.m == t.m && 
-            l.batch_size == t.batch_size && 
-            l.dtype == t.dtype && 
+    return  l.n == t.n &&
+            l.m == t.m &&
+            l.batch_size == t.batch_size &&
+            l.dtype == t.dtype &&
             l.exec.getStream() == t.exec.getStream();
   }
 };
@@ -961,7 +961,7 @@ void svd_impl(UTensor &&u, STensor &&s,
 
   const char job_cusolver = detail::SVDModeToChar(jobz);
   const bool m_leq_n = a.Size(RANK-2) <= a.Size(RANK-1);
-  const auto method = GetCUDASVDMethod(a);  
+  const auto method = GetCUDASVDMethod(a);
 
   // The power iteration method is a custom kernel so we don't need to test the types
   if (method == detail::SVDMethod::POW_ITER) {
@@ -973,8 +973,8 @@ void svd_impl(UTensor &&u, STensor &&s,
   T1 *tp;
   auto a_shape = a.Shape();
   auto a_total_size = std::accumulate(a_shape.begin(), a_shape.begin() + ATensor::Rank(), 1, std::multiplies<index_t>());
-  matxAlloc(reinterpret_cast<void **>(&tp), sizeof(T1) * a_total_size, MATX_ASYNC_DEVICE_MEMORY, stream);  
-  
+  matxAlloc(reinterpret_cast<void **>(&tp), sizeof(T1) * a_total_size, MATX_ASYNC_DEVICE_MEMORY, stream);
+
   if (m_leq_n) {
     // get col-major AT
     auto a_new = make_tensor(tp, a_shape);
@@ -987,7 +987,7 @@ void svd_impl(UTensor &&u, STensor &&s,
     auto vt_new = getSolverSupportedTensor(vt, exec);
 
     // swap U and VT. svd(AT) = V*S*UT
-    // svd(AT) = V*S*UT 
+    // svd(AT) = V*S*UT
     // Need the tensors to appear like V and UT since that is expected based on AT view
     // inputted, although the results when read as row-major are VT and U.
     auto u_in = transpose_matrix(vt_new);
diff --git a/include/matx/transforms/svd/svd_lapack.h b/include/matx/transforms/svd/svd_lapack.h
index 0621bd1d4..14d04d7f2 100644
--- a/include/matx/transforms/svd/svd_lapack.h
+++ b/include/matx/transforms/svd/svd_lapack.h
@@ -184,7 +184,7 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
       // the real part of the first elem of work holds the optimal lwork.
       if constexpr (is_complex_v<WorkQuery>) {
         this->lwork = static_cast<lapack_int_t>(work_query.real());
-        
+
         lapack_int_t mnthr = (mn * 5) / 3;
         if (mx >= mnthr) { // mx >> mn condition in LAPACK
           this->lrwork = 5*mn*mn + 5*mn;
@@ -263,7 +263,7 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
                         reinterpret_cast<T3*>(this->rwork), &info);
 
 
-        MATX_ASSERT_STR_EXP(info, 0, matxSolverError, 
+        MATX_ASSERT_STR_EXP(info, 0, matxSolverError,
           (std::to_string(info) + " superdiagonals of an intermediate bidiagonal form did not converge to zero in LAPACK").c_str());
       }
     } else if (params.algo == SVDHostAlgo::DC) {
@@ -277,7 +277,7 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
                         reinterpret_cast<T3*>(this->rwork),
                         reinterpret_cast<lapack_int_t*>(this->iwork), &info);
 
-        MATX_ASSERT_STR_EXP(info, 0, matxSolverError, "gesdd error in LAPACK");     
+        MATX_ASSERT_STR_EXP(info, 0, matxSolverError, "gesdd error in LAPACK");
       }
     }
   }
@@ -300,7 +300,7 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
                       const lapack_int_t *lwork_in, [[maybe_unused]] T3 *rwork_in, lapack_int_t *info)
   {
     // TODO: remove warning suppression once gesvd is optimized in NVPL LAPACK
-IGNORE_WARNING_PUSH_GCC("-Wdeprecated-declarations")
+MATX_IGNORE_WARNING_PUSH_GCC("-Wdeprecated-declarations")
     if constexpr (std::is_same_v<T1, float>) {
       LAPACK_CALL(sgesvd)(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work_in, lwork_in, info);
     } else if constexpr (std::is_same_v<T1, double>) {
@@ -310,7 +310,7 @@ IGNORE_WARNING_PUSH_GCC("-Wdeprecated-declarations")
     } else if constexpr (std::is_same_v<T1, cuda::std::complex<double>>) {
       LAPACK_CALL(zgesvd)(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work_in, lwork_in, rwork_in, info);
     }
-IGNORE_WARNING_POP_GCC
+MATX_IGNORE_WARNING_POP_GCC
   }
 
   /**
@@ -332,7 +332,7 @@ IGNORE_WARNING_POP_GCC
       LAPACK_CALL(zgesdd)(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work_in, lwork_in, rwork_in, iwork_in, info);
     }
   }
-  
+
   std::vector<T2 *> batch_u_ptrs;
   std::vector<T3 *> batch_s_ptrs;
   std::vector<T4 *> batch_vt_ptrs;
@@ -427,7 +427,7 @@ void svd_impl([[maybe_unused]] UTensor &&u,
      equivalent to col-major AT, and swapping the inputs U and VT.
   */
 
-  // LAPACK destroys the input, so we need to make a copy of A regardless  
+  // LAPACK destroys the input, so we need to make a copy of A regardless
   auto a_copy = make_tensor<T1>(a.Shape(), MATX_HOST_MALLOC_MEMORY);
   (a_copy = a).run(exec);
   auto at_col_maj = transpose_matrix(a_copy);
diff --git a/test/00_io/PrintTests.cu b/test/00_io/PrintTests.cu
index 3a9e66215..b07bb8e63 100644
--- a/test/00_io/PrintTests.cu
+++ b/test/00_io/PrintTests.cu
@@ -97,8 +97,9 @@ TEST_F(PrintTest, DefaultTest1)
   auto pft = get_print_format_type();
   ASSERT_EQ(MATX_PRINT_FORMAT_DEFAULT, pft);
 
+  A1.set_name("A1 Matrix");
   print_checker(A1,
-      "Tensor{complex<double>} Rank: 1, Sizes:[16], Strides:[1]\n"
+      "A1 Matrix: Tensor{complex<double>} Rank: 1, Sizes:[16], Strides:[1]\n"
       "000000: -9.2466e-01+9.9114e-01j \n"
       "000001: -4.2534e-01+1.0676e+00j \n"
       "000002: -2.6438e+00-6.2723e-01j \n"
@@ -192,11 +193,11 @@ TEST_F(PrintTest, DefaultTest5)
   MATX_ENTER_HANDLER();
   auto pft = get_print_format_type();
   ASSERT_EQ(MATX_PRINT_FORMAT_DEFAULT, pft);
-  
+
   auto testSlice = matx::slice<0>(A1, {0}, {matx::matxDropDim});
 
   print_checker(testSlice,
-      "Tensor{complex<double>} Rank: 0, Sizes:[], Strides:[]\n"
+      "tensor_0_f64c: Tensor{complex<double>} Rank: 0, Sizes:[], Strides:[]\n"
       "-9.2466e-01+9.9114e-01j \n");
 
   MATX_EXIT_HANDLER();
@@ -210,7 +211,7 @@ TEST_F(PrintTest, MlabTest1)
   ASSERT_EQ(MATX_PRINT_FORMAT_MLAB, pft);
 
   print_checker(A1,
-      "Tensor{complex<double>} Rank: 1, Sizes:[16], Strides:[1]\n"
+      "tensor_1_f64c: Tensor{complex<double>} Rank: 1, Sizes:[16], Strides:[1]\n"
       "[-9.2466e-01+9.9114e-01j ,\n"
       " -4.2534e-01+1.0676e+00j ,\n"
       " -2.6438e+00-6.2723e-01j ,\n"
@@ -306,7 +307,7 @@ TEST_F(PrintTest, MlabTest5)
   auto testSlice = matx::slice<0>(A1, {0}, {matx::matxDropDim});
 
   print_checker(testSlice,
-      "Tensor{complex<double>} Rank: 0, Sizes:[], Strides:[]\n"
+      "tensor_0_f64c: Tensor{complex<double>} Rank: 0, Sizes:[], Strides:[]\n"
       "-9.2466e-01+9.9114e-01j \n");
 
   MATX_EXIT_HANDLER();
@@ -320,7 +321,7 @@ TEST_F(PrintTest, PythonTest1)
   ASSERT_EQ(MATX_PRINT_FORMAT_PYTHON, pft);
 
   print_checker(A1,
-      "Tensor{complex<double>} Rank: 1, Sizes:[16], Strides:[1]\n"
+      "tensor_1_f64c: Tensor{complex<double>} Rank: 1, Sizes:[16], Strides:[1]\n"
       "[-9.2466e-01+9.9114e-01j ,\n"
       " -4.2534e-01+1.0676e+00j ,\n"
       " -2.6438e+00-6.2723e-01j ,\n"
@@ -412,7 +413,7 @@ TEST_F(PrintTest, PythonTest5)
   auto testSlice = matx::slice<0>(A1, {0}, {matx::matxDropDim});
 
   print_checker(testSlice,
-      "Tensor{complex<double>} Rank: 0, Sizes:[], Strides:[]\n"
+      "tensor_0_f64c: Tensor{complex<double>} Rank: 0, Sizes:[], Strides:[]\n"
       "-9.2466e-01+9.9114e-01j \n");
 
   MATX_EXIT_HANDLER();