From db2ae6b2ffb8a39e66e885dab8df4b578c9e40fb Mon Sep 17 00:00:00 2001
From: Aayush Gupta <aaygupta@nvidia.com>
Date: Fri, 31 May 2024 00:30:14 +0000
Subject: [PATCH] Added synchronization support through .sync().

Updated all tests to use calls to .sync() instead of the cuda api and
removed Prefetch calls.
---
 .gitignore                                    |   1 +
 bench/00_operators/operators.cu               |  20 +-
 bench/00_operators/reduction.cu               |   5 +-
 bench/00_transform/conv.cu                    |  20 +-
 bench/00_transform/cub.cu                     |   3 +-
 bench/00_transform/qr.cu                      |  11 +-
 bench/00_transform/svd_power.cu               |  38 +-
 docs_input/api/index.rst                      |   1 +
 docs_input/api/synchronization/index.rst      |  10 +
 docs_input/api/synchronization/sync.rst       |  18 +
 .../exercises/example1_assignment1.cu         |   2 -
 .../exercises/example2_assignment1.cu         |   8 +-
 .../exercises/example3_assignment1.cu         |   6 +-
 .../notebooks/exercises/example3_fft_conv.cu  |   5 +-
 .../solutions/example1_assignment1.cu         |   2 -
 .../solutions/example2_assignment1.cu         |  14 +-
 .../solutions/example3_assignment1.cu         |  12 +-
 examples/black_scholes.cu                     |  16 +-
 examples/cgsolve.cu                           |  19 +-
 examples/channelize_poly_bench.cu             |   9 +-
 examples/convolution.cu                       |  15 +-
 examples/eigenExample.cu                      |  23 +-
 examples/fft_conv.cu                          |  24 +-
 examples/mvdr_beamformer.cu                   |   7 +-
 examples/mvdr_beamformer.h                    |  21 +-
 examples/pwelch.cu                            |  11 +-
 examples/qr.cu                                |  17 +-
 examples/recursive_filter.cu                  |  10 +-
 examples/resample.cu                          |  13 +-
 examples/resample_poly_bench.cu               |  16 +-
 examples/simple_radar_pipeline.cu             |   5 +-
 examples/simple_radar_pipeline.h              |  47 +-
 examples/spectrogram.cu                       |  24 +-
 examples/spectrogram_graph.cu                 |  26 +-
 examples/spherical_harmonics.cu               |   6 +-
 examples/svd_power.cu                         |  58 +-
 include/matx/executors/device.h               |  12 +-
 include/matx/executors/host.h                 |   6 +
 test/00_io/FileIOTests.cu                     |   6 +-
 test/00_operators/GeneratorTests.cu           |  46 +-
 test/00_operators/OperatorTests.cu            | 531 +++++++++---------
 test/00_operators/PWelch.cu                   |  19 +-
 test/00_operators/ReductionTests.cu           | 180 +++---
 test/00_solver/Cholesky.cu                    |   6 +-
 test/00_solver/Det.cu                         |   2 +-
 test/00_solver/Eigen.cu                       |   2 +-
 test/00_solver/Inverse.cu                     |  10 +-
 test/00_solver/LU.cu                          |   2 +-
 test/00_solver/QR.cu                          |   2 +-
 test/00_solver/QR2.cu                         |  19 +-
 test/00_solver/SVD.cu                         |  24 +-
 test/00_tensor/BasicTensorTests.cu            |  20 +-
 test/00_tensor/CUBTests.cu                    |  18 +-
 test/00_tensor/EinsumTests.cu                 |  14 +-
 test/00_tensor/ViewTests.cu                   |   4 +-
 test/00_transform/ChannelizePoly.cu           |  24 +-
 test/00_transform/ConvCorr.cu                 |  18 +-
 test/00_transform/Copy.cu                     |  44 +-
 test/00_transform/Cov.cu                      |   4 +-
 test/00_transform/FFT.cu                      |  90 +--
 test/00_transform/MatMul.cu                   |  72 +--
 test/00_transform/ResamplePoly.cu             |  40 +-
 test/00_transform/Solve.cu                    |   2 +-
 test/01_radar/MVDRBeamformer.cu               |   8 +-
 test/01_radar/MultiChannelRadarPipeline.cu    |   6 -
 test/CMakeLists.txt                           |   6 +-
 66 files changed, 904 insertions(+), 876 deletions(-)
 create mode 100644 docs_input/api/synchronization/index.rst
 create mode 100644 docs_input/api/synchronization/sync.rst

diff --git a/.gitignore b/.gitignore
index 8947ab01f..253de50b3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 .vscode
+.cache
 build*
 *.pyc
diff --git a/bench/00_operators/operators.cu b/bench/00_operators/operators.cu
index ea8c49756..8bbbf3429 100644
--- a/bench/00_operators/operators.cu
+++ b/bench/00_operators/operators.cu
@@ -10,6 +10,7 @@ template <typename ValueType>
 void vector_add(nvbench::state &state, nvbench::type_list<ValueType>)
 {
   // Get current parameters:
+  cudaExecutor exec{0};
   const int x_len = static_cast<int>(state.get_int64("Vector size"));
 
   state.add_element_count(x_len, "NumElements");
@@ -19,9 +20,9 @@ void vector_add(nvbench::state &state, nvbench::type_list<ValueType>)
 
   tensor_t<ValueType, 1> xv{{x_len}};
   tensor_t<ValueType, 1> xv2{{x_len}};
-  xv.PrefetchDevice(0);
-  (xv = xv + xv2).run();
-  cudaDeviceSynchronize();
+
+  (xv = xv + xv2).run(exec);
+  exec.sync();
 
   state.exec( 
     [&xv, &xv2](nvbench::launch &launch) {
@@ -38,6 +39,7 @@ using permute_types = nvbench::type_list<float, double, cuda::std::complex<float
 template <typename ValueType>
 void permute(nvbench::state &state, nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto x = make_tensor<ValueType>({1000,200,6,300});
   auto y = make_tensor<ValueType>({300,1000,6,200});
 
@@ -46,7 +48,7 @@ void permute(nvbench::state &state, nvbench::type_list<ValueType>)
   state.add_global_memory_writes<ValueType>(x.TotalSize());    
 
   x.PrefetchDevice(0);
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec( 
     [&x, &y](nvbench::launch &launch) {
@@ -61,17 +63,18 @@ using random_types = nvbench::type_list<float, double, cuda::std::complex<float>
 template <typename ValueType>
 void random(nvbench::state &state, nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto x = make_tensor<ValueType>({1966800});
   auto y = make_tensor<ValueType>({1966800});
   x.PrefetchDevice(0);
   y.PrefetchDevice(0);
 
-  (y = random<float>(x.Shape(), NORMAL)).run();
+  (y = random<float>(x.Shape(), NORMAL)).run(exec);
 
   state.add_element_count(x.TotalSize(), "NumElements");
   state.add_global_memory_writes<ValueType>(x.TotalSize());    
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec( 
     [&x, &y](nvbench::launch &launch) {
@@ -98,6 +101,7 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list<ValueType>)
   int n = 600;
   ValueType dx = M_PI/n;
   
+  cudaExecutor exec{};
   auto col = range<0>({n+1},ValueType(0), ValueType(dx));
   auto az = range<0>({2*n+1}, ValueType(0), ValueType(dx));
 
@@ -122,8 +126,8 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list<ValueType>)
   auto Y = make_tensor<ValueType>(Ym.Shape());
   auto Z = make_tensor<ValueType>(Zm.Shape());
 
-  cudaDeviceSynchronize();
-
+  exec.sync();
+  
   state.add_element_count(n+1, "Elements");
 
   state.exec( 
diff --git a/bench/00_operators/reduction.cu b/bench/00_operators/reduction.cu
index ed9f5817f..ffd44ab6c 100644
--- a/bench/00_operators/reduction.cu
+++ b/bench/00_operators/reduction.cu
@@ -127,6 +127,7 @@ void reduce_4d(
            nvbench::type_list<ValueType>
            )
 {
+  cudaExecutor exec{0};
   const int size0 = static_cast<int>(state.get_int64("Size0"));
   const int size1 = static_cast<int>(state.get_int64("Size1"));
   const int size2 = static_cast<int>(state.get_int64("Size2"));
@@ -138,8 +139,8 @@ void reduce_4d(
   t1.PrefetchDevice(0);
   t4.PrefetchDevice(0);
 
-  (t4 = random<float>(t4.Shape(), UNIFORM)).run();
-  cudaDeviceSynchronize();
+  (t4 = random<float>(t4.Shape(), UNIFORM)).run(exec);
+  exec.sync();
 
   state.exec([&t4, &t1](nvbench::launch &launch) { 
     (t1 = matx::sum(t4, {1, 2, 3})).run((cudaStream_t)launch.get_stream()); });
diff --git a/bench/00_transform/conv.cu b/bench/00_transform/conv.cu
index 5f59754db..23b307f2c 100644
--- a/bench/00_transform/conv.cu
+++ b/bench/00_transform/conv.cu
@@ -13,6 +13,7 @@ template <typename ValueType>
 void conv1d_direct_4d_batch(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto out = make_tensor<ValueType>({4, 2, 14, 288 + 4096 + 133 - 1});
   auto at = make_tensor<ValueType>({ 4, 2, 14, 133});
   auto bt = make_tensor<ValueType>({ 4, 2, 14, 288 + 4096});
@@ -21,7 +22,7 @@ void conv1d_direct_4d_batch(nvbench::state &state,
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
 
-  cudaDeviceSynchronize();
+  exec.sync();
   MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
@@ -35,7 +36,7 @@ template <typename ValueType>
 void conv1d_direct_2d_batch(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
-
+  cudaExecutor exec{0};
 
   auto out = make_tensor<ValueType>({4 * 2* 14, 288 + 4096 + 133 - 1});
   auto at = make_tensor<ValueType>({ 4 * 2* 14, 133});
@@ -45,7 +46,7 @@ void conv1d_direct_2d_batch(nvbench::state &state,
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
@@ -56,6 +57,7 @@ template <typename ValueType>
 void conv1d_direct_large(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto at = make_tensor<ValueType>({state.get_int64("Signal Size")});
   auto bt = make_tensor<ValueType>({state.get_int64("Filter Size")});
   auto out = make_tensor<ValueType>({at.Size(at.Rank()-1) + bt.Size(bt.Rank()-1) - 1});
@@ -64,9 +66,9 @@ void conv1d_direct_large(nvbench::state &state,
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
 
-  (out = conv1d(at, bt, MATX_C_MODE_FULL)).run();
+  (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(exec);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
@@ -79,17 +81,18 @@ template <typename ValueType>
 void conv1d_fft_large(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto at = make_tensor<ValueType>({state.get_int64("Signal Size")});
   auto bt = make_tensor<ValueType>({state.get_int64("Filter Size")});
   auto out = make_tensor<ValueType>({at.Size(at.Rank()-1) + bt.Size(bt.Rank()-1) - 1});
 
-  (out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run();
+  (out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run(exec);
 
   out.PrefetchDevice(0);
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run(cudaExecutor(launch.get_stream())); });
@@ -103,6 +106,7 @@ template <typename ValueType>
 void conv2d_direct_batch(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto at = make_tensor<ValueType>({256, 1024, 1024});
   auto bt = make_tensor<ValueType>({256, 16, 16});
   auto out = make_tensor<ValueType>({256, 
@@ -113,7 +117,7 @@ void conv2d_direct_batch(nvbench::state &state,
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { (out = conv2d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
diff --git a/bench/00_transform/cub.cu b/bench/00_transform/cub.cu
index c37f0d7be..f0e9abc8b 100644
--- a/bench/00_transform/cub.cu
+++ b/bench/00_transform/cub.cu
@@ -24,6 +24,7 @@ void sort1d(
            nvbench::type_list<ValueType>
            )
 {
+  cudaExecutor exec{0};
   const int dataSize = static_cast<int>(state.get_int64("Tensor Size"));
 
   auto sortedData = matx::make_tensor<ValueType>({dataSize});
@@ -31,7 +32,7 @@ void sort1d(
 
   sortedData.PrefetchDevice(0);
   randomData.PrefetchDevice(0);
-  cudaDeviceSynchronize();
+  exec.sync();
 
   (randomData = random<float>(sortedData.Shape(), NORMAL)).run();
 
diff --git a/bench/00_transform/qr.cu b/bench/00_transform/qr.cu
index ee745072c..09db69958 100644
--- a/bench/00_transform/qr.cu
+++ b/bench/00_transform/qr.cu
@@ -17,6 +17,7 @@ void qr_batch(nvbench::state &state,
 
   cudaStream_t stream = 0;
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
+  cudaExecutor exec{stream};
 
   int batch = state.get_int64("batch");
   int m = state.get_int64("rows");
@@ -26,17 +27,13 @@ void qr_batch(nvbench::state &state,
   auto Q = make_tensor<AType>({batch, m, m});
   auto R = make_tensor<AType>({batch, m, n});
   
-  A.PrefetchDevice(stream);
-  Q.PrefetchDevice(stream);
-  R.PrefetchDevice(stream);
-  
-  (A = random<float>({batch, m, n}, NORMAL)).run(stream);
+  (A = random<float>({batch, m, n}, NORMAL)).run(exec);
 
   // warm up
   nvtxRangePushA("Warmup");
-  (mtie(Q, R) = qr(A)).run(stream);
+  (mtie(Q, R) = qr(A)).run(exec);
 
-  cudaDeviceSynchronize();
+  exec.sync();
   nvtxRangePop();
 
   MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
diff --git a/bench/00_transform/svd_power.cu b/bench/00_transform/svd_power.cu
index 3e165dc0e..7736a19e2 100644
--- a/bench/00_transform/svd_power.cu
+++ b/bench/00_transform/svd_power.cu
@@ -17,6 +17,7 @@ void svdpi_batch(nvbench::state &state,
 
   cudaStream_t stream = 0;
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
+  cudaExecutor exec{stream};
 
   int batch = state.get_int64("batch");
   int m = state.get_int64("rows");
@@ -30,22 +31,17 @@ void svdpi_batch(nvbench::state &state,
 
   int iterations = 10;
 
-  (A = random<float>({batch, m, n}, NORMAL)).run(stream);
-  
-  A.PrefetchDevice(stream);
-  U.PrefetchDevice(stream);
-  S.PrefetchDevice(stream);
-  VT.PrefetchDevice(stream);
-
-  (U = 0).run(stream);
-  (S = 0).run(stream);
-  (VT = 0).run(stream);
+  (A = random<float>({batch, m, n}, NORMAL)).run(exec);
+
+  (U = 0).run(exec);
+  (S = 0).run(exec);
+  (VT = 0).run(exec);
   auto x0 = random<float>({batch, r}, NORMAL);
 
   // warm up
   nvtxRangePushA("Warmup");
-  (mtie(U, S, VT) = svdpi(A, x0, iterations, r)).run(stream);
-  cudaDeviceSynchronize();
+  (mtie(U, S, VT) = svdpi(A, x0, iterations, r)).run(exec);
+  exec.sync();
   nvtxRangePop();
 
   MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
@@ -70,6 +66,7 @@ void svdbpi_batch(nvbench::state &state,
 
   cudaStream_t stream = 0;
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
+  cudaExecutor exec{stream};
 
   int batch = state.get_int64("batch");
   int m = state.get_int64("rows");
@@ -83,21 +80,16 @@ void svdbpi_batch(nvbench::state &state,
 
   int iterations = 10;
   
-  (A = random<float>({batch, m, n}, NORMAL)).run(stream);
-  
-  A.PrefetchDevice(stream);
-  U.PrefetchDevice(stream);
-  S.PrefetchDevice(stream);
-  VT.PrefetchDevice(stream);
+  (A = random<float>({batch, m, n}, NORMAL)).run(exec);
 
-  (U = 0).run(stream);
-  (S = 0).run(stream);
-  (VT = 0).run(stream);
+  (U = 0).run(exec);
+  (S = 0).run(exec);
+  (VT = 0).run(exec);
 
   // warm up
   nvtxRangePushA("Warmup");
-  (mtie(U, S, VT) = svdbpi(A, iterations)).run(stream);
-  cudaDeviceSynchronize();
+  (mtie(U, S, VT) = svdbpi(A, iterations)).run(exec);
+  exec.sync();
   nvtxRangePop();
 
   MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
diff --git a/docs_input/api/index.rst b/docs_input/api/index.rst
index 4d6764bb6..63383ef83 100644
--- a/docs_input/api/index.rst
+++ b/docs_input/api/index.rst
@@ -17,6 +17,7 @@ API Reference
    casting/index.rst
    window/index.rst
    signalimage/index.rst
+   synchronization/index.rst
    polynomials/index.rst
    random/random.rst
    dft/index.rst
diff --git a/docs_input/api/synchronization/index.rst b/docs_input/api/synchronization/index.rst
new file mode 100644
index 000000000..0819f6330
--- /dev/null
+++ b/docs_input/api/synchronization/index.rst
@@ -0,0 +1,10 @@
+.. _synchronization:
+
+Synchronization
+###############
+
+.. toctree::
+   :maxdepth: 1
+   :glob:
+
+   *
\ No newline at end of file
diff --git a/docs_input/api/synchronization/sync.rst b/docs_input/api/synchronization/sync.rst
new file mode 100644
index 000000000..27358b7c1
--- /dev/null
+++ b/docs_input/api/synchronization/sync.rst
@@ -0,0 +1,18 @@
+.. _sync_func:
+
+sync
+====
+
+Wait for any code running on an executor to complete.
+
+.. doxygenfunction:: matx::cudaExecutor::sync()
+.. doxygenfunction:: matx::HostExecutor::sync()
+
+Examples
+~~~~~~~~
+
+.. literalinclude:: ../../../examples/cgsolve.cu
+   :language: cpp
+   :start-after: example-begin sync-test-1
+   :end-before: example-end sync-test-1
+   :dedent:
diff --git a/docs_input/notebooks/exercises/example1_assignment1.cu b/docs_input/notebooks/exercises/example1_assignment1.cu
index 164d17ed3..14d923a20 100644
--- a/docs_input/notebooks/exercises/example1_assignment1.cu
+++ b/docs_input/notebooks/exercises/example1_assignment1.cu
@@ -57,8 +57,6 @@ int main() {
   // t2 = ;
   /*** End editing ***/
 
-  t2.PrefetchDevice(0);
-
   /****************************************************************************************************
    * Get a slice of the second and third rows with all columns
    * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#slicing-and-dicing
diff --git a/docs_input/notebooks/exercises/example2_assignment1.cu b/docs_input/notebooks/exercises/example2_assignment1.cu
index 9c1cdd7af..69d2f3e65 100644
--- a/docs_input/notebooks/exercises/example2_assignment1.cu
+++ b/docs_input/notebooks/exercises/example2_assignment1.cu
@@ -45,6 +45,8 @@ int main() {
   auto B = make_tensor<float>({2, 3});
   auto V = make_tensor<float>({3});
 
+  cudaExecutor exec{};
+
   /****************************************************************************************************
    * Initialize tensor A with increasing values from 0.5 to 3.0 in steps of 0.4,
    *and tensor V from -1 to -3 in steps of -1.
@@ -83,7 +85,7 @@ int main() {
 
   /*** End editing ***/
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   step = 0.5;
   for (int row = 0; row < A.Size(0); row++) {
@@ -111,7 +113,7 @@ int main() {
   /// auto tvs = ;
   /*** End editing. ***/
 
-  // cudaStreamSynchronize(0);
+  // exec.sync();
 
   // step = 0.5;
   // for (int row = 0; row < A.Size(0); row++) {
@@ -137,7 +139,7 @@ int main() {
 
   /*** End editing ***/
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (int row = 0; row < B.Size(0); row++) {
     for (int col = 0; col < B.Size(1); col++) {
diff --git a/docs_input/notebooks/exercises/example3_assignment1.cu b/docs_input/notebooks/exercises/example3_assignment1.cu
index 9a755b8c6..07f37ef14 100644
--- a/docs_input/notebooks/exercises/example3_assignment1.cu
+++ b/docs_input/notebooks/exercises/example3_assignment1.cu
@@ -45,6 +45,8 @@ int main() {
   auto A = make_tensor<complex>({2, 3});
   auto B = make_tensor<complex>({2, 3});
 
+  cudaExecutor exec{};
+
   /****************************************************************************************************
    * Use the random number generator with a seed of 12345 to generate
    * normally-distributed numbers in the tensor A. Next, take the FFT across
@@ -63,7 +65,7 @@ int main() {
        {{0.5646, 0.8638}, {1.6400, 0.3494}, {-0.5709, 0.5919}}});
   A.print();
   B.print();
-  cudaStreamSynchronize(0);
+  exec.sync();
   for (int row = 0; row < A.Size(0); row++) {
     for (int col = 0; col < A.Size(1); col++) {
       if (fabs(A(row, col).real() - B(row, col).real()) > 0.001) {
@@ -101,7 +103,7 @@ int main() {
   /*** End editing ***/
 
   // Verify init is correct
-  cudaStreamSynchronize(0);
+  exec.sync();
   if (fabs(redv() - 1.0) > 0.001) {
     printf("Mismatch on final reduction. Expected=1.0, actual = %f\n", redv());
     exit(-1);
diff --git a/docs_input/notebooks/exercises/example3_fft_conv.cu b/docs_input/notebooks/exercises/example3_fft_conv.cu
index b0a2f675f..d922ec820 100644
--- a/docs_input/notebooks/exercises/example3_fft_conv.cu
+++ b/docs_input/notebooks/exercises/example3_fft_conv.cu
@@ -37,7 +37,8 @@ using namespace matx;
 int main() {
 
   using complex = cuda::std::complex<float>;
-
+  cudaExecutor exec{};
+  
   index_t signal_size = 16;
   index_t filter_size = 3;
   index_t filtered_size = signal_size + filter_size - 1;
@@ -73,7 +74,7 @@ int main() {
   // TODO: Perform a time-domain convolution
   
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   // Compare signals
   for (index_t i = 0; i < filtered_size; i++) {
diff --git a/docs_input/notebooks/exercises/solutions/example1_assignment1.cu b/docs_input/notebooks/exercises/solutions/example1_assignment1.cu
index f18b5a80f..0331b965a 100644
--- a/docs_input/notebooks/exercises/solutions/example1_assignment1.cu
+++ b/docs_input/notebooks/exercises/solutions/example1_assignment1.cu
@@ -49,8 +49,6 @@ int main() {
         {17, 18, 19, 20}});
   /*** End editing ***/
 
-  t2.PrefetchDevice(0);
-
   int count = 1;
   for (int row = 0; row < t2.Size(0); row++) {
     for (int col = 0; col < t2.Size(1); col++) {
diff --git a/docs_input/notebooks/exercises/solutions/example2_assignment1.cu b/docs_input/notebooks/exercises/solutions/example2_assignment1.cu
index 9781389d8..4e0560336 100644
--- a/docs_input/notebooks/exercises/solutions/example2_assignment1.cu
+++ b/docs_input/notebooks/exercises/solutions/example2_assignment1.cu
@@ -41,6 +41,8 @@ int main() {
   tensor_t<float, 2> B(shape);
   tensor_t<float, 1> V({3});
 
+  cudaExecutor exec{};
+
   /****************************************************************************************************
    * Initialize tensor A with increasing values from 0.5 to 3.0 in steps of 0.5,
    *and tensor V from -1 to -3 in steps of -1.
@@ -78,10 +80,10 @@ int main() {
   /****************************************************************************************************
    * Add 5.0 to all elements of A and store the results back in A
    ****************************************************************************************************/
-  (A = A + 5.0).run();
+  (A = A + 5.0).run(exec);
   /*** End editing ***/
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   step = 0.5;
   for (int row = 0; row < A.Size(0); row++) {
@@ -107,10 +109,10 @@ int main() {
    *
    ****************************************************************************************************/
   auto tvs = V.Clone<2>({A.Size(0), matxKeepDim});
-  (A = A - tvs).run();
+  (A = A - tvs).run(exec);
   /*** End editing ***/
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   step = 0.5;
   for (int row = 0; row < A.Size(0); row++) {
@@ -134,10 +136,10 @@ int main() {
    *
    * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/api/tensorops.html#_CPPv4N4matx3powE2Op2Op
    ****************************************************************************************************/
-  (B = (pow(A, 2) * 2) - V).run();
+  (B = (pow(A, 2) * 2) - V).run(exec);
   /*** End editing ***/
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (int row = 0; row < B.Size(0); row++) {
     for (int col = 0; col < B.Size(1); col++) {
diff --git a/docs_input/notebooks/exercises/solutions/example3_assignment1.cu b/docs_input/notebooks/exercises/solutions/example3_assignment1.cu
index 0828788e8..5aff55999 100644
--- a/docs_input/notebooks/exercises/solutions/example3_assignment1.cu
+++ b/docs_input/notebooks/exercises/solutions/example3_assignment1.cu
@@ -41,6 +41,8 @@ int main() {
   tensor_t<complex, 2> A(shape);
   tensor_t<complex, 2> B(shape);
 
+  cudaExecutor exec{};
+
   /****************************************************************************************************
    * Use the random number generator with a seed of 12345 to generate
    * normally-distributed numbers in the tensor A. Next, take the FFT across
@@ -51,7 +53,7 @@ int main() {
    * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#random-numbers
    * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/api/random.html
    ****************************************************************************************************/
-  (A = random<float>(A.Shape(), NORMAL, 12345)).run();
+  (A = random<float>(A.Shape(), NORMAL, 12345)).run(exec);
 
   auto At = A.Permute({1, 0});
   fft(At, At);
@@ -62,7 +64,7 @@ int main() {
        {{0.5646, 0.8638}, {1.6400, 0.3494}, {-0.5709, 0.5919}}});
   print(A);
   print(B);
-  cudaStreamSynchronize(0);
+  exec.sync();
   for (int row = 0; row < A.Size(0); row++) {
     for (int col = 0; col < A.Size(1); col++) {
       if (fabs(A(row, col).real() - B(row, col).real()) > 0.001) {
@@ -95,15 +97,15 @@ int main() {
   // Create and initialize 3D tensor
   auto dv = make_tensor<float>({10, 5, 15});
 
-  (dv = random<float>(dv.Shape(), NORMAL)).run();
+  (dv = random<float>(dv.Shape(), NORMAL)).run(exec);
 
   tensor_t<float, 0> redv;
   max(redv, dv, 0);
-  (dv = dv / redv).run();
+  (dv = dv / redv).run(exec);
   max(redv, dv, 0);
   /*** End editing ***/
 
-  cudaStreamSynchronize(0);
+  exec.sync();
   // Verify init is correct
   if (fabs(redv() - 1.0) > 0.001) {
     printf("Mismatch on final reduction. Expected=1.0, actual = %f\n", redv());
diff --git a/examples/black_scholes.cu b/examples/black_scholes.cu
index c880a259b..1938cab28 100644
--- a/examples/black_scholes.cu
+++ b/examples/black_scholes.cu
@@ -93,7 +93,7 @@ void compute_black_scholes_matx(tensor_t<T1,1>& K,
                                 tensor_t<T1,1>& r, 
                                 tensor_t<T1,1>& T, 
                                 tensor_t<T1,1>& output, 
-                                cudaStream_t& stream)
+                                cudaExecutor& exec)
 {
     auto VsqrtT = V * sqrt(T);
     auto d1 = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ;
@@ -102,7 +102,7 @@ void compute_black_scholes_matx(tensor_t<T1,1>& K,
     auto cdf_d2 = normcdf(d2);
     auto expRT = exp(-1 * r * T); 
 
-    (output = S * cdf_d1 - K * expRT * cdf_d2).run(stream);
+    (output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);
 }
 
 int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
@@ -124,9 +124,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   cudaStream_t stream;
   cudaStreamCreate(&stream);
+  cudaExecutor exec{stream};
 
-  // Prefetch all data
-  compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, stream);  
+  compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);  
 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
@@ -135,10 +135,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventRecord(start, stream);
   // Time non-operator version
   for (uint32_t i = 0; i < num_iterations; i++) {
-    compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, stream);
+    compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);
   }
   cudaEventRecord(stop, stream);
-  cudaStreamSynchronize(stream);
+  exec.sync();
   cudaEventElapsedTime(&time_ms, start, stop);
 
   printf("Time without custom operator = %.2fms per iteration\n",
@@ -147,10 +147,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventRecord(start, stream);
   // Time non-operator version
   for (uint32_t i = 0; i < num_iterations; i++) {
-    BlackScholes(output_tensor, K_tensor, V_tensor, S_tensor, r_tensor, T_tensor).run(stream);
+    BlackScholes(output_tensor, K_tensor, V_tensor, S_tensor, r_tensor, T_tensor).run(exec);
   }
   cudaEventRecord(stop, stream);
-  cudaStreamSynchronize(stream);
+  exec.sync();
   cudaEventElapsedTime(&time_ms, start, stop);
 
   printf("Time with custom operator = %.2fms per iteration\n",
diff --git a/examples/cgsolve.cu b/examples/cgsolve.cu
index 015bfd031..9027d634d 100644
--- a/examples/cgsolve.cu
+++ b/examples/cgsolve.cu
@@ -54,6 +54,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto norm = make_tensor<TypeParam, 1>({BATCH});
   auto maxn = make_tensor<TypeParam>({});
 
+  cudaExecutor exec{};
 
   // Simple Poisson matrix
   for(int b = 0; b < BATCH; b++) {
@@ -72,19 +73,17 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
       }
     }
   }
-  A.PrefetchDevice(0);
-  B.PrefetchDevice(0);
-  X.PrefetchDevice(0);
 
-  (X = TypeParam(1)).run();
+  (X = TypeParam(1)).run(exec);
 
-  (X = cgsolve(A, B, .0001, max_iters)).run();
+  (X = cgsolve(A, B, .0001, max_iters)).run(exec);
+  // example-begin sync-test-1
+  (Bout = matvec(A, X)).run(exec);
+  (norm = sum((Bout-B)*(Bout-B))).run(exec);
+  (maxn = matx::max(sqrt(norm))).run(exec);
 
-  (Bout = matvec(A, X)).run();
-  (norm = sum((Bout-B)*(Bout-B))).run();
-  (maxn = matx::max(sqrt(norm))).run();
-
-  cudaDeviceSynchronize();
+  exec.sync();
+  // example-end sync-test-1
   printf ("max l2 norm: %f\n", (float)sqrt(maxn()));
 
   CUDA_CHECK_LAST_ERROR();
diff --git a/examples/channelize_poly_bench.cu b/examples/channelize_poly_bench.cu
index 3665c9223..ac38367a4 100644
--- a/examples/channelize_poly_bench.cu
+++ b/examples/channelize_poly_bench.cu
@@ -73,6 +73,7 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
+  cudaExecutor exec{};
 
   for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
     for (matx::index_t num_channels = channel_start; num_channels <= channel_stop; num_channels++) {
@@ -88,18 +89,18 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
       const matx::index_t decimation_factor = num_channels;
 
       for (int k = 0; k < NUM_WARMUP_ITERATIONS; k++) {
-        (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(stream);
+        (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec);
       }
 
-      cudaStreamSynchronize(stream);
+      exec.sync();
 
       float elapsed_ms = 0.0f;
       cudaEventRecord(start, stream);
       for (int k = 0; k < NUM_ITERATIONS; k++) {
-        (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(stream);
+        (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec);
       }
       cudaEventRecord(stop, stream);
-      cudaStreamSynchronize(stream);
+      exec.sync();
       CUDA_CHECK_LAST_ERROR();
       cudaEventElapsedTime(&elapsed_ms, start, stop);
 
diff --git a/examples/convolution.cu b/examples/convolution.cu
index 8d328d3d4..f45b625ad 100644
--- a/examples/convolution.cu
+++ b/examples/convolution.cu
@@ -53,6 +53,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   cudaStream_t stream;
   cudaStreamCreate(&stream);
+  cudaExecutor exec{stream};
 
   cudaEvent_t start, stop;
 
@@ -85,20 +86,17 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     filterView(f) = filterView(f-1) * 0.99f;
   }
 
-  inView.PrefetchDevice(stream);
-  filterView.PrefetchDevice(stream);
-
   // Measure recursive runtime
-  cudaStreamSynchronize(stream);
+  exec.sync();
   cudaEventRecord(start, stream);
 
   for (uint32_t i = 0; i < iterations; i++) {
-    (outView = conv1d(inView, filterView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(stream);
+    (outView = conv1d(inView, filterView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
   }
   
 
   cudaEventRecord(stop, stream);
-  cudaStreamSynchronize(stream);
+  exec.sync();
   cudaEventElapsedTime(&time_ms, start, stop);
   time_ms /= static_cast<float>(iterations);
 
@@ -120,9 +118,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //     {(uint32_t) 10e3 + filter_dim_2d - 1, (uint32_t) 10e3 + filter_dim_2d -
   //     1});
   // auto out2DView = out2DData.View();
-  // filter2DData.PrefetchDevice(stream);
-  // in2DData.PrefetchDevice(stream);
-  // out2DData.PrefetchDevice(stream);
 
   // Measure recursive runtime
   // cudaEventRecord(start, stream);
@@ -136,7 +131,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // }
 
   // cudaEventRecord(stop, stream);
-  // cudaStreamSynchronize(stream);
+  // exec.sync();
   // cudaEventElapsedTime(&time_ms, start, stop);
   // time_ms /= static_cast<float>(iterations);
 
diff --git a/examples/eigenExample.cu b/examples/eigenExample.cu
index 6e53cd701..d204c4be7 100644
--- a/examples/eigenExample.cu
+++ b/examples/eigenExample.cu
@@ -47,6 +47,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   int dimX = 3;
   int dimY = 3;
 
+  matx::cudaExecutor exec{};
 
   ///////////////////////////////////////////////////////////////////////////////
   //////////////               Eigen Test Data Setup               //////////////
@@ -91,10 +92,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 #else
   std::cout <<"!!!!!!!!! Eigen NOT USED in Test !!!!!!!!!" << std ::endl;
   // provide data in tensors if eigen is not used
-  (aTensor = matx::random<double>({dimX, dimY}, matx::UNIFORM)).run();
-  (bTensor = matx::random<double>({dimX, dimY}, matx::UNIFORM)).run();
-  (complexTensor = matx::random<cuda::std::complex<double>>({2, 2}, matx::UNIFORM)).run();
-  (matTensor10x10 = matx::random<double>({10, 10}, matx::UNIFORM)).run();
+  (aTensor = matx::random<double>({dimX, dimY}, matx::UNIFORM)).run(exec);
+  (bTensor = matx::random<double>({dimX, dimY}, matx::UNIFORM)).run(exec);
+  (complexTensor = matx::random<cuda::std::complex<double>>({2, 2}, matx::UNIFORM)).run(exec);
+  (matTensor10x10 = matx::random<double>({10, 10}, matx::UNIFORM)).run(exec);
 
 #endif
 
@@ -110,16 +111,16 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaMemcpy(complexTensor.Data(), complexMatrix.data(), sizeof(std::complex<double>)*2*2, cudaMemcpyHostToDevice);
   cudaMemcpy(matTensor10x10.Data(), matrix10x10.data(), sizeof(float)*10*10, cudaMemcpyHostToDevice);
 
-  (aTensor = matx::transpose(aTensor)).run();
-  // (bTensor = matx::transpose(bTensor)).run(); // do not need to transpose because b has the same layout
-  (complexTensor = matx::transpose(complexTensor)).run();
-  (matTensor10x10 = matx::transpose(matTensor10x10)).run();
+  (aTensor = matx::transpose(aTensor)).run(exec);
+  // (bTensor = matx::transpose(bTensor)).run(exec); // do not need to transpose because b has the same layout
+  (complexTensor = matx::transpose(complexTensor)).run(exec);
+  (matTensor10x10 = matx::transpose(matTensor10x10)).run(exec);
 #endif
 
   tensor1D(0) = 1;
   tensor1D(1) = 2;
   tensor1D(2) = 3;
-  cudaDeviceSynchronize();
+  exec.sync();
 
   // slower alternative of copying per-element
   // for(int curX=0; curX<dimX; curX++)
@@ -351,8 +352,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //
   std::cout << "=================== Invert Matrix ===================" << std::endl;
 #ifdef USE_EIGEN
-  // Eigen::MatrixXd inverseMatrix = a.inverse();                                              // current bug where .run() in inverse is ambiguous, so cannot be used with MatX
-  // std::cout << "Inverse of the Real Part:\n" << inverseMatrix << std::endl; // current bug where .run() in inverse is ambiguous, so cannot be used with MatX
+  // Eigen::MatrixXd inverseMatrix = a.inverse();                                              // current bug where .run(exec) in inverse is ambiguous, so cannot be used with MatX
+  // std::cout << "Inverse of the Real Part:\n" << inverseMatrix << std::endl; // current bug where .run(exec) in inverse is ambiguous, so cannot be used with MatX
 #endif  
 
   auto invTensor = matx::inv(aTensor);
diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu
index d08cb535a..9afb0a0fd 100644
--- a/examples/fft_conv.cu
+++ b/examples/fft_conv.cu
@@ -73,6 +73,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
   using complex = cuda::std::complex<float>;
+  cudaExecutor exec{};
 
   index_t signal_size = 1ULL << 16;
   index_t filter_size = 16;
@@ -113,39 +114,34 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     }
   }
 
-  // Prefetch the data we just created
-  sig_time.PrefetchDevice(0);
-  filt_time.PrefetchDevice(0);  
-
-
   // Perform the FFT in-place on both signal and filter
   for (int i = 0; i < iterations; i++) {
     if (i == 1) {
       cudaEventRecord(start, stream);
     }    
-    (sig_freq = fft(sig_time, filtered_size)).run(stream);
-    (filt_freq = fft(filt_time, filtered_size)).run(stream);
+    (sig_freq = fft(sig_time, filtered_size)).run(exec);
+    (filt_freq = fft(filt_time, filtered_size)).run(exec);
 
-    (sig_freq = sig_freq * filt_freq).run(stream);
+    (sig_freq = sig_freq * filt_freq).run(exec);
 
     // IFFT in-place
-    (sig_freq = ifft(sig_freq)).run(stream);
+    (sig_freq = ifft(sig_freq)).run(exec);
     
   }
 
   cudaEventRecord(stop, stream);
-  cudaStreamSynchronize(stream);
+  exec.sync();
   cudaEventElapsedTime(&separate_ms, start, stop);   
 
   for (int i = 0; i < iterations; i++) {
     if (i == 1) {
       cudaEventRecord(start, stream);
     }
-    (sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(stream);
+    (sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(exec);
   }
   
   cudaEventRecord(stop, stream);
-  cudaStreamSynchronize(stream);
+  exec.sync();
   cudaEventElapsedTime(&fused_ms, start, stop);  
 
   printf("FFT runtimes for separate = %.2f ms, fused = %.2f ms\n", separate_ms/(iterations-1), fused_ms/(iterations-1));
@@ -154,9 +150,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // a direct convolution. The conv1d function only accepts a 1D filter, so we
   // create a sliced view here.
   auto filt1 = filt_time.Slice<1>({0,0}, {matxDropDim, matxEnd});
-  (time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run();
+  (time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
 
-  cudaStreamSynchronize(0);
+  exec.sync();
  
   // Compare signals
   for (index_t b = 0; b < batches; b++) {
diff --git a/examples/mvdr_beamformer.cu b/examples/mvdr_beamformer.cu
index 5096c7236..07fc97ebf 100644
--- a/examples/mvdr_beamformer.cu
+++ b/examples/mvdr_beamformer.cu
@@ -53,6 +53,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   cudaStream_t stream;
   cudaStreamCreate(&stream);
+  cudaExecutor exec{stream};
 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
@@ -76,16 +77,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     }
   }
 
-  mvdr.Prefetch(stream);
-
   cudaEventRecord(start, stream);
 
   for (uint32_t i = 0; i < num_iterations; i++) {
-    mvdr.Run(stream);
+    mvdr.Run(exec);
   }
 
   cudaEventRecord(stop, stream);
-  cudaStreamSynchronize(stream);
+  exec.sync();
   cudaEventElapsedTime(&time_ms, start, stop);
 
   printf("MVDR Kernel Time = %.2fms per iteration\n", time_ms / num_iterations);
diff --git a/examples/mvdr_beamformer.h b/examples/mvdr_beamformer.h
index 5fe78ee85..63fef3d0d 100644
--- a/examples/mvdr_beamformer.h
+++ b/examples/mvdr_beamformer.h
@@ -99,32 +99,33 @@ class MVDRBeamformer {
   /**
    *  Run the entire beamformer
    * 
-   *  @param stream CUDA stream
+   *  @param exec CUDA executor
    */
-  void Run(cudaStream_t stream)
+  void Run(cudaExecutor exec)
   {
-    (vhView = hermitianT(vView)).run(stream);
+    cudaStream_t stream = exec.getStream();
+    (vhView = hermitianT(vView)).run(exec);
 
-    (cbfView = matmul(vhView, inVecView)).run(stream);
+    (cbfView = matmul(vhView, inVecView)).run(exec);
 
     matx::copy(ivsView, inVecView.Slice({0, 0}, {matxEnd, snap_len_}), stream);
 
-    (ivshView = hermitianT(ivsView)).run(stream);
+    (ivshView = hermitianT(ivsView)).run(exec);
 
-    (covMatView = matmul(ivsView, ivshView)).run(stream);
+    (covMatView = matmul(ivsView, ivshView)).run(exec);
 
     (covMatView = (covMatView * (1.0f / static_cast<float>(snap_len_))) +
                    eye<complex>() * load_coeff_)
-        .run(stream);
+        .run(exec);
     inv_impl(invCovMatView, covMatView, stream);
 
     // Find A and B to solve xA=B. Matlab uses A/B to solve for x, which is the
     // same as x = BA^-1
-    (abfBView = matmul(invCovMatView, vView)).run(stream);
-    (abfAView = matmul(vhView, abfBView)).run(stream);
+    (abfBView = matmul(invCovMatView, vView)).run(exec);
+    (abfAView = matmul(vhView, abfBView)).run(exec);
 
     inv_impl(abfAInvView, abfAView, stream);
-    (abfWeightsView = matmul(abfBView, abfAInvView)).run(stream);
+    (abfWeightsView = matmul(abfBView, abfAInvView)).run(exec);
   }
 
   /**
diff --git a/examples/pwelch.cu b/examples/pwelch.cu
index 84cfb0bc0..ea2679180 100644
--- a/examples/pwelch.cu
+++ b/examples/pwelch.cu
@@ -62,20 +62,21 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
+  cudaExecutor exec{stream};
 
   // Create input signal as a complex exponential
   auto sample_index_range = range<0>({signal_size},0.f,1.f);
   auto phase = 2.f * static_cast<float>(M_PI) * ftone * sample_index_range / static_cast<float>(nfft);
   auto tmp_x = expj(phase);
   auto x = make_tensor<complex>({signal_size});
-  (x = tmp_x).run(stream); // pre-compute x, tmp_x is otherwise lazily evaluated
+  (x = tmp_x).run(exec); // pre-compute x, tmp_x is otherwise lazily evaluated
 
   // Create output tensor
   auto Pxx  = make_tensor<typename complex::value_type>({nfft});
 
   // Run one time to pre-cache the FFT plan
-  (Pxx = pwelch(x, nperseg, noverlap, nfft)).run(stream);
-  cudaStreamSynchronize(stream);
+  (Pxx = pwelch(x, nperseg, noverlap, nfft)).run(exec);
+  exec.sync();
 
   // Start the timing
   cudaEventRecord(start, stream);
@@ -85,11 +86,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   for (int iteration = 0; iteration < num_iterations; iteration++) {
     // Use the PWelch operator
-    (Pxx = pwelch(x, nperseg, noverlap, nfft)).run(stream);
+    (Pxx = pwelch(x, nperseg, noverlap, nfft)).run(exec);
   }
 
   cudaEventRecord(stop, stream);
-  cudaStreamSynchronize(stream);
+  exec.sync();
   cudaEventElapsedTime(&exec_time_ms, start, stop);
 
   printf("Output Pxx:\n");
diff --git a/examples/qr.cu b/examples/qr.cu
index 0e53ef950..2953dabcf 100644
--- a/examples/qr.cu
+++ b/examples/qr.cu
@@ -45,6 +45,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   using AType = cuda::std::complex<float>;
   
   cudaStream_t stream = 0;
+  cudaExecutor exec{stream};
   int batch = 1; 
 
   int m = 4;
@@ -56,10 +57,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto Q = make_tensor<AType>({batch, m, m});
   auto R = make_tensor<AType>({batch, m, n});
 
-  (A = random<float>(A.Shape(), NORMAL)).run(stream);
+  (A = random<float>(A.Shape(), NORMAL)).run(exec);
 
 #if 0
-  cudaDeviceSynchronize();
+  exec.sync();
   A(0,0,0) = 10000; A(0,0,1) = 10001;
   A(0,1,0) = 10001; A(0,1,1) = 10002;
   A(0,2,0) = 10002; A(0,2,1) = 10003;
@@ -67,15 +68,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   A(0,4,0) = 10004; A(0,4,1) = 10005;
 #endif
 
-  A.PrefetchDevice(stream);
-  Q.PrefetchDevice(stream);
-  R.PrefetchDevice(stream);
+  (mtie(Q, R) = qr(A)).run(exec);
 
-  (mtie(Q, R) = qr(A)).run(stream);
-
-  (QR = matmul(Q, R)).run(stream);
-  (QTQ = matmul(conj(transpose_matrix(Q)), Q)).run(stream);
-  cudaDeviceSynchronize();
+  (QR = matmul(Q, R)).run(exec);
+  (QTQ = matmul(conj(transpose_matrix(Q)), Q)).run(exec);
+  exec.sync();
   
   printf("Q:\n"); print(Q);
   printf("R:\n"); print(R);
diff --git a/examples/recursive_filter.cu b/examples/recursive_filter.cu
index 49ff039e2..70eee34a7 100644
--- a/examples/recursive_filter.cu
+++ b/examples/recursive_filter.cu
@@ -63,6 +63,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   cudaStream_t stream;
   cudaStreamCreate(&stream);
+  cudaExecutor exec{stream};
 
   cudaEvent_t start, stop;
 
@@ -109,23 +110,20 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     }
   }
 
-  inView.PrefetchDevice(stream);
-  outView.PrefetchDevice(stream);
-
   // Measure recursive runtime
-  cudaStreamSynchronize(stream);
+  exec.sync();
   cudaEventRecord(start, stream);
 
   for (uint32_t i = 0; i < iterations; i++) {
     // example-begin filter-example-1
     // Perform an IIR filter on "inView" with rCoeffs and nrCoeffs recursive/non-recursive
     // coefficients, respectively
-    (outView = filter(inView, rCoeffs, nrCoeffs)).run(stream);
+    (outView = filter(inView, rCoeffs, nrCoeffs)).run(exec);
     // example-end filter-example-1
   }
 
   cudaEventRecord(stop, stream);
-  cudaStreamSynchronize(stream);
+  exec.sync();
   cudaEventElapsedTime(&time_ms, start, stop);
   time_ms /= static_cast<float>(iterations);
 
diff --git a/examples/resample.cu b/examples/resample.cu
index fb814aa58..7927ea34f 100644
--- a/examples/resample.cu
+++ b/examples/resample.cu
@@ -53,6 +53,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   cudaStream_t stream;
   cudaStreamCreate(&stream);
+  cudaExecutor exec{stream};
 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
@@ -63,31 +64,31 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   tensor_t<complex, 1> sigViewComplex({num_samp / 2 + 1});
   tensor_t<float, 1> resampView({num_samp_resamp});
 
-  (sigView = random<float>({num_samp}, NORMAL)).run(stream);
+  (sigView = random<float>({num_samp}, NORMAL)).run(exec);
 
-  (sigViewComplex = fft(sigView)).run(stream);
+  (sigViewComplex = fft(sigView)).run(exec);
 
   // Slice
   auto sliceView = sigViewComplex.Slice({0}, {nyq});
 
   // Inverse Transform - FFT size based on output
-  (resampView = ifft(sliceView)).run(stream);
+  (resampView = ifft(sliceView)).run(exec);
 
   cudaEventRecord(start, stream);
 
   for (uint32_t i = 0; i < num_iterations; i++) {
     // Launch 1D FFT
-    (sigViewComplex = fft(sigView)).run(stream);
+    (sigViewComplex = fft(sigView)).run(exec);
 
     // Slice
     auto sv = sigViewComplex.Slice({0}, {nyq});
 
     // Inverse Transform - FFT size based on output
-    (resampView = ifft(sv)).run(stream);
+    (resampView = ifft(sv)).run(exec);
   }
 
   cudaEventRecord(stop, stream);
-  cudaStreamSynchronize(stream);
+  exec.sync();
   cudaEventElapsedTime(&time_ms, start, stop);
 
   printf("Resample Kernel Time = %.2fms per iteration\n",
diff --git a/examples/resample_poly_bench.cu b/examples/resample_poly_bench.cu
index 124bc412b..6debd4311 100644
--- a/examples/resample_poly_bench.cu
+++ b/examples/resample_poly_bench.cu
@@ -123,6 +123,8 @@ void ResamplePolyBench()
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
+  cudaExecutor exec{stream};
+
   for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
       const matx::index_t num_batches = test_cases[i].num_batches;
       const matx::index_t input_len = test_cases[i].input_len;
@@ -139,24 +141,24 @@ void ResamplePolyBench()
       auto filter = matx::make_tensor<InType, 1>({filter_len}, MATX_DEVICE_MEMORY);
       auto output = matx::make_tensor<InType, 2>({num_batches, output_len}, MATX_DEVICE_MEMORY);
 
-      (input = static_cast<InType>(1.0)).run(stream);
-      (filter = static_cast<InType>(1.0)).run(stream);
+      (input = static_cast<InType>(1.0)).run(exec);
+      (filter = static_cast<InType>(1.0)).run(exec);
 
-      cudaStreamSynchronize(stream);
+      exec.sync();
 
       for (int k = 0; k < NUM_WARMUP_ITERATIONS; k++) {
-        (output = matx::resample_poly(input, filter, up, down)).run(stream);
+        (output = matx::resample_poly(input, filter, up, down)).run(exec);
       }
 
-      cudaStreamSynchronize(stream);
+      exec.sync();
 
       float elapsed_ms = 0.0f;
       cudaEventRecord(start, stream);
       for (int k = 0; k < NUM_ITERATIONS; k++) {
-        (output = matx::resample_poly(input, filter, up, down)).run(stream);
+        (output = matx::resample_poly(input, filter, up, down)).run(exec);
       }
       cudaEventRecord(stop, stream);
-      cudaStreamSynchronize(stream);
+      exec.sync();
       CUDA_CHECK_LAST_ERROR();
       cudaEventElapsedTime(&elapsed_ms, start, stop);
 
diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu
index ae965b6c6..a89cb1b1f 100644
--- a/examples/simple_radar_pipeline.cu
+++ b/examples/simple_radar_pipeline.cu
@@ -71,10 +71,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     MATX_NVTX_START_RANGE("Pipeline Initialize", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 1)
     printf("Initializing data structures for stream %d...\n", s);
     pipelines[s] = new RadarPipeline(numPulses, numSamples, waveformLength, numChannels, streams[s]);
-    pipelines[s]->GetInputView().PrefetchDevice(streams[s]);
     MATX_NVTX_END_RANGE(1)
 
-    cudaStreamSynchronize(streams[s]);    
+    pipelines[s]->sync();  
   }
 
   MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2)
@@ -129,7 +128,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   for (int s = 0; s < num_streams; s++) {
     cudaEventRecord(stops[s], streams[s]);
-    cudaStreamSynchronize(streams[s]);
+    pipelines[s]->sync();
   }
   MATX_NVTX_END_RANGE(2)
   
diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h
index e94e5be21..de08ffd5c 100644
--- a/examples/simple_radar_pipeline.h
+++ b/examples/simple_radar_pipeline.h
@@ -135,7 +135,7 @@ class RadarPipeline {
   RadarPipeline(const index_t _numPulses, const index_t _numSamples,
                 index_t _wfLen, index_t _numChannels, cudaStream_t _stream)
       : numPulses(_numPulses), numSamples(_numSamples), waveformLength(_wfLen),
-        numChannels(_numChannels), stream(_stream)
+        numChannels(_numChannels), stream(_stream), exec(_stream)
   {
     numSamplesRnd = 1;
     while (numSamplesRnd < numSamples) {
@@ -202,7 +202,7 @@ class RadarPipeline {
 
     // Pre-process CFAR convolution
     (normT = conv2d(ones({numChannels, numPulsesRnd, numCompressedSamples}),
-           cfarMaskView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(stream);
+           cfarMaskView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
 
     cancelMask.PrefetchDevice(stream);
     ba.PrefetchDevice(stream);
@@ -216,6 +216,14 @@ class RadarPipeline {
     xPow.PrefetchDevice(stream);
   }
 
+  /**
+   * @brief Sync the pipeline using the underlying executor
+   * 
+   */
+  void sync() {
+    exec.sync();
+  }
+
   /**
    * @brief Stage 1 - Pulse compression - convolution via FFTs
    * 
@@ -242,19 +250,19 @@ class RadarPipeline {
     // Apply a Hamming window to the waveform to suppress sidelobes. Other
     // windows could be used as well (e.g., Taylor windows). Ultimately, it is
     // just an element-wise weighting by a pre-computed window function.
-    (waveformPart = waveformPart * hamming<0>({waveformLength})).run(stream);
+    (waveformPart = waveformPart * hamming<0>({waveformLength})).run(exec);
 
     // compute L2 norm
-    (norms = sum(norm(waveformPart))).run(stream);
-    (norms = sqrt(norms)).run(stream);
+    (norms = sum(norm(waveformPart))).run(exec);
+    (norms = sqrt(norms)).run(exec);
 
-    (waveformPart = waveformPart / norms).run(stream);
-    (waveformFull = fft(waveformPart, numSamplesRnd)).run(stream);
-    (waveformFull = conj(waveformFull)).run(stream);
+    (waveformPart = waveformPart / norms).run(exec);
+    (waveformFull = fft(waveformPart, numSamplesRnd)).run(exec);
+    (waveformFull = conj(waveformFull)).run(exec);
 
-    (x = fft(x)).run(stream);
-    (x = x * waveformT).run(stream);
-    (x = ifft(x)).run(stream);
+    (x = fft(x)).run(exec);
+    (x = x * waveformT).run(exec);
+    (x = ifft(x)).run(exec);
   }
 
 
@@ -281,7 +289,7 @@ class RadarPipeline {
         {0, 0, 0}, {numChannels, numCompressedSamples, numPulses});
     auto xo = tpcView.Permute({0, 2, 1}).Slice(
         {0, 0, 0}, {numChannels, numCompressedSamples, numPulses});
-    (xo = conv1d(x, cancelMask, matxConvCorrMode_t::MATX_C_MODE_SAME)).run(stream);
+    (xo = conv1d(x, cancelMask, matxConvCorrMode_t::MATX_C_MODE_SAME)).run(exec);
   }
 
   /**
@@ -309,8 +317,8 @@ class RadarPipeline {
 
     (xc = xc * hamming<1>({numChannels, numPulses - (cancelMask.Size(0) - 1),
                           numCompressedSamples}))
-        .run(stream);
-    (xf = fft(xf)).run(stream);
+        .run(exec);
+    (xf = fft(xf)).run(exec);
   }
 
   /**
@@ -350,11 +358,11 @@ class RadarPipeline {
    */
   void CFARDetections()
   {
-    (xPow = norm(tpcView)).run(stream);
+    (xPow = norm(tpcView)).run(exec);
 
     // Estimate the background average power in each cell
     // background_averages = conv2(Xpow, mask, 'same') ./ norm;
-    (ba = conv2d(xPow, cfarMaskView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(stream);
+    (ba = conv2d(xPow, cfarMaskView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
 
     // Computing number of cells contributing to each cell.
     // This can be done with a convolution of the cfarMask with
@@ -367,7 +375,7 @@ class RadarPipeline {
     auto baTrim = ba.Slice({0, cfarMaskY / 2, cfarMaskX / 2},
                             {numChannels, numPulsesRnd + cfarMaskY / 2,
                              numCompressedSamples + cfarMaskX / 2});
-    (baTrim = baTrim / normTrim).run(stream);
+    (baTrim = baTrim / normTrim).run(exec);
 
     // The scalar alpha is used as a multiplier on the background averages
     // to achieve a constant false alarm rate (under certain assumptions);
@@ -384,9 +392,9 @@ class RadarPipeline {
     // efficient as it can avoid repeated loads.
 #if 0
     IFELSE(xPow > normTrim*(pow(pfa, -1.0f/normTrim) - 1.0f)*baTrim,
-                dets = 1, dets = 0).run(stream);
+                dets = 1, dets = 0).run(exec);
 #else
-    calcDets(dets, xPow, baTrim, normTrim, pfa).run(stream);
+    calcDets(dets, xPow, baTrim, normTrim, pfa).run(exec);
 #endif
   }
 
@@ -457,4 +465,5 @@ class RadarPipeline {
   tensor_t<typename ComplexType::value_type, 2> cfarMaskView;
 
   cudaStream_t stream;
+  cudaExecutor exec;
 };
diff --git a/examples/spectrogram.cu b/examples/spectrogram.cu
index ad6b24c4f..b781690a9 100644
--- a/examples/spectrogram.cu
+++ b/examples/spectrogram.cu
@@ -65,6 +65,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
+  cudaExecutor exec{stream};
+
   float fs = 10000;
   constexpr index_t N = 100000;
   float amp = static_cast<float>(2 * sqrt(2));
@@ -92,17 +94,17 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // Set up all static buffers
   // time = np.arange(N) / float(fs)
   (time = linspace<0>(num_samps, 0.0f, static_cast<float>(N) - 1.0f) / fs)
-      .run(stream);
+      .run(exec);
   // mod = 500 * np.cos(2*np.pi*0.25*time)
-  (modulation = 500 * cos(2 * M_PI * 0.25 * time)).run(stream);
+  (modulation = 500 * cos(2 * M_PI * 0.25 * time)).run(exec);
   // carrier = amp * np.sin(2*np.pi*3e3*time + modulation)
-  (carrier = amp * sin(2 * M_PI * 3000 * time + modulation)).run(stream);
+  (carrier = amp * sin(2 * M_PI * 3000 * time + modulation)).run(exec);
   // noise = 0.01 * fs / 2 * np.random.randn(time.shape)
-  (noise = sqrt(0.01 * fs / 2) * random<float>({N}, NORMAL)).run(stream);
+  (noise = sqrt(0.01 * fs / 2) * random<float>({N}, NORMAL)).run(exec);
   // noise *= np.exp(-time/5)
-  (noise = noise * exp(-1.0f * time / 5.0f)).run(stream);
+  (noise = noise * exp(-1.0f * time / 5.0f)).run(exec);
   // x = carrier + noise
-  (x = carrier + noise).run(stream);
+  (x = carrier + noise).run(exec);
 
   for (uint32_t i = 0; i < num_iterations; i++) {
     if (i == 2) { // Start timer on third loop to allow generation of plot
@@ -112,15 +114,15 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     // DFT Sample Frequencies (rfftfreq)
     (freqs = (1.0 / (static_cast<float>(nfft) * 1 / fs)) *
                linspace<0>(half_win, 0.0f, static_cast<float>(nfft) / 2.0f))
-        .run(stream);
+        .run(exec);
 
     // Create overlapping matrix of segments.
     auto stackedMatrix = overlap(x, {nperseg}, {nstep});
     // FFT along rows
-    (fftStackedMatrix = fft(stackedMatrix)).run(stream);
+    (fftStackedMatrix = fft(stackedMatrix)).run(exec);
     // Absolute value
     (fftStackedMatrix = conj(fftStackedMatrix) * fftStackedMatrix)
-        .run(stream);
+        .run(exec);
     // Get real part and transpose
     auto Sxx = fftStackedMatrix.RealView().Permute({1, 0});
 
@@ -128,7 +130,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     (s_time = linspace<0>(s_time_shape, static_cast<float>(nperseg) / 2.0f,
                            static_cast<float>(N - nperseg) / 2.0f + 1) /
                 fs)
-        .run(stream);
+        .run(exec);
 
     if (i == 1) {
 #if MATX_ENABLE_VIZ
@@ -142,7 +144,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   }
 
   cudaEventRecord(stop, stream);
-  cudaStreamSynchronize(stream);
+  exec.sync();
   cudaEventElapsedTime(&time_ms, start, stop);
 
   printf("Spectrogram Time Without Graphs = %.2fus per iteration\n",
diff --git a/examples/spectrogram_graph.cu b/examples/spectrogram_graph.cu
index 9271c3fc0..c32534cca 100644
--- a/examples/spectrogram_graph.cu
+++ b/examples/spectrogram_graph.cu
@@ -62,6 +62,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
+  cudaExecutor exec{stream};
+
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
@@ -93,17 +95,17 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // Set up all static buffers
   // time = np.arange(N) / float(fs)
   (time = linspace<0>(num_samps, 0.0f, static_cast<float>(N) - 1.0f) / fs)
-      .run(stream);
+      .run(exec);
   // mod = 500 * np.cos(2*np.pi*0.25*time)
-  (modulation = 500 * cos(2 * M_PI * 0.25 * time)).run(stream);
+  (modulation = 500 * cos(2 * M_PI * 0.25 * time)).run(exec);
   // carrier = amp * np.sin(2*np.pi*3e3*time + modulation)
-  (carrier = amp * sin(2 * M_PI * 3000 * time + modulation)).run(stream);
+  (carrier = amp * sin(2 * M_PI * 3000 * time + modulation)).run(exec);
   // noise = 0.01 * fs / 2 * np.random.randn(time.shape)
-  (noise = sqrt(0.01 * fs / 2) * random<float>({N}, NORMAL)).run(stream);
+  (noise = sqrt(0.01 * fs / 2) * random<float>({N}, NORMAL)).run(exec);
   // noise *= np.exp(-time/5)
-  (noise = noise * exp(-1.0f * time / 5.0f)).run(stream);
+  (noise = noise * exp(-1.0f * time / 5.0f)).run(exec);
   // x = carrier + noise
-  (x = carrier + noise).run(stream);
+  (x = carrier + noise).run(exec);
 
   for (uint32_t i = 0; i < 2; i++) {
     // Record graph on second loop to get rid of plan caching in the graph
@@ -114,15 +116,15 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     // DFT Sample Frequencies (rfftfreq)
     (freqs = (1.0 / (static_cast<float>(nfft) * 1 / fs)) *
                linspace<0>(half_win, 0.0f, static_cast<float>(nfft) / 2.0f))
-        .run(stream);
+        .run(exec);
 
     // Create overlapping matrix of segments.
     auto stackedMatrix = overlap(x, {nperseg}, {nstep});
     // FFT along rows
-    (fftStackedMatrix = fft(stackedMatrix)).run(stream);
+    (fftStackedMatrix = fft(stackedMatrix)).run(exec);
     // Absolute value
     (fftStackedMatrix = conj(fftStackedMatrix) * fftStackedMatrix)
-        .run(stream);
+        .run(exec);
     // Get real part and transpose
     [[maybe_unused]] auto Sxx = fftStackedMatrix.RealView().Permute({1, 0});
 
@@ -130,7 +132,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     (s_time = linspace<0>(s_time_shape, static_cast<float>(nperseg) / 2.0f,
                            static_cast<float>(N - nperseg) / 2.0f + 1) /
                 fs)
-        .run(stream);
+        .run(exec);
 
     if (i == 1) {
       cudaStreamEndCapture(stream, &graph);
@@ -146,14 +148,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   }
 
 
-  cudaStreamSynchronize(0);
+  exec.sync();
   // Time graph execution of same kernels
   cudaEventRecord(start, stream);
   for (uint32_t i = 0; i < 10; i++) {
     cudaGraphLaunch(instance, stream);
   }
   cudaEventRecord(stop, stream);
-  cudaStreamSynchronize(stream);
+  exec.sync();
   cudaEventElapsedTime(&time_ms, start, stop);
 
   printf("Spectrogram Time With Graphs = %.2fus per iteration\n",
diff --git a/examples/spherical_harmonics.cu b/examples/spherical_harmonics.cu
index e74fae638..35d650f5f 100644
--- a/examples/spherical_harmonics.cu
+++ b/examples/spherical_harmonics.cu
@@ -49,6 +49,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
 
+  cudaExecutor exec{};
+
   using ValueType = double;
 
   int l = 3;
@@ -76,9 +78,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto Y = make_tensor<ValueType>(Ym.Shape());
   auto Z = make_tensor<ValueType>(Zm.Shape());
 
-  (X = Xm, Y = Ym, Z=Zm).run();
+  (X = Xm, Y = Ym, Z=Zm).run(exec);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
 #if MATX_ENABLE_VIZ
   matx::viz::surf(X, Y, Z, "test-viz.html");
diff --git a/examples/svd_power.cu b/examples/svd_power.cu
index 6c86167fe..840688466 100644
--- a/examples/svd_power.cu
+++ b/examples/svd_power.cu
@@ -47,6 +47,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   using SType = float;
 
   cudaStream_t stream = 0;
+  cudaExecutor exec{stream};
 
   int m = 5;
   int n = 4;
@@ -70,7 +71,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto VTV = make_tensor<AType>({batch, k, k});
   auto x0 = random<float>({batch, d}, NORMAL);
     
-  (A = random<float>({batch, m, n}, NORMAL)).run(stream);
+  (A = random<float>({batch, m, n}, NORMAL)).run(exec);
 
 #else
   auto A = make_tensor<AType>({m, n});
@@ -87,7 +88,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto VTV = make_tensor<AType>({k, k});
   auto x0 = random<float>({d}, NORMAL);
   
-  (A = random<float>({m, n}, NORMAL)).run(stream);
+  (A = random<float>({m, n}, NORMAL)).run(exec);
 
 #endif
   std::array<index_t, U.Rank()> Dshape;
@@ -103,18 +104,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
     printf("iterations: %d\n", iterations);
 
-    A.PrefetchDevice(stream);
-    U.PrefetchDevice(stream);
-    S.PrefetchDevice(stream);
-    VT.PrefetchDevice(stream);
+    (U = 0).run(exec);
+    (S = 0).run(exec);
+    (VT = 0).run(exec);
 
-    (U = 0).run(stream);
-    (S = 0).run(stream);
-    (VT = 0).run(stream);
+    (mtie(U, S, VT) = svdpi(A, x0, iterations, k)).run(exec);
 
-    (mtie(U, S, VT) = svdpi(A, x0, iterations, k)).run(stream);
-
-    cudaDeviceSynchronize();
+    exec.sync();
     printf("svdpi:\n");
 
     printf("S\n");
@@ -126,29 +122,29 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
     if( m <=  n) {
       printf("UUT:\n");
-      (UUT = matmul(U, conj(transpose_matrix(U)))).run(stream);
+      (UUT = matmul(U, conj(transpose_matrix(U)))).run(exec);
       print(UUT);
     }
 
     printf("UTU:\n");
-    (UTU = matmul(conj(transpose_matrix(U)), U)).run(stream);
+    (UTU = matmul(conj(transpose_matrix(U)), U)).run(exec);
     print(UTU);
 
     if( n >= m) {
       printf("VVT:\n");
-      (VVT = matmul(conj(transpose_matrix(VT)), VT)).run(stream);
+      (VVT = matmul(conj(transpose_matrix(VT)), VT)).run(exec);
       print(VVT);
     }
 
     printf("VTV:\n");
-    (VTV = matmul(VT, conj(transpose_matrix(VT)))).run(stream); // works on r x r
+    (VTV = matmul(VT, conj(transpose_matrix(VT)))).run(exec); // works on r x r
 
     print(VTV);
 
     // scale U by eigen values (equivalent to matmul of the diagonal matrix)
-    (UD = U * D).run(stream);
+    (UD = U * D).run(exec);
 
-    (UDVT = matmul(UD, VT)).run(stream);
+    (UDVT = matmul(UD, VT)).run(exec);
 
     printf("A\n");
     print(A);
@@ -156,7 +152,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     printf("UDVT\n");
     print(UDVT);
 
-    (A = A - UDVT).run(stream);
+    (A = A - UDVT).run(exec);
 
     printf("A-UDVT\n");
     print(A);
@@ -165,13 +161,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // Same as above but with svdbpi
   {
 
-    (U = 0).run(stream);
-    (S = 0).run(stream);
-    (VT = 0).run(stream);
+    (U = 0).run(exec);
+    (S = 0).run(exec);
+    (VT = 0).run(exec);
     // TODO add k
-    (mtie(U, S, VT) = svdbpi(A, iterations, tol)).run(stream);
+    (mtie(U, S, VT) = svdbpi(A, iterations, tol)).run(exec);
 
-    cudaDeviceSynchronize();
+    exec.sync();
     printf("svdbpi:\n");
 
     printf("S\n");
@@ -183,29 +179,29 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
     if( m <=  n) {
       printf("UUT:\n");
-      (UUT = matmul(U, conj(transpose_matrix(U)))).run(stream);
+      (UUT = matmul(U, conj(transpose_matrix(U)))).run(exec);
       print(UUT);
     }
 
     printf("UTU:\n");
-    (UTU = matmul(conj(transpose_matrix(U)), U)).run(stream);
+    (UTU = matmul(conj(transpose_matrix(U)), U)).run(exec);
     print(UTU);
 
     if( n >= m) {
       printf("VVT:\n");
-      (VVT = matmul(conj(transpose_matrix(VT)), VT)).run(stream);
+      (VVT = matmul(conj(transpose_matrix(VT)), VT)).run(exec);
       print(VVT);
     }
 
     printf("VTV:\n");
-    (VTV = matmul(VT, conj(transpose_matrix(VT)))).run(stream); // works on r x r
+    (VTV = matmul(VT, conj(transpose_matrix(VT)))).run(exec); // works on r x r
 
     print(VTV);
 
     // scale U by eigen values (equivalent to matmul of the diagonal matrix)
-    (UD = U * D).run(stream);
+    (UD = U * D).run(exec);
 
-    (UDVT = matmul(UD, VT)).run(stream);
+    (UDVT = matmul(UD, VT)).run(exec);
 
     printf("A\n");
     print(A);
@@ -213,7 +209,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     printf("UDVT\n");
     print(UDVT);
 
-    (A = A - UDVT).run(stream);
+    (A = A - UDVT).run(exec);
 
     printf("A-UDVT\n");
     print(A);
diff --git a/include/matx/executors/device.h b/include/matx/executors/device.h
index 7125214be..b293ec1d8 100644
--- a/include/matx/executors/device.h
+++ b/include/matx/executors/device.h
@@ -63,11 +63,17 @@ namespace matx
        */
       cudaExecutor() : stream_(0) {}
 
-      /*
-       * @breif Returns stream associated with executor
-       */
+      /**
+       * @brief Returns stream associated with executor
+      */
       auto getStream() const { return stream_; }
 
+      /**
+       * @brief Synchronize the cuda executor's stream
+       * 
+       */
+      void sync() { cudaStreamSynchronize(stream_); }
+      
       /**
        * Execute an operator on a device
        * 
diff --git a/include/matx/executors/host.h b/include/matx/executors/host.h
index 969de943a..c17c96cf4 100644
--- a/include/matx/executors/host.h
+++ b/include/matx/executors/host.h
@@ -71,6 +71,12 @@ class HostExecutor {
 
     HostExecutor(const HostExecParams &params = HostExecParams{}) : params_(params) {}
 
+    /**
+     * @brief Synchronize the host executor's threads. Currently a noop as the executor is single-threaded.
+     * 
+     */
+    void sync() {}
+
     /**
      * @brief Execute an operator
      * 
diff --git a/test/00_io/FileIOTests.cu b/test/00_io/FileIOTests.cu
index 2769f361e..0b5d938bd 100644
--- a/test/00_io/FileIOTests.cu
+++ b/test/00_io/FileIOTests.cu
@@ -153,7 +153,7 @@ TYPED_TEST(FileIoTestsNonComplexFloatTypes, MATWriteRank5)
 
   (t = random<float>(t.Shape(), UNIFORM)).run(this->exec);
 
-  cudaDeviceSynchronize();
+  this->exec.sync();
 
   // Read "myvar" from mat file
   io::write_mat(t, "test_write.mat", "myvar");
@@ -182,7 +182,7 @@ TYPED_TEST(FileIoTestsNonComplexFloatTypes, MATWriteRank5GetShape)
 
   (t = random<float>(t.Shape(), UNIFORM)).run(this->exec);
 
-  cudaDeviceSynchronize();
+  this->exec.sync();
 
   // Read "myvar" from mat file
   io::write_mat(t, "test_write.mat", "myvar");
@@ -213,7 +213,7 @@ TYPED_TEST(FileIoTestsComplexFloatTypes, MATWriteRank5GetShape)
 
   (t = random<float>(t.Shape(), UNIFORM)).run(this->exec);
 
-  cudaDeviceSynchronize();
+  this->exec.sync();
 
   // Read "myvar" from mat file
   io::write_mat(t, "test_write.mat", "myvar");
diff --git a/test/00_operators/GeneratorTests.cu b/test/00_operators/GeneratorTests.cu
index a4c92597a..a1b4f4a44 100644
--- a/test/00_operators/GeneratorTests.cu
+++ b/test/00_operators/GeneratorTests.cu
@@ -161,7 +161,7 @@ TYPED_TEST(BasicGeneratorTestsAll, Diag)
     // Assign the diagonal elements of `tc` to `td`.
     (td = diag(tc)).run(exec);
     // example-end diag-op-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (int i = 0; i < 10; i++) {
       for (int j = 0; j < 10; j++) {
@@ -177,11 +177,11 @@ TYPED_TEST(BasicGeneratorTestsAll, Diag)
     {
       auto delta = make_tensor<TestType>({1});
       delta(0) = static_cast<TestType>(1.0);
-      cudaStreamSynchronize(0);
+      exec.sync();
 
       (td = 0).run(exec);
       (td = diag(conv1d(tc, delta, MATX_C_MODE_SAME))).run(exec);
-      cudaStreamSynchronize(0);
+      exec.sync();
 
       for (int i = 0; i < 10; i++) {
         for (int j = 0; j < 10; j++) {
@@ -209,7 +209,7 @@ TYPED_TEST(BasicGeneratorTestsFloat, Alternate)
   (td = alternate(10)).run(exec);
   // example-end alternate-gen-test-1
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (int i = 0; i < 10; i++) {
     MATX_ASSERT_EQ(td(i), (TestType)-2* (TestType)(i&1) + (TestType)1)
@@ -235,7 +235,7 @@ TEST(OperatorTests, Kron)
 
   (ov = kron(eye({4, 4}), bv)).run(exec);
   // example-end kron-gen-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   MATX_TEST_ASSERT_COMPARE(pb, ov, "square", 0);
 
   tensor_t<dtype, 2> av({2, 3});
@@ -246,7 +246,7 @@ TEST(OperatorTests, Kron)
   // Explicit shape specified in ones()
   (ov2 = kron(av, ones({2, 2}))).run(exec);
   // example-end ones-gen-test-2  
-  cudaStreamSynchronize(0);
+  exec.sync();
   MATX_TEST_ASSERT_COMPARE(pb, ov2, "rect", 0);
 
   MATX_EXIT_HANDLER();
@@ -277,7 +277,7 @@ TEST(OperatorTests, MeshGrid)
   (yv = yy).run(exec);
   // example-end meshgrid-gen-test-1
 
-  cudaStreamSynchronize(0);
+  exec.sync();
   MATX_TEST_ASSERT_COMPARE(pb, xv, "X", 0);
   MATX_TEST_ASSERT_COMPARE(pb, yv, "Y", 0);
 
@@ -303,18 +303,18 @@ TYPED_TEST(BasicGeneratorTestsFloatNonComplex, FFTFreq)
   // Generate FFT frequencies using the length of the "t1" tensor and assign to t1
   (t1 = fftfreq(t1.Size(0))).run(exec);
   // example-end fftfreq-gen-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   MATX_TEST_ASSERT_COMPARE(pb, t1, "F1", 0.1);
 
   (t2 = fftfreq(t2.Size(0))).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
   MATX_TEST_ASSERT_COMPARE(pb, t2, "F2", 0.1);
 
   // example-begin fftfreq-gen-test-2
   // Generate FFT frequencies using the length of the "t1" tensor and a sample spacing of 0.5 and assign to t1
   (t1 = fftfreq(t1.Size(0), 0.5)).run(exec);
   // example-end fftfreq-gen-test-2
-  cudaStreamSynchronize(0);
+  exec.sync();
   MATX_TEST_ASSERT_COMPARE(pb, t1, "F3", 0.1);  
 
   MATX_EXIT_HANDLER();
@@ -336,7 +336,7 @@ TYPED_TEST(BasicGeneratorTestsAll, Zeros)
   (t1 = zeros()).run(exec);
   // example-end zeros-gen-test-1
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (index_t i = 0; i < count; i++) {
     if constexpr (IsHalfType<TestType>()) {
@@ -362,7 +362,7 @@ TYPED_TEST(BasicGeneratorTestsAll, Ones)
 
   (t1 = ones()).run(exec);
   // example-end ones-gen-test-1    
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (index_t i = 0; i < count; i++) {
     if constexpr (IsHalfType<TestType>()) {
@@ -390,7 +390,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Range)
   // Generate a sequence of 100 numbers starting at 1 and spaced by 1
   (t1 = range<0>(t1.Shape(), 1, 1)).run(exec);
   // example-end range-gen-test-1  
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   TestType one = 1;
   TestType two = 1;
@@ -403,7 +403,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Range)
 
   {
     (t1 = t1 * t1).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count; i++) {
       TestType it = static_cast<detail::value_promote_t<TestType>>(i);
@@ -413,7 +413,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Range)
 
   {
     (t1 = t1 * two).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count; i++) {
       TestType it = static_cast<detail::value_promote_t<TestType>>(i);
@@ -424,7 +424,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Range)
 
   {
     (t1 = three * t1).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count; i++) {
       TestType it = static_cast<detail::value_promote_t<TestType>>(i);
@@ -451,7 +451,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Linspace)
   // with `count` points in between
   (t1 = linspace<0>(t1.Shape(), (TestType)1, (TestType)100)).run(exec);
   // example-end linspace-gen-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (index_t i = 0; i < count; i++) {
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1(i), i + 1));
@@ -459,7 +459,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Linspace)
 
   {
     (t1 = t1 + t1).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count; i++) {
       EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1(i), (i + 1) + (i + 1)));
@@ -468,7 +468,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Linspace)
 
   {
     (t1 = (TestType)1 + t1).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count; i++) {
       EXPECT_TRUE(
@@ -478,7 +478,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Linspace)
 
   {
     (t1 = t1 + (TestType)2).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count; i++) {
       EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1(i), (i + 1) + (i + 1) + 1 + 2));
@@ -506,7 +506,7 @@ TYPED_TEST(BasicGeneratorTestsFloatNonComplex, Logspace)
   (t1 = logspace<0>(s, start, stop)).run(exec);
   // example-end logspace-gen-test-1
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   // Use doubles for verification since half operators have no equivalent host
   // types
@@ -563,7 +563,7 @@ TYPED_TEST(BasicGeneratorTestsNumeric, Eye)
   TestType one = 1.0f;
   TestType zero = 0.0f;
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   for (index_t i = 0; i < count; i++) {
     for (index_t j = 0; j < count; j++) {
@@ -626,7 +626,7 @@ TYPED_TEST(BasicGeneratorTestsNumeric, Diag)
 
   TestType zero = 0.0f;
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   for (index_t i = 0; i < count; i++) {
     for (index_t j = 0; j < count; j++) {
diff --git a/test/00_operators/OperatorTests.cu b/test/00_operators/OperatorTests.cu
index 208dea374..208e31cb4 100644
--- a/test/00_operators/OperatorTests.cu
+++ b/test/00_operators/OperatorTests.cu
@@ -292,11 +292,11 @@ TYPED_TEST(OperatorTestsFloatAllExecs, IsClose)
   auto B = make_tensor<TestType>({5, 5, 5});
   auto C = make_tensor<int>({5, 5, 5});
 
-  (A = ones<TestType>(A.Shape())).run();
-  (B = ones<TestType>(B.Shape())).run();
-  (C = isclose(A, B)).run();
+  (A = ones<TestType>(A.Shape())).run(exec);
+  (B = ones<TestType>(B.Shape())).run(exec);
+  (C = isclose(A, B)).run(exec);
   // example-end isclose-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for(int i=0; i < A.Size(0); i++) {
     for(int j=0; j < A.Size(1); j++) {
@@ -307,8 +307,8 @@ TYPED_TEST(OperatorTestsFloatAllExecs, IsClose)
   }
 
   B(1,1,1) = 2;
-  (C = isclose(A, B)).run();
-  cudaStreamSynchronize(0);
+  (C = isclose(A, B)).run(exec);
+  exec.sync();
 
   for(int i=0; i < A.Size(0); i++) {
     for(int j=0; j < A.Size(1); j++) {
@@ -349,7 +349,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexNonHalfAllExecs, Frexp)
   (tofrac = ofrac, toint = oint).run(exec);
   // example-end frexp-test-1
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   int texp;  
   for (int i = 0; i < tiv0.Size(0); i++) {
@@ -396,7 +396,7 @@ TYPED_TEST(OperatorTestsComplexNonHalfTypesAllExecs, Frexpc)
     toint_imag = oint_imag).run(exec);
   // example-end frexpc-test-1
 
-  cudaStreamSynchronize(0);
+  exec.sync();
   int texp_real, texp_imag;  
   for (int i = 0; i < tiv0.Size(0); i++) {
     if constexpr (std::is_same_v<TypeParam, cuda::std::complex<float>>) {
@@ -438,7 +438,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, FMod)
   tiv1() = (TestType)3.1;
   (tov0 = fmod(tiv0, tiv1)).run(exec);
   // example-end fmod-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_fmod((TestType)5.0, (TestType)3.1)));
 
   MATX_EXIT_HANDLER();
@@ -460,73 +460,73 @@ TYPED_TEST(OperatorTestsFloatAllExecs, TrigFuncs)
   // example-begin sin-test-1
   (tov0 = sin(tiv0)).run(exec);
   // example-end sin-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_sin(c)));
 
   // example-begin cos-test-1
   (tov0 = cos(tiv0)).run(exec);
   // example-end cos-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_cos(c)));
 
   // example-begin tan-test-1
   (tov0 = tan(tiv0)).run(exec);
   // example-end tan-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_tan(c)));
 
   // example-begin asin-test-1
   (tov0 = asin(tiv0)).run(exec);
   // example-end asin-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_asin(c)));
 
   // example-begin acos-test-1
   (tov0 = acos(tiv0)).run(exec);
   // example-end acos-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_acos(c)));
 
   // example-begin atan-test-1
   (tov0 = atan(tiv0)).run(exec);
   // example-end atan-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_atan(c)));
 
   // example-begin sinh-test-1
   (tov0 = sinh(tiv0)).run(exec);
   // example-end sinh-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_sinh(c)));
 
   // example-begin cosh-test-1
   (tov0 = cosh(tiv0)).run(exec);
   // example-end cosh-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_cosh(c)));
 
   // example-begin tanh-test-1
   (tov0 = tanh(tiv0)).run(exec);
   // example-end tanh-test-1  
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_tanh(c)));
 
   // example-begin asinh-test-1
   (tov0 = asinh(tiv0)).run(exec);
   // example-end asinh-test-1  
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_asinh(c)));
 
   // example-begin acosh-test-1
   (tov0 = acosh(tiv0)).run(exec);
   // example-end acosh-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_acosh(c)));
 
   // example-begin atanh-test-1
   (tov0 = atanh(tiv0)).run(exec);
   // example-end atanh-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_atanh(c)));
 
   MATX_EXIT_HANDLER();
@@ -549,7 +549,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, AngleOp)
   // example-begin angle-test-1
   (tov0 = angle(tiv0)).run(exec);
   // example-end angle-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_angle(c)));  
 
   MATX_EXIT_HANDLER();
@@ -590,7 +590,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CloneOp)
     }
 
     (tov = op).run(exec);
-    cudaDeviceSynchronize();
+    exec.sync();
 
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
@@ -625,7 +625,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CloneOp)
     }
 
     (tov = op).run(exec);
-    cudaDeviceSynchronize();
+    exec.sync();
 
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
@@ -660,7 +660,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CloneOp)
     }
 
     (tov = op).run(exec);
-    cudaDeviceSynchronize();
+    exec.sync();
 
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
@@ -697,7 +697,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CloneOp)
     }
 
     (tov = op).run(exec);
-    cudaDeviceSynchronize();
+    exec.sync();
 
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
@@ -734,7 +734,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CloneOp)
     }
 
     (tov = op).run(exec);
-    cudaDeviceSynchronize();
+    exec.sync();
 
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
@@ -759,11 +759,11 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CloneOp)
 
     delta(0,0) = static_cast<typename inner_op_type_t<TestType>::type>(1.0);
 
-    cudaDeviceSynchronize();
+    exec.sync();
 
     (tov = clone<3>(conv2d(tiv, delta, MATX_C_MODE_SAME), {N, matxKeepDim, matxKeepDim})).run(exec);
 
-    cudaDeviceSynchronize();
+    exec.sync();
 
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
@@ -797,12 +797,12 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, AtOp)
   // `t1(3)` after execution
   (t0 = at(t1, 3)).run(exec);
   // example-end at-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   ASSERT_EQ(t0(), t1(3));
 
   (t0 = at(t2, 1, 4)).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   ASSERT_EQ(t0(), t2(1, 4));  
 
@@ -810,7 +810,7 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, AtOp)
     using ComplexType = detail::complex_from_scalar_t<TestType>;
     auto c0 = make_tensor<ComplexType>({});
     (c0 = at(fft(t1), 0)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     // The first component of the FFT output (DC) is the sum of all elements, so
     // 10+20+...+100 = 550. The imaginary component should be 0.
@@ -869,7 +869,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, SliceOp)
   (t2 = linspace<1>(t2.Shape(), (inner_type)0, (inner_type)10)).run(exec);
   (t3 = linspace<2>(t3.Shape(), (inner_type)0, (inner_type)10)).run(exec);
   (t4 = linspace<3>(t4.Shape(), (inner_type)0, (inner_type)10)).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   // Slice with different start and end points in each dimension
   auto t2t = slice(t2, {1, 2}, {3, 5});
@@ -916,7 +916,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, SliceOp)
   // Test SliceOp applied to a transform, using transpose() as an example transform
   auto t2trans = make_tensor<TestType>({3, 2});
   (t2trans = slice(transpose(t2), {2, 1}, {5, 3})).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   ASSERT_EQ(t2trans.Size(0), 3);
   ASSERT_EQ(t2trans.Size(1), 2);
@@ -932,7 +932,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, SliceOp)
   auto t2s = slice(t2, {t2.Size(0) - 4, t2.Size(1) - 5}, {matxEnd, matxEnd});
 
   // example-end slice-test-4
-  cudaStreamSynchronize(0);
+  exec.sync();
   ASSERT_EQ(t2sn.Size(0), t2s.Size(0));
   ASSERT_EQ(t2sn.Size(1), t2s.Size(1));
   for (index_t i = 0; i < t2sn.Size(0); i++) {
@@ -956,7 +956,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, SliceAndReduceOp)
   tensor_t<TestType, 3> t3t{{30, 20, 10}};
   (t2t = linspace<1>(t2t.Shape(), (inner_type)0, (inner_type)10)).run(exec);
   (t3t = linspace<2>(t3t.Shape(), (inner_type)0, (inner_type)10)).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   {
     index_t j = 0;
@@ -1074,7 +1074,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CollapseOp)
 
     (tov = (TestType)0).run(exec);
     (tov = op).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
@@ -1100,7 +1100,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CollapseOp)
     
     (tov = (TestType)0).run(exec);
     (tov = op).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
@@ -1121,7 +1121,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CollapseOp)
 
     (tov = (TestType)0).run(exec);
     (tov = op).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
@@ -1142,7 +1142,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CollapseOp)
 
     (tov = (TestType)0).run(exec);
     (tov = op).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
@@ -1167,7 +1167,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CollapseOp)
 
     (tov = (TestType)0).run(exec);
     (tov = op).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
@@ -1192,7 +1192,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CollapseOp)
 
     (tov = (TestType)0).run(exec);
     (tov = op).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
@@ -1238,7 +1238,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
     // Remap 2D operator "tiv" by selecting elements from dimension 0 stored in "idx"
     (tov = remap<0>(tiv, idx)).run(exec);
     // example-end remap-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for( int i = 0; i < N ; i++) {
       for( int j = 0; j < N ; j++) {
@@ -1247,7 +1247,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
     }
     
     (tov = remap<1>(tiv, idx)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     
     for( int i = 0; i < N ; i++) {
       for( int j = 0; j < N ; j++) {
@@ -1259,7 +1259,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
     // Remap 2D operator "tiv" by selecting elements from dimensions 0 and 1 stored in "idx"
     (tov = remap<0,1>(tiv, idx, idx)).run(exec);
     // example-end remap-test-2
-    cudaStreamSynchronize(0);
+    exec.sync();
     
     for( int i = 0; i < N ; i++) {
       for( int j = 0; j < N ; j++) {
@@ -1279,9 +1279,9 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
 
     (tov = (TestType)0).run(exec);
 
-    (remap<0>(tov, idx) = tiv).run();
+    (remap<0>(tov, idx) = tiv).run(exec);
     
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for( int i = 0; i < N ; i++) {
       for( int j = 0; j < N ; j++) {
@@ -1291,7 +1291,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
     
     (tov = (TestType)0).run(exec);
     (remap<1>(tov, idx) = tiv).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     
     for( int i = 0; i < N ; i++) {
       for( int j = 0; j < N ; j++) {
@@ -1301,7 +1301,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
     
     (tov = (TestType)0).run(exec);
     (remap<0,1>(tov, idx, idx) = tiv).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     
     for( int i = 0; i < N ; i++) {
       for( int j = 0; j < N ; j++) {
@@ -1320,7 +1320,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
     }
 
     (tov = remap<0>(tiv, idx)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for( int i = 0; i < N ; i++) {
       for( int j = 0; j < N ; j++) {
@@ -1329,7 +1329,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
     }
     
     (tov = remap<1>(tiv, idx)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for( int i = 0; i < N ; i++) {
       for( int j = 0; j < N ; j++) {
@@ -1338,7 +1338,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
     }
     
     (tov = remap<0,1>(tiv, idx, idx)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for( int i = 0; i < N ; i++) {
       for( int j = 0; j < N ; j++) {
@@ -1357,7 +1357,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
     }
 
     (remap<0>(tov, idx) = tiv).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for( int i = 0; i < N ; i++) {
       for( int j = 0; j < N ; j++) {
@@ -1366,7 +1366,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
     }
     
     (remap<1>(tov, idx) = tiv).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for( int i = 0; i < N ; i++) {
       for( int j = 0; j < N ; j++) {
@@ -1375,7 +1375,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
     }
     
     (remap<0,1>(tov, idx, idx) = tiv).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for( int i = 0; i < N ; i++) {
       for( int j = 0; j < N ; j++) {
@@ -1396,7 +1396,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
       auto tov = make_tensor<TestType>({M, N});
 
       (tov = remap<0>(tiv, idx)).run(exec);
-      cudaStreamSynchronize(0);
+      exec.sync();
 
       for( int i = 0; i < M ; i++) {
         for( int j = 0; j < N ; j++) {
@@ -1409,7 +1409,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
       auto tov = make_tensor<TestType>({N, M});
 
       (tov = remap<1>(tiv, idx)).run(exec);
-      cudaStreamSynchronize(0);
+      exec.sync();
 
       for( int i = 0; i < N ; i++) {
         for( int j = 0; j < M ; j++) {
@@ -1422,7 +1422,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
       auto tov = make_tensor<TestType>({M, M});
 
       (tov = remap<0,1>(tiv, idx, idx)).run(exec);
-      cudaStreamSynchronize(0);
+      exec.sync();
 
       for( int i = 0; i < M ; i++) {
         for( int j = 0; j < M ; j++) {
@@ -1444,7 +1444,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
       auto tov = make_tensor<TestType>({M, N});
 
       (tov = remap<0>(tiv, idx)).run(exec);
-      cudaStreamSynchronize(0);
+      exec.sync();
 
       for( int i = 0; i < M ; i++) {
         for( int j = 0; j < N ; j++) {
@@ -1457,7 +1457,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
       auto tov = make_tensor<TestType>({N, M});
 
       (tov = remap<1>(tiv, idx)).run(exec);
-      cudaStreamSynchronize(0);
+      exec.sync();
 
       for( int i = 0; i < N ; i++) {
         for( int j = 0; j < M ; j++) {
@@ -1479,7 +1479,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
       auto tov = make_tensor<TestType>({M, N});
 
       (tov = remap<0>(tiv, idx)).run(exec);
-      cudaStreamSynchronize(0);
+      exec.sync();
 
       for( int i = 0; i < M ; i++) {
         for( int j = 0; j < N ; j++) {
@@ -1492,7 +1492,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
       auto tov = make_tensor<TestType>({N, M});
 
       (tov = remap<1>(tiv, idx)).run(exec);
-      cudaStreamSynchronize(0);
+      exec.sync();
 
       for( int i = 0; i < N ; i++) {
         for( int j = 0; j < M ; j++) {
@@ -1505,7 +1505,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp)
       auto tov = make_tensor<TestType>({M, M});
 
       (tov = remap<0,1>(tiv, idx, idx)).run(exec);
-      cudaStreamSynchronize(0);
+      exec.sync();
 
       for( int i = 0; i < M ; i++) {
         for( int j = 0; j < M ; j++) {
@@ -1526,32 +1526,26 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapRankZero)
 
   ExecType exec{};
 
-  auto sync = []() constexpr {
-    if constexpr (std::is_same_v<ExecType,cudaExecutor>) {
-      cudaDeviceSynchronize();
-    }
-  };
-
   const int N = 16;
 
   // 1D source tensor cases
   {
     auto from = make_tensor<int>({N});
     (from = range<0>({N}, 0, 1)).run(exec);
-    sync();
+    exec.sync();
     auto ind = make_tensor<int>({});
     auto r = remap<0>(from, ind);
     auto to = make_tensor<int>({1});
 
     ind() = N/2;
     (to = r).run(exec);
-    sync();
+    exec.sync();
 
     ASSERT_EQ(to(0), N/2);
 
     ind() = N/4;
     (to = r).run(exec);
-    sync();
+    exec.sync();
 
     ASSERT_EQ(to(0), N/4);
   }
@@ -1560,7 +1554,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapRankZero)
   {
     auto from = make_tensor<int>({N,N});
     (from = ones()).run(exec);
-    sync();
+    exec.sync();
 
     auto i0 = make_tensor<int>({});
     auto i1 = make_tensor<int>({});
@@ -1575,7 +1569,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapRankZero)
     from(0,N/2) = 3;
     (to0 = r0).run(exec);
     (to1 = r1).run(exec);
-    sync();
+    exec.sync();
 
     ASSERT_EQ(to0(0,0), 2);
     ASSERT_EQ(to0(0,1), 1);
@@ -1589,7 +1583,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapRankZero)
     // Select a single entry from the 2D input tensor
     auto entry = make_tensor<int>({1,1});
     (entry = remap<0,1>(from, i0, i1)).run(exec);
-    sync();
+    exec.sync();
 
     ASSERT_EQ(entry(0,0), 11);
   }
@@ -1613,13 +1607,13 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, RealImagOp)
   // example-begin real-test-1
   (tov0 = real(tiv0)).run(exec);
   // example-end real-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c.real()));  
 
   // example-begin imag-test-1
   (tov0 = imag(tiv0)).run(exec);
   // example-end imag-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c.imag()));   
 
   MATX_EXIT_HANDLER();
@@ -1646,19 +1640,19 @@ TYPED_TEST(OperatorTestsAllExecs, OperatorFuncs)
   // example-begin IFELSE-test-1
   IFELSE(tiv0 == d, tov0 = z, tov0 = d).run(exec);
   // example-end IFELSE-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), z));
 
   IFELSE(tiv0 == d, tov0 = tiv0, tov0 = d).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), tiv0()));
 
   IFELSE(tiv0 != d, tov0 = d, tov0 = z).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), z));
 
   (tov0 = c, tov00 = c).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c));
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov00(), c));  
 
@@ -1674,20 +1668,14 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Abs2)
 
   ExecType exec{};
 
-  auto sync = []() constexpr {
-    if constexpr (std::is_same_v<ExecType,cudaExecutor>) {
-      cudaDeviceSynchronize();
-    }
-  };
-
   if constexpr (std::is_same_v<TestType, cuda::std::complex<float>> &&
     std::is_same_v<ExecType,cudaExecutor>) {
     // example-begin abs2-test-1
     auto x = make_tensor<cuda::std::complex<float>>({});
     auto y = make_tensor<float>({});
     x() = { 1.5f, 2.5f };
-    (y = abs2(x)).run();
-    cudaDeviceSynchronize();
+    (y = abs2(x)).run(exec);
+    exec.sync();
     ASSERT_NEAR(y(), 1.5f*1.5f+2.5f*2.5f, 1.0e-6);
     // example-end abs2-test-1
   }
@@ -1697,12 +1685,12 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Abs2)
   if constexpr (is_complex_v<TestType>) {
     x() = TestType{2.0, 2.0};
     (y = abs2(x)).run(exec);
-    sync();
+    exec.sync();
     ASSERT_NEAR(y(), 8.0, 1.0e-6);
   } else {
     x() = 2.0;
     (y = abs2(x)).run(exec);
-    sync();
+    exec.sync();
     ASSERT_NEAR(y(), 4.0, 1.0e-6);
 
     // Test with higher rank tensor
@@ -1717,7 +1705,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Abs2)
     }
 
     (y3 = abs2(x3)).run(exec);
-    sync();
+    exec.sync();
 
     for (int i = 0; i < 3; i++) {
       for (int j = 0; j < 3; j++) {
@@ -1748,7 +1736,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, OperatorFuncsR2C)
   // example-begin expj-test-1
   (tov0 = expj(tiv0)).run(exec);
   // example-end expj-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(
       tov0(),
@@ -1773,43 +1761,43 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, OperatorFuncs)
   // example-begin log10-test-1
   (tov0 = log10(tiv0)).run(exec);
   // example-end log10-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_log10(c)));
 
   // example-begin log-test-1
   (tov0 = log(tiv0)).run(exec);
   // example-end log-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_log(c)));
 
   // example-begin log2-test-1
   (tov0 = log2(tiv0)).run(exec);
   // example-end log2-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_log2(c)));
 
   // example-begin floor-test-1
   (tov0 = floor(tiv0)).run(exec);
   // example-end floor-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_floor(c)));
 
   // example-begin ceil-test-1
   (tov0 = ceil(tiv0)).run(exec);
   // example-end ceil-test-1  
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_ceil(c)));
 
   // example-begin round-test-1
   (tov0 = round(tiv0)).run(exec);
   // example-end round-test-1  
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_round(c)));
 
   // example-begin sqrt-test-1
   (tov0 = sqrt(tiv0)).run(exec);
   // example-end sqrt-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_sqrt(c)));      
 
   MATX_EXIT_HANDLER();
@@ -1827,14 +1815,14 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, NDOperatorFuncs)
   auto a = make_tensor<TestType>({1,2,3,4,5});
   auto b = make_tensor<TestType>({1,2,3,4,5});
   (a = ones<TestType>(a.Shape())).run(exec);
-  cudaDeviceSynchronize();
+  exec.sync();
   (b = ones<TestType>(b.Shape())).run(exec);
-  cudaDeviceSynchronize();
+  exec.sync();
   (a = a + b).run(exec);
 
   auto t0 = make_tensor<TestType>({});
   (t0 = sum(a)).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
   ASSERT_EQ(t0(), static_cast<TestType>(2 * a.TotalSize()));
   MATX_EXIT_HANDLER();
 }
@@ -1857,13 +1845,13 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, OperatorFuncs)
   // example-begin max-el-test-1
   (tov0 = max(tiv0, d)).run(exec);
   // example-end max-el-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), std::max(c, d)));
 
   // example-begin min-el-test-1
   (tov0 = min(tiv0, d)).run(exec);
   // example-end min-el-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), std::min(c, d)));
 
   // These operators convert type T into bool
@@ -1872,37 +1860,37 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, OperatorFuncs)
   // example-begin lt-test-1
   (tob = tiv0 < d).run(exec);
   // example-end lt-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), c < d));
 
   // example-begin gt-test-1
   (tob = tiv0 > d).run(exec);
   // example-end gt-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), c > d));
 
   // example-begin lte-test-1
   (tob = tiv0 <= d).run(exec);
   // example-end lte-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), c <= d));
 
   // example-begin gte-test-1
   (tob = tiv0 >= d).run(exec);
   // example-end gte-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), c >= d));
 
   // example-begin eq-test-1
   (tob = tiv0 == d).run(exec);
   // example-end eq-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), c == d));
 
   // example-begin neq-test-1
   (tob = tiv0 != d).run(exec);
   // example-end neq-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), c != d));
 
   MATX_EXIT_HANDLER();
@@ -1924,7 +1912,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, OperatorFuncDivComplex)
   tiv0() = c;
 
   (tov0 = s / tiv0).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), s / tiv0()));
 
   MATX_EXIT_HANDLER();  
@@ -1947,50 +1935,50 @@ TYPED_TEST(OperatorTestsNumericAllExecs, OperatorFuncs)
   // example-begin add-test-1
   (tov0 = tiv0 + tiv0).run(exec);
   // example-end add-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c + c));
 
   // example-begin sub-test-1
   (tov0 = tiv0 - tiv0).run(exec);
   // example-end sub-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c - c));
 
   // example-begin mul-test-1
   (tov0 = tiv0 * tiv0).run(exec);
   // example-end mul-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c * c));
 
   // example-begin div-test-1
   (tov0 = tiv0 / tiv0).run(exec);
   // example-end div-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c / c));
 
   // example-begin neg-test-1
   (tov0 = -tiv0).run(exec);
   // example-end neg-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), -c));
 
   // example-begin IF-test-1
   IF(tiv0 == tiv0, tov0 = c).run(exec);
   // example-end IF-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c));
 
   TestType p = 2.0f;
   // example-begin pow-test-1
   (tov0 = as_type<TestType>(pow(tiv0, p))).run(exec);
   // example-end pow-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_pow(c, p)));
 
   TestType three = 3.0f;
 
-  (tov0 = tiv0 * tiv0 * (tiv0 + tiv0) / tiv0 + three).run();
-  cudaStreamSynchronize(0);
+  (tov0 = tiv0 * tiv0 * (tiv0 + tiv0) / tiv0 + three).run(exec);
+  exec.sync();
 
   TestType res;
   res = c * c * (c + c) / c + three;
@@ -2005,9 +1993,9 @@ TYPED_TEST(OperatorTestsNumericAllExecs, OperatorFuncs)
   }
   auto tob = make_tensor<bool>({});
   // example-begin nan-test-1
-  (tob = matx::isnan(nan)).run(); 
+  (tob = matx::isnan(nan)).run(exec); 
   // example-end nan-test-1
-  cudaDeviceSynchronize();  
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), std::is_floating_point_v<conversionType> ? true : false));
 
   auto notnanorinf = make_tensor<TestType>({});
@@ -2016,8 +2004,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, OperatorFuncs)
   } else {
     notnanorinf() = 0;
   }  
-  (tob = matx::isnan(notnanorinf)).run();
-  cudaDeviceSynchronize();  
+  (tob = matx::isnan(notnanorinf)).run(exec);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), false));
 
   auto inf = make_tensor<TestType>({});
@@ -2028,13 +2016,13 @@ TYPED_TEST(OperatorTestsNumericAllExecs, OperatorFuncs)
     inf() = std::numeric_limits<conversionType>::infinity();
   }  
   // example-begin inf-test-1
-  (tob = matx::isinf(inf)).run(); 
+  (tob = matx::isinf(inf)).run(exec); 
   // example-end inf-test-1
-  cudaDeviceSynchronize();  
+  exec.sync(); 
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), std::is_floating_point_v<conversionType> ? true : false));
 
-  (tob = matx::isinf(notnanorinf)).run();
-  cudaDeviceSynchronize();  
+  (tob = matx::isinf(notnanorinf)).run(exec);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), false));
 
   MATX_EXIT_HANDLER();
@@ -2058,7 +2046,7 @@ TYPED_TEST(OperatorTestsIntegralAllExecs, OperatorFuncs)
   // example-begin mod-test-1
   (tov0 = tiv0 % mod).run(exec);
   // example-end mod-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c % mod));
 
   MATX_EXIT_HANDLER();
@@ -2082,37 +2070,37 @@ TYPED_TEST(OperatorTestsBooleanAllExecs, OperatorFuncs)
   // example-begin land-test-1
   (tov0 = tiv0 && d).run(exec);
   // example-end land-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c && d));
 
   // example-begin lor-test-1
   (tov0 = tiv0 || d).run(exec);
   // example-end lor-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c || d));
 
   // example-begin lnot-test-1
   (tov0 = !tiv0).run(exec);
   // example-end lnot-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), !c));
 
   // example-begin xor-test-1
   (tov0 = tiv0 ^ d).run(exec);
   // example-end xor-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c ^ d));
 
   // example-begin or-test-1
   (tov0 = tiv0 | d).run(exec);
   // example-end or-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c | d));
 
   // example-begin and-test-1
   (tov0 = tiv0 & d).run(exec);
   // example-end and-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c & d));    
 
   MATX_EXIT_HANDLER();
@@ -2136,13 +2124,13 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, OperatorFuncs)
   // example-begin exp-test-1
   (tov0 = exp(tiv0)).run(exec);
   // example-end exp-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_exp(c)));
 
   // example-begin conj-test-1
   (tov0 = conj(tiv0)).run(exec);
   // example-end conj-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_conj(c)));
 
   // abs and norm take a complex and output a floating point value
@@ -2150,13 +2138,13 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, OperatorFuncs)
   // example-begin norm-test-1
   (tdd0 = norm(tiv0)).run(exec);
   // example-end norm-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tdd0(), detail::_internal_norm(c)));
 
   // example-begin abs-test-1
   (tdd0 = abs(tiv0)).run(exec);
   // example-end abs-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tdd0(), detail::_internal_abs(c)));
 
   MATX_EXIT_HANDLER();
@@ -2184,7 +2172,7 @@ TYPED_TEST(OperatorTestsAllExecs, Flatten)
   auto t1 = make_tensor<TestType>({t2.Size(0)*t2.Size(1)});
   (t1 = flatten(t2)).run(exec);
   // example-end flatten-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   
   for (index_t i = 0; i < t2.Size(0)*t2.Size(1); i++) {
     ASSERT_EQ(t1(i), val);
@@ -2215,7 +2203,7 @@ TYPED_TEST(OperatorTestsNumericNoHalfAllExecs, AdvancedOperators)
   {
     (c = a + b).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count; i++) {
       TestType tcnt = static_cast<detail::value_promote_t<TestType>>(i);
@@ -2227,7 +2215,7 @@ TYPED_TEST(OperatorTestsNumericNoHalfAllExecs, AdvancedOperators)
   {
     (c = a * b).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count; i++) {
       TestType tcnt = static_cast<detail::value_promote_t<TestType>>(i);
@@ -2238,7 +2226,7 @@ TYPED_TEST(OperatorTestsNumericNoHalfAllExecs, AdvancedOperators)
 
   {
     (c = a * b + a).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count; i++) {
       TestType tcnt = static_cast<detail::value_promote_t<TestType>>(i);
@@ -2250,7 +2238,7 @@ TYPED_TEST(OperatorTestsNumericNoHalfAllExecs, AdvancedOperators)
   {
 
     (c = a * b + a * (TestType)4.0f).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count; i++) {
       TestType tcnt = static_cast<detail::value_promote_t<TestType>>(i);
@@ -2285,7 +2273,7 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, AdvancedOperators)
   {
     (c = a + b).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count; i++) {
       TestType tcnt = (TestType)i;
@@ -2295,7 +2283,7 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, AdvancedOperators)
 
   {
     (c = a * b).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count; i++) {
       TestType tcnt = (TestType)i;
@@ -2306,7 +2294,7 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, AdvancedOperators)
 
   {
     (c = a * b + a).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count; i++) {
       TestType tcnt = (TestType)i;
@@ -2318,7 +2306,7 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, AdvancedOperators)
   {
 
     (c = a * b + a * (TestType)2.0f).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count; i++) {
       TestType tcnt = (TestType)i;
@@ -2356,7 +2344,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, ComplexTypeCompatibility)
   }
 
   (dview = dview * fview).run(exec);
-  cudaDeviceSynchronize();
+  exec.sync();
 
   for (index_t i = 0; i < count; i++) {
     ASSERT_EQ(static_cast<detail::value_promote_t<TestType>>(dview(i).real()),
@@ -2373,7 +2361,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, ComplexTypeCompatibility)
   }
 
   (dview = dview / fview).run(exec);
-  cudaDeviceSynchronize();
+  exec.sync();
 
   for (index_t i = 0; i < count; i++) {
     ASSERT_EQ(static_cast<detail::value_promote_t<TestType>>(dview(i).real()),
@@ -2393,7 +2381,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, ComplexTypeCompatibility)
   
 
   (dview = dview + fview).run(exec);
-  cudaDeviceSynchronize();
+  exec.sync();
 
   for (index_t i = 0; i < count; i++) {
     ASSERT_EQ(static_cast<detail::value_promote_t<TestType>>(dview(i).real()),
@@ -2410,7 +2398,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, ComplexTypeCompatibility)
   }
 
   (dview = dview - fview).run(exec);
-  cudaDeviceSynchronize();
+  exec.sync();
 
   for (index_t i = 0; i < count; i++) {
     ASSERT_EQ(static_cast<detail::value_promote_t<TestType>>(dview(i).real()),
@@ -2427,7 +2415,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, ComplexTypeCompatibility)
   }
 
   (dview = fview - dview).run(exec);
-  cudaDeviceSynchronize();
+  exec.sync();
 
   for (index_t i = 0; i < count; i++) {
     ASSERT_EQ(static_cast<detail::value_promote_t<TestType>>(dview(i).real()),
@@ -2458,12 +2446,9 @@ TYPED_TEST(OperatorTestsNumericAllExecs, SquareCopyTranspose)
     }
   }
 
-  t2.PrefetchDevice(0);
-  t2t.PrefetchDevice(0);
   matx::copy(t2t, t2, exec);
 
-  t2t.PrefetchHost(0);
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (index_t i = 0; i < count; i++) {
     for (index_t j = 0; j < count; j++) {
@@ -2472,11 +2457,9 @@ TYPED_TEST(OperatorTestsNumericAllExecs, SquareCopyTranspose)
     }
   }
 
-  t2t.PrefetchDevice(0);
   (t2t = transpose(t2)).run(exec);
 
-  t2t.PrefetchHost(0);
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (index_t i = 0; i < count; i++) {
     for (index_t j = 0; j < count; j++) {
@@ -2509,7 +2492,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, NonSquareTranspose)
   }
 
   (t2t = transpose(t2)).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (index_t i = 0; i < count1; i++) {
     for (index_t j = 0; j < count2; j++) {
@@ -2544,8 +2527,11 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Transpose3D)
   }
 
   (t3t = transpose_matrix(t3)).run(exec);
-  cudaError_t error = cudaStreamSynchronize(0);
-  ASSERT_EQ(error, cudaSuccess);
+  exec.sync();
+
+  if constexpr (std::is_same_v<ExecType,cudaExecutor>) {
+    ASSERT_EQ(cudaGetLastError(), cudaSuccess);
+  }
 
   for (index_t i = 0; i < num_rows; i++) {
     for (index_t j = 0; j < num_cols; j++) {
@@ -2587,9 +2573,9 @@ TYPED_TEST(OperatorTestsNumericAllExecs, TransposeVsTransposeMatrix)
   (t3t = transpose(t3)).run(exec);
   (t3tm = transpose_matrix(t3)).run(exec);
 
-  if constexpr (is_cuda_executor_v<ExecType>) {
-    cudaError_t error = cudaStreamSynchronize(0);
-    ASSERT_EQ(error, cudaSuccess);
+  exec.sync();
+  if constexpr (std::is_same_v<ExecType,cudaExecutor>) {
+    ASSERT_EQ(cudaGetLastError(), cudaSuccess);
   }
 
   for (index_t i = 0; i < m; i++) {
@@ -2646,9 +2632,9 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, CloneAndAdd)
   auto vah = clone<4>(velAccelHypoth,
       {numBeams, matxKeepDim, matxKeepDim, matxKeepDim});
 
-  (beamwiseRangeDoppler = smx + vah).run();
+  (beamwiseRangeDoppler = smx + vah).run(exec);
 
-  cudaStreamSynchronize(0);
+  exec.sync();
   for (index_t i = 0; i < numBeams; i++) {
     for (index_t j = 0; j < numPulses; j++) {
       for (index_t k = 0; k < numPairs; k++) {
@@ -2813,7 +2799,7 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, Overlap)
   ASSERT_EQ(b4out.Size(0), 4);
   ASSERT_EQ(b4out.Size(1), 3);
 
-  cudaStreamSynchronize(0);
+  exec.sync();
   for (index_t i = 0; i < b4.Size(0); i++) {
     for (index_t j = 0; j < b4.Size(1); j++) {
       ASSERT_EQ(b4out(i, j), b4(i, j));
@@ -2853,7 +2839,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast)
     }
 
     (t4o = t4i * t0).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
   
     for (index_t i = 0; i < t4o.Size(0); i++) {
       for (index_t j = 0; j < t4o.Size(1); j++) {
@@ -2871,7 +2857,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast)
       }
     }
     (t4o = t0 * t4i).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < t4o.Size(0); i++) {
       for (index_t j = 0; j < t4o.Size(1); j++) {
@@ -2909,8 +2895,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast)
   //     }
   //   }
 
-  //   (t4o = t4i * t1).run();
-  //   cudaStreamSynchronize(0);
+  //   (t4o = t4i * t1).run(exec);
+  //   exec.sync();
 
   //   for (index_t i = 0; i < t4o.Size(0); i++) {
   //     for (index_t j = 0; j < t4o.Size(1); j++) {
@@ -2928,8 +2914,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast)
   //     }
   //   }
 
-  //   (t4o = t1 * t4i).run();
-  //   cudaStreamSynchronize(0);
+  //   (t4o = t1 * t4i).run(exec);
+  //   exec.sync();
 
   //   for (index_t i = 0; i < t4o.Size(0); i++) {
   //     for (index_t j = 0; j < t4o.Size(1); j++) {
@@ -2970,8 +2956,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast)
   //     }
   //   }
 
-  //   (t4o = t4i * t2).run();
-  //   cudaStreamSynchronize(0);
+  //   (t4o = t4i * t2).run(exec);
+  //   exec.sync();
 
   //   for (index_t i = 0; i < t4o.Size(0); i++) {
   //     for (index_t j = 0; j < t4o.Size(1); j++) {
@@ -2989,8 +2975,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast)
   //     }
   //   }
 
-  //   (t4o = t2 * t4i).run();
-  //   cudaStreamSynchronize(0);
+  //   (t4o = t2 * t4i).run(exec);
+  //   exec.sync();
 
   //   for (index_t i = 0; i < t4o.Size(0); i++) {
   //     for (index_t j = 0; j < t4o.Size(1); j++) {
@@ -3033,8 +3019,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast)
   //     }
   //   }
 
-  //   (t4o = t4i * t3).run();
-  //   cudaStreamSynchronize(0);
+  //   (t4o = t4i * t3).run(exec);
+  //   exec.sync()
 
   //   for (index_t i = 0; i < t4o.Size(0); i++) {
   //     for (index_t j = 0; j < t4o.Size(1); j++) {
@@ -3052,8 +3038,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast)
   //     }
   //   }
 
-  //   (t4o = t3 * t4i).run();
-  //   cudaStreamSynchronize(0);
+  //   (t4o = t3 * t4i).run(exec);
+  //   exec.sync()
 
   //   for (index_t i = 0; i < t4o.Size(0); i++) {
   //     for (index_t j = 0; j < t4o.Size(1); j++) {
@@ -3111,8 +3097,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast)
   //     }
   //   }
 
-  //   (t4o = t4i + t3 + t2 + t1 + t0).run();
-  //   cudaStreamSynchronize(0);
+  //   (t4o = t4i + t3 + t2 + t1 + t0).run(exec);
+  //   exec.sync()
 
   //   for (index_t i = 0; i < t4o.Size(0); i++) {
   //     for (index_t j = 0; j < t4o.Size(1); j++) {
@@ -3133,8 +3119,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast)
   //     }
   //   }
 
-  //   (t4o = t0 + t1 + t2 + t3 + t4i).run();
-  //   cudaStreamSynchronize(0);
+  //   (t4o = t0 + t1 + t2 + t3 + t4i).run(exec);
+  //   exec.sync();
 
   //   for (index_t i = 0; i < t4o.Size(0); i++) {
   //     for (index_t j = 0; j < t4o.Size(1); j++) {
@@ -3179,7 +3165,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Concatenate)
   // Concatenate "t11" and "t12" into a new 1D tensor
   (t1o = concat(0, t11, t12)).run(exec);
   // example-end concat-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (i = 0; i < t11.Size(0) + t12.Size(0); i++) {
     if (i < t11.Size(0)) {
@@ -3198,7 +3184,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Concatenate)
     (t1o = 0).run(exec);
     (t1o = concat(0, conv1d(t11, delta, MATX_C_MODE_SAME), conv1d(t12, delta, MATX_C_MODE_SAME))).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (i = 0; i < t11.Size(0) + t12.Size(0); i++) {
       if (i < t11.Size(0)) {
@@ -3230,7 +3216,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Concatenate)
                {10,11,12}});
 
   (t2o1 = concat(0, t21, t22)).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (i = 0; i < t21.Size(0) + t22.Size(0); i++) {
     for (j = 0; j < t21.Size(1); j++) {
@@ -3244,7 +3230,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Concatenate)
   }
 
   (t2o2 = concat(1, t21, t23)).run(exec); 
-  cudaStreamSynchronize(0);
+  exec.sync();
   
   for (j = 0; j < t21.Size(1) + t23.Size(1); j++) {
     for (i = 0; i < t21.Size(0); i++) {
@@ -3261,7 +3247,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Concatenate)
 
   // Concatenating 3 tensors
   (t1o1 = concat(0, t11, t11, t11)).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (i = 0; i < t1o1.Size(0); i++) {
     ASSERT_EQ(t1o1(i), t11(i % t11.Size(0)));
@@ -3285,7 +3271,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Concatenate)
     auto tempConcat2 = matx::concat(0, c, d);
     (result = matx::concat(0, tempConcat1, tempConcat2 )).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (int cnt = 0; cnt < result.Size(0); cnt++) {
       ASSERT_EQ(result(cnt), cnt + 1);
     }    
@@ -3310,7 +3296,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Stack)
   auto cop = concat(0, t1a, t1b, t1c);
   
   (cop = (TestType)2).run(exec);
-  cudaDeviceSynchronize();
+  exec.sync();
 
   {
     // example-begin stack-test-1
@@ -3362,7 +3348,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, HermitianTranspose)
   // example-begin hermitianT-test-1
   (t2s = hermitianT(t2)).run(exec);
   // example-end hermitianT-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (index_t i = 0; i < count0; i++) {
     for (index_t j = 0; j < count1; j++) {
@@ -3397,7 +3383,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, PlanarTransform)
   }
 
   (t2p = planar(t2)).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (index_t i = 0; i < m; i++) {
     for (index_t j = 0; j < k; j++) {
@@ -3434,7 +3420,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, InterleavedTransform)
   }
 
   (t2 = interleaved(t2p)).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (index_t i = 0; i < m; i++) {
     for (index_t j = 0; j < k; j++) {
@@ -3474,7 +3460,7 @@ TYPED_TEST(OperatorTestsAllExecs, RepMat)
   ASSERT_TRUE(repop.Size(1) == same_reps * t2.Size(1));
 
   (t2s = repop).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (index_t i = 0; i < count0 * same_reps; i++) {
     for (index_t j = 0; j < count1 * same_reps; j++) {
@@ -3491,7 +3477,7 @@ TYPED_TEST(OperatorTestsAllExecs, RepMat)
   ASSERT_TRUE(rrepop.Size(1) == same_reps * 2 * t2.Size(1));
 
   (t2r = rrepop).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (index_t i = 0; i < count0 * same_reps; i++) {
     for (index_t j = 0; j < count1 * same_reps * 2; j++) {
@@ -3550,9 +3536,9 @@ TYPED_TEST(OperatorTestsFloatNonComplexNonHalfAllExecs, PolyVal)
   pb->NumpyToTensorView(c, "c");
 
   // example-begin polyval-test-1
-  (out = polyval(x, c)).run();
+  (out = polyval(x, c)).run(exec);
   // example-end polyval-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   MATX_TEST_ASSERT_COMPARE(pb, out, "out", 0.01);
 
   MATX_EXIT_HANDLER();
@@ -3576,14 +3562,15 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Upsample)
     (t1 = static_cast<TestType>(1)).run(exec);
     auto us_op = upsample(t1, 0, n);
     // example-end upsample-test-1
+    exec.sync();
 
     ASSERT_TRUE(us_op.Size(0) == t1.Size(0) * n);
     for (index_t i = 0; i < us_op.Size(0); i++) {
       if ((i % n) == 0) {
-        MatXUtils::MatXTypeCompare(us_op(i), t1(i / n));
+        ASSERT_TRUE(MatXUtils::MatXTypeCompare(us_op(i), t1(i / n)));
       }
       else {
-        MatXUtils::MatXTypeCompare(us_op(i), static_cast<TestType>(0));
+        ASSERT_TRUE(MatXUtils::MatXTypeCompare(us_op(i), static_cast<TestType>(0)));
       }
     }
   }
@@ -3607,10 +3594,11 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Downsample)
     (t1 = static_cast<TestType>(1)).run(exec);
     auto ds_op = downsample(t1, 0, n);
     // example-end downsample-test-1
+    exec.sync();
 
     ASSERT_TRUE(ds_op.Size(0) == t1.Size(0) / n);
     for (index_t i = 0; i < ds_op.Size(0); i++) {
-      MatXUtils::MatXTypeCompare(ds_op(i), t1(i * n));
+      ASSERT_TRUE(MatXUtils::MatXTypeCompare(ds_op(i), t1(i * n)));
     }
   }
 
@@ -3623,7 +3611,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Downsample)
 
     ASSERT_TRUE(ds_op.Size(0) == t1.Size(0) / n + 1);
     for (index_t i = 0; i < ds_op.Size(0); i++) {
-      MatXUtils::MatXTypeCompare(ds_op(i), t1(i * n));
+      ASSERT_TRUE(MatXUtils::MatXTypeCompare(ds_op(i), t1(i * n)));
     }
   }  
 
@@ -3653,7 +3641,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexNonHalfAllExecs, R2COp)
 
   for (int i = 0; i < N1; i++) { t1(i) = static_cast<TestType>(i+1); }
   for (int i = 0; i < N2; i++) { t2(i) = static_cast<TestType>(i+1); }
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   const std::array<ComplexType, N1> T1_expected = {{
     { 15.0, 0.0 }, { -2.5, static_cast<TestType>(3.4409548) }, { -2.5, static_cast<TestType>(0.81229924) },
@@ -3672,7 +3660,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexNonHalfAllExecs, R2COp)
   (T1 = r2c(fft(t1), N1)).run(exec);
   (T2 = r2c(fft(t2), N2)).run(exec);
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (int i = 0; i < N1; i++) {
     ASSERT_NEAR(T1(i).real(), T1_expected[i].real(), thresh);
@@ -3688,7 +3676,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexNonHalfAllExecs, R2COp)
   (T1 = r2c(fft(t1, N1), N1)).run(exec);
   (T2 = r2c(fft(t2, N2), N2)).run(exec);
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (int i = 0; i < N1; i++) {
     ASSERT_NEAR(T1(i).real(), T1_expected[i].real(), thresh);
@@ -3706,7 +3694,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexNonHalfAllExecs, R2COp)
   (T1 = ifft(r2c(fft(t1), N1))).run(exec);
   (T2 = ifft(r2c(fft(t2), N2))).run(exec);
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (int i = 0; i < N1; i++) {
     ASSERT_NEAR(T1(i).real(), t1(i), thresh);
@@ -3725,9 +3713,16 @@ TYPED_TEST(OperatorTestsFloatNonHalf, FftShiftWithTransform)
 {
   MATX_ENTER_HANDLER();
   using TestType = std::tuple_element_t<0, TypeParam>;
+  using ExecType = std::tuple_element_t<1, TypeParam>;  
   using inner_type = typename inner_op_type_t<TestType>::type;
   using complex_type = detail::complex_from_scalar_t<inner_type>;
 
+  if constexpr (!detail::CheckFFTSupport<ExecType>()) {
+    GTEST_SKIP();
+  }  
+  
+  ExecType exec{};
+
   [[maybe_unused]] const inner_type thresh = static_cast<inner_type>(1.0e-6);
 
   // Verify that fftshift1D/ifftshift1D work with nested transforms.
@@ -3750,12 +3745,12 @@ TYPED_TEST(OperatorTestsFloatNonHalf, FftShiftWithTransform)
     for (int i = 0; i < N1; i++) { t3(i) = t3_vals[i]; };
     for (int i = 0; i < N2; i++) { t4(i) = t4_vals[i]; };
 
-    cudaStreamSynchronize(0);
+    exec.sync();
 
-    (T3 = fftshift1D(fft(t3))).run();
-    (T4 = fftshift1D(fft(t4))).run();
+    (T3 = fftshift1D(fft(t3))).run(exec);
+    (T4 = fftshift1D(fft(t4))).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     const std::array<complex_type, N1> T3_expected = {{
       { -1.5, static_cast<inner_type>(-0.8660254) }, { 6.0, 0.0 }, { -1.5, static_cast<inner_type>(0.8660254) }
@@ -3774,10 +3769,10 @@ TYPED_TEST(OperatorTestsFloatNonHalf, FftShiftWithTransform)
       ASSERT_NEAR(T4(i).imag(), T4_expected[i].imag(), thresh);
     }
 
-    (T3 = ifftshift1D(fft(t3))).run();
-    (T4 = ifftshift1D(fft(t4))).run();
+    (T3 = ifftshift1D(fft(t3))).run(exec);
+    (T4 = ifftshift1D(fft(t4))).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     const std::array<complex_type, N1> T3_ifftshift_expected = {{
       { -1.5, static_cast<inner_type>(0.8660254) }, { -1.5, static_cast<inner_type>(-0.8660254) }, { 6.0, 0.0 }
@@ -3803,14 +3798,14 @@ TYPED_TEST(OperatorTestsFloatNonHalf, FftShiftWithTransform)
     auto x = make_tensor<complex_type>({N,N});
     auto X = make_tensor<complex_type>({N,N});
 
-    (x = 0).run();
+    (x = static_cast<TestType>(0)).run(exec);
 
-    (X = fftshift2D(fft2(x))).run();
-    (X = fftshift2D(ifft2(x))).run();
-    (X = ifftshift2D(fft2(x))).run();
-    (X = ifftshift2D(ifft2(x))).run();
+    (X = fftshift2D(fft2(x))).run(exec);
+    (X = fftshift2D(ifft2(x))).run(exec);
+    (X = ifftshift2D(fft2(x))).run(exec);
+    (X = ifftshift2D(ifft2(x))).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
   }
 
   MATX_EXIT_HANDLER();
@@ -3844,7 +3839,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp)
     // Shift the first dimension of "t2" by -5 so the 5th element of "t2" is the first element of "t2s"
     (t2s = shift<0>(t2, -5)).run(exec);
     // example-end shift-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count0; i++) {
       for (index_t j = 0; j < count1; j++) {
@@ -3856,7 +3851,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp)
   
   {
     (t2s = shift<0>(t2, t0)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count0; i++) {
       for (index_t j = 0; j < count1; j++) {
@@ -3868,7 +3863,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp)
 
   {
     (t2s = shift<1>(t2, -5)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count0; i++) {
       for (index_t j = 0; j < count1; j++) {
@@ -3880,7 +3875,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp)
 
   {
     (t2s = shift<1,0>(t2, -5, -6)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count0; i++) {
       for (index_t j = 0; j < count1; j++) {
@@ -3894,7 +3889,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp)
     // example-begin fftshift2D-test-1
     (t2s = fftshift2D(t2)).run(exec);
     // example-end fftshift2D-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count0; i++) {
       for (index_t j = 0; j < count1; j++) {
@@ -3909,7 +3904,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp)
     // example-begin ifftshift2D-test-1
     (t2s = ifftshift2D(t2)).run(exec);
     // example-end ifftshift2D-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count0; i++) {
       for (index_t j = 0; j < count1; j++) {
@@ -3923,7 +3918,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp)
   // Right shifts
   {
     (t2s = shift<0>(t2, 5)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count0; i++) {
       for (index_t j = 0; j < count1; j++) {
@@ -3935,7 +3930,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp)
 
   {
     (t2s = shift<1>(t2, 5)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count0; i++) {
       for (index_t j = 0; j < count1; j++) {
@@ -3948,7 +3943,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp)
   // Large shifts
   {
     (t2s = shift<0>(t2, -t2.Size(0) * 4)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count0; i++) {
       for (index_t j = 0; j < count1; j++) {
@@ -3961,8 +3956,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp)
     // Shift 4 times the size back, minus one. This should be equivalent to
     // simply shifting by -1
     (t2s = shift<0>(t2, -t2.Size(0) * 4 - 1)).run(exec);
-    (t2s2 = shift<0>(t2, -1)).run();
-    cudaStreamSynchronize(0);
+    (t2s2 = shift<0>(t2, -1)).run(exec);
+    exec.sync();
 
     for (index_t i = 0; i < count0; i++) {
       for (index_t j = 0; j < count1; j++) {
@@ -3999,7 +3994,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Reverse)
     // Reverse the values of t2 along dimension 0
     (t2r = reverse<0>(t2)).run(exec);
     // example-end reverse-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count0; i++) {
       for (index_t j = 0; j < count1; j++) {
@@ -4011,7 +4006,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Reverse)
 
   {
     (t2r = reverse<1>(t2)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count0; i++) {
       for (index_t j = 0; j < count1; j++) {
@@ -4023,7 +4018,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Reverse)
 
   {
     (t2r = reverse<0,1>(t2)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count0; i++) {
       for (index_t j = 0; j < count1; j++) {
@@ -4038,7 +4033,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Reverse)
     // example-begin flipud-test-1
     (t2r = flipud(t2)).run(exec);
     // example-end flipud-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count0; i++) {
       for (index_t j = 0; j < count1; j++) {
@@ -4052,7 +4047,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Reverse)
     // example-begin fliplr-test-1
     (t2r = fliplr(t2)).run(exec);
     // example-end fliplr-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (index_t i = 0; i < count0; i++) {
       for (index_t j = 0; j < count1; j++) {
@@ -4073,22 +4068,24 @@ TEST(OperatorTests, Cast)
   auto t2 = make_tensor<int8_t>({count0});
   auto to = make_tensor<float>({count0});
 
+  cudaExecutor exec{};
+
   t.SetVals({126, 126, 126, 126});
   t2.SetVals({126, 126, 126, 126});
   
   // example-begin as_type-test-1
-  (to = as_type<int8_t>(t + t2)).run();
+  (to = as_type<int8_t>(t + t2)).run(exec);
   // example-end as_type-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (int i = 0; i < t.Size(0); i++) {
     ASSERT_EQ(to(i), -4); // -4 from 126 + 126 wrap-around
   }
 
   // example-begin as_int8-test-1
-  (to = as_int8(t + t2)).run();
+  (to = as_int8(t + t2)).run(exec);
   // example-end as_int8-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   
   for (int i = 0; i < t.Size(0); i++) {
     ASSERT_EQ(to(i), -4); // -4 from 126 + 126 wrap-around
@@ -4155,7 +4152,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Legendre)
     (out = legendre(n, m, x)).run(exec);
     // example-end legendre-test-1
 
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for(int j = 0; j < order; j++) {
       for(int p = 0; p < order; p++) {
@@ -4179,7 +4176,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Legendre)
 
     (out = lcollapse<2>(legendre(order, m, x))).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for(int i = 0 ; i < size; i++) {
       for(int p = 0; p < order; p++) {
@@ -4200,7 +4197,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Legendre)
 
     (out = lcollapse<3>(legendre(order, order,  x))).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for(int i = 0 ; i < size; i++) {
       if constexpr (is_matx_half_v<TestType>) {
@@ -4220,7 +4217,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Legendre)
 
     (out = lcollapse<3>(legendre(order, m,  x))).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for(int i = 0 ; i < size; i++) {
       if constexpr (is_matx_half_v<TestType>) {
@@ -4237,6 +4234,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Legendre)
 TEST(OperatorTestsAdvanced, AdvancedRemapOp)
 {
   typedef cuda::std::complex<float> complex;
+  cudaExecutor exec{};
   MATX_ENTER_HANDLER();
 
   int I = 4;
@@ -4275,7 +4273,7 @@ TEST(OperatorTestsAdvanced, AdvancedRemapOp)
     }
   }
 
-  (B = 0).run();
+  (B = 0).run(exec);
 
   auto rop = remap<1>(A, idx);
   auto lop = lcollapse<3>(rop);
@@ -4284,9 +4282,9 @@ TEST(OperatorTestsAdvanced, AdvancedRemapOp)
   ASSERT_EQ(lop.Size(1) , A.Size(3));
   ASSERT_EQ(lop.Size(0) , I * M * K);
 
-  (B = lop).run();
+  (B = lop).run(exec);
 
-  cudaDeviceSynchronize();  
+  exec.sync(); 
 
   for (int i = 0; i < I; i++) {
     for (int m = 0; m < M; m++) {
@@ -4337,17 +4335,8 @@ TEST(OperatorTestsAdvanced, AdvancedRemapOp)
       }
     }
   }
-  
-  A.PrefetchDevice(0);
-  B.PrefetchDevice(0);
-  C.PrefetchDevice(0);
-  D.PrefetchDevice(0);
-  O1.PrefetchDevice(0);
-  O2.PrefetchDevice(0);
-  O3.PrefetchDevice(0);
-  O4.PrefetchDevice(0);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   auto o1op = lcollapse<3>(remap<1>(O1, idx));
   auto o2op = lcollapse<3>(remap<1>(O2, idx));
@@ -4357,27 +4346,27 @@ TEST(OperatorTestsAdvanced, AdvancedRemapOp)
   auto cop = C.Clone<4>({matxKeepDim, M, matxKeepDim, matxKeepDim});
   auto rcop = lcollapse<3>(remap<1>(cop, idx));
 
-  (O1 = 1).run();
-  (O2 = 2).run();
-  (O3 = 3).run();
-  (O4 = 4).run();
+  (O1 = 1).run(exec);
+  (O2 = 2).run(exec);
+  (O3 = 3).run(exec);
+  (O4 = 4).run(exec);
   
-  (B = lop).run();
-  (D = rcop).run();
+  (B = lop).run(exec);
+  (D = rcop).run(exec);
 
   // two operators as input
-  (o1op = conv1d(lop, rcop, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run();
+  (o1op = conv1d(lop, rcop, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
 
   // one tensor and one operators as input
-  (o2op = conv1d(B, rcop, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run();
+  (o2op = conv1d(B, rcop, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
   
   // one tensor and one operators as input
-  (o3op = conv1d(lop, D, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run();
+  (o3op = conv1d(lop, D, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
   
   //two tensors as input
-  (o4op = conv1d(B, D, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run();
+  (o4op = conv1d(B, D, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   for (int i = 0; i < o1op.Size(0); i++) {
     for (int l = 0; l < o1op.Size(1); l++) {
diff --git a/test/00_operators/PWelch.cu b/test/00_operators/PWelch.cu
index 98bf8eddf..059bcad13 100644
--- a/test/00_operators/PWelch.cu
+++ b/test/00_operators/PWelch.cu
@@ -100,33 +100,35 @@ void helper(PWelchComplexExponentialTest& test)
 
   auto Pxx  = make_tensor<typename TypeParam::value_type>({test.params.nfft});
 
+  cudaExecutor exec{};
+
   if (test.params.window_name == "none")
   {
-    (Pxx = pwelch(x, test.params.nperseg, test.params.noverlap, test.params.nfft)).run();
+    (Pxx = pwelch(x, test.params.nperseg, test.params.noverlap, test.params.nfft)).run(exec);
   }
   else
   {
     auto w = make_tensor<typename TypeParam::value_type>({test.params.nperseg});
     if (test.params.window_name == "boxcar")
     {
-      (w = ones<typename TypeParam::value_type>({test.params.nperseg})).run();
+      (w = ones<typename TypeParam::value_type>({test.params.nperseg})).run(exec);
     }
     else if (test.params.window_name == "hann")
     {
-      (w = hanning<0,1,typename TypeParam::value_type>({test.params.nperseg})).run();
+      (w = hanning<0,1,typename TypeParam::value_type>({test.params.nperseg})).run(exec);
     }
     else if (test.params.window_name == "flattop")
     {
-      (w = flattop<0,1,typename TypeParam::value_type>({test.params.nperseg})).run();
+      (w = flattop<0,1,typename TypeParam::value_type>({test.params.nperseg})).run(exec);
     }
     else
     {
       ASSERT_TRUE(false) << "Unknown window parameter name " + test.params.window_name;
     }
-    (Pxx = pwelch(x, w, test.params.nperseg, test.params.noverlap, test.params.nfft)).run();
+    (Pxx = pwelch(x, w, test.params.nperseg, test.params.noverlap, test.params.nfft)).run(exec);
   }
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(test.pb, Pxx, "Pxx_out", test.thresh);
   MATX_EXIT_HANDLER();
@@ -154,13 +156,14 @@ TEST(PWelchOpTest, xin_complex_float)
   index_t noverlap = 0;
   index_t nfft = 8;
   auto x = ones<cuda::std::complex<float>>({signal_size});
+  cudaExecutor exec{};
   // example-begin pwelch-test-1
   auto Pxx  = make_tensor<float>({nfft});
   auto w = ones<float>({nperseg});
-  (Pxx = pwelch(x, w, nperseg, noverlap, nfft)).run();
+  (Pxx = pwelch(x, w, nperseg, noverlap, nfft)).run(exec);
   // example-end pwelch-test-1
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   EXPECT_NEAR(Pxx(0), 64, thresh);
   for (index_t k=1; k<nfft; k++)
diff --git a/test/00_operators/ReductionTests.cu b/test/00_operators/ReductionTests.cu
index 5a65f96c0..a48e38424 100644
--- a/test/00_operators/ReductionTests.cu
+++ b/test/00_operators/ReductionTests.cu
@@ -203,7 +203,7 @@ TYPED_TEST(ReductionTestsNumericNoHalfAllExecs, Sum)
     // Reduce a 3D tensor into a 2D by taking the sum of the last dimension
     (b = sum(a, {2})).run(exec);
     // example-end sum-test-2
-    cudaStreamSynchronize(0);
+    exec.sync();
     for(int i = 0 ; i < x ; i++) {
       for(int j = 0; j < y ; j++) {
         ASSERT_TRUE( MatXUtils::MatXTypeCompare(b(i,j), (TestType)z));
@@ -223,22 +223,22 @@ TYPED_TEST(ReductionTestsNumericNoHalfAllExecs, Sum)
     // Reduce a 4D tensor into a 0D by taking the sum of all elements
     (t0 = sum(t4)).run(exec);
     // example-end sum-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
     ASSERT_TRUE(MatXUtils::MatXTypeCompare(
         t0(), (TestType)(t4.Size(0) * t4.Size(1) * t4.Size(2) * t4.Size(3))));
 
      (t0 = sum(t3)).run(exec);
-     cudaStreamSynchronize(0);
+     exec.sync();
      ASSERT_TRUE(MatXUtils::MatXTypeCompare(
          t0(), (TestType)(t3.Size(0) * t3.Size(1) * t3.Size(2))));
 
      (t0 = sum(t2)).run(exec);
-     cudaStreamSynchronize(0);
+     exec.sync();
      ASSERT_TRUE(
          MatXUtils::MatXTypeCompare(t0(), (TestType)(t2.Size(0) * t2.Size(1))));
 
      (t0 = sum(t1)).run(exec);
-     cudaStreamSynchronize(0);
+     exec.sync();
      ASSERT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(t1.Size(0))));
   }
   {
@@ -250,21 +250,21 @@ TYPED_TEST(ReductionTestsNumericNoHalfAllExecs, Sum)
 
     (t1 = sum(t4, {1, 2, 3})).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t1.Size(0); i++) {
       ASSERT_TRUE(MatXUtils::MatXTypeCompare(
           t1(i), (TestType)(t4.Size(1) * t4.Size(2) * t4.Size(3))));
     }
 
     (t1 = sum(t3, {1, 2})).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t1.Size(0); i++) {
       ASSERT_TRUE(MatXUtils::MatXTypeCompare(
           t1(i), (TestType)(t3.Size(1) * t3.Size(2))));
     }
 
     (t1 = sum(t2, {1})).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t1.Size(0); i++) {
       ASSERT_TRUE(MatXUtils::MatXTypeCompare(t1(i), (TestType)(t2.Size(1))));
     }
@@ -285,7 +285,7 @@ TYPED_TEST(ReductionTestsNumericNoHalfAllExecs, Sum)
     auto t3 = ones<TestType>({3, 4, 5});
 
     (t2 = sum(t4, {2, 3})).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2.Size(0); i++) {
       for (index_t j = 0; j < t2.Size(1); j++) {
         ASSERT_TRUE(MatXUtils::MatXTypeCompare(
@@ -294,7 +294,7 @@ TYPED_TEST(ReductionTestsNumericNoHalfAllExecs, Sum)
     }
 
     (t2 = sum(t3, {2})).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2.Size(0); i++) {
       for (index_t j = 0; j < t2.Size(1); j++) {
         ASSERT_TRUE(
@@ -371,7 +371,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce)
     (t2a = sum(permute(t4,{2,3,0,1}))).run(exec);
     (t2b = sum(t4, {0,1})).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2a.Size(0); i++) {
       for (index_t j = 0; j < t2a.Size(1); j++) {
         ASSERT_TRUE(MatXUtils::MatXTypeCompare(
@@ -384,7 +384,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce)
     (t2a = mean(permute(t4,{2,3,0,1}))).run(exec);
     (t2b = mean(t4, {0,1})).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2a.Size(0); i++) {
       for (index_t j = 0; j < t2a.Size(1); j++) {
         ASSERT_TRUE(MatXUtils::MatXTypeCompare(
@@ -398,7 +398,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce)
     median(t2a, permute(t4,{2,3,0,1}), exec);
     median(t2b, t4, {0,1}, exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2a.Size(0); i++) {
       for (index_t j = 0; j < t2a.Size(1); j++) {
         ASSERT_TRUE(MatXUtils::MatXTypeCompare(
@@ -413,7 +413,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce)
     (t2a = prod(permute(t4,{2,3,0,1}))).run(exec);
     (t2b = prod(t4, {0,1})).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2a.Size(0); i++) {
       for (index_t j = 0; j < t2a.Size(1); j++) {
         ASSERT_TRUE(MatXUtils::MatXTypeCompare(
@@ -431,7 +431,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce)
     (t2b = max(t4, {0,1})).run(exec);
     // example-end max-test-2
 
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2a.Size(0); i++) {
       for (index_t j = 0; j < t2a.Size(1); j++) {
         ASSERT_TRUE(MatXUtils::MatXTypeCompare(
@@ -449,7 +449,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce)
     (t2b = min(t4, {0,1})).run(exec);
     // example-end min-test-2
 
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2a.Size(0); i++) {
       for (index_t j = 0; j < t2a.Size(1); j++) {
         ASSERT_TRUE(MatXUtils::MatXTypeCompare(
@@ -467,7 +467,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce)
     (mtie(t2b, t2bi) = argmax(t4, {0,1})).run(exec);
     // example-end argmax-test-2
 
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2a.Size(0); i++) {
       for (index_t j = 0; j < t2a.Size(1); j++) {
         ASSERT_TRUE(MatXUtils::MatXTypeCompare(
@@ -487,7 +487,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce)
     (mtie(t2b, t2bi) = argmin(t4, {0,1})).run(exec);
     // example-end argmin-test-2
 
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2a.Size(0); i++) {
       for (index_t j = 0; j < t2a.Size(1); j++) {
         ASSERT_TRUE(MatXUtils::MatXTypeCompare(
@@ -507,7 +507,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce)
     (t2b = any(t4, {0,1})).run(exec);
     // example-end any-test-2
 
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2a.Size(0); i++) {
       for (index_t j = 0; j < t2a.Size(1); j++) {
         ASSERT_TRUE(MatXUtils::MatXTypeCompare(
@@ -525,7 +525,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce)
     (t2b = all(t4, {0,1})).run(exec);
     // example-end all-test-2
 
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2a.Size(0); i++) {
       for (index_t j = 0; j < t2a.Size(1); j++) {
         ASSERT_TRUE(MatXUtils::MatXTypeCompare(
@@ -539,7 +539,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce)
     (t2a = var(permute(t4,{2,3,0,1}))).run(exec);
     (t2b = var(t4, {0,1})).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2a.Size(0); i++) {
       for (index_t j = 0; j < t2a.Size(1); j++) {
         ASSERT_TRUE(MatXUtils::MatXTypeCompare(
@@ -553,7 +553,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce)
     (t2a = stdd(permute(t4,{2,3,0,1}))).run(exec);
     (t2b = stdd(t4, {0,1})).run(exec);
 
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2a.Size(0); i++) {
       for (index_t j = 0; j < t2a.Size(1); j++) {
         ASSERT_TRUE(MatXUtils::MatXTypeCompare(
@@ -585,7 +585,7 @@ TYPED_TEST(ReductionTestsNumericNonComplexAllExecs, Any)
     (t2 = zeros<TestType>(t2.Shape())).run(exec);
     (t3 = zeros<TestType>(t3.Shape())).run(exec);
     (t4 = zeros<TestType>(t4.Shape())).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     t1(5) = 5;
     t3(1, 1, 1) = 6;
@@ -595,19 +595,19 @@ TYPED_TEST(ReductionTestsNumericNonComplexAllExecs, Any)
     // convertible to "true"
     (t0 = any(t4)).run(exec);
     // example-end any-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(0)));
 
     (t0 = any(t3)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1)));
 
     (t0 = any(t2)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(0)));
 
     (t0 = any(t1)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1)));
   }
 
@@ -631,13 +631,13 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, AllClose)
   (B = ones<TestType>(B.Shape())).run(exec);
   allclose(C, A, B, 1e-5, 1e-8, exec);
   // example-end allclose-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   ASSERT_EQ(C(), 1);
 
   B(1,1,1) = 2;
   allclose(C, A, B, 1e-5, 1e-8, exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   ASSERT_EQ(C(), 0);
 
@@ -661,11 +661,11 @@ TYPED_TEST(ReductionTestsNumericNonComplexAllExecs, All)
     tensor_t<TestType, 3> t3({30, 40, 50});
     tensor_t<TestType, 4> t4({30, 40, 50, 60});
 
-    (t1 = ones<TestType>(t1.Shape())).run();
-    (t2 = ones<TestType>(t2.Shape())).run();
-    (t3 = ones<TestType>(t3.Shape())).run();
-    (t4 = ones<TestType>(t4.Shape())).run();
-    cudaStreamSynchronize(0);
+    (t1 = ones<TestType>(t1.Shape())).run(exec);
+    (t2 = ones<TestType>(t2.Shape())).run(exec);
+    (t3 = ones<TestType>(t3.Shape())).run(exec);
+    (t4 = ones<TestType>(t4.Shape())).run(exec);
+    exec.sync();
 
     t1(5) = 0;
     t3(1, 1, 1) = 0;
@@ -675,19 +675,19 @@ TYPED_TEST(ReductionTestsNumericNonComplexAllExecs, All)
     // convert to "true", or "false" otherwise
     (t0 = all(t4)).run(exec);
     // example-end all-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1)));
 
     (t0 = all(t3)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(0)));
 
     (t0 = all(t2)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1)));
 
     (t0 = all(t1)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(0)));
   }
 
@@ -718,51 +718,51 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Percentile)
     // Find the 50th percentile value in `t1e` using linear interpolation between midpoints
     (t0 = percentile(t1e, 50, PercentileMethod::LINEAR)).run(exec);
     // example-end percentile-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
     MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_linear50", 0.01);
 
     (t0 = percentile(t1e, 80, PercentileMethod::LINEAR)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_linear80", 0.01);
 
     (t0 = percentile(t1e, 50, PercentileMethod::LOWER)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_lower50", 0.01);
 
     (t0 = percentile(t1e, 80, PercentileMethod::LOWER)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_lower80", 0.01);
 
     (t0 = percentile(t1e, 50, PercentileMethod::HIGHER)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_higher50", 0.01);
 
     (t0 = percentile(t1e, 80, PercentileMethod::HIGHER)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_higher80", 0.01);
 
     (t0 = percentile(t1o, 50, PercentileMethod::LINEAR)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_linear50", 0.01);
 
     (t0 = percentile(t1o, 80, PercentileMethod::LINEAR)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_linear80", 0.01);
 
     (t0 = percentile(t1o, 50, PercentileMethod::LOWER)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_lower50", 0.01);
 
     (t0 = percentile(t1o, 80, PercentileMethod::LOWER)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_lower80", 0.01);
 
     (t0 = percentile(t1o, 50, PercentileMethod::HIGHER)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_higher50", 0.01);
 
     (t0 = percentile(t1o, 80, PercentileMethod::HIGHER)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_higher80", 0.01);    
   }
 
@@ -794,20 +794,20 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Median)
     // Compute media over all elements in "t1e" and store result in "t0"
     (t0 = median(t1e)).run(exec);
     // example-end median-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(4.5f)));
 
     (t0 = median(t1o)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(5)));
 
     (t1out = median(t2e, {1})).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1out(0), (TestType)(2.5f)));
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1out(1), (TestType)(2.5f)));
 
     (t1out = median(t2o, {1})).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1out(0), (TestType)(3.0f)));
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1out(1), (TestType)(3.0f)));
   }
@@ -824,10 +824,12 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, MinMaxNegative)
     auto t = matx::make_tensor<TestType, 1>({3});
     t.SetVals({-3, -1, -7});
 
+    ExecType exec{};
+
     matx::tensor_t<TestType, 0> max_val{{}};
     matx::tensor_t<matx::index_t, 0> max_idx{{}};
-    (mtie(max_val, max_idx) = matx::argmax(t)).run(ExecType{});
-    cudaStreamSynchronize(0);
+    (mtie(max_val, max_idx) = matx::argmax(t)).run(exec);
+    exec.sync();
     ASSERT_EQ(max_val(), -1);
     ASSERT_EQ(max_idx(), 1);
   }
@@ -853,7 +855,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Max)
     // Reduce all inputs in "t1o" into "t0" by the maximum of all elements
     (t0 = max(t1o)).run(exec);
     // example-end max-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(11)));    
   }
 
@@ -878,7 +880,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Min)
     // Reduce all inputs in "t1o" into "t0" by the minimum of all elements
     (t0 = min(t1o)).run(exec);
     // example-end min-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1))); 
   }
 
@@ -903,7 +905,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMax)
 
     (mtie(t0, t0i) = argmax(t1o)).run(exec);
     // example-end argmax-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(11)));
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0i(), (TestType)(10)));    
 
@@ -913,7 +915,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMax)
     t2o.SetVals({{(T)2, (T)4, (T)1, (T)3, (T)5}, {(T)3, (T)1, (T)5, (T)2, (T)4}});    
         
     (mtie(t1o_small, t1i_small) = argmax(t2o, {1})).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     auto rel = GetIdxFromAbs(t2o, t1i_small(0));
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(5)));
@@ -942,7 +944,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMin)
 
     (mtie(t0, t0i) = argmin(t1o)).run(exec);
     // example-end argmin-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1)));
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0i(), (TestType)(0)));    
 
@@ -952,7 +954,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMin)
     t2o.SetVals({{(T)2, (T)4, (T)1, (T)3, (T)5}, {(T)3, (T)1, (T)5, (T)2, (T)4}});
 
     (mtie(t1o_small, t1i_small) = argmin(t2o, {1})).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     
     auto rel = GetIdxFromAbs(t2o, t1i_small(0));
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(1)));
@@ -982,19 +984,19 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Mean)
     // Compute the mean over all dimensions in "t4" and store the result in "t0"
     (t0 = mean(t4)).run(exec);
     // example-end mean-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1)));
 
     (t0 = mean(t3)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1)));
 
     (t0 = mean(t2)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1)));
 
     (t0 = mean(t1)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1)));
   }
   {
@@ -1005,19 +1007,19 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Mean)
     auto t2 = ones<TestType>({30, 40});
 
     (t1 = mean(t4, {1, 2, 3})).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t1.Size(0); i++) {
       EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1(i), (TestType)(1)));
     }
 
     (t1 = mean(t3, {1, 2})).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t1.Size(0); i++) {
       EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1(i), (TestType)(1)));
     }
 
     (t1 = mean(t2, {1})).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t1.Size(0); i++) {
       EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1(i), (TestType)(1)));
     }
@@ -1030,7 +1032,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Mean)
     auto t3 = ones<TestType>({30, 40, 50});
 
     (t2 = mean(t4, {2, 3})).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2.Size(0); i++) {
       for (index_t j = 0; j < t2.Size(1); j++) {
         EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2(i, j), (TestType)(1)));
@@ -1038,7 +1040,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Mean)
     }
 
     (t2 = mean(t3, {2})).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     for (index_t i = 0; i < t2.Size(0); i++) {
       for (index_t j = 0; j < t2.Size(1); j++) {
         EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2(i, j), (TestType)(1)));
@@ -1085,11 +1087,11 @@ TYPED_TEST(ReductionTestsNumericNonComplexAllExecs, Prod)
     // Compute the product of all elements in "t2" and store into "t0"
     (t0 = prod(t2)).run(exec);
     // example-end prod-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), t2p));
 
     (t0 = prod(t1)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), t1p));
   }
 
@@ -1104,6 +1106,8 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Find)
     using TestType = std::tuple_element_t<0, TypeParam>;
     using ExecType = std::tuple_element_t<1, TypeParam>;
 
+    ExecType exec{};
+
     tensor_t<int, 0> num_found{{}};
     tensor_t<TestType, 1> t1{{100}};
     tensor_t<TestType, 1> t1o{{100}};
@@ -1116,9 +1120,9 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Find)
     // example-begin find-test-1
     // Find values greater than 0.5
     TestType thresh = (TestType)0.5;
-    (mtie(t1o, num_found) = find(t1, GT{thresh})).run(ExecType{});
+    (mtie(t1o, num_found) = find(t1, GT{thresh})).run(exec);
     // example-end find-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
     
     int output_found = 0;
     for (int i = 0; i < t1.Size(0); i++) {
@@ -1143,6 +1147,8 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, FindIdx)
     using TestType = std::tuple_element_t<0, TypeParam>;
     using ExecType = std::tuple_element_t<1, TypeParam>;
 
+    ExecType exec{};
+
     tensor_t<int, 0> num_found{{}};
     tensor_t<TestType, 1> t1{{100}};
     tensor_t<int, 1> t1o{{100}};
@@ -1155,9 +1161,9 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, FindIdx)
     // example-begin find_idx-test-1
     // Find indices with values greater than 0.5
     TestType thresh = (TestType)0.5;
-    (mtie(t1o, num_found) = find_idx(t1, GT{thresh})).run(ExecType{});
+    (mtie(t1o, num_found) = find_idx(t1, GT{thresh})).run(exec);
     // example-end find_idx-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
     
     int output_found = 0;
     for (int i = 0; i < t1.Size(0); i++) {
@@ -1187,7 +1193,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, FindIdxAndSelect)
     tensor_t<TestType, 1> t1o_2{{100}};
     TestType thresh = (TestType)0.5;
 
-    auto executor = ExecType{};
+    ExecType exec{};
 
     for (int i = 0; i < t1.Size(0); i++) {
       t1(i) = static_cast<detail::value_promote_t<TestType>>((float)rand() /
@@ -1196,19 +1202,19 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, FindIdxAndSelect)
 
     // Find indices with values greater than 0.5
     // example-begin select-test-1
-    (mtie(t1o_idx, num_found) = find_idx(t1, GT{thresh})).run(executor);
+    (mtie(t1o_idx, num_found) = find_idx(t1, GT{thresh})).run(exec);
 
     // Since we use the output on the host in select() we need to synchronize first
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     auto t1o_slice = t1o.Slice({0}, {num_found()});
     auto t1o_idx_slice = t1o_idx.Slice({0}, {num_found()});
-    (t1o_slice = select(t1o_slice, t1o_idx_slice)).run(executor);
+    (t1o_slice = select(t1o_slice, t1o_idx_slice)).run(exec);
 
     // Compare to simply finding the values
-    (mtie(t1o_2, num_found2) = find(t1, GT{thresh})).run(executor);
+    (mtie(t1o_2, num_found2) = find(t1, GT{thresh})).run(exec);
     // example-end select-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     ASSERT_EQ(num_found(), num_found2());
     
@@ -1227,6 +1233,8 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Unique)
     using TestType = std::tuple_element_t<0, TypeParam>;
     using ExecType = std::tuple_element_t<1, TypeParam>;
 
+    ExecType exec{};
+
     tensor_t<int, 0> num_found{{}};
     tensor_t<TestType, 1> t1{{100}};
     tensor_t<TestType, 1> t1o{{100}};
@@ -1236,9 +1244,9 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Unique)
     }
 
     // example-begin unique-test-1
-    (mtie(t1o, num_found) = unique(t1)).run(ExecType{});
+    (mtie(t1o, num_found) = unique(t1)).run(exec);
     // example-end unique-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     for (int i = 0; i < 10; i++) {
       ASSERT_NEAR(t1o(i), i, 0.01);
@@ -1270,7 +1278,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Trace)
   (t0 = trace(t2)).run(exec);
   // example-end trace-test-1
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   ASSERT_EQ(t0(), count);
   MATX_EXIT_HANDLER();
diff --git a/test/00_solver/Cholesky.cu b/test/00_solver/Cholesky.cu
index 79bfc7762..576745496 100644
--- a/test/00_solver/Cholesky.cu
+++ b/test/00_solver/Cholesky.cu
@@ -83,7 +83,7 @@ TYPED_TEST(CholSolverTestNonHalfFloatTypes, CholeskyBasic)
     // example-begin chol-test-1
     (Bv = chol(Bv, CUBLAS_FILL_MODE_LOWER)).run(exec);
     // example-end chol-test-1
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     // Cholesky fills the lower triangular portion (due to CUBLAS_FILL_MODE_LOWER)
     // and destroys the upper triangular portion.
@@ -130,10 +130,10 @@ TYPED_TEST(CholSolverTestNonHalfFloatTypes, CholeskyWindowed)
     this->pb->NumpyToTensorView(Cv, "B");
     this->pb->NumpyToTensorView(Lv, "L");
     (Bslice = Cv).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     (Bslice = chol(Bslice, CUBLAS_FILL_MODE_LOWER)).run(exec);
-    cudaStreamSynchronize(0);
+    exec.sync();
 
     // Cholesky fills the lower triangular portion (due to CUBLAS_FILL_MODE_LOWER)
     // and destroys the upper triangular portion.
diff --git a/test/00_solver/Det.cu b/test/00_solver/Det.cu
index 870979889..e6fea1663 100644
--- a/test/00_solver/Det.cu
+++ b/test/00_solver/Det.cu
@@ -76,7 +76,7 @@ TYPED_TEST(DetSolverTestNonComplexFloatTypes, Determinant)
 
   (this->detv = det(this->Atv)).run(this->exec);
   (this->Av = transpose(this->Atv)).run(this->exec); // Transpose back to row-major
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, this->detv, "det", 0.1);
 
diff --git a/test/00_solver/Eigen.cu b/test/00_solver/Eigen.cu
index 3076f579d..b1a4c1c8d 100644
--- a/test/00_solver/Eigen.cu
+++ b/test/00_solver/Eigen.cu
@@ -94,7 +94,7 @@ TYPED_TEST(EigenSolverTestNonComplexFloatTypes, EigenBasic)
     // Compute A*v
 
     (this->Gtv = matmul(this->Bv, this->Wv)).run(this->exec);
-    cudaStreamSynchronize(0);
+    this->exec.sync();
     // Compare
     for (index_t j = 0; j < dim_size; j++) {
       ASSERT_NEAR(this->Gtv(j, 0), this->Lvv(j, 0), 0.001);
diff --git a/test/00_solver/Inverse.cu b/test/00_solver/Inverse.cu
index 74fa61ee4..4896cfb64 100644
--- a/test/00_solver/Inverse.cu
+++ b/test/00_solver/Inverse.cu
@@ -77,7 +77,7 @@ TYPED_TEST(InvSolverTestFloatTypes, Inv4x4)
   // Perform an inverse on matrix "A" and store the output in "Ainv"
   (Ainv = inv(A)).run(this->exec);
   // example-end inv-test-1  
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (index_t i = 0; i < A.Size(0); i++) {
     for (index_t j = 0; j <= i; j++) {
@@ -108,7 +108,7 @@ TYPED_TEST(InvSolverTestFloatTypes, Inv4x4Batched)
   this->pb->NumpyToTensorView(Ainv_ref, "A_inv");  
 
   (Ainv = inv(A)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (index_t b = 0; b < A.Size(0); b++) {
     for (index_t i = 0; i < A.Size(1); i++) {
@@ -141,7 +141,7 @@ TYPED_TEST(InvSolverTestFloatTypes, Inv8x8)
   this->pb->NumpyToTensorView(Ainv_ref, "A_inv");  
 
   (Ainv = inv(A)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (index_t i = 0; i < A.Size(0); i++) {
     for (index_t j = 0; j <= i; j++) {
@@ -172,7 +172,7 @@ TYPED_TEST(InvSolverTestFloatTypes, Inv8x8Batched)
   this->pb->NumpyToTensorView(Ainv_ref, "A_inv");  
 
   (Ainv = inv(A)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (index_t b = 0; b < A.Size(0); b++) {
     for (index_t i = 0; i < A.Size(1); i++) {
@@ -206,7 +206,7 @@ TYPED_TEST(InvSolverTestFloatTypes, Inv256x256)
   this->pb->NumpyToTensorView(Ainv_ref, "A_inv");  
 
   (Ainv = inv(A)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (index_t i = 0; i < A.Size(0); i++) {
     for (index_t j = 0; j <= i; j++) {
diff --git a/test/00_solver/LU.cu b/test/00_solver/LU.cu
index 8f57dd5c4..36cbdc0c7 100644
--- a/test/00_solver/LU.cu
+++ b/test/00_solver/LU.cu
@@ -77,7 +77,7 @@ TYPED_TEST(LUSolverTestNonComplexFloatTypes, LUBasic)
   // example-begin lu-test-1
   (mtie(this->Av, this->PivV) =  lu(this->Av)).run(this->exec);
   // example-end lu-test-1
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   // The upper and lower triangle components are saved in Av. Python saves them
   // as separate matrices with the diagonal of the lower matrix set to 0
diff --git a/test/00_solver/QR.cu b/test/00_solver/QR.cu
index 1c11be41b..2ac40a85f 100644
--- a/test/00_solver/QR.cu
+++ b/test/00_solver/QR.cu
@@ -82,7 +82,7 @@ TYPED_TEST(QRSolverTestNonComplexFloatTypes, QRBasic)
   // solve, then transpose again to compare to Python
   (mtie(this->Av, this->TauV) = cusolver_qr(this->Av)).run(this->exec);
   // example-end cusolver_qr-test-1
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   // For now we're only verifying R. Q is a bit more complex to compute since
   // cuSolver/BLAS don't return Q, and instead return Householder reflections
diff --git a/test/00_solver/QR2.cu b/test/00_solver/QR2.cu
index 38abc510e..25f730adf 100644
--- a/test/00_solver/QR2.cu
+++ b/test/00_solver/QR2.cu
@@ -51,6 +51,7 @@ void qr_test( const index_t (&AshapeA)[RANK]) {
   using SType = typename inner_op_type_t<AType>::type;
   
   cudaStream_t stream = 0;
+  cudaExecutor exec{stream};
 
   std::array<index_t, RANK> Ashape = detail::to_array(AshapeA);
   std::array<index_t, RANK> Qshape = Ashape;
@@ -69,14 +70,10 @@ void qr_test( const index_t (&AshapeA)[RANK]) {
   auto Q = make_tensor<AType>(Qshape);
   auto R = make_tensor<AType>(Rshape);
   
-  (A = random<float>(Ashape, NORMAL)).run(stream);
-  
-  A.PrefetchDevice(stream);
-  Q.PrefetchDevice(stream);
-  R.PrefetchDevice(stream);
+  (A = random<float>(Ashape, NORMAL)).run(exec);
   
   // example-begin qr-test-1
-  (mtie(Q, R) = qr(A)).run(stream);
+  (mtie(Q, R) = qr(A)).run(exec);
   // example-end qr-test-1
 
   auto mdiffQTQ = make_tensor<SType>({});
@@ -85,7 +82,7 @@ void qr_test( const index_t (&AshapeA)[RANK]) {
   {
     // QTQ == Identity
     auto QTQ = make_tensor<AType>(Qshape);
-    (QTQ = matmul(conj(transpose_matrix(Q)), Q)).run(stream);
+    (QTQ = matmul(conj(transpose_matrix(Q)), Q)).run(exec);
     auto e = eye<SType>({m, m});
 
     auto eShape = Qshape;
@@ -93,19 +90,19 @@ void qr_test( const index_t (&AshapeA)[RANK]) {
     eShape[RANK-2] = matxKeepDim;
     auto I = clone<RANK>(e, eShape);
   
-    (mdiffQTQ = max(abs(QTQ-I))).run(stream);
+    (mdiffQTQ = max(abs(QTQ-I))).run(exec);
 
   }
 
   {
     // Q*R == A
     auto QR = make_tensor<AType>(Ashape);
-    (QR = matmul(Q, R)).run(stream);
+    (QR = matmul(Q, R)).run(exec);
     
-    (mdiffQR = max(abs(A-QR))).run(stream);
+    (mdiffQR = max(abs(A-QR))).run(exec);
   }
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   ASSERT_NEAR( mdiffQTQ(), SType(0), .00001);
   ASSERT_NEAR( mdiffQR(), SType(0), .00001);
diff --git a/test/00_solver/SVD.cu b/test/00_solver/SVD.cu
index 11a2905e3..990e6d04f 100644
--- a/test/00_solver/SVD.cu
+++ b/test/00_solver/SVD.cu
@@ -94,7 +94,7 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasic)
   (mtie(Uv, Sv, Vv) = svd(Atv2)).run(this->exec);
   // example-end svd-test-1
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   // Since SVD produces a solution that's not necessarily unique, we cannot
   // compare against Python output. Instead, we just make sure that A = U*S*V'.
@@ -105,14 +105,14 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasic)
 
   // Zero out s
   (Sav = zeros<typename inner_op_type_t<TestType>::type>({m, n})).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   // Construct S matrix since it's just a vector from cuSolver
   for (index_t i = 0; i < n; i++) {
     Sav(i, i) = Sv(i);
   }
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   (SSolav = 0).run(this->exec);
   if constexpr (is_complex_v<TestType>) {
@@ -124,7 +124,7 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasic)
 
   (tmpV = matmul(Uav, SSolav)).run(this->exec); // U * S
   (SSolav = matmul(tmpV, Vav)).run(this->exec); // (U * S) * V'
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (index_t i = 0; i < Av.Size(0); i++) {
     for (index_t j = 0; j < Av.Size(1); j++) {
@@ -175,7 +175,7 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasicBatched)
   auto Atv2 = Atv.View({batches, m, n});
   (mtie(Uv, Sv, Vv) = svd(Atv2)).run(this->exec);
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   // Since SVD produces a solution that's not necessarily unique, we cannot
   // compare against Python output. Instead, we just make sure that A = U*S*V'.
@@ -186,7 +186,7 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasicBatched)
 
   // Zero out s
   (Sav = zeros<typename inner_op_type_t<TestType>::type>({batches, m, n})).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   // Construct S matrix since it's just a vector from cuSolver
   for (index_t b = 0; b < batches; b++) {
@@ -195,7 +195,7 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasicBatched)
     }
   }
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   (SSolav = 0).run(this->exec);
   if constexpr (is_complex_v<TestType>) {
@@ -207,7 +207,7 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasicBatched)
 
   (tmpV = matmul(Uav, SSolav)).run(this->exec); // U * S
   (SSolav = matmul(tmpV, Vav)).run(this->exec); // (U * S) * V'
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (index_t b = 0; b < batches; b++) {
     for (index_t i = 0; i < Av.Size(0); i++) {
@@ -313,7 +313,7 @@ void svdpi_test( const index_t (&AshapeA)[RANK], Executor exec) {
   (mdiffV = max(VTVd)).run(exec);
   (mdiffA = max(Ad)).run(exec);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
 #if 0
   printf("A\n"); print(A);
@@ -363,7 +363,7 @@ void svdbpi_test( const index_t (&AshapeA)[RANK], Executor exec) {
 
   std::array<index_t, RANK> Ashape = detail::to_array(AshapeA);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   index_t mm = Ashape[RANK-2];
   index_t nn = Ashape[RANK-1];
@@ -445,7 +445,7 @@ void svdbpi_test( const index_t (&AshapeA)[RANK], Executor exec) {
   (mdiffV = max(VTVd)).run(exec);
   (mdiffA = max(Ad)).run(exec);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
 #if 0
   printf("A\n"); print(A);
@@ -467,7 +467,7 @@ void svdbpi_test( const index_t (&AshapeA)[RANK], Executor exec) {
   ASSERT_NEAR( mdiffV(), SType(0), .1);
   ASSERT_NEAR( mdiffA(), SType(0), .00001);
   
-  cudaDeviceSynchronize();
+  exec.sync();
 }
 
 TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBPI)
diff --git a/test/00_tensor/BasicTensorTests.cu b/test/00_tensor/BasicTensorTests.cu
index bd19ac0ba..9d5ee8e9b 100644
--- a/test/00_tensor/BasicTensorTests.cu
+++ b/test/00_tensor/BasicTensorTests.cu
@@ -242,7 +242,7 @@ TYPED_TEST(BasicTensorTestsAll, AssignmentOps)
 
   (t2c = this->t2).run(this->exec);
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
   for (index_t i = 0; i < t2c.Size(0); i++) {
     for (index_t j = 0; j < t2c.Size(1); j++) {
       ASSERT_EQ(this->t2(i,j), t2c(i,j));
@@ -250,7 +250,7 @@ TYPED_TEST(BasicTensorTestsAll, AssignmentOps)
   }
 
   (this->t2 = t2c = t2c2).run(this->exec);
-  cudaStreamSynchronize(0);  
+  this->exec.sync();  
   for (index_t i = 0; i < t2c.Size(0); i++) {
     for (index_t j = 0; j < t2c.Size(1); j++) {
       ASSERT_EQ(this->t2(i,j), t2c2(i,j));
@@ -277,7 +277,7 @@ TYPED_TEST(BasicTensorTestsNumeric, AssignmentOps)
   }  
 
   (this->t2 += t2c).run(this->exec);
-  cudaStreamSynchronize(0);  
+  this->exec.sync();  
   for (index_t i = 0; i < t2c.Size(0); i++) {
     for (index_t j = 0; j < t2c.Size(1); j++) {
       ASSERT_EQ(this->t2(i,j), t2c(i,j) + t2c(i,j));
@@ -285,7 +285,7 @@ TYPED_TEST(BasicTensorTestsNumeric, AssignmentOps)
   }
 
   (this->t2 -= t2c).run(this->exec);
-  cudaStreamSynchronize(0);  
+  this->exec.sync();  
   for (index_t i = 0; i < t2c.Size(0); i++) {
     for (index_t j = 0; j < t2c.Size(1); j++) {
       ASSERT_EQ(this->t2(i,j), t2c(i,j));
@@ -293,7 +293,7 @@ TYPED_TEST(BasicTensorTestsNumeric, AssignmentOps)
   }  
 
   (this->t2 *= t2c).run(this->exec);
-  cudaStreamSynchronize(0);  
+  this->exec.sync();  
   for (index_t i = 0; i < t2c.Size(0); i++) {
     for (index_t j = 0; j < t2c.Size(1); j++) {
       ASSERT_EQ(this->t2(i,j), static_cast<TestType>(1));
@@ -301,8 +301,8 @@ TYPED_TEST(BasicTensorTestsNumeric, AssignmentOps)
   }
 
   (t2c = this->t2).run(this->exec);
-  (this->t2 /= static_cast<TestType>(1)).run();
-  cudaStreamSynchronize(0);  
+  (this->t2 /= static_cast<TestType>(1)).run(this->exec);
+  this->exec.sync();  
   for (index_t i = 0; i < t2c.Size(0); i++) {
     for (index_t j = 0; j < t2c.Size(1); j++) {
       ASSERT_EQ(t2c(i,j) , this->t2(i,j));
@@ -328,7 +328,7 @@ TYPED_TEST(BasicTensorTestsIntegral, AssignmentOps)
   }
 
   (this->t2 |= t2c).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
   for (index_t i = 0; i < t2c.Size(0); i++) {
     for (index_t j = 0; j < t2c.Size(1); j++) {
       ASSERT_EQ(this->t2(i,j), 3);
@@ -336,7 +336,7 @@ TYPED_TEST(BasicTensorTestsIntegral, AssignmentOps)
   }
 
   (this->t2 &= t2c).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
   for (index_t i = 0; i < t2c.Size(0); i++) {
     for (index_t j = 0; j < t2c.Size(1); j++) {
       ASSERT_EQ(this->t2(i,j), 2);
@@ -344,7 +344,7 @@ TYPED_TEST(BasicTensorTestsIntegral, AssignmentOps)
   }       
 
   (this->t2 ^= t2c).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
   for (index_t i = 0; i < t2c.Size(0); i++) {
     for (index_t j = 0; j < t2c.Size(1); j++) {
       ASSERT_EQ(this->t2(i,j), 0);
diff --git a/test/00_tensor/CUBTests.cu b/test/00_tensor/CUBTests.cu
index d162b8e83..9a34901fb 100644
--- a/test/00_tensor/CUBTests.cu
+++ b/test/00_tensor/CUBTests.cu
@@ -117,10 +117,12 @@ TEST(TensorStats, Hist)
 
   inv.SetVals({2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5});
 
+  cudaExecutor exec{};
+
   // example-begin hist-test-1
-  (outv = hist(inv, 0.0f, 12.0f)).run();;
+  (outv = hist(inv, 0.0f, 12.0f)).run(exec);
   // example-end hist-test-1
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   std::array<int, levels - 1> sol = {1, 5, 0, 3, 0, 0};
   for (index_t i = 0; i < outv.Lsize(); i++) {
@@ -147,7 +149,7 @@ TYPED_TEST(CUBTestsNumericNonComplexAllExecs, CumSum)
   // Compute the cumulative sum/exclusive scan across "t1"
   (tmpv = cumsum(this->t1)).run(this->exec);
   // example-end cumsum-test-1
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   TestType ttl = 0;
   for (index_t i = 0; i < tmpv.Lsize(); i++) {
@@ -165,7 +167,7 @@ TYPED_TEST(CUBTestsNumericNonComplexAllExecs, CumSum)
   }
 
   (tmpv2 = cumsum(this->t2)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
   for (index_t i = 0; i < tmpv2.Size(0); i++) {
     ttl = 0;
     for (index_t j = 0; j < tmpv2.Size(1); j++) {
@@ -194,7 +196,7 @@ TYPED_TEST(CUBTestsNumericNonComplexAllExecs, Sort)
   // Ascending sort of 1D input
   (tmpv = matx::sort(this->t1, SORT_DIR_ASC)).run(this->exec);
   // example-end sort-test-1
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (index_t i = 1; i < tmpv.Lsize(); i++) {
     ASSERT_TRUE(tmpv(i) > tmpv(i - 1));
@@ -204,7 +206,7 @@ TYPED_TEST(CUBTestsNumericNonComplexAllExecs, Sort)
   // Descending sort of 1D input
   (tmpv = matx::sort(this->t1, SORT_DIR_DESC)).run(this->exec);
   // example-end sort-test-2
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (index_t i = 1; i < tmpv.Lsize(); i++) {
     ASSERT_TRUE(tmpv(i) < tmpv(i - 1));
@@ -220,7 +222,7 @@ TYPED_TEST(CUBTestsNumericNonComplexAllExecs, Sort)
   }
 
   (tmpv2 = matx::sort(this->t2, SORT_DIR_ASC)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (index_t i = 0; i < tmpv2.Size(0); i++) {
     for (index_t j = 1; j < tmpv2.Size(1); j++) {
@@ -230,7 +232,7 @@ TYPED_TEST(CUBTestsNumericNonComplexAllExecs, Sort)
 
   // Descending
   (tmpv2 = matx::sort(this->t2, SORT_DIR_DESC)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (index_t i = 0; i < tmpv2.Size(0); i++) {
     for (index_t j = 1; j < tmpv2.Size(1); j++) {
diff --git a/test/00_tensor/EinsumTests.cu b/test/00_tensor/EinsumTests.cu
index 02c9bcbd1..ab1c68786 100644
--- a/test/00_tensor/EinsumTests.cu
+++ b/test/00_tensor/EinsumTests.cu
@@ -121,7 +121,7 @@ TYPED_TEST(EinsumTestsFloatNonComplexNonHalfTypes, Contraction3D)
   // Perform a 3D tensor contraction
   (c2 = cutensor::einsum("ijk,jil->kl", a, b)).run(exec);
   // example-end einsum-contraction-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   MATX_TEST_ASSERT_COMPARE(this->pb, c2, "c_float3d", 0.01);
 
   MATX_EXIT_HANDLER();
@@ -146,7 +146,7 @@ TYPED_TEST(EinsumTestsFloatNonComplexNonHalfTypes, Dot)
   // Perform a dot product of b1 with itself and store in a1
   (c0 = cutensor::einsum("i,i->", a1, b1)).run(exec);
   // example-end einsum-dot-1
-  cudaStreamSynchronize(0);
+  exec.sync();
   MATX_ASSERT_EQ(c0(), 4 * a1.Size(0));
 
   MATX_EXIT_HANDLER();
@@ -173,7 +173,7 @@ TYPED_TEST(EinsumTestsFloatNonComplexNonHalfTypes, GEMM)
   (c2 = cutensor::einsum("mk,kn->mn", a2, b2)).run(exec);
   (c22 = matmul(a2, b2)).run(exec);
   // example-end einsum-gemm-1
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (auto i = 0; i < c2.Size(0); i++) {
     for (auto j = 0; j < c2.Size(1); j++) {
@@ -203,7 +203,7 @@ TYPED_TEST(EinsumTestsFloatNonComplexNonHalfTypes, GEMMTranspose)
   (c2 = cutensor::einsum("mk,kn->nm", a2, b2)).run(exec);
   // example-end einsum-gemm-2
   (c22 = matmul(a2, b2)).run(exec);
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   auto c22t = c22.Permute({1,0}); // Permute to match cutensor
 
@@ -232,7 +232,7 @@ TYPED_TEST(EinsumTestsFloatNonComplexNonHalfTypes, Permute)
   (b = cutensor::einsum("ijkl->jlki", a)).run(exec);
   (b2 = a.Permute({1,3,2,0})).run(exec);
   // example-end einsum-permute-1
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (auto i = 0; i < b.Size(0); i++) {
     for (auto j = 0; j < b.Size(1); j++) {
@@ -264,7 +264,7 @@ TYPED_TEST(EinsumTestsFloatNonComplexNonHalfTypes, Sum)
   (b = matx::cutensor::einsum("ij->j", a)).run(exec);
   // example-end einsum-sum-1
     
-  cudaStreamSynchronize(0);
+  exec.sync();
   for (auto i = 0; i < a.Size(1); i++) {
     TestType s = 0;
     for (auto j = 0; j < a.Size(0); j++) {
@@ -294,7 +294,7 @@ TYPED_TEST(EinsumTestsFloatNonComplexNonHalfTypes, Trace)
   (c0_1 = trace(a2)).run(exec);
 
   // example-end einsum-trace-1
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   MATX_ASSERT_EQ(c0_0(), c0_1());
   MATX_ASSERT_EQ(c0_0(), 10);
diff --git a/test/00_tensor/ViewTests.cu b/test/00_tensor/ViewTests.cu
index 9c913bc9b..599d98f8c 100644
--- a/test/00_tensor/ViewTests.cu
+++ b/test/00_tensor/ViewTests.cu
@@ -430,7 +430,7 @@ TYPED_TEST(ViewTestsFloatNonComplexNonHalf, Random)
     (t3f = (TestType)-1000000).run(this->exec);
     (t3f = random<TestType>({count, count, count}, UNIFORM)).run(this->exec);
     // example-end random-test-1    
-    cudaDeviceSynchronize();
+    this->exec.sync();
 
     TestType total = 0;
     for (index_t i = 0; i < count; i++) {
@@ -449,7 +449,7 @@ TYPED_TEST(ViewTestsFloatNonComplexNonHalf, Random)
 
     (t3f = (TestType)-1000000).run(this->exec);
     (t3f = random<TestType>({count, count, count}, NORMAL)).run(this->exec);
-    cudaDeviceSynchronize();
+    this->exec.sync();
 
     total = 0;
 
diff --git a/test/00_transform/ChannelizePoly.cu b/test/00_transform/ChannelizePoly.cu
index ca5947aea..582063da2 100644
--- a/test/00_transform/ChannelizePoly.cu
+++ b/test/00_transform/ChannelizePoly.cu
@@ -149,8 +149,6 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Simple)
     { 1000000, 40*16, 40 }
   };
 
-  cudaStream_t stream = 0;
-
   for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
     const index_t a_len = test_cases[i].a_len;
     const index_t f_len = test_cases[i].f_len;
@@ -172,14 +170,14 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Simple)
     (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec);
     // example-end channelize_poly-test-1
 
-    cudaStreamSynchronize(stream);
+    this->exec.sync();
     MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);    
 
     // Now test with a multiplicative operator on the input. The channelizer is linear,
     // so we can inverse-scale the output to compare against the golden outputs.
     (b = channelize_poly(static_cast<TestType>(4.0) * a, f, num_channels, decimation_factor)).run(this->exec);
     (b = b * static_cast<TestType>(0.25)).run(this->exec);
-    cudaStreamSynchronize(stream);
+    this->exec.sync();
 
     MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);
   }
@@ -233,7 +231,7 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Batched)
     this->pb->NumpyToTensorView(f, "filter_random");
     (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);    
 
@@ -241,7 +239,7 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Batched)
     // so we can inverse-scale the output to compare against the golden outputs.
     (b = channelize_poly(static_cast<TestType>(4.0) * a, f, num_channels, decimation_factor)).run(this->exec);
     (b = b * static_cast<TestType>(0.25)).run(this->exec);
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);
   }
@@ -277,12 +275,12 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, IdentityFilter)
   this->pb->NumpyToTensorView(a, "a");
   for (auto i = 0; i < num_channels; i++) { f(i) = 1; }
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   const index_t decimation_factor = num_channels;
   (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec);
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (auto k = 0; k < b_elem_per_channel; k++) {
     // Explicit DFT in the channel dimension. The complex exponential sign here is opposite
@@ -315,8 +313,6 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Operators)
   using InnerType = typename test_types::inner_type<TestType>::type;
   using ComplexType = typename test_types::complex_type<TestType>::type;
 
-  cudaStream_t stream = 0;
-
   const index_t a_len = 2500;
   [[maybe_unused]] const index_t f_len = 90;
   const index_t num_channels = 10;
@@ -331,13 +327,13 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Operators)
   this->pb->NumpyToTensorView(a, "a");
   this->pb->NumpyToTensorView(f, "filter_random");
 
-  cudaStreamSynchronize(stream);
+  this->exec.sync();
 
   const index_t decimation_factor = num_channels;
   auto b = permute(bp, {1, 0});
   (b = channelize_poly(shift<0>(shift<0>(a, 8), -8), f, num_channels, decimation_factor)).run(this->exec);
 
-  cudaStreamSynchronize(stream);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);
 
@@ -610,12 +606,12 @@ TYPED_TEST(ChannelizePolyTestDoubleType, Harris2003)
   cudaMemcpyAsync(a.Data(), input.data(), signal_len * sizeof(TestType), cudaMemcpyHostToDevice, stream);
   cudaMemcpyAsync(f.Data(), filter.data(), filter_len * sizeof(TestType), cudaMemcpyHostToDevice, stream);
 
-  cudaStreamSynchronize(stream);
+  this->exec.sync();
 
   const index_t decimation_factor = num_channels;
   (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec);
 
-  cudaStreamSynchronize(stream);
+  this->exec.sync();
 
   for (auto chan = 0; chan < num_channels; chan++) {
     for (auto k = 0; k < b_elem_per_channel; k++) {
diff --git a/test/00_transform/ConvCorr.cu b/test/00_transform/ConvCorr.cu
index b1522058e..f0c32c7ea 100644
--- a/test/00_transform/ConvCorr.cu
+++ b/test/00_transform/ConvCorr.cu
@@ -593,7 +593,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv1Axis)
   (out2 = conv1d(in1, in2, {2}, MATX_C_MODE_SAME)).run(this->exec);
   // example-end conv1d-test-2
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -608,7 +608,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv1Axis)
   (out2 = conv1d(in1, in2, {1}, MATX_C_MODE_SAME)).run(this->exec);
   // example-end conv1d-test-3
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -621,7 +621,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv1Axis)
   (out1.Permute({1,2,0}) = conv1d(in1.Permute({1,2,0}), in2.Permute({1,2,0}), MATX_C_MODE_SAME)).run(this->exec);
   (out2 = conv1d(in1, in2, {0}, MATX_C_MODE_SAME)).run(this->exec);
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -634,7 +634,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv1Axis)
   (out1 = corr(in1, in2, MATX_C_MODE_SAME, MATX_C_METHOD_DIRECT)).run(this->exec);
   (out2 = corr(in1, in2, {2}, MATX_C_MODE_SAME, MATX_C_METHOD_DIRECT)).run(this->exec);
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -647,7 +647,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv1Axis)
   (out1.Permute({0,2,1}) = corr(in1.Permute({0,2,1}), in2.Permute({0,2,1}), MATX_C_MODE_SAME, MATX_C_METHOD_DIRECT)).run(this->exec);
   (out2 = corr(in1, in2, {1}, MATX_C_MODE_SAME, MATX_C_METHOD_DIRECT)).run(this->exec);
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -660,7 +660,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv1Axis)
   (out1.Permute({1,2,0}) = corr(in1.Permute({1,2,0}), in2.Permute({1,2,0}), MATX_C_MODE_SAME, MATX_C_METHOD_DIRECT)).run(this->exec);
   (out2 = corr(in1, in2, {0}, MATX_C_MODE_SAME, MATX_C_METHOD_DIRECT)).run(this->exec);
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -699,7 +699,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv2Axis)
   (out1 = conv2d(in1, in2, MATX_C_MODE_SAME)).run(this->exec);
   (out2 = conv2d(in1, in2, {1, 2}, MATX_C_MODE_SAME)).run(this->exec);
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -712,7 +712,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv2Axis)
   (out1.Permute({0,2,1}) = conv2d(in1.Permute({0,2,1}), in2.Permute({0,2,1}), MATX_C_MODE_SAME)).run(this->exec);
   (out2 = conv2d(in1, in2, {2, 1}, MATX_C_MODE_SAME)).run(this->exec);
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -725,7 +725,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv2Axis)
   (out1.Permute({1,2,0}) = conv2d(in1.Permute({1,2,0}), in2.Permute({1,2,0}), MATX_C_MODE_SAME)).run(this->exec);
   (out2 = conv2d(in1, in2, {2, 0}, MATX_C_MODE_SAME)).run(this->exec);
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
diff --git a/test/00_transform/Copy.cu b/test/00_transform/Copy.cu
index 7e17ea8e4..1dd32155e 100755
--- a/test/00_transform/Copy.cu
+++ b/test/00_transform/Copy.cu
@@ -54,12 +54,6 @@ TYPED_TEST(CopyTestsAll, CopyOutParam)
 
   ExecType exec{};
 
-  auto sync = []() constexpr {
-    if constexpr (std::is_same_v<ExecType,cudaExecutor>) {
-      cudaDeviceSynchronize();
-    }
-  };
-
   const int SZ = 5;
   TestType DEFAULT, TEST_VAL;
   if constexpr (std::is_same_v<TestType, bool>) {
@@ -81,13 +75,13 @@ TYPED_TEST(CopyTestsAll, CopyOutParam)
       auto in = make_tensor<TestType>(dims); \
       auto out = make_tensor<TestType>(dims); \
       (in = DEFAULT).run(exec); \
-      sync(); \
+      exec.sync(); \
       std::array<index_t, N> inds; \
       inds.fill(SZ/2); \
       in(inds) = TEST_VAL; \
-      sync(); \
+      exec.sync(); \
       matx::copy(out, in, exec); \
-      sync(); \
+      exec.sync(); \
       ASSERT_EQ(in(inds), out(inds)); \
       ASSERT_EQ(out(inds), TEST_VAL); \
       inds.fill(0); \
@@ -109,9 +103,9 @@ TYPED_TEST(CopyTestsAll, CopyOutParam)
     auto in = make_tensor<TestType>({});
     auto out = make_tensor<TestType>({});
     in() = TEST_VAL;
-    sync();
+    exec.sync();
     matx::copy(out, in, exec);
-    sync();
+    exec.sync();
     ASSERT_EQ(in(), out());
     ASSERT_EQ(out(), TEST_VAL);
   }
@@ -121,11 +115,11 @@ TYPED_TEST(CopyTestsAll, CopyOutParam)
     auto in = make_tensor<TestType>({SZ, SZ, SZ});
     auto out = make_tensor<TestType>({SZ});
     (in = DEFAULT).run(exec);
-    sync();
+    exec.sync();
     in(0, SZ/2, 0) = TEST_VAL;
-    sync();
+    exec.sync();
     matx::copy(out, slice<1>(in, {0,0,0}, {matxDropDim,matxEnd,matxDropDim}), exec);
-    sync();
+    exec.sync();
     ASSERT_EQ(out.Rank(), 1);
     ASSERT_EQ(out.Size(0), SZ);
     ASSERT_EQ(out(SZ/2), TEST_VAL);
@@ -148,12 +142,6 @@ TYPED_TEST(CopyTestsAll, CopyReturn)
 
   ExecType exec{};
 
-  auto sync = []() constexpr {
-    if constexpr (std::is_same_v<ExecType,cudaExecutor>) {
-      cudaDeviceSynchronize();
-    }
-  };
-
   const int SZ = 5;
   TestType DEFAULT, TEST_VAL;
   if constexpr (std::is_same_v<TestType, bool>) {
@@ -174,13 +162,13 @@ TYPED_TEST(CopyTestsAll, CopyReturn)
       dims.fill(SZ); \
       auto in = make_tensor<TestType>(dims); \
       (in = DEFAULT).run(exec); \
-      sync(); \
+      exec.sync(); \
       std::array<index_t, N> inds; \
       inds.fill(SZ/2); \
       in(inds) = TEST_VAL; \
-      sync(); \
+      exec.sync(); \
       auto out = matx::copy(in, exec); \
-      sync(); \
+      exec.sync(); \
       ASSERT_EQ(in(inds), out(inds)); \
       ASSERT_EQ(out(inds), TEST_VAL); \
       inds.fill(0); \
@@ -201,9 +189,9 @@ TYPED_TEST(CopyTestsAll, CopyReturn)
   {
     auto in = make_tensor<TestType>({});
     in() = TEST_VAL;
-    sync();
+    exec.sync();
     auto out = matx::copy(in, exec);
-    sync();
+    exec.sync();
     ASSERT_EQ(in(), out());
     ASSERT_EQ(out(), TEST_VAL);
   }
@@ -212,11 +200,11 @@ TYPED_TEST(CopyTestsAll, CopyReturn)
   {
     auto in = make_tensor<TestType>({SZ, SZ, SZ});
     (in = DEFAULT).run(exec);
-    sync();
+    exec.sync();
     in(0, SZ/2, 0) = TEST_VAL;
-    sync();
+    exec.sync();
     auto out = matx::copy(slice<1>(in, {0,0,0}, {matxDropDim,matxEnd,matxDropDim}), exec);
-    sync();
+    exec.sync();
     ASSERT_EQ(out.Rank(), 1);
     ASSERT_EQ(out.Size(0), SZ);
     ASSERT_EQ(out(SZ/2), TEST_VAL);
diff --git a/test/00_transform/Cov.cu b/test/00_transform/Cov.cu
index 217797c51..0ca3e7a43 100644
--- a/test/00_transform/Cov.cu
+++ b/test/00_transform/Cov.cu
@@ -82,7 +82,7 @@ TYPED_TEST(CovarianceTestFloatTypes, SmallCov)
   // example-begin cov-test-1
   (this->cv = cov(this->av)).run(this->exec);
   // example-end cov-test-1
-  cudaDeviceSynchronize();
+  this->exec.sync();
   MATX_TEST_ASSERT_COMPARE(this->pb, this->cv, "c_cov", this->thresh);
   MATX_EXIT_HANDLER();
 }
@@ -107,7 +107,7 @@ TYPED_TEST(CovarianceTestFloatTypes, BatchedCov)
   (batched_in = clone<5>(this->av, {m, n, k, matxKeepDim, matxKeepDim})).run(this->exec);
 
   (batched_out = cov(batched_in)).run(this->exec);
-  cudaDeviceSynchronize();
+  this->exec.sync();
 
   for (int im = 0; im < m; im++) {
     for (int in = 0; in < n; in++) {
diff --git a/test/00_transform/FFT.cu b/test/00_transform/FFT.cu
index 7a3b35c0b..0ff8a0a65 100644
--- a/test/00_transform/FFT.cu
+++ b/test/00_transform/FFT.cu
@@ -98,7 +98,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT1D1024C2C)
   // type of the tensors and output size.
   (avo = fft(av)).run(this->exec);
   // example-end fft-1
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -121,7 +121,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT1DFWD1024C2C)
   // type of the tensors and output size.
   (avo = fft(av, fft_dim, FFTNorm::FORWARD)).run(this->exec);
   // example-end fft-1
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -144,7 +144,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT1DORTHO1024C2C)
   // type of the tensors and output size.
   (avo = fft(av, fft_dim, FFTNorm::ORTHO)).run(this->exec);
   // example-end fft-1
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -178,7 +178,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1Axis)
   (out2 = fft(in, {2})).run(this->exec);
 
   // example-end fft-2
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -195,7 +195,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1Axis)
   (out2 = fft(in, {1})).run(this->exec);  
 
   // example-end fft-3
-  cudaStreamSynchronize(0);
+  this->exec.sync();
   
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -211,7 +211,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1Axis)
   (out1 = ifft(in)).run(this->exec);
   (out2 = ifft(in, {2})).run(this->exec);    
   // example-end ifft-1
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -227,7 +227,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1Axis)
   (out1.Permute({0,2,1}) = ifft(in.Permute({0,2,1}))).run(this->exec);
   (out2 = ifft(in, {1})).run(this->exec);    
   // example-end ifft-2
-  cudaStreamSynchronize(0);
+  this->exec.sync();
   
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -242,7 +242,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1Axis)
     (out1.Permute({0,2,1}) = fft(permute(in1, {0,2,1}))).run(this->exec);
     (out2 = fft(in1, {1})).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     for(int i = 0; i < d1; i++) {
       for(int j = 0; j < d2; j++) {
@@ -255,7 +255,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1Axis)
 
     (out1.Permute({1,2,0}) = ifft(permute(in1, {1,2,0}))).run(this->exec);
     (out2 = ifft(in1, {0})).run(this->exec);
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     for(int i = 0; i < d1; i++) {
       for(int j = 0; j < d2; j++) {
@@ -296,7 +296,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis)
   (out1 = fft2(in)).run(this->exec);
   (out2 = fft2(in, {1,2})).run(this->exec);
   // example-end fft2-1
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -312,7 +312,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis)
   (out1.Permute({1,2,0}) = fft2(in.Permute({1,2,0}))).run(this->exec);
   (out2 = fft2(in, {2,0})).run(this->exec);
   // example-end fft2-2
-  cudaStreamSynchronize(0);
+  this->exec.sync();
   
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -324,7 +324,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis)
 
   (out1.Permute({1,0,2}) = fft2(in.Permute({1,0,2}))).run(this->exec);
   (out2 = fft2(in, {0,2})).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
   
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -340,7 +340,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis)
   (out1 = ifft2(in)).run(this->exec);
   (out2 = ifft2(in, {1,2})).run(this->exec);
   // example-end ifft2-1
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -356,7 +356,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis)
   (out1.Permute({1,2,0}) = ifft2(in.Permute({1,2,0}))).run(this->exec);
   (out2 = ifft2(in, {2,0})).run(this->exec);
   // example-end ifft2-2
-  cudaStreamSynchronize(0);
+  this->exec.sync();
   
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -368,7 +368,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis)
 
   (out1.Permute({1,0,2}) = ifft2(in.Permute({1,0,2}))).run(this->exec);
   (out2 = ifft2(in, {0,2})).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
   
   for(int i = 0; i < d1; i++) {
     for(int j = 0; j < d2; j++) {
@@ -383,7 +383,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis)
 
     (out1.Permute({1,0,2}) = fft2(permute(in1, {1,0,2}))).run(this->exec);
     (out2 = fft2(in1, {0,2})).run(this->exec);
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     for(int i = 0; i < d1; i++) {
       for(int j = 0; j < d2; j++) {
@@ -396,7 +396,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis)
 
     (out1.Permute({1,0,2}) = ifft2(permute(in1, {1,0,2}))).run(this->exec);
     (out2 = ifft2(in1, {0,2})).run(this->exec);
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     for(int i = 0; i < d1; i++) {
       for(int j = 0; j < d2; j++) {
@@ -422,7 +422,7 @@ TYPED_TEST(FFTTestComplexTypes, IFFT1D1024C2C)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = ifft(av)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -440,7 +440,7 @@ TYPED_TEST(FFTTestComplexTypes, IFFT1DORTHO1024C2C)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = ifft(av, fft_dim, FFTNorm::ORTHO)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -458,7 +458,7 @@ TYPED_TEST(FFTTestComplexTypes, IFFT1DFWD1024C2C)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = ifft(av, fft_dim, FFTNorm::FORWARD)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -479,7 +479,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT1D1024PadC2C)
   // Specify the FFT size as bigger than av. Thus, av will be zero-padded to the appropriate size
   (avo = fft(av, fft_dim * 2)).run(this->exec);
   // example-end fft-4
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -497,7 +497,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT1D1024PadBatchedC2C)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = fft(av, fft_dim*2)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
 
@@ -505,7 +505,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT1D1024PadBatchedC2C)
   // Perform an FFT but force the size to be fft_dim * 2 instead of the output size
   (avo = fft(av, fft_dim * 2)).run(this->exec); // Force the FFT size
   // example-end fft-5
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);  
   MATX_EXIT_HANDLER();
@@ -524,7 +524,7 @@ TYPED_TEST(FFTTestComplexTypes, IFFT1D1024PadC2C)
 
   // Specify the IFFT size as bigger than av. Thus, av will be zero-padded to the appropriate size
   (avo = ifft(av, fft_dim * 2)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -545,7 +545,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1D1024R2C)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = fft(av)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -566,7 +566,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1D1024PadR2C)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = fft(av, fft_dim*2)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -588,28 +588,28 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1DSizeChecks)
     auto t2 = make_tensor<ComplexType>({2*N});
     // We do not implicitly zero-pad to a larger transform size
     (t2 = fft(tc)).run(this->exec);
-    cudaDeviceSynchronize();
+    this->exec.sync();
   }, matx::detail::matxException);
 
   // C2C, output size smaller than input size
   ASSERT_THROW({
     auto t2 = make_tensor<ComplexType>({(N/2)+1});
     (t2 = fft(tc)).run(this->exec);
-    cudaDeviceSynchronize();
+    this->exec.sync();
   }, matx::detail::matxException);
 
   // R2C, output size smaller than N/2 + 1
   ASSERT_THROW({
     auto t2 = make_tensor<ComplexType>({N/2});
     (t2 = fft(tr)).run(this->exec);
-    cudaDeviceSynchronize();
+    this->exec.sync();
   }, matx::detail::matxException);
 
   // R2C, output size larger than N/2 + 1
   ASSERT_THROW({
     auto t2 = make_tensor<ComplexType>({N/2+2});
     (t2 = fft(tr)).run(this->exec);
-    cudaDeviceSynchronize();
+    this->exec.sync();
   }, matx::detail::matxException);
 
   // C2R, output size smaller than N
@@ -617,7 +617,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1DSizeChecks)
     auto tcs = slice(tc, {0}, {N/2+1});
     auto t2 = make_tensor<RealType>({N-1});
     (t2 = fft(tcs)).run(this->exec);
-    cudaDeviceSynchronize();
+    this->exec.sync();
   }, matx::detail::matxException);
 
   // C2R, output size too large
@@ -625,7 +625,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1DSizeChecks)
     auto tcs = slice(tc, {0}, {N/2+1});
     auto t2 = make_tensor<RealType>({N+2});
     (t2 = fft(tcs)).run(this->exec);
-    cudaDeviceSynchronize();
+    this->exec.sync();
  }, matx::detail::matxException);
 
   MATX_EXIT_HANDLER();
@@ -645,7 +645,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1D1024PadBatchedR2C)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = fft(av, fft_dim*2)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -664,7 +664,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT2D16C2C)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = fft2(av)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -683,7 +683,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT2D16x32C2C)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = fft2(av)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -704,7 +704,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT2D16BatchedC2C)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = fft2(av)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -726,7 +726,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT2D16BatchedStridedC2C)
 
   const int32_t axes[] = {0, 2};
   (avo = fft2(av, axes)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -745,7 +745,7 @@ TYPED_TEST(FFTTestComplexTypes, IFFT2D16C2C)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = ifft2(av)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -764,7 +764,7 @@ TYPED_TEST(FFTTestComplexTypes, IFFT2D16x32C2C)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = ifft2(av)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -784,7 +784,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypes, FFT2D16R2C)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = fft2(av)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -804,7 +804,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypes, FFT2D16x32R2C)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = fft2(av)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -824,7 +824,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypes, IFFT2D16C2R)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = ifft2(av)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -844,7 +844,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypes, IFFT2D16x32C2R)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = ifft2(av)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -864,7 +864,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1D1024C2CShort)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = fft(av, fft_dim - 16)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
@@ -883,7 +883,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, IFFT1D1024C2CShort)
   this->pb->NumpyToTensorView(av, "a_in");
 
   (avo = ifft(av, fft_dim - 16)).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh);
   MATX_EXIT_HANDLER();
diff --git a/test/00_transform/MatMul.cu b/test/00_transform/MatMul.cu
index 99c78d767..28133f573 100644
--- a/test/00_transform/MatMul.cu
+++ b/test/00_transform/MatMul.cu
@@ -369,9 +369,9 @@ TYPED_TEST(MatMulTestFloatTypes, MediumRectBatched0StrideA)
   (b = b0).run();
 
   // Perform a batched gemm with "batches" GEMMs
-  (c = matmul(a0, b)).run();
+  (c = matmul(a0, b)).run(this->exec);
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (int i = 0; i < c.Size(0); i++) {
     for (int j = 0; j < c.Size(1); j++) {
@@ -409,9 +409,9 @@ TYPED_TEST(MatMulTestFloatTypes, MediumRectBatched0StrideB)
   (a = a0).run();
 
   // Perform a batched gemm with "batches" GEMMs
-  (c = matmul(a, b0)).run();
+  (c = matmul(a, b0)).run(this->exec);
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (int i = 0; i < c.Size(0); i++) {
     for (int j = 0; j < c.Size(1); j++) {
@@ -545,14 +545,14 @@ TYPED_TEST(MatMulTestFloatNonHalfTypes,  MatMulAxis)
     auto bp = permute(bi, perm);
     auto cp = permute(ci, perm);
 
-    (ap = a3).run();
-    (bp = b3).run();
+    (ap = a3).run(this->exec);
+    (bp = b3).run(this->exec);
 
     (ci = matmul(ai, bi, axis)).run();
     
-    (c3 = cp).run();
+    (c3 = cp).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     MATX_TEST_ASSERT_COMPARE(this->pb, c3, "c", this->thresh);
   }
@@ -571,17 +571,17 @@ TYPED_TEST(MatMulTestFloatNonHalfTypes,  MatMulAxis)
     auto cp = permute(ci, perm);
 
     // copy data into permuted inputs
-    (ap = a3).run();
-    (bp = b3).run();
+    (ap = a3).run(this->exec);
+    (bp = b3).run(this->exec);
 
     // Perform a GEMM with the last two dimensions permuted
-    (ci = matmul(ai, bi, axis)).run();
+    (ci = matmul(ai, bi, axis)).run(this->exec);
     // example-end matmul-test-6    
     
     // copy result from permuted output
-    (c3 = cp).run();
+    (c3 = cp).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     MATX_TEST_ASSERT_COMPARE(this->pb, c3, "c", this->thresh);
   }
@@ -599,15 +599,15 @@ TYPED_TEST(MatMulTestFloatNonHalfTypes,  MatMulAxis)
     auto cp = permute(ci, perm);
 
     // copy data into permuted inputs
-    (ap = a3).run();
-    (bp = b3).run();
+    (ap = a3).run(this->exec);
+    (bp = b3).run(this->exec);
 
-    (ci = matmul(ai, bi, axis)).run();
+    (ci = matmul(ai, bi, axis)).run(this->exec);
     
     // copy result from permuted output
-    (c3 = cp).run();
+    (c3 = cp).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     MATX_TEST_ASSERT_COMPARE(this->pb, c3, "c", this->thresh);
   }
@@ -625,15 +625,15 @@ TYPED_TEST(MatMulTestFloatNonHalfTypes,  MatMulAxis)
     auto cp = permute(ci, perm);
 
     // copy data into permuted inputs
-    (ap = a3).run();
-    (bp = b3).run();
+    (ap = a3).run(this->exec);
+    (bp = b3).run(this->exec);
 
-    (ci = matmul(ai, bi, axis)).run();
+    (ci = matmul(ai, bi, axis)).run(this->exec);
     
     // copy result from permuted output
-    (c3 = cp).run();
+    (c3 = cp).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     MATX_TEST_ASSERT_COMPARE(this->pb, c3, "c", this->thresh);
   }
@@ -701,7 +701,7 @@ TYPED_TEST(MatMulTestFloatNonHalfTypes,  MatMulBroadcast)
   // Broadcast eye2, scaling each entry in a5 by 2
   (c5 = matmul(eye2, a5)).run(this->exec);
 
-  cudaDeviceSynchronize();
+  this->exec.sync();
 
   for (index_t i0 = 0; i0 < x; i0++)
     for (index_t i1 = 0; i1 < y; i1++)
@@ -720,7 +720,7 @@ TYPED_TEST(MatMulTestFloatNonHalfTypes,  MatMulBroadcast)
   // Broadcast eye2, scaling each entry in a5 by 2
   (c5 = matmul(a5, eye2)).run(this->exec);
 
-  cudaDeviceSynchronize();
+  this->exec.sync();
 
   for (index_t i0 = 0; i0 < x; i0++)
     for (index_t i1 = 0; i1 < y; i1++)
@@ -828,9 +828,9 @@ TYPED_TEST(MatMulTestFloatTypes, MediumMatVecBatch)
 
   tensor_t<TestType, 2> bv{{blocks, k}};
   tensor_t<TestType, 2> cv{{blocks, m}};
-  (bv = bs).run();
-  (cv = cs).run();
-  (cv = matvec(a, bv)).run();
+  (bv = bs).run(this->exec);
+  (cv = cs).run(this->exec);
+  (cv = matvec(a, bv)).run(this->exec);
 
   MATX_TEST_ASSERT_COMPARE(this->pb, c, "c", this->thresh);
 
@@ -859,15 +859,15 @@ TYPED_TEST(MatMulTestFloatTypes, MatVecRowVector)
 
   auto cs = slice<2>(c, {0,0,0}, {matxEnd, matxEnd, matxDropDim});
   auto bs = slice<2>(b, {0,0,0}, {matxEnd, matxEnd, matxDropDim});
-  (cs = matvec(a, bs)).run();
+  (cs = matvec(a, bs)).run(this->exec);
 
   MATX_TEST_ASSERT_COMPARE(this->pb, c, "c", this->thresh);
 
   tensor_t<TestType, 2> bv{{blocks, k}};
   tensor_t<TestType, 2> cv{{blocks, m}};
-  (bv = bs).run();
-  (cv = cs).run();
-  (cv = matvec(a, bv)).run();
+  (bv = bs).run(this->exec);
+  (cv = cs).run(this->exec);
+  (cv = matvec(a, bv)).run(this->exec);
 
   MATX_TEST_ASSERT_COMPARE(this->pb, c, "c", this->thresh);
 
@@ -894,10 +894,10 @@ TYPED_TEST(MatMulTestFloatTypes, OuterProduct)
   this->pb->NumpyToTensorView(b, "b");
 
   // example-begin outer-test-1
-  (c = outer(a, b)).run();
+  (c = outer(a, b)).run(this->exec);
   // example-end outer-test-1
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
   MATX_TEST_ASSERT_COMPARE(this->pb, c, "c", this->thresh);
 
   auto ba = make_tensor<TestType>({batches, an});
@@ -906,9 +906,9 @@ TYPED_TEST(MatMulTestFloatTypes, OuterProduct)
   this->pb->NumpyToTensorView(bb, "bb");
 
   auto bc = make_tensor<TestType>({batches, an, bn});  
-  (bc = outer(ba, bb)).run();
+  (bc = outer(ba, bb)).run(this->exec);
 
-  cudaStreamSynchronize(0);
+  this->exec.sync();
   MATX_TEST_ASSERT_COMPARE(this->pb, bc, "bc", this->thresh);
 
   MATX_EXIT_HANDLER();
diff --git a/test/00_transform/ResamplePoly.cu b/test/00_transform/ResamplePoly.cu
index 1124d4cfe..1dbfc2041 100644
--- a/test/00_transform/ResamplePoly.cu
+++ b/test/00_transform/ResamplePoly.cu
@@ -128,7 +128,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, SimpleOddLength)
     (b = resample_poly(a, f, up, down)).run(this->exec);
     // example-end resample_poly-test-1
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);    
 
@@ -136,7 +136,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, SimpleOddLength)
     // so we can inverse-scale the output to compare against the golden outputs.
     (b = resample_poly(static_cast<TestType>(4.0) * a, f, up, down)).run(this->exec);
     (b = b * static_cast<TestType>(0.25)).run(this->exec);
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);
   }
@@ -190,7 +190,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, SimpleEvenLength)
     this->pb->NumpyToTensorView(f, "filter_random");
     (b = resample_poly(a, f, up, down)).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);    
 
@@ -198,7 +198,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, SimpleEvenLength)
     // so we can inverse-scale the output to compare against the golden outputs.
     (b = resample_poly(static_cast<TestType>(4.0) * a, f, up, down)).run(this->exec);
     (b = b * static_cast<TestType>(0.25)).run(this->exec);
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);
   }
@@ -244,11 +244,11 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, DefaultFilter)
     this->pb->NumpyToTensorView(a, "a");
     this->pb->NumpyToTensorView(f, "filter_default");
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     (b = resample_poly(a, f, up, down)).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_default", this->thresh);
   }
@@ -296,11 +296,11 @@ TYPED_TEST(ResamplePolyTestFloatTypes, DefaultFilter)
     this->pb->NumpyToTensorView(a, "a");
     this->pb->NumpyToTensorView(f, "filter_default");
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     (b = resample_poly(a, f, up, down)).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_default", this->thresh);
   }
@@ -344,7 +344,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Batched)
     this->pb->NumpyToTensorView(f, "filter_random");
     (b = resample_poly(ac, f, up, down)).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     // Verify that the 4D tensor was handled in a batched fashion
     for (int ia = 0; ia < nA; ia++) {
@@ -361,11 +361,11 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Batched)
     (full = ac).run(this->exec);
     (b = 0).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     (b = resample_poly(ac, f, up, down)).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     // Verify that the 4D tensor was handled in a batched fashion
     for (int ia = 0; ia < nA; ia++) {
@@ -377,7 +377,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Batched)
       }
     }
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
   }
 
   MATX_EXIT_HANDLER();
@@ -399,7 +399,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Identity)
 
   auto zero = make_tensor<TestType>({1});
   (zero = 0).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
     const index_t a_len = test_cases[i].a_len;
@@ -413,7 +413,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Identity)
     this->pb->NumpyToTensorView(a, "a");
     (b = resample_poly(a, zero, up, down)).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     // The output should equal the input because up == down.
     for (index_t k = 0; k < a_len; k++) {
@@ -442,7 +442,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Downsample)
 
   auto seven = make_tensor<TestType>({1});
   (seven = 7).run(this->exec);
-  cudaStreamSynchronize(0);
+  this->exec.sync();
 
   for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
     const index_t a_len = test_cases[i].a_len;
@@ -457,7 +457,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Downsample)
     this->pb->NumpyToTensorView(a, "a");
     (b = resample_poly(a, seven, up, down)).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     for (index_t j = 0; j < b_len; j++) {
       double aj, bj;
@@ -501,7 +501,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Upsample)
     // The resample kernel scales the filter by up, so we use 1/up to get an
     // effective filter of 1.
     (f = 1.0/static_cast<double>(up)).run(this->exec);
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     auto a = make_tensor<TestType>({a_len});
     const index_t b_len = a_len * up;
@@ -509,7 +509,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Upsample)
     this->pb->NumpyToTensorView(a, "a");
     (b = resample_poly(a, f, up, down)).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     // Since the filter is single tapped and == 1, we should get the sequence
     // [a_0, 0, ..., 0, a_1, 0, ...] with up-1 zeros between successive values
@@ -569,11 +569,11 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Operators)
     this->pb->NumpyToTensorView(a, "a");
     this->pb->NumpyToTensorView(f, "filter_random");
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     (b = resample_poly(shift<0>(shift<0>(a, 8), -8), shift<0>(shift<0>(f, 3), -3), up, down)).run(this->exec);
 
-    cudaStreamSynchronize(0);
+    this->exec.sync();
 
     MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);
   }
diff --git a/test/00_transform/Solve.cu b/test/00_transform/Solve.cu
index 6a14dd0a7..02369df8b 100644
--- a/test/00_transform/Solve.cu
+++ b/test/00_transform/Solve.cu
@@ -83,7 +83,7 @@ TYPED_TEST(SolveTestsFloatNonComplexNonHalf, CGSolve)
   (X = cgsolve(A, B, .00001, 10)).run(exec);
   // example-end cgsolve-test-1
   (B = matvec(A, X)).run(exec);
-  cudaDeviceSynchronize();
+  exec.sync();
 
   for(int i = 0; i < BATCH; i++) {
     for(int j = 0; j < N; j++) {
diff --git a/test/01_radar/MVDRBeamformer.cu b/test/01_radar/MVDRBeamformer.cu
index c2ce47fc2..21c5d430d 100644
--- a/test/01_radar/MVDRBeamformer.cu
+++ b/test/01_radar/MVDRBeamformer.cu
@@ -48,9 +48,9 @@ TEST(Radar, MVDRBeamformer)
   index_t data_len = 1000;
   index_t snap_len = 2 * num_el;
 
-  auto mvdr = MVDRBeamformer(num_beams, num_el, data_len, snap_len);
+  cudaExecutor exec{};
 
-  mvdr.Prefetch(0);
+  auto mvdr = MVDRBeamformer(num_beams, num_el, data_len, snap_len);
 
   auto pb = std::make_unique<detail::MatXPybind>();
   pb->InitAndRunTVGenerator<complex>("mvdr_beamformer", "mvdr_beamformer",
@@ -64,8 +64,8 @@ TEST(Radar, MVDRBeamformer)
   pb->NumpyToTensorView(in_vec, "in_vec");
   pb->NumpyToTensorView(v, "v");
 
-  mvdr.Run(0);
-  cudaStreamSynchronize(0);
+  mvdr.Run(exec);
+  exec.sync();
 
   auto cbf = mvdr.GetCBFView();
 
diff --git a/test/01_radar/MultiChannelRadarPipeline.cu b/test/01_radar/MultiChannelRadarPipeline.cu
index 98fc66418..1abbd4efb 100644
--- a/test/01_radar/MultiChannelRadarPipeline.cu
+++ b/test/01_radar/MultiChannelRadarPipeline.cu
@@ -93,15 +93,11 @@ TYPED_TEST(MultiChannelRadarPipelineTypes, PulseCompression)
   // Copy the replicated data into the actual data pointer
   matx::copy(d, x_clone, 0);
 
-  d.PrefetchDevice(0);
-
   auto wfd = p.GetwaveformView();
   auto wf = wfd.Slice({0}, {this->waveformLength});
 
   this->pb->NumpyToTensorView(wf, "waveform");
 
-  wfd.PrefetchDevice(0);
-
   p.PulseCompression();
 
   auto xc = d.Slice({0, 0, 0}, {this->numChannels, this->numPulses,
@@ -155,8 +151,6 @@ TYPED_TEST(MultiChannelRadarPipelineTypes, Doppler)
                                  this->numCompressedSamples});
   this->pb->NumpyToTensorView(v, "x_conv2");
 
-  in.PrefetchHost(0);
-
   p.DopplerProcessing();
 
   auto out = p.GetTPCView();
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 34e498140..3db9bef96 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -37,7 +37,7 @@ set (test_sources
     main.cu
 )
 
-# Some of <00_io> tests need csv files and the binary 'test.mat' which all
+# Some of <00_io> tests need csv files and binaries which all
 # are located under 'CMAKE_SOURCE_DIR/test/00_io'. When calling the test
 # executable <matx_test> from its location in 'CMAKE_BINARY_DIR/test' the
 # search paths according <FileIOTests.cu> are
@@ -57,6 +57,10 @@ file(COPY
 	${CMAKE_SOURCE_DIR}/test/00_io/test.mat
 	DESTINATION ${CMAKE_BINARY_DIR}/test/00_io
 )
+file(COPY
+	${CMAKE_SOURCE_DIR}/test/00_io/test.npy
+	DESTINATION ${CMAKE_BINARY_DIR}/test/00_io
+)
 
 # Find proprietary parameters
 file (GLOB_RECURSE proprietary_sources ../proprietary/*/tests/*.cu)