NVIDIA · cliffburdick · May 31, 2024 · May 31, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 .vscode
+.cache
 build*
 *.pyc
diff --git a/bench/00_operators/operators.cu b/bench/00_operators/operators.cu
@@ -10,6 +10,7 @@ template <typename ValueType>
 void vector_add(nvbench::state &state, nvbench::type_list<ValueType>)
 {
   // Get current parameters:
+  cudaExecutor exec{0};
   const int x_len = static_cast<int>(state.get_int64("Vector size"));
 
   state.add_element_count(x_len, "NumElements");
@@ -19,9 +20,9 @@ void vector_add(nvbench::state &state, nvbench::type_list<ValueType>)
 
   tensor_t<ValueType, 1> xv{{x_len}};
   tensor_t<ValueType, 1> xv2{{x_len}};
-  xv.PrefetchDevice(0);
-  (xv = xv + xv2).run();
-  cudaDeviceSynchronize();
+
+  (xv = xv + xv2).run(exec);
+  exec.sync();
 
   state.exec( 
     [&xv, &xv2](nvbench::launch &launch) {
@@ -38,6 +39,7 @@ using permute_types = nvbench::type_list<float, double, cuda::std::complex<float
 template <typename ValueType>
 void permute(nvbench::state &state, nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto x = make_tensor<ValueType>({1000,200,6,300});
   auto y = make_tensor<ValueType>({300,1000,6,200});
 
@@ -46,7 +48,7 @@ void permute(nvbench::state &state, nvbench::type_list<ValueType>)
   state.add_global_memory_writes<ValueType>(x.TotalSize());    
 
   x.PrefetchDevice(0);
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec( 
     [&x, &y](nvbench::launch &launch) {
@@ -61,17 +63,18 @@ using random_types = nvbench::type_list<float, double, cuda::std::complex<float>
 template <typename ValueType>
 void random(nvbench::state &state, nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto x = make_tensor<ValueType>({1966800});
   auto y = make_tensor<ValueType>({1966800});
   x.PrefetchDevice(0);
   y.PrefetchDevice(0);
 
-  (y = random<float>(x.Shape(), NORMAL)).run();
+  (y = random<float>(x.Shape(), NORMAL)).run(exec);
 
   state.add_element_count(x.TotalSize(), "NumElements");
   state.add_global_memory_writes<ValueType>(x.TotalSize());    
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec( 
     [&x, &y](nvbench::launch &launch) {
@@ -98,6 +101,7 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list<ValueType>)
   int n = 600;
   ValueType dx = M_PI/n;
 
+  cudaExecutor exec{};
   auto col = range<0>({n+1},ValueType(0), ValueType(dx));
   auto az = range<0>({2*n+1}, ValueType(0), ValueType(dx));
 
@@ -122,8 +126,8 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list<ValueType>)
   auto Y = make_tensor<ValueType>(Ym.Shape());
   auto Z = make_tensor<ValueType>(Zm.Shape());
 
-  cudaDeviceSynchronize();
-
+  exec.sync();
+  
   state.add_element_count(n+1, "Elements");
 
   state.exec( 

diff --git a/bench/00_operators/reduction.cu b/bench/00_operators/reduction.cu
@@ -127,6 +127,7 @@ void reduce_4d(
            nvbench::type_list<ValueType>
            )
 {
+  cudaExecutor exec{0};
   const int size0 = static_cast<int>(state.get_int64("Size0"));
   const int size1 = static_cast<int>(state.get_int64("Size1"));
   const int size2 = static_cast<int>(state.get_int64("Size2"));
@@ -138,8 +139,8 @@ void reduce_4d(
   t1.PrefetchDevice(0);
   t4.PrefetchDevice(0);
 
-  (t4 = random<float>(t4.Shape(), UNIFORM)).run();
-  cudaDeviceSynchronize();
+  (t4 = random<float>(t4.Shape(), UNIFORM)).run(exec);
+  exec.sync();
 
   state.exec([&t4, &t1](nvbench::launch &launch) { 
     (t1 = matx::sum(t4, {1, 2, 3})).run((cudaStream_t)launch.get_stream()); });

diff --git a/bench/00_transform/conv.cu b/bench/00_transform/conv.cu
@@ -13,6 +13,7 @@ template <typename ValueType>
 void conv1d_direct_4d_batch(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto out = make_tensor<ValueType>({4, 2, 14, 288 + 4096 + 133 - 1});
   auto at = make_tensor<ValueType>({ 4, 2, 14, 133});
   auto bt = make_tensor<ValueType>({ 4, 2, 14, 288 + 4096});
@@ -21,7 +22,7 @@ void conv1d_direct_4d_batch(nvbench::state &state,
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
 
-  cudaDeviceSynchronize();
+  exec.sync();
   MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
@@ -35,7 +36,7 @@ template <typename ValueType>
 void conv1d_direct_2d_batch(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
-
+  cudaExecutor exec{0};
 
   auto out = make_tensor<ValueType>({4 * 2* 14, 288 + 4096 + 133 - 1});
   auto at = make_tensor<ValueType>({ 4 * 2* 14, 133});
@@ -45,7 +46,7 @@ void conv1d_direct_2d_batch(nvbench::state &state,
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
@@ -56,6 +57,7 @@ template <typename ValueType>
 void conv1d_direct_large(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto at = make_tensor<ValueType>({state.get_int64("Signal Size")});
   auto bt = make_tensor<ValueType>({state.get_int64("Filter Size")});
   auto out = make_tensor<ValueType>({at.Size(at.Rank()-1) + bt.Size(bt.Rank()-1) - 1});
@@ -64,9 +66,9 @@ void conv1d_direct_large(nvbench::state &state,
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
 
-  (out = conv1d(at, bt, MATX_C_MODE_FULL)).run();
+  (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(exec);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
@@ -79,17 +81,18 @@ template <typename ValueType>
 void conv1d_fft_large(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto at = make_tensor<ValueType>({state.get_int64("Signal Size")});
   auto bt = make_tensor<ValueType>({state.get_int64("Filter Size")});
   auto out = make_tensor<ValueType>({at.Size(at.Rank()-1) + bt.Size(bt.Rank()-1) - 1});
 
-  (out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run();
+  (out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run(exec);
 
   out.PrefetchDevice(0);
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run(cudaExecutor(launch.get_stream())); });
@@ -103,6 +106,7 @@ template <typename ValueType>
 void conv2d_direct_batch(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto at = make_tensor<ValueType>({256, 1024, 1024});
   auto bt = make_tensor<ValueType>({256, 16, 16});
   auto out = make_tensor<ValueType>({256, 
@@ -113,7 +117,7 @@ void conv2d_direct_batch(nvbench::state &state,
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { (out = conv2d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });

diff --git a/bench/00_transform/cub.cu b/bench/00_transform/cub.cu
@@ -24,14 +24,15 @@ void sort1d(
            nvbench::type_list<ValueType>
            )
 {
+  cudaExecutor exec{0};
   const int dataSize = static_cast<int>(state.get_int64("Tensor Size"));
 
   auto sortedData = matx::make_tensor<ValueType>({dataSize});
   auto randomData = matx::make_tensor<ValueType>({dataSize});
 
   sortedData.PrefetchDevice(0);
   randomData.PrefetchDevice(0);
-  cudaDeviceSynchronize();
+  exec.sync();
 
   (randomData = random<float>(sortedData.Shape(), NORMAL)).run();
 

diff --git a/bench/00_transform/qr.cu b/bench/00_transform/qr.cu
@@ -17,6 +17,7 @@ void qr_batch(nvbench::state &state,
 
   cudaStream_t stream = 0;
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
+  cudaExecutor exec{stream};
 
   int batch = state.get_int64("batch");
   int m = state.get_int64("rows");
@@ -26,17 +27,13 @@ void qr_batch(nvbench::state &state,
   auto Q = make_tensor<AType>({batch, m, m});
   auto R = make_tensor<AType>({batch, m, n});
 
-  A.PrefetchDevice(stream);
-  Q.PrefetchDevice(stream);
-  R.PrefetchDevice(stream);
-
-  (A = random<float>({batch, m, n}, NORMAL)).run(stream);
+  (A = random<float>({batch, m, n}, NORMAL)).run(exec);
 
   // warm up
   nvtxRangePushA("Warmup");
-  (mtie(Q, R) = qr(A)).run(stream);
+  (mtie(Q, R) = qr(A)).run(exec);
 
-  cudaDeviceSynchronize();
+  exec.sync();
   nvtxRangePop();
 
   MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )

diff --git a/bench/00_transform/svd_power.cu b/bench/00_transform/svd_power.cu
@@ -17,6 +17,7 @@ void svdpi_batch(nvbench::state &state,
 
   cudaStream_t stream = 0;
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
+  cudaExecutor exec{stream};
 
   int batch = state.get_int64("batch");
   int m = state.get_int64("rows");
@@ -30,22 +31,17 @@ void svdpi_batch(nvbench::state &state,
 
   int iterations = 10;
 
-  (A = random<float>({batch, m, n}, NORMAL)).run(stream);
-
-  A.PrefetchDevice(stream);
-  U.PrefetchDevice(stream);
-  S.PrefetchDevice(stream);
-  VT.PrefetchDevice(stream);
-
-  (U = 0).run(stream);
-  (S = 0).run(stream);
-  (VT = 0).run(stream);
+  (A = random<float>({batch, m, n}, NORMAL)).run(exec);
+
+  (U = 0).run(exec);
+  (S = 0).run(exec);
+  (VT = 0).run(exec);
   auto x0 = random<float>({batch, r}, NORMAL);
 
   // warm up
   nvtxRangePushA("Warmup");
-  (mtie(U, S, VT) = svdpi(A, x0, iterations, r)).run(stream);
-  cudaDeviceSynchronize();
+  (mtie(U, S, VT) = svdpi(A, x0, iterations, r)).run(exec);
+  exec.sync();
   nvtxRangePop();
 
   MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
@@ -70,6 +66,7 @@ void svdbpi_batch(nvbench::state &state,
 
   cudaStream_t stream = 0;
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
+  cudaExecutor exec{stream};
 
   int batch = state.get_int64("batch");
   int m = state.get_int64("rows");
@@ -83,21 +80,16 @@ void svdbpi_batch(nvbench::state &state,
 
   int iterations = 10;
 
-  (A = random<float>({batch, m, n}, NORMAL)).run(stream);
-
-  A.PrefetchDevice(stream);
-  U.PrefetchDevice(stream);
-  S.PrefetchDevice(stream);
-  VT.PrefetchDevice(stream);
+  (A = random<float>({batch, m, n}, NORMAL)).run(exec);
 
-  (U = 0).run(stream);
-  (S = 0).run(stream);
-  (VT = 0).run(stream);
+  (U = 0).run(exec);
+  (S = 0).run(exec);
+  (VT = 0).run(exec);
 
   // warm up
   nvtxRangePushA("Warmup");
-  (mtie(U, S, VT) = svdbpi(A, iterations)).run(stream);
-  cudaDeviceSynchronize();
+  (mtie(U, S, VT) = svdbpi(A, iterations)).run(exec);
+  exec.sync();
   nvtxRangePop();
 
   MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )

diff --git a/docs_input/api/index.rst b/docs_input/api/index.rst
@@ -17,6 +17,7 @@ API Reference
    casting/index.rst
    window/index.rst
    signalimage/index.rst
+   synchronization/index.rst
    polynomials/index.rst
    random/random.rst
    dft/index.rst

diff --git a/docs_input/api/synchronization/index.rst b/docs_input/api/synchronization/index.rst
@@ -0,0 +1,10 @@
+.. _synchronization:
+
+Synchronization
+###############
+
+.. toctree::
+   :maxdepth: 1
+   :glob:
+
+   *
diff --git a/docs_input/api/synchronization/sync.rst b/docs_input/api/synchronization/sync.rst
@@ -0,0 +1,18 @@
+.. _sync_func:
+
+sync
+====
+
+Wait for any code running on an executor to complete.
+
+.. doxygenfunction:: matx::cudaExecutor::sync()
+.. doxygenfunction:: matx::HostExecutor::sync()
+
+Examples
+~~~~~~~~
+
+.. literalinclude:: ../../../examples/cgsolve.cu
+   :language: cpp
+   :start-after: example-begin sync-test-1
+   :end-before: example-end sync-test-1
+   :dedent:
diff --git a/docs_input/notebooks/exercises/example1_assignment1.cu b/docs_input/notebooks/exercises/example1_assignment1.cu
@@ -57,8 +57,6 @@ int main() {
   // t2 = ;
   /*** End editing ***/
 
-  t2.PrefetchDevice(0);
-
   /****************************************************************************************************
    * Get a slice of the second and third rows with all columns
    * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#slicing-and-dicing

diff --git a/docs_input/notebooks/exercises/example2_assignment1.cu b/docs_input/notebooks/exercises/example2_assignment1.cu
@@ -45,6 +45,8 @@ int main() {
   auto B = make_tensor<float>({2, 3});
   auto V = make_tensor<float>({3});
 
+  cudaExecutor exec{};
+
   /****************************************************************************************************
    * Initialize tensor A with increasing values from 0.5 to 3.0 in steps of 0.4,
    *and tensor V from -1 to -3 in steps of -1.
@@ -83,7 +85,7 @@ int main() {
 
   /*** End editing ***/
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   step = 0.5;
   for (int row = 0; row < A.Size(0); row++) {
@@ -111,7 +113,7 @@ int main() {
   /// auto tvs = ;
   /*** End editing. ***/
 
-  // cudaStreamSynchronize(0);
+  // exec.sync();
 
   // step = 0.5;
   // for (int row = 0; row < A.Size(0); row++) {
@@ -137,7 +139,7 @@ int main() {
 
   /*** End editing ***/
 
-  cudaStreamSynchronize(0);
+  exec.sync();
 
   for (int row = 0; row < B.Size(0); row++) {
     for (int col = 0; col < B.Size(1); col++) {
-Original file line number
+Diff line change
@@ -1,3 +1,4 @@
     .vscode
+    .cache
     build*
     *.pyc