NVIDIA
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bench/00_operators/operators.cu‎
Lines changed: 12 additions & 8 deletions b/‎bench/00_operators/operators.cu‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎bench/00_operators/reduction.cu‎
Lines changed: 3 additions & 2 deletions b/‎bench/00_operators/reduction.cu‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎bench/00_transform/conv.cu‎
Lines changed: 12 additions & 8 deletions b/‎bench/00_transform/conv.cu‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎bench/00_transform/cub.cu‎
Lines changed: 2 additions & 1 deletion b/‎bench/00_transform/cub.cu‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎bench/00_transform/qr.cu‎
Lines changed: 4 additions & 7 deletions b/‎bench/00_transform/qr.cu‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎bench/00_transform/svd_power.cu‎
Lines changed: 15 additions & 23 deletions b/‎bench/00_transform/svd_power.cu‎
Lines changed: 15 additions & 23 deletions
diff --git a/‎docs_input/api/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs_input/api/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs_input/api/synchronization/index.rst‎
Lines changed: 10 additions & 0 deletions b/‎docs_input/api/synchronization/index.rst‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs_input/api/synchronization/sync.rst‎
Lines changed: 18 additions & 0 deletions b/‎docs_input/api/synchronization/sync.rst‎
Lines changed: 18 additions & 0 deletions
@@ -1,3 +1,4 @@
 .vscode
+.cache
 build*
 *.pyc
@@ -10,6 +10,7 @@ template <typename ValueType>
 void vector_add(nvbench::state &state, nvbench::type_list<ValueType>)
 {
   // Get current parameters:
+  cudaExecutor exec{0};
   const int x_len = static_cast<int>(state.get_int64("Vector size"));
 
   state.add_element_count(x_len, "NumElements");
@@ -19,9 +20,9 @@ void vector_add(nvbench::state &state, nvbench::type_list<ValueType>)
 
   tensor_t<ValueType, 1> xv{{x_len}};
   tensor_t<ValueType, 1> xv2{{x_len}};
-  xv.PrefetchDevice(0);
-  (xv = xv + xv2).run();
-  cudaDeviceSynchronize();
+
+  (xv = xv + xv2).run(exec);
+  exec.sync();
 
   state.exec( 
     [&xv, &xv2](nvbench::launch &launch) {
@@ -38,6 +39,7 @@ using permute_types = nvbench::type_list<float, double, cuda::std::complex<float
 template <typename ValueType>
 void permute(nvbench::state &state, nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto x = make_tensor<ValueType>({1000,200,6,300});
   auto y = make_tensor<ValueType>({300,1000,6,200});
 
@@ -46,7 +48,7 @@ void permute(nvbench::state &state, nvbench::type_list<ValueType>)
   state.add_global_memory_writes<ValueType>(x.TotalSize());    
 
   x.PrefetchDevice(0);
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec( 
     [&x, &y](nvbench::launch &launch) {
@@ -61,17 +63,18 @@ using random_types = nvbench::type_list<float, double, cuda::std::complex<float>
 template <typename ValueType>
 void random(nvbench::state &state, nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto x = make_tensor<ValueType>({1966800});
   auto y = make_tensor<ValueType>({1966800});
   x.PrefetchDevice(0);
   y.PrefetchDevice(0);
 
-  (y = random<float>(x.Shape(), NORMAL)).run();
+  (y = random<float>(x.Shape(), NORMAL)).run(exec);
 
   state.add_element_count(x.TotalSize(), "NumElements");
   state.add_global_memory_writes<ValueType>(x.TotalSize());    
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec( 
     [&x, &y](nvbench::launch &launch) {
@@ -98,6 +101,7 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list<ValueType>)
   int n = 600;
   ValueType dx = M_PI/n;
 
+  cudaExecutor exec{};
   auto col = range<0>({n+1},ValueType(0), ValueType(dx));
   auto az = range<0>({2*n+1}, ValueType(0), ValueType(dx));
 
@@ -122,8 +126,8 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list<ValueType>)
   auto Y = make_tensor<ValueType>(Ym.Shape());
   auto Z = make_tensor<ValueType>(Zm.Shape());
 
-  cudaDeviceSynchronize();
-
+  exec.sync();
+  
   state.add_element_count(n+1, "Elements");
 
   state.exec( 
 
@@ -127,6 +127,7 @@ void reduce_4d(
            nvbench::type_list<ValueType>
            )
 {
+  cudaExecutor exec{0};
   const int size0 = static_cast<int>(state.get_int64("Size0"));
   const int size1 = static_cast<int>(state.get_int64("Size1"));
   const int size2 = static_cast<int>(state.get_int64("Size2"));
@@ -138,8 +139,8 @@ void reduce_4d(
   t1.PrefetchDevice(0);
   t4.PrefetchDevice(0);
 
-  (t4 = random<float>(t4.Shape(), UNIFORM)).run();
-  cudaDeviceSynchronize();
+  (t4 = random<float>(t4.Shape(), UNIFORM)).run(exec);
+  exec.sync();
 
   state.exec([&t4, &t1](nvbench::launch &launch) { 
     (t1 = matx::sum(t4, {1, 2, 3})).run((cudaStream_t)launch.get_stream()); });
 
@@ -13,6 +13,7 @@ template <typename ValueType>
 void conv1d_direct_4d_batch(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto out = make_tensor<ValueType>({4, 2, 14, 288 + 4096 + 133 - 1});
   auto at = make_tensor<ValueType>({ 4, 2, 14, 133});
   auto bt = make_tensor<ValueType>({ 4, 2, 14, 288 + 4096});
@@ -21,7 +22,7 @@ void conv1d_direct_4d_batch(nvbench::state &state,
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
 
-  cudaDeviceSynchronize();
+  exec.sync();
   MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
@@ -35,7 +36,7 @@ template <typename ValueType>
 void conv1d_direct_2d_batch(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
-
+  cudaExecutor exec{0};
 
   auto out = make_tensor<ValueType>({4 * 2* 14, 288 + 4096 + 133 - 1});
   auto at = make_tensor<ValueType>({ 4 * 2* 14, 133});
@@ -45,7 +46,7 @@ void conv1d_direct_2d_batch(nvbench::state &state,
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
@@ -56,6 +57,7 @@ template <typename ValueType>
 void conv1d_direct_large(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto at = make_tensor<ValueType>({state.get_int64("Signal Size")});
   auto bt = make_tensor<ValueType>({state.get_int64("Filter Size")});
   auto out = make_tensor<ValueType>({at.Size(at.Rank()-1) + bt.Size(bt.Rank()-1) - 1});
@@ -64,9 +66,9 @@ void conv1d_direct_large(nvbench::state &state,
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
 
-  (out = conv1d(at, bt, MATX_C_MODE_FULL)).run();
+  (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(exec);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
@@ -79,17 +81,18 @@ template <typename ValueType>
 void conv1d_fft_large(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto at = make_tensor<ValueType>({state.get_int64("Signal Size")});
   auto bt = make_tensor<ValueType>({state.get_int64("Filter Size")});
   auto out = make_tensor<ValueType>({at.Size(at.Rank()-1) + bt.Size(bt.Rank()-1) - 1});
 
-  (out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run();
+  (out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run(exec);
 
   out.PrefetchDevice(0);
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run(cudaExecutor(launch.get_stream())); });
@@ -103,6 +106,7 @@ template <typename ValueType>
 void conv2d_direct_batch(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
+  cudaExecutor exec{0};
   auto at = make_tensor<ValueType>({256, 1024, 1024});
   auto bt = make_tensor<ValueType>({256, 16, 16});
   auto out = make_tensor<ValueType>({256, 
@@ -113,7 +117,7 @@ void conv2d_direct_batch(nvbench::state &state,
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
 
-  cudaDeviceSynchronize();
+  exec.sync();
 
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { (out = conv2d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
 
@@ -24,14 +24,15 @@ void sort1d(
            nvbench::type_list<ValueType>
            )
 {
+  cudaExecutor exec{0};
   const int dataSize = static_cast<int>(state.get_int64("Tensor Size"));
 
   auto sortedData = matx::make_tensor<ValueType>({dataSize});
   auto randomData = matx::make_tensor<ValueType>({dataSize});
 
   sortedData.PrefetchDevice(0);
   randomData.PrefetchDevice(0);
-  cudaDeviceSynchronize();
+  exec.sync();
 
   (randomData = random<float>(sortedData.Shape(), NORMAL)).run();
 
 
@@ -17,6 +17,7 @@ void qr_batch(nvbench::state &state,
 
   cudaStream_t stream = 0;
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
+  cudaExecutor exec{stream};
 
   int batch = state.get_int64("batch");
   int m = state.get_int64("rows");
@@ -26,17 +27,13 @@ void qr_batch(nvbench::state &state,
   auto Q = make_tensor<AType>({batch, m, m});
   auto R = make_tensor<AType>({batch, m, n});
 
-  A.PrefetchDevice(stream);
-  Q.PrefetchDevice(stream);
-  R.PrefetchDevice(stream);
-  
-  (A = random<float>({batch, m, n}, NORMAL)).run(stream);
+  (A = random<float>({batch, m, n}, NORMAL)).run(exec);
 
   // warm up
   nvtxRangePushA("Warmup");
-  (mtie(Q, R) = qr(A)).run(stream);
+  (mtie(Q, R) = qr(A)).run(exec);
 
-  cudaDeviceSynchronize();
+  exec.sync();
   nvtxRangePop();
 
   MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
 
@@ -17,6 +17,7 @@ void svdpi_batch(nvbench::state &state,
 
   cudaStream_t stream = 0;
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
+  cudaExecutor exec{stream};
 
   int batch = state.get_int64("batch");
   int m = state.get_int64("rows");
@@ -30,22 +31,17 @@ void svdpi_batch(nvbench::state &state,
 
   int iterations = 10;
 
-  (A = random<float>({batch, m, n}, NORMAL)).run(stream);
-  
-  A.PrefetchDevice(stream);
-  U.PrefetchDevice(stream);
-  S.PrefetchDevice(stream);
-  VT.PrefetchDevice(stream);
-
-  (U = 0).run(stream);
-  (S = 0).run(stream);
-  (VT = 0).run(stream);
+  (A = random<float>({batch, m, n}, NORMAL)).run(exec);
+
+  (U = 0).run(exec);
+  (S = 0).run(exec);
+  (VT = 0).run(exec);
   auto x0 = random<float>({batch, r}, NORMAL);
 
   // warm up
   nvtxRangePushA("Warmup");
-  (mtie(U, S, VT) = svdpi(A, x0, iterations, r)).run(stream);
-  cudaDeviceSynchronize();
+  (mtie(U, S, VT) = svdpi(A, x0, iterations, r)).run(exec);
+  exec.sync();
   nvtxRangePop();
 
   MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
@@ -70,6 +66,7 @@ void svdbpi_batch(nvbench::state &state,
 
   cudaStream_t stream = 0;
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
+  cudaExecutor exec{stream};
 
   int batch = state.get_int64("batch");
   int m = state.get_int64("rows");
@@ -83,21 +80,16 @@ void svdbpi_batch(nvbench::state &state,
 
   int iterations = 10;
 
-  (A = random<float>({batch, m, n}, NORMAL)).run(stream);
-  
-  A.PrefetchDevice(stream);
-  U.PrefetchDevice(stream);
-  S.PrefetchDevice(stream);
-  VT.PrefetchDevice(stream);
+  (A = random<float>({batch, m, n}, NORMAL)).run(exec);
 
-  (U = 0).run(stream);
-  (S = 0).run(stream);
-  (VT = 0).run(stream);
+  (U = 0).run(exec);
+  (S = 0).run(exec);
+  (VT = 0).run(exec);
 
   // warm up
   nvtxRangePushA("Warmup");
-  (mtie(U, S, VT) = svdbpi(A, iterations)).run(stream);
-  cudaDeviceSynchronize();
+  (mtie(U, S, VT) = svdbpi(A, iterations)).run(exec);
+  exec.sync();
   nvtxRangePop();
 
   MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
 
@@ -17,6 +17,7 @@ API Reference
    casting/index.rst
    window/index.rst
    signalimage/index.rst
+   synchronization/index.rst
    polynomials/index.rst
    random/random.rst
    dft/index.rst
 
@@ -0,0 +1,10 @@
+.. _synchronization:
+
+Synchronization
+###############
+
+.. toctree::
+   :maxdepth: 1
+   :glob:
+
+   *
@@ -0,0 +1,18 @@
+.. _sync_func:
+
+sync
+====
+
+Wait for any code running on an executor to complete.
+
+.. doxygenfunction:: matx::cudaExecutor::sync()
+.. doxygenfunction:: matx::HostExecutor::sync()
+
+Examples
+~~~~~~~~
+
+.. literalinclude:: ../../../examples/cgsolve.cu
+   :language: cpp
+   :start-after: example-begin sync-test-1
+   :end-before: example-end sync-test-1
+   :dedent:
-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 .vscode
 +.cache
 build*
 *.pyc
Original file line number	Diff line number	Diff line change
`@@ -24,14 +24,15 @@ void sort1d(`
`24`	`24`	`nvbench::type_list<ValueType>`
`25`	`25`	`)`
`26`	`26`	`{`
	`27`	`+ cudaExecutor exec{0};`
`27`	`28`	`const int dataSize = static_cast<int>(state.get_int64("Tensor Size"));`
`28`	`29`
`29`	`30`	`auto sortedData = matx::make_tensor<ValueType>({dataSize});`
`30`	`31`	`auto randomData = matx::make_tensor<ValueType>({dataSize});`
`31`	`32`
`32`	`33`	`sortedData.PrefetchDevice(0);`
`33`	`34`	`randomData.PrefetchDevice(0);`
`34`		`- cudaDeviceSynchronize();`
	`35`	`+ exec.sync();`
`35`	`36`
`36`	`37`	`(randomData = random<float>(sortedData.Shape(), NORMAL)).run();`
`37`	`38`