From db2ae6b2ffb8a39e66e885dab8df4b578c9e40fb Mon Sep 17 00:00:00 2001 From: Aayush Gupta Date: Fri, 31 May 2024 00:30:14 +0000 Subject: [PATCH] Added synchronization support through .sync(). Updated all tests to use calls to .sync() instead of the cuda api and removed Prefetch calls. --- .gitignore | 1 + bench/00_operators/operators.cu | 20 +- bench/00_operators/reduction.cu | 5 +- bench/00_transform/conv.cu | 20 +- bench/00_transform/cub.cu | 3 +- bench/00_transform/qr.cu | 11 +- bench/00_transform/svd_power.cu | 38 +- docs_input/api/index.rst | 1 + docs_input/api/synchronization/index.rst | 10 + docs_input/api/synchronization/sync.rst | 18 + .../exercises/example1_assignment1.cu | 2 - .../exercises/example2_assignment1.cu | 8 +- .../exercises/example3_assignment1.cu | 6 +- .../notebooks/exercises/example3_fft_conv.cu | 5 +- .../solutions/example1_assignment1.cu | 2 - .../solutions/example2_assignment1.cu | 14 +- .../solutions/example3_assignment1.cu | 12 +- examples/black_scholes.cu | 16 +- examples/cgsolve.cu | 19 +- examples/channelize_poly_bench.cu | 9 +- examples/convolution.cu | 15 +- examples/eigenExample.cu | 23 +- examples/fft_conv.cu | 24 +- examples/mvdr_beamformer.cu | 7 +- examples/mvdr_beamformer.h | 21 +- examples/pwelch.cu | 11 +- examples/qr.cu | 17 +- examples/recursive_filter.cu | 10 +- examples/resample.cu | 13 +- examples/resample_poly_bench.cu | 16 +- examples/simple_radar_pipeline.cu | 5 +- examples/simple_radar_pipeline.h | 47 +- examples/spectrogram.cu | 24 +- examples/spectrogram_graph.cu | 26 +- examples/spherical_harmonics.cu | 6 +- examples/svd_power.cu | 58 +- include/matx/executors/device.h | 12 +- include/matx/executors/host.h | 6 + test/00_io/FileIOTests.cu | 6 +- test/00_operators/GeneratorTests.cu | 46 +- test/00_operators/OperatorTests.cu | 531 +++++++++--------- test/00_operators/PWelch.cu | 19 +- test/00_operators/ReductionTests.cu | 180 +++--- test/00_solver/Cholesky.cu | 6 +- test/00_solver/Det.cu | 2 +- test/00_solver/Eigen.cu | 2 +- test/00_solver/Inverse.cu | 10 +- test/00_solver/LU.cu | 2 +- test/00_solver/QR.cu | 2 +- test/00_solver/QR2.cu | 19 +- test/00_solver/SVD.cu | 24 +- test/00_tensor/BasicTensorTests.cu | 20 +- test/00_tensor/CUBTests.cu | 18 +- test/00_tensor/EinsumTests.cu | 14 +- test/00_tensor/ViewTests.cu | 4 +- test/00_transform/ChannelizePoly.cu | 24 +- test/00_transform/ConvCorr.cu | 18 +- test/00_transform/Copy.cu | 44 +- test/00_transform/Cov.cu | 4 +- test/00_transform/FFT.cu | 90 +-- test/00_transform/MatMul.cu | 72 +-- test/00_transform/ResamplePoly.cu | 40 +- test/00_transform/Solve.cu | 2 +- test/01_radar/MVDRBeamformer.cu | 8 +- test/01_radar/MultiChannelRadarPipeline.cu | 6 - test/CMakeLists.txt | 6 +- 66 files changed, 904 insertions(+), 876 deletions(-) create mode 100644 docs_input/api/synchronization/index.rst create mode 100644 docs_input/api/synchronization/sync.rst diff --git a/.gitignore b/.gitignore index 8947ab01f..253de50b3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .vscode +.cache build* *.pyc diff --git a/bench/00_operators/operators.cu b/bench/00_operators/operators.cu index ea8c49756..8bbbf3429 100644 --- a/bench/00_operators/operators.cu +++ b/bench/00_operators/operators.cu @@ -10,6 +10,7 @@ template void vector_add(nvbench::state &state, nvbench::type_list) { // Get current parameters: + cudaExecutor exec{0}; const int x_len = static_cast(state.get_int64("Vector size")); state.add_element_count(x_len, "NumElements"); @@ -19,9 +20,9 @@ void vector_add(nvbench::state &state, nvbench::type_list) tensor_t xv{{x_len}}; tensor_t xv2{{x_len}}; - xv.PrefetchDevice(0); - (xv = xv + xv2).run(); - cudaDeviceSynchronize(); + + (xv = xv + xv2).run(exec); + exec.sync(); state.exec( [&xv, &xv2](nvbench::launch &launch) { @@ -38,6 +39,7 @@ using permute_types = nvbench::type_list void permute(nvbench::state &state, nvbench::type_list) { + cudaExecutor exec{0}; auto x = make_tensor({1000,200,6,300}); auto y = make_tensor({300,1000,6,200}); @@ -46,7 +48,7 @@ void permute(nvbench::state &state, nvbench::type_list) state.add_global_memory_writes(x.TotalSize()); x.PrefetchDevice(0); - cudaDeviceSynchronize(); + exec.sync(); state.exec( [&x, &y](nvbench::launch &launch) { @@ -61,17 +63,18 @@ using random_types = nvbench::type_list template void random(nvbench::state &state, nvbench::type_list) { + cudaExecutor exec{0}; auto x = make_tensor({1966800}); auto y = make_tensor({1966800}); x.PrefetchDevice(0); y.PrefetchDevice(0); - (y = random(x.Shape(), NORMAL)).run(); + (y = random(x.Shape(), NORMAL)).run(exec); state.add_element_count(x.TotalSize(), "NumElements"); state.add_global_memory_writes(x.TotalSize()); - cudaDeviceSynchronize(); + exec.sync(); state.exec( [&x, &y](nvbench::launch &launch) { @@ -98,6 +101,7 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list) int n = 600; ValueType dx = M_PI/n; + cudaExecutor exec{}; auto col = range<0>({n+1},ValueType(0), ValueType(dx)); auto az = range<0>({2*n+1}, ValueType(0), ValueType(dx)); @@ -122,8 +126,8 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list) auto Y = make_tensor(Ym.Shape()); auto Z = make_tensor(Zm.Shape()); - cudaDeviceSynchronize(); - + exec.sync(); + state.add_element_count(n+1, "Elements"); state.exec( diff --git a/bench/00_operators/reduction.cu b/bench/00_operators/reduction.cu index ed9f5817f..ffd44ab6c 100644 --- a/bench/00_operators/reduction.cu +++ b/bench/00_operators/reduction.cu @@ -127,6 +127,7 @@ void reduce_4d( nvbench::type_list ) { + cudaExecutor exec{0}; const int size0 = static_cast(state.get_int64("Size0")); const int size1 = static_cast(state.get_int64("Size1")); const int size2 = static_cast(state.get_int64("Size2")); @@ -138,8 +139,8 @@ void reduce_4d( t1.PrefetchDevice(0); t4.PrefetchDevice(0); - (t4 = random(t4.Shape(), UNIFORM)).run(); - cudaDeviceSynchronize(); + (t4 = random(t4.Shape(), UNIFORM)).run(exec); + exec.sync(); state.exec([&t4, &t1](nvbench::launch &launch) { (t1 = matx::sum(t4, {1, 2, 3})).run((cudaStream_t)launch.get_stream()); }); diff --git a/bench/00_transform/conv.cu b/bench/00_transform/conv.cu index 5f59754db..23b307f2c 100644 --- a/bench/00_transform/conv.cu +++ b/bench/00_transform/conv.cu @@ -13,6 +13,7 @@ template void conv1d_direct_4d_batch(nvbench::state &state, nvbench::type_list) { + cudaExecutor exec{0}; auto out = make_tensor({4, 2, 14, 288 + 4096 + 133 - 1}); auto at = make_tensor({ 4, 2, 14, 133}); auto bt = make_tensor({ 4, 2, 14, 288 + 4096}); @@ -21,7 +22,7 @@ void conv1d_direct_4d_batch(nvbench::state &state, at.PrefetchDevice(0); bt.PrefetchDevice(0); - cudaDeviceSynchronize(); + exec.sync(); MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 ) state.exec( [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); }); @@ -35,7 +36,7 @@ template void conv1d_direct_2d_batch(nvbench::state &state, nvbench::type_list) { - + cudaExecutor exec{0}; auto out = make_tensor({4 * 2* 14, 288 + 4096 + 133 - 1}); auto at = make_tensor({ 4 * 2* 14, 133}); @@ -45,7 +46,7 @@ void conv1d_direct_2d_batch(nvbench::state &state, at.PrefetchDevice(0); bt.PrefetchDevice(0); - cudaDeviceSynchronize(); + exec.sync(); state.exec( [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); }); @@ -56,6 +57,7 @@ template void conv1d_direct_large(nvbench::state &state, nvbench::type_list) { + cudaExecutor exec{0}; auto at = make_tensor({state.get_int64("Signal Size")}); auto bt = make_tensor({state.get_int64("Filter Size")}); auto out = make_tensor({at.Size(at.Rank()-1) + bt.Size(bt.Rank()-1) - 1}); @@ -64,9 +66,9 @@ void conv1d_direct_large(nvbench::state &state, at.PrefetchDevice(0); bt.PrefetchDevice(0); - (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(); + (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(exec); - cudaDeviceSynchronize(); + exec.sync(); state.exec( [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); }); @@ -79,17 +81,18 @@ template void conv1d_fft_large(nvbench::state &state, nvbench::type_list) { + cudaExecutor exec{0}; auto at = make_tensor({state.get_int64("Signal Size")}); auto bt = make_tensor({state.get_int64("Filter Size")}); auto out = make_tensor({at.Size(at.Rank()-1) + bt.Size(bt.Rank()-1) - 1}); - (out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run(); + (out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run(exec); out.PrefetchDevice(0); at.PrefetchDevice(0); bt.PrefetchDevice(0); - cudaDeviceSynchronize(); + exec.sync(); state.exec( [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run(cudaExecutor(launch.get_stream())); }); @@ -103,6 +106,7 @@ template void conv2d_direct_batch(nvbench::state &state, nvbench::type_list) { + cudaExecutor exec{0}; auto at = make_tensor({256, 1024, 1024}); auto bt = make_tensor({256, 16, 16}); auto out = make_tensor({256, @@ -113,7 +117,7 @@ void conv2d_direct_batch(nvbench::state &state, at.PrefetchDevice(0); bt.PrefetchDevice(0); - cudaDeviceSynchronize(); + exec.sync(); state.exec( [&out, &at, &bt](nvbench::launch &launch) { (out = conv2d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); }); diff --git a/bench/00_transform/cub.cu b/bench/00_transform/cub.cu index c37f0d7be..f0e9abc8b 100644 --- a/bench/00_transform/cub.cu +++ b/bench/00_transform/cub.cu @@ -24,6 +24,7 @@ void sort1d( nvbench::type_list ) { + cudaExecutor exec{0}; const int dataSize = static_cast(state.get_int64("Tensor Size")); auto sortedData = matx::make_tensor({dataSize}); @@ -31,7 +32,7 @@ void sort1d( sortedData.PrefetchDevice(0); randomData.PrefetchDevice(0); - cudaDeviceSynchronize(); + exec.sync(); (randomData = random(sortedData.Shape(), NORMAL)).run(); diff --git a/bench/00_transform/qr.cu b/bench/00_transform/qr.cu index ee745072c..09db69958 100644 --- a/bench/00_transform/qr.cu +++ b/bench/00_transform/qr.cu @@ -17,6 +17,7 @@ void qr_batch(nvbench::state &state, cudaStream_t stream = 0; state.set_cuda_stream(nvbench::make_cuda_stream_view(stream)); + cudaExecutor exec{stream}; int batch = state.get_int64("batch"); int m = state.get_int64("rows"); @@ -26,17 +27,13 @@ void qr_batch(nvbench::state &state, auto Q = make_tensor({batch, m, m}); auto R = make_tensor({batch, m, n}); - A.PrefetchDevice(stream); - Q.PrefetchDevice(stream); - R.PrefetchDevice(stream); - - (A = random({batch, m, n}, NORMAL)).run(stream); + (A = random({batch, m, n}, NORMAL)).run(exec); // warm up nvtxRangePushA("Warmup"); - (mtie(Q, R) = qr(A)).run(stream); + (mtie(Q, R) = qr(A)).run(exec); - cudaDeviceSynchronize(); + exec.sync(); nvtxRangePop(); MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 ) diff --git a/bench/00_transform/svd_power.cu b/bench/00_transform/svd_power.cu index 3e165dc0e..7736a19e2 100644 --- a/bench/00_transform/svd_power.cu +++ b/bench/00_transform/svd_power.cu @@ -17,6 +17,7 @@ void svdpi_batch(nvbench::state &state, cudaStream_t stream = 0; state.set_cuda_stream(nvbench::make_cuda_stream_view(stream)); + cudaExecutor exec{stream}; int batch = state.get_int64("batch"); int m = state.get_int64("rows"); @@ -30,22 +31,17 @@ void svdpi_batch(nvbench::state &state, int iterations = 10; - (A = random({batch, m, n}, NORMAL)).run(stream); - - A.PrefetchDevice(stream); - U.PrefetchDevice(stream); - S.PrefetchDevice(stream); - VT.PrefetchDevice(stream); - - (U = 0).run(stream); - (S = 0).run(stream); - (VT = 0).run(stream); + (A = random({batch, m, n}, NORMAL)).run(exec); + + (U = 0).run(exec); + (S = 0).run(exec); + (VT = 0).run(exec); auto x0 = random({batch, r}, NORMAL); // warm up nvtxRangePushA("Warmup"); - (mtie(U, S, VT) = svdpi(A, x0, iterations, r)).run(stream); - cudaDeviceSynchronize(); + (mtie(U, S, VT) = svdpi(A, x0, iterations, r)).run(exec); + exec.sync(); nvtxRangePop(); MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 ) @@ -70,6 +66,7 @@ void svdbpi_batch(nvbench::state &state, cudaStream_t stream = 0; state.set_cuda_stream(nvbench::make_cuda_stream_view(stream)); + cudaExecutor exec{stream}; int batch = state.get_int64("batch"); int m = state.get_int64("rows"); @@ -83,21 +80,16 @@ void svdbpi_batch(nvbench::state &state, int iterations = 10; - (A = random({batch, m, n}, NORMAL)).run(stream); - - A.PrefetchDevice(stream); - U.PrefetchDevice(stream); - S.PrefetchDevice(stream); - VT.PrefetchDevice(stream); + (A = random({batch, m, n}, NORMAL)).run(exec); - (U = 0).run(stream); - (S = 0).run(stream); - (VT = 0).run(stream); + (U = 0).run(exec); + (S = 0).run(exec); + (VT = 0).run(exec); // warm up nvtxRangePushA("Warmup"); - (mtie(U, S, VT) = svdbpi(A, iterations)).run(stream); - cudaDeviceSynchronize(); + (mtie(U, S, VT) = svdbpi(A, iterations)).run(exec); + exec.sync(); nvtxRangePop(); MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 ) diff --git a/docs_input/api/index.rst b/docs_input/api/index.rst index 4d6764bb6..63383ef83 100644 --- a/docs_input/api/index.rst +++ b/docs_input/api/index.rst @@ -17,6 +17,7 @@ API Reference casting/index.rst window/index.rst signalimage/index.rst + synchronization/index.rst polynomials/index.rst random/random.rst dft/index.rst diff --git a/docs_input/api/synchronization/index.rst b/docs_input/api/synchronization/index.rst new file mode 100644 index 000000000..0819f6330 --- /dev/null +++ b/docs_input/api/synchronization/index.rst @@ -0,0 +1,10 @@ +.. _synchronization: + +Synchronization +############### + +.. toctree:: + :maxdepth: 1 + :glob: + + * \ No newline at end of file diff --git a/docs_input/api/synchronization/sync.rst b/docs_input/api/synchronization/sync.rst new file mode 100644 index 000000000..27358b7c1 --- /dev/null +++ b/docs_input/api/synchronization/sync.rst @@ -0,0 +1,18 @@ +.. _sync_func: + +sync +==== + +Wait for any code running on an executor to complete. + +.. doxygenfunction:: matx::cudaExecutor::sync() +.. doxygenfunction:: matx::HostExecutor::sync() + +Examples +~~~~~~~~ + +.. literalinclude:: ../../../examples/cgsolve.cu + :language: cpp + :start-after: example-begin sync-test-1 + :end-before: example-end sync-test-1 + :dedent: diff --git a/docs_input/notebooks/exercises/example1_assignment1.cu b/docs_input/notebooks/exercises/example1_assignment1.cu index 164d17ed3..14d923a20 100644 --- a/docs_input/notebooks/exercises/example1_assignment1.cu +++ b/docs_input/notebooks/exercises/example1_assignment1.cu @@ -57,8 +57,6 @@ int main() { // t2 = ; /*** End editing ***/ - t2.PrefetchDevice(0); - /**************************************************************************************************** * Get a slice of the second and third rows with all columns * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#slicing-and-dicing diff --git a/docs_input/notebooks/exercises/example2_assignment1.cu b/docs_input/notebooks/exercises/example2_assignment1.cu index 9c1cdd7af..69d2f3e65 100644 --- a/docs_input/notebooks/exercises/example2_assignment1.cu +++ b/docs_input/notebooks/exercises/example2_assignment1.cu @@ -45,6 +45,8 @@ int main() { auto B = make_tensor({2, 3}); auto V = make_tensor({3}); + cudaExecutor exec{}; + /**************************************************************************************************** * Initialize tensor A with increasing values from 0.5 to 3.0 in steps of 0.4, *and tensor V from -1 to -3 in steps of -1. @@ -83,7 +85,7 @@ int main() { /*** End editing ***/ - cudaStreamSynchronize(0); + exec.sync(); step = 0.5; for (int row = 0; row < A.Size(0); row++) { @@ -111,7 +113,7 @@ int main() { /// auto tvs = ; /*** End editing. ***/ - // cudaStreamSynchronize(0); + // exec.sync(); // step = 0.5; // for (int row = 0; row < A.Size(0); row++) { @@ -137,7 +139,7 @@ int main() { /*** End editing ***/ - cudaStreamSynchronize(0); + exec.sync(); for (int row = 0; row < B.Size(0); row++) { for (int col = 0; col < B.Size(1); col++) { diff --git a/docs_input/notebooks/exercises/example3_assignment1.cu b/docs_input/notebooks/exercises/example3_assignment1.cu index 9a755b8c6..07f37ef14 100644 --- a/docs_input/notebooks/exercises/example3_assignment1.cu +++ b/docs_input/notebooks/exercises/example3_assignment1.cu @@ -45,6 +45,8 @@ int main() { auto A = make_tensor({2, 3}); auto B = make_tensor({2, 3}); + cudaExecutor exec{}; + /**************************************************************************************************** * Use the random number generator with a seed of 12345 to generate * normally-distributed numbers in the tensor A. Next, take the FFT across @@ -63,7 +65,7 @@ int main() { {{0.5646, 0.8638}, {1.6400, 0.3494}, {-0.5709, 0.5919}}}); A.print(); B.print(); - cudaStreamSynchronize(0); + exec.sync(); for (int row = 0; row < A.Size(0); row++) { for (int col = 0; col < A.Size(1); col++) { if (fabs(A(row, col).real() - B(row, col).real()) > 0.001) { @@ -101,7 +103,7 @@ int main() { /*** End editing ***/ // Verify init is correct - cudaStreamSynchronize(0); + exec.sync(); if (fabs(redv() - 1.0) > 0.001) { printf("Mismatch on final reduction. Expected=1.0, actual = %f\n", redv()); exit(-1); diff --git a/docs_input/notebooks/exercises/example3_fft_conv.cu b/docs_input/notebooks/exercises/example3_fft_conv.cu index b0a2f675f..d922ec820 100644 --- a/docs_input/notebooks/exercises/example3_fft_conv.cu +++ b/docs_input/notebooks/exercises/example3_fft_conv.cu @@ -37,7 +37,8 @@ using namespace matx; int main() { using complex = cuda::std::complex; - + cudaExecutor exec{}; + index_t signal_size = 16; index_t filter_size = 3; index_t filtered_size = signal_size + filter_size - 1; @@ -73,7 +74,7 @@ int main() { // TODO: Perform a time-domain convolution - cudaStreamSynchronize(0); + exec.sync(); // Compare signals for (index_t i = 0; i < filtered_size; i++) { diff --git a/docs_input/notebooks/exercises/solutions/example1_assignment1.cu b/docs_input/notebooks/exercises/solutions/example1_assignment1.cu index f18b5a80f..0331b965a 100644 --- a/docs_input/notebooks/exercises/solutions/example1_assignment1.cu +++ b/docs_input/notebooks/exercises/solutions/example1_assignment1.cu @@ -49,8 +49,6 @@ int main() { {17, 18, 19, 20}}); /*** End editing ***/ - t2.PrefetchDevice(0); - int count = 1; for (int row = 0; row < t2.Size(0); row++) { for (int col = 0; col < t2.Size(1); col++) { diff --git a/docs_input/notebooks/exercises/solutions/example2_assignment1.cu b/docs_input/notebooks/exercises/solutions/example2_assignment1.cu index 9781389d8..4e0560336 100644 --- a/docs_input/notebooks/exercises/solutions/example2_assignment1.cu +++ b/docs_input/notebooks/exercises/solutions/example2_assignment1.cu @@ -41,6 +41,8 @@ int main() { tensor_t B(shape); tensor_t V({3}); + cudaExecutor exec{}; + /**************************************************************************************************** * Initialize tensor A with increasing values from 0.5 to 3.0 in steps of 0.5, *and tensor V from -1 to -3 in steps of -1. @@ -78,10 +80,10 @@ int main() { /**************************************************************************************************** * Add 5.0 to all elements of A and store the results back in A ****************************************************************************************************/ - (A = A + 5.0).run(); + (A = A + 5.0).run(exec); /*** End editing ***/ - cudaStreamSynchronize(0); + exec.sync(); step = 0.5; for (int row = 0; row < A.Size(0); row++) { @@ -107,10 +109,10 @@ int main() { * ****************************************************************************************************/ auto tvs = V.Clone<2>({A.Size(0), matxKeepDim}); - (A = A - tvs).run(); + (A = A - tvs).run(exec); /*** End editing ***/ - cudaStreamSynchronize(0); + exec.sync(); step = 0.5; for (int row = 0; row < A.Size(0); row++) { @@ -134,10 +136,10 @@ int main() { * * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/api/tensorops.html#_CPPv4N4matx3powE2Op2Op ****************************************************************************************************/ - (B = (pow(A, 2) * 2) - V).run(); + (B = (pow(A, 2) * 2) - V).run(exec); /*** End editing ***/ - cudaStreamSynchronize(0); + exec.sync(); for (int row = 0; row < B.Size(0); row++) { for (int col = 0; col < B.Size(1); col++) { diff --git a/docs_input/notebooks/exercises/solutions/example3_assignment1.cu b/docs_input/notebooks/exercises/solutions/example3_assignment1.cu index 0828788e8..5aff55999 100644 --- a/docs_input/notebooks/exercises/solutions/example3_assignment1.cu +++ b/docs_input/notebooks/exercises/solutions/example3_assignment1.cu @@ -41,6 +41,8 @@ int main() { tensor_t A(shape); tensor_t B(shape); + cudaExecutor exec{}; + /**************************************************************************************************** * Use the random number generator with a seed of 12345 to generate * normally-distributed numbers in the tensor A. Next, take the FFT across @@ -51,7 +53,7 @@ int main() { * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#random-numbers * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/api/random.html ****************************************************************************************************/ - (A = random(A.Shape(), NORMAL, 12345)).run(); + (A = random(A.Shape(), NORMAL, 12345)).run(exec); auto At = A.Permute({1, 0}); fft(At, At); @@ -62,7 +64,7 @@ int main() { {{0.5646, 0.8638}, {1.6400, 0.3494}, {-0.5709, 0.5919}}}); print(A); print(B); - cudaStreamSynchronize(0); + exec.sync(); for (int row = 0; row < A.Size(0); row++) { for (int col = 0; col < A.Size(1); col++) { if (fabs(A(row, col).real() - B(row, col).real()) > 0.001) { @@ -95,15 +97,15 @@ int main() { // Create and initialize 3D tensor auto dv = make_tensor({10, 5, 15}); - (dv = random(dv.Shape(), NORMAL)).run(); + (dv = random(dv.Shape(), NORMAL)).run(exec); tensor_t redv; max(redv, dv, 0); - (dv = dv / redv).run(); + (dv = dv / redv).run(exec); max(redv, dv, 0); /*** End editing ***/ - cudaStreamSynchronize(0); + exec.sync(); // Verify init is correct if (fabs(redv() - 1.0) > 0.001) { printf("Mismatch on final reduction. Expected=1.0, actual = %f\n", redv()); diff --git a/examples/black_scholes.cu b/examples/black_scholes.cu index c880a259b..1938cab28 100644 --- a/examples/black_scholes.cu +++ b/examples/black_scholes.cu @@ -93,7 +93,7 @@ void compute_black_scholes_matx(tensor_t& K, tensor_t& r, tensor_t& T, tensor_t& output, - cudaStream_t& stream) + cudaExecutor& exec) { auto VsqrtT = V * sqrt(T); auto d1 = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ; @@ -102,7 +102,7 @@ void compute_black_scholes_matx(tensor_t& K, auto cdf_d2 = normcdf(d2); auto expRT = exp(-1 * r * T); - (output = S * cdf_d1 - K * expRT * cdf_d2).run(stream); + (output = S * cdf_d1 - K * expRT * cdf_d2).run(exec); } int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) @@ -124,9 +124,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaStream_t stream; cudaStreamCreate(&stream); + cudaExecutor exec{stream}; - // Prefetch all data - compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, stream); + compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec); cudaEvent_t start, stop; cudaEventCreate(&start); @@ -135,10 +135,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEventRecord(start, stream); // Time non-operator version for (uint32_t i = 0; i < num_iterations; i++) { - compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, stream); + compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec); } cudaEventRecord(stop, stream); - cudaStreamSynchronize(stream); + exec.sync(); cudaEventElapsedTime(&time_ms, start, stop); printf("Time without custom operator = %.2fms per iteration\n", @@ -147,10 +147,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEventRecord(start, stream); // Time non-operator version for (uint32_t i = 0; i < num_iterations; i++) { - BlackScholes(output_tensor, K_tensor, V_tensor, S_tensor, r_tensor, T_tensor).run(stream); + BlackScholes(output_tensor, K_tensor, V_tensor, S_tensor, r_tensor, T_tensor).run(exec); } cudaEventRecord(stop, stream); - cudaStreamSynchronize(stream); + exec.sync(); cudaEventElapsedTime(&time_ms, start, stop); printf("Time with custom operator = %.2fms per iteration\n", diff --git a/examples/cgsolve.cu b/examples/cgsolve.cu index 015bfd031..9027d634d 100644 --- a/examples/cgsolve.cu +++ b/examples/cgsolve.cu @@ -54,6 +54,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) auto norm = make_tensor({BATCH}); auto maxn = make_tensor({}); + cudaExecutor exec{}; // Simple Poisson matrix for(int b = 0; b < BATCH; b++) { @@ -72,19 +73,17 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } } } - A.PrefetchDevice(0); - B.PrefetchDevice(0); - X.PrefetchDevice(0); - (X = TypeParam(1)).run(); + (X = TypeParam(1)).run(exec); - (X = cgsolve(A, B, .0001, max_iters)).run(); + (X = cgsolve(A, B, .0001, max_iters)).run(exec); + // example-begin sync-test-1 + (Bout = matvec(A, X)).run(exec); + (norm = sum((Bout-B)*(Bout-B))).run(exec); + (maxn = matx::max(sqrt(norm))).run(exec); - (Bout = matvec(A, X)).run(); - (norm = sum((Bout-B)*(Bout-B))).run(); - (maxn = matx::max(sqrt(norm))).run(); - - cudaDeviceSynchronize(); + exec.sync(); + // example-end sync-test-1 printf ("max l2 norm: %f\n", (float)sqrt(maxn())); CUDA_CHECK_LAST_ERROR(); diff --git a/examples/channelize_poly_bench.cu b/examples/channelize_poly_bench.cu index 3665c9223..ac38367a4 100644 --- a/examples/channelize_poly_bench.cu +++ b/examples/channelize_poly_bench.cu @@ -73,6 +73,7 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop cudaEventCreate(&start); cudaEventCreate(&stop); + cudaExecutor exec{}; for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) { for (matx::index_t num_channels = channel_start; num_channels <= channel_stop; num_channels++) { @@ -88,18 +89,18 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop const matx::index_t decimation_factor = num_channels; for (int k = 0; k < NUM_WARMUP_ITERATIONS; k++) { - (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(stream); + (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec); } - cudaStreamSynchronize(stream); + exec.sync(); float elapsed_ms = 0.0f; cudaEventRecord(start, stream); for (int k = 0; k < NUM_ITERATIONS; k++) { - (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(stream); + (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec); } cudaEventRecord(stop, stream); - cudaStreamSynchronize(stream); + exec.sync(); CUDA_CHECK_LAST_ERROR(); cudaEventElapsedTime(&elapsed_ms, start, stop); diff --git a/examples/convolution.cu b/examples/convolution.cu index 8d328d3d4..f45b625ad 100644 --- a/examples/convolution.cu +++ b/examples/convolution.cu @@ -53,6 +53,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaStream_t stream; cudaStreamCreate(&stream); + cudaExecutor exec{stream}; cudaEvent_t start, stop; @@ -85,20 +86,17 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) filterView(f) = filterView(f-1) * 0.99f; } - inView.PrefetchDevice(stream); - filterView.PrefetchDevice(stream); - // Measure recursive runtime - cudaStreamSynchronize(stream); + exec.sync(); cudaEventRecord(start, stream); for (uint32_t i = 0; i < iterations; i++) { - (outView = conv1d(inView, filterView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(stream); + (outView = conv1d(inView, filterView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec); } cudaEventRecord(stop, stream); - cudaStreamSynchronize(stream); + exec.sync(); cudaEventElapsedTime(&time_ms, start, stop); time_ms /= static_cast(iterations); @@ -120,9 +118,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // {(uint32_t) 10e3 + filter_dim_2d - 1, (uint32_t) 10e3 + filter_dim_2d - // 1}); // auto out2DView = out2DData.View(); - // filter2DData.PrefetchDevice(stream); - // in2DData.PrefetchDevice(stream); - // out2DData.PrefetchDevice(stream); // Measure recursive runtime // cudaEventRecord(start, stream); @@ -136,7 +131,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // } // cudaEventRecord(stop, stream); - // cudaStreamSynchronize(stream); + // exec.sync(); // cudaEventElapsedTime(&time_ms, start, stop); // time_ms /= static_cast(iterations); diff --git a/examples/eigenExample.cu b/examples/eigenExample.cu index 6e53cd701..d204c4be7 100644 --- a/examples/eigenExample.cu +++ b/examples/eigenExample.cu @@ -47,6 +47,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) int dimX = 3; int dimY = 3; + matx::cudaExecutor exec{}; /////////////////////////////////////////////////////////////////////////////// ////////////// Eigen Test Data Setup ////////////// @@ -91,10 +92,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) #else std::cout <<"!!!!!!!!! Eigen NOT USED in Test !!!!!!!!!" << std ::endl; // provide data in tensors if eigen is not used - (aTensor = matx::random({dimX, dimY}, matx::UNIFORM)).run(); - (bTensor = matx::random({dimX, dimY}, matx::UNIFORM)).run(); - (complexTensor = matx::random>({2, 2}, matx::UNIFORM)).run(); - (matTensor10x10 = matx::random({10, 10}, matx::UNIFORM)).run(); + (aTensor = matx::random({dimX, dimY}, matx::UNIFORM)).run(exec); + (bTensor = matx::random({dimX, dimY}, matx::UNIFORM)).run(exec); + (complexTensor = matx::random>({2, 2}, matx::UNIFORM)).run(exec); + (matTensor10x10 = matx::random({10, 10}, matx::UNIFORM)).run(exec); #endif @@ -110,16 +111,16 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaMemcpy(complexTensor.Data(), complexMatrix.data(), sizeof(std::complex)*2*2, cudaMemcpyHostToDevice); cudaMemcpy(matTensor10x10.Data(), matrix10x10.data(), sizeof(float)*10*10, cudaMemcpyHostToDevice); - (aTensor = matx::transpose(aTensor)).run(); - // (bTensor = matx::transpose(bTensor)).run(); // do not need to transpose because b has the same layout - (complexTensor = matx::transpose(complexTensor)).run(); - (matTensor10x10 = matx::transpose(matTensor10x10)).run(); + (aTensor = matx::transpose(aTensor)).run(exec); + // (bTensor = matx::transpose(bTensor)).run(exec); // do not need to transpose because b has the same layout + (complexTensor = matx::transpose(complexTensor)).run(exec); + (matTensor10x10 = matx::transpose(matTensor10x10)).run(exec); #endif tensor1D(0) = 1; tensor1D(1) = 2; tensor1D(2) = 3; - cudaDeviceSynchronize(); + exec.sync(); // slower alternative of copying per-element // for(int curX=0; curX; + cudaExecutor exec{}; index_t signal_size = 1ULL << 16; index_t filter_size = 16; @@ -113,39 +114,34 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } } - // Prefetch the data we just created - sig_time.PrefetchDevice(0); - filt_time.PrefetchDevice(0); - - // Perform the FFT in-place on both signal and filter for (int i = 0; i < iterations; i++) { if (i == 1) { cudaEventRecord(start, stream); } - (sig_freq = fft(sig_time, filtered_size)).run(stream); - (filt_freq = fft(filt_time, filtered_size)).run(stream); + (sig_freq = fft(sig_time, filtered_size)).run(exec); + (filt_freq = fft(filt_time, filtered_size)).run(exec); - (sig_freq = sig_freq * filt_freq).run(stream); + (sig_freq = sig_freq * filt_freq).run(exec); // IFFT in-place - (sig_freq = ifft(sig_freq)).run(stream); + (sig_freq = ifft(sig_freq)).run(exec); } cudaEventRecord(stop, stream); - cudaStreamSynchronize(stream); + exec.sync(); cudaEventElapsedTime(&separate_ms, start, stop); for (int i = 0; i < iterations; i++) { if (i == 1) { cudaEventRecord(start, stream); } - (sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(stream); + (sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(exec); } cudaEventRecord(stop, stream); - cudaStreamSynchronize(stream); + exec.sync(); cudaEventElapsedTime(&fused_ms, start, stop); printf("FFT runtimes for separate = %.2f ms, fused = %.2f ms\n", separate_ms/(iterations-1), fused_ms/(iterations-1)); @@ -154,9 +150,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // a direct convolution. The conv1d function only accepts a 1D filter, so we // create a sliced view here. auto filt1 = filt_time.Slice<1>({0,0}, {matxDropDim, matxEnd}); - (time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(); + (time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); // Compare signals for (index_t b = 0; b < batches; b++) { diff --git a/examples/mvdr_beamformer.cu b/examples/mvdr_beamformer.cu index 5096c7236..07fc97ebf 100644 --- a/examples/mvdr_beamformer.cu +++ b/examples/mvdr_beamformer.cu @@ -53,6 +53,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaStream_t stream; cudaStreamCreate(&stream); + cudaExecutor exec{stream}; cudaEvent_t start, stop; cudaEventCreate(&start); @@ -76,16 +77,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } } - mvdr.Prefetch(stream); - cudaEventRecord(start, stream); for (uint32_t i = 0; i < num_iterations; i++) { - mvdr.Run(stream); + mvdr.Run(exec); } cudaEventRecord(stop, stream); - cudaStreamSynchronize(stream); + exec.sync(); cudaEventElapsedTime(&time_ms, start, stop); printf("MVDR Kernel Time = %.2fms per iteration\n", time_ms / num_iterations); diff --git a/examples/mvdr_beamformer.h b/examples/mvdr_beamformer.h index 5fe78ee85..63fef3d0d 100644 --- a/examples/mvdr_beamformer.h +++ b/examples/mvdr_beamformer.h @@ -99,32 +99,33 @@ class MVDRBeamformer { /** * Run the entire beamformer * - * @param stream CUDA stream + * @param exec CUDA executor */ - void Run(cudaStream_t stream) + void Run(cudaExecutor exec) { - (vhView = hermitianT(vView)).run(stream); + cudaStream_t stream = exec.getStream(); + (vhView = hermitianT(vView)).run(exec); - (cbfView = matmul(vhView, inVecView)).run(stream); + (cbfView = matmul(vhView, inVecView)).run(exec); matx::copy(ivsView, inVecView.Slice({0, 0}, {matxEnd, snap_len_}), stream); - (ivshView = hermitianT(ivsView)).run(stream); + (ivshView = hermitianT(ivsView)).run(exec); - (covMatView = matmul(ivsView, ivshView)).run(stream); + (covMatView = matmul(ivsView, ivshView)).run(exec); (covMatView = (covMatView * (1.0f / static_cast(snap_len_))) + eye() * load_coeff_) - .run(stream); + .run(exec); inv_impl(invCovMatView, covMatView, stream); // Find A and B to solve xA=B. Matlab uses A/B to solve for x, which is the // same as x = BA^-1 - (abfBView = matmul(invCovMatView, vView)).run(stream); - (abfAView = matmul(vhView, abfBView)).run(stream); + (abfBView = matmul(invCovMatView, vView)).run(exec); + (abfAView = matmul(vhView, abfBView)).run(exec); inv_impl(abfAInvView, abfAView, stream); - (abfWeightsView = matmul(abfBView, abfAInvView)).run(stream); + (abfWeightsView = matmul(abfBView, abfAInvView)).run(exec); } /** diff --git a/examples/pwelch.cu b/examples/pwelch.cu index 84cfb0bc0..ea2679180 100644 --- a/examples/pwelch.cu +++ b/examples/pwelch.cu @@ -62,20 +62,21 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); + cudaExecutor exec{stream}; // Create input signal as a complex exponential auto sample_index_range = range<0>({signal_size},0.f,1.f); auto phase = 2.f * static_cast(M_PI) * ftone * sample_index_range / static_cast(nfft); auto tmp_x = expj(phase); auto x = make_tensor({signal_size}); - (x = tmp_x).run(stream); // pre-compute x, tmp_x is otherwise lazily evaluated + (x = tmp_x).run(exec); // pre-compute x, tmp_x is otherwise lazily evaluated // Create output tensor auto Pxx = make_tensor({nfft}); // Run one time to pre-cache the FFT plan - (Pxx = pwelch(x, nperseg, noverlap, nfft)).run(stream); - cudaStreamSynchronize(stream); + (Pxx = pwelch(x, nperseg, noverlap, nfft)).run(exec); + exec.sync(); // Start the timing cudaEventRecord(start, stream); @@ -85,11 +86,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) for (int iteration = 0; iteration < num_iterations; iteration++) { // Use the PWelch operator - (Pxx = pwelch(x, nperseg, noverlap, nfft)).run(stream); + (Pxx = pwelch(x, nperseg, noverlap, nfft)).run(exec); } cudaEventRecord(stop, stream); - cudaStreamSynchronize(stream); + exec.sync(); cudaEventElapsedTime(&exec_time_ms, start, stop); printf("Output Pxx:\n"); diff --git a/examples/qr.cu b/examples/qr.cu index 0e53ef950..2953dabcf 100644 --- a/examples/qr.cu +++ b/examples/qr.cu @@ -45,6 +45,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) using AType = cuda::std::complex; cudaStream_t stream = 0; + cudaExecutor exec{stream}; int batch = 1; int m = 4; @@ -56,10 +57,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) auto Q = make_tensor({batch, m, m}); auto R = make_tensor({batch, m, n}); - (A = random(A.Shape(), NORMAL)).run(stream); + (A = random(A.Shape(), NORMAL)).run(exec); #if 0 - cudaDeviceSynchronize(); + exec.sync(); A(0,0,0) = 10000; A(0,0,1) = 10001; A(0,1,0) = 10001; A(0,1,1) = 10002; A(0,2,0) = 10002; A(0,2,1) = 10003; @@ -67,15 +68,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) A(0,4,0) = 10004; A(0,4,1) = 10005; #endif - A.PrefetchDevice(stream); - Q.PrefetchDevice(stream); - R.PrefetchDevice(stream); + (mtie(Q, R) = qr(A)).run(exec); - (mtie(Q, R) = qr(A)).run(stream); - - (QR = matmul(Q, R)).run(stream); - (QTQ = matmul(conj(transpose_matrix(Q)), Q)).run(stream); - cudaDeviceSynchronize(); + (QR = matmul(Q, R)).run(exec); + (QTQ = matmul(conj(transpose_matrix(Q)), Q)).run(exec); + exec.sync(); printf("Q:\n"); print(Q); printf("R:\n"); print(R); diff --git a/examples/recursive_filter.cu b/examples/recursive_filter.cu index 49ff039e2..70eee34a7 100644 --- a/examples/recursive_filter.cu +++ b/examples/recursive_filter.cu @@ -63,6 +63,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaStream_t stream; cudaStreamCreate(&stream); + cudaExecutor exec{stream}; cudaEvent_t start, stop; @@ -109,23 +110,20 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } } - inView.PrefetchDevice(stream); - outView.PrefetchDevice(stream); - // Measure recursive runtime - cudaStreamSynchronize(stream); + exec.sync(); cudaEventRecord(start, stream); for (uint32_t i = 0; i < iterations; i++) { // example-begin filter-example-1 // Perform an IIR filter on "inView" with rCoeffs and nrCoeffs recursive/non-recursive // coefficients, respectively - (outView = filter(inView, rCoeffs, nrCoeffs)).run(stream); + (outView = filter(inView, rCoeffs, nrCoeffs)).run(exec); // example-end filter-example-1 } cudaEventRecord(stop, stream); - cudaStreamSynchronize(stream); + exec.sync(); cudaEventElapsedTime(&time_ms, start, stop); time_ms /= static_cast(iterations); diff --git a/examples/resample.cu b/examples/resample.cu index fb814aa58..7927ea34f 100644 --- a/examples/resample.cu +++ b/examples/resample.cu @@ -53,6 +53,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaStream_t stream; cudaStreamCreate(&stream); + cudaExecutor exec{stream}; cudaEvent_t start, stop; cudaEventCreate(&start); @@ -63,31 +64,31 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) tensor_t sigViewComplex({num_samp / 2 + 1}); tensor_t resampView({num_samp_resamp}); - (sigView = random({num_samp}, NORMAL)).run(stream); + (sigView = random({num_samp}, NORMAL)).run(exec); - (sigViewComplex = fft(sigView)).run(stream); + (sigViewComplex = fft(sigView)).run(exec); // Slice auto sliceView = sigViewComplex.Slice({0}, {nyq}); // Inverse Transform - FFT size based on output - (resampView = ifft(sliceView)).run(stream); + (resampView = ifft(sliceView)).run(exec); cudaEventRecord(start, stream); for (uint32_t i = 0; i < num_iterations; i++) { // Launch 1D FFT - (sigViewComplex = fft(sigView)).run(stream); + (sigViewComplex = fft(sigView)).run(exec); // Slice auto sv = sigViewComplex.Slice({0}, {nyq}); // Inverse Transform - FFT size based on output - (resampView = ifft(sv)).run(stream); + (resampView = ifft(sv)).run(exec); } cudaEventRecord(stop, stream); - cudaStreamSynchronize(stream); + exec.sync(); cudaEventElapsedTime(&time_ms, start, stop); printf("Resample Kernel Time = %.2fms per iteration\n", diff --git a/examples/resample_poly_bench.cu b/examples/resample_poly_bench.cu index 124bc412b..6debd4311 100644 --- a/examples/resample_poly_bench.cu +++ b/examples/resample_poly_bench.cu @@ -123,6 +123,8 @@ void ResamplePolyBench() cudaEventCreate(&start); cudaEventCreate(&stop); + cudaExecutor exec{stream}; + for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) { const matx::index_t num_batches = test_cases[i].num_batches; const matx::index_t input_len = test_cases[i].input_len; @@ -139,24 +141,24 @@ void ResamplePolyBench() auto filter = matx::make_tensor({filter_len}, MATX_DEVICE_MEMORY); auto output = matx::make_tensor({num_batches, output_len}, MATX_DEVICE_MEMORY); - (input = static_cast(1.0)).run(stream); - (filter = static_cast(1.0)).run(stream); + (input = static_cast(1.0)).run(exec); + (filter = static_cast(1.0)).run(exec); - cudaStreamSynchronize(stream); + exec.sync(); for (int k = 0; k < NUM_WARMUP_ITERATIONS; k++) { - (output = matx::resample_poly(input, filter, up, down)).run(stream); + (output = matx::resample_poly(input, filter, up, down)).run(exec); } - cudaStreamSynchronize(stream); + exec.sync(); float elapsed_ms = 0.0f; cudaEventRecord(start, stream); for (int k = 0; k < NUM_ITERATIONS; k++) { - (output = matx::resample_poly(input, filter, up, down)).run(stream); + (output = matx::resample_poly(input, filter, up, down)).run(exec); } cudaEventRecord(stop, stream); - cudaStreamSynchronize(stream); + exec.sync(); CUDA_CHECK_LAST_ERROR(); cudaEventElapsedTime(&elapsed_ms, start, stop); diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu index ae965b6c6..a89cb1b1f 100644 --- a/examples/simple_radar_pipeline.cu +++ b/examples/simple_radar_pipeline.cu @@ -71,10 +71,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) MATX_NVTX_START_RANGE("Pipeline Initialize", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 1) printf("Initializing data structures for stream %d...\n", s); pipelines[s] = new RadarPipeline(numPulses, numSamples, waveformLength, numChannels, streams[s]); - pipelines[s]->GetInputView().PrefetchDevice(streams[s]); MATX_NVTX_END_RANGE(1) - cudaStreamSynchronize(streams[s]); + pipelines[s]->sync(); } MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2) @@ -129,7 +128,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) for (int s = 0; s < num_streams; s++) { cudaEventRecord(stops[s], streams[s]); - cudaStreamSynchronize(streams[s]); + pipelines[s]->sync(); } MATX_NVTX_END_RANGE(2) diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h index e94e5be21..de08ffd5c 100644 --- a/examples/simple_radar_pipeline.h +++ b/examples/simple_radar_pipeline.h @@ -135,7 +135,7 @@ class RadarPipeline { RadarPipeline(const index_t _numPulses, const index_t _numSamples, index_t _wfLen, index_t _numChannels, cudaStream_t _stream) : numPulses(_numPulses), numSamples(_numSamples), waveformLength(_wfLen), - numChannels(_numChannels), stream(_stream) + numChannels(_numChannels), stream(_stream), exec(_stream) { numSamplesRnd = 1; while (numSamplesRnd < numSamples) { @@ -202,7 +202,7 @@ class RadarPipeline { // Pre-process CFAR convolution (normT = conv2d(ones({numChannels, numPulsesRnd, numCompressedSamples}), - cfarMaskView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(stream); + cfarMaskView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec); cancelMask.PrefetchDevice(stream); ba.PrefetchDevice(stream); @@ -216,6 +216,14 @@ class RadarPipeline { xPow.PrefetchDevice(stream); } + /** + * @brief Sync the pipeline using the underlying executor + * + */ + void sync() { + exec.sync(); + } + /** * @brief Stage 1 - Pulse compression - convolution via FFTs * @@ -242,19 +250,19 @@ class RadarPipeline { // Apply a Hamming window to the waveform to suppress sidelobes. Other // windows could be used as well (e.g., Taylor windows). Ultimately, it is // just an element-wise weighting by a pre-computed window function. - (waveformPart = waveformPart * hamming<0>({waveformLength})).run(stream); + (waveformPart = waveformPart * hamming<0>({waveformLength})).run(exec); // compute L2 norm - (norms = sum(norm(waveformPart))).run(stream); - (norms = sqrt(norms)).run(stream); + (norms = sum(norm(waveformPart))).run(exec); + (norms = sqrt(norms)).run(exec); - (waveformPart = waveformPart / norms).run(stream); - (waveformFull = fft(waveformPart, numSamplesRnd)).run(stream); - (waveformFull = conj(waveformFull)).run(stream); + (waveformPart = waveformPart / norms).run(exec); + (waveformFull = fft(waveformPart, numSamplesRnd)).run(exec); + (waveformFull = conj(waveformFull)).run(exec); - (x = fft(x)).run(stream); - (x = x * waveformT).run(stream); - (x = ifft(x)).run(stream); + (x = fft(x)).run(exec); + (x = x * waveformT).run(exec); + (x = ifft(x)).run(exec); } @@ -281,7 +289,7 @@ class RadarPipeline { {0, 0, 0}, {numChannels, numCompressedSamples, numPulses}); auto xo = tpcView.Permute({0, 2, 1}).Slice( {0, 0, 0}, {numChannels, numCompressedSamples, numPulses}); - (xo = conv1d(x, cancelMask, matxConvCorrMode_t::MATX_C_MODE_SAME)).run(stream); + (xo = conv1d(x, cancelMask, matxConvCorrMode_t::MATX_C_MODE_SAME)).run(exec); } /** @@ -309,8 +317,8 @@ class RadarPipeline { (xc = xc * hamming<1>({numChannels, numPulses - (cancelMask.Size(0) - 1), numCompressedSamples})) - .run(stream); - (xf = fft(xf)).run(stream); + .run(exec); + (xf = fft(xf)).run(exec); } /** @@ -350,11 +358,11 @@ class RadarPipeline { */ void CFARDetections() { - (xPow = norm(tpcView)).run(stream); + (xPow = norm(tpcView)).run(exec); // Estimate the background average power in each cell // background_averages = conv2(Xpow, mask, 'same') ./ norm; - (ba = conv2d(xPow, cfarMaskView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(stream); + (ba = conv2d(xPow, cfarMaskView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec); // Computing number of cells contributing to each cell. // This can be done with a convolution of the cfarMask with @@ -367,7 +375,7 @@ class RadarPipeline { auto baTrim = ba.Slice({0, cfarMaskY / 2, cfarMaskX / 2}, {numChannels, numPulsesRnd + cfarMaskY / 2, numCompressedSamples + cfarMaskX / 2}); - (baTrim = baTrim / normTrim).run(stream); + (baTrim = baTrim / normTrim).run(exec); // The scalar alpha is used as a multiplier on the background averages // to achieve a constant false alarm rate (under certain assumptions); @@ -384,9 +392,9 @@ class RadarPipeline { // efficient as it can avoid repeated loads. #if 0 IFELSE(xPow > normTrim*(pow(pfa, -1.0f/normTrim) - 1.0f)*baTrim, - dets = 1, dets = 0).run(stream); + dets = 1, dets = 0).run(exec); #else - calcDets(dets, xPow, baTrim, normTrim, pfa).run(stream); + calcDets(dets, xPow, baTrim, normTrim, pfa).run(exec); #endif } @@ -457,4 +465,5 @@ class RadarPipeline { tensor_t cfarMaskView; cudaStream_t stream; + cudaExecutor exec; }; diff --git a/examples/spectrogram.cu b/examples/spectrogram.cu index ad6b24c4f..b781690a9 100644 --- a/examples/spectrogram.cu +++ b/examples/spectrogram.cu @@ -65,6 +65,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEventCreate(&start); cudaEventCreate(&stop); + cudaExecutor exec{stream}; + float fs = 10000; constexpr index_t N = 100000; float amp = static_cast(2 * sqrt(2)); @@ -92,17 +94,17 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // Set up all static buffers // time = np.arange(N) / float(fs) (time = linspace<0>(num_samps, 0.0f, static_cast(N) - 1.0f) / fs) - .run(stream); + .run(exec); // mod = 500 * np.cos(2*np.pi*0.25*time) - (modulation = 500 * cos(2 * M_PI * 0.25 * time)).run(stream); + (modulation = 500 * cos(2 * M_PI * 0.25 * time)).run(exec); // carrier = amp * np.sin(2*np.pi*3e3*time + modulation) - (carrier = amp * sin(2 * M_PI * 3000 * time + modulation)).run(stream); + (carrier = amp * sin(2 * M_PI * 3000 * time + modulation)).run(exec); // noise = 0.01 * fs / 2 * np.random.randn(time.shape) - (noise = sqrt(0.01 * fs / 2) * random({N}, NORMAL)).run(stream); + (noise = sqrt(0.01 * fs / 2) * random({N}, NORMAL)).run(exec); // noise *= np.exp(-time/5) - (noise = noise * exp(-1.0f * time / 5.0f)).run(stream); + (noise = noise * exp(-1.0f * time / 5.0f)).run(exec); // x = carrier + noise - (x = carrier + noise).run(stream); + (x = carrier + noise).run(exec); for (uint32_t i = 0; i < num_iterations; i++) { if (i == 2) { // Start timer on third loop to allow generation of plot @@ -112,15 +114,15 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // DFT Sample Frequencies (rfftfreq) (freqs = (1.0 / (static_cast(nfft) * 1 / fs)) * linspace<0>(half_win, 0.0f, static_cast(nfft) / 2.0f)) - .run(stream); + .run(exec); // Create overlapping matrix of segments. auto stackedMatrix = overlap(x, {nperseg}, {nstep}); // FFT along rows - (fftStackedMatrix = fft(stackedMatrix)).run(stream); + (fftStackedMatrix = fft(stackedMatrix)).run(exec); // Absolute value (fftStackedMatrix = conj(fftStackedMatrix) * fftStackedMatrix) - .run(stream); + .run(exec); // Get real part and transpose auto Sxx = fftStackedMatrix.RealView().Permute({1, 0}); @@ -128,7 +130,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) (s_time = linspace<0>(s_time_shape, static_cast(nperseg) / 2.0f, static_cast(N - nperseg) / 2.0f + 1) / fs) - .run(stream); + .run(exec); if (i == 1) { #if MATX_ENABLE_VIZ @@ -142,7 +144,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } cudaEventRecord(stop, stream); - cudaStreamSynchronize(stream); + exec.sync(); cudaEventElapsedTime(&time_ms, start, stop); printf("Spectrogram Time Without Graphs = %.2fus per iteration\n", diff --git a/examples/spectrogram_graph.cu b/examples/spectrogram_graph.cu index 9271c3fc0..c32534cca 100644 --- a/examples/spectrogram_graph.cu +++ b/examples/spectrogram_graph.cu @@ -62,6 +62,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaStream_t stream; cudaStreamCreate(&stream); + cudaExecutor exec{stream}; + cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); @@ -93,17 +95,17 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // Set up all static buffers // time = np.arange(N) / float(fs) (time = linspace<0>(num_samps, 0.0f, static_cast(N) - 1.0f) / fs) - .run(stream); + .run(exec); // mod = 500 * np.cos(2*np.pi*0.25*time) - (modulation = 500 * cos(2 * M_PI * 0.25 * time)).run(stream); + (modulation = 500 * cos(2 * M_PI * 0.25 * time)).run(exec); // carrier = amp * np.sin(2*np.pi*3e3*time + modulation) - (carrier = amp * sin(2 * M_PI * 3000 * time + modulation)).run(stream); + (carrier = amp * sin(2 * M_PI * 3000 * time + modulation)).run(exec); // noise = 0.01 * fs / 2 * np.random.randn(time.shape) - (noise = sqrt(0.01 * fs / 2) * random({N}, NORMAL)).run(stream); + (noise = sqrt(0.01 * fs / 2) * random({N}, NORMAL)).run(exec); // noise *= np.exp(-time/5) - (noise = noise * exp(-1.0f * time / 5.0f)).run(stream); + (noise = noise * exp(-1.0f * time / 5.0f)).run(exec); // x = carrier + noise - (x = carrier + noise).run(stream); + (x = carrier + noise).run(exec); for (uint32_t i = 0; i < 2; i++) { // Record graph on second loop to get rid of plan caching in the graph @@ -114,15 +116,15 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // DFT Sample Frequencies (rfftfreq) (freqs = (1.0 / (static_cast(nfft) * 1 / fs)) * linspace<0>(half_win, 0.0f, static_cast(nfft) / 2.0f)) - .run(stream); + .run(exec); // Create overlapping matrix of segments. auto stackedMatrix = overlap(x, {nperseg}, {nstep}); // FFT along rows - (fftStackedMatrix = fft(stackedMatrix)).run(stream); + (fftStackedMatrix = fft(stackedMatrix)).run(exec); // Absolute value (fftStackedMatrix = conj(fftStackedMatrix) * fftStackedMatrix) - .run(stream); + .run(exec); // Get real part and transpose [[maybe_unused]] auto Sxx = fftStackedMatrix.RealView().Permute({1, 0}); @@ -130,7 +132,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) (s_time = linspace<0>(s_time_shape, static_cast(nperseg) / 2.0f, static_cast(N - nperseg) / 2.0f + 1) / fs) - .run(stream); + .run(exec); if (i == 1) { cudaStreamEndCapture(stream, &graph); @@ -146,14 +148,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } - cudaStreamSynchronize(0); + exec.sync(); // Time graph execution of same kernels cudaEventRecord(start, stream); for (uint32_t i = 0; i < 10; i++) { cudaGraphLaunch(instance, stream); } cudaEventRecord(stop, stream); - cudaStreamSynchronize(stream); + exec.sync(); cudaEventElapsedTime(&time_ms, start, stop); printf("Spectrogram Time With Graphs = %.2fus per iteration\n", diff --git a/examples/spherical_harmonics.cu b/examples/spherical_harmonics.cu index e74fae638..35d650f5f 100644 --- a/examples/spherical_harmonics.cu +++ b/examples/spherical_harmonics.cu @@ -49,6 +49,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) { MATX_ENTER_HANDLER(); + cudaExecutor exec{}; + using ValueType = double; int l = 3; @@ -76,9 +78,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) auto Y = make_tensor(Ym.Shape()); auto Z = make_tensor(Zm.Shape()); - (X = Xm, Y = Ym, Z=Zm).run(); + (X = Xm, Y = Ym, Z=Zm).run(exec); - cudaDeviceSynchronize(); + exec.sync(); #if MATX_ENABLE_VIZ matx::viz::surf(X, Y, Z, "test-viz.html"); diff --git a/examples/svd_power.cu b/examples/svd_power.cu index 6c86167fe..840688466 100644 --- a/examples/svd_power.cu +++ b/examples/svd_power.cu @@ -47,6 +47,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) using SType = float; cudaStream_t stream = 0; + cudaExecutor exec{stream}; int m = 5; int n = 4; @@ -70,7 +71,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) auto VTV = make_tensor({batch, k, k}); auto x0 = random({batch, d}, NORMAL); - (A = random({batch, m, n}, NORMAL)).run(stream); + (A = random({batch, m, n}, NORMAL)).run(exec); #else auto A = make_tensor({m, n}); @@ -87,7 +88,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) auto VTV = make_tensor({k, k}); auto x0 = random({d}, NORMAL); - (A = random({m, n}, NORMAL)).run(stream); + (A = random({m, n}, NORMAL)).run(exec); #endif std::array Dshape; @@ -103,18 +104,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) printf("iterations: %d\n", iterations); - A.PrefetchDevice(stream); - U.PrefetchDevice(stream); - S.PrefetchDevice(stream); - VT.PrefetchDevice(stream); + (U = 0).run(exec); + (S = 0).run(exec); + (VT = 0).run(exec); - (U = 0).run(stream); - (S = 0).run(stream); - (VT = 0).run(stream); + (mtie(U, S, VT) = svdpi(A, x0, iterations, k)).run(exec); - (mtie(U, S, VT) = svdpi(A, x0, iterations, k)).run(stream); - - cudaDeviceSynchronize(); + exec.sync(); printf("svdpi:\n"); printf("S\n"); @@ -126,29 +122,29 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) if( m <= n) { printf("UUT:\n"); - (UUT = matmul(U, conj(transpose_matrix(U)))).run(stream); + (UUT = matmul(U, conj(transpose_matrix(U)))).run(exec); print(UUT); } printf("UTU:\n"); - (UTU = matmul(conj(transpose_matrix(U)), U)).run(stream); + (UTU = matmul(conj(transpose_matrix(U)), U)).run(exec); print(UTU); if( n >= m) { printf("VVT:\n"); - (VVT = matmul(conj(transpose_matrix(VT)), VT)).run(stream); + (VVT = matmul(conj(transpose_matrix(VT)), VT)).run(exec); print(VVT); } printf("VTV:\n"); - (VTV = matmul(VT, conj(transpose_matrix(VT)))).run(stream); // works on r x r + (VTV = matmul(VT, conj(transpose_matrix(VT)))).run(exec); // works on r x r print(VTV); // scale U by eigen values (equivalent to matmul of the diagonal matrix) - (UD = U * D).run(stream); + (UD = U * D).run(exec); - (UDVT = matmul(UD, VT)).run(stream); + (UDVT = matmul(UD, VT)).run(exec); printf("A\n"); print(A); @@ -156,7 +152,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) printf("UDVT\n"); print(UDVT); - (A = A - UDVT).run(stream); + (A = A - UDVT).run(exec); printf("A-UDVT\n"); print(A); @@ -165,13 +161,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // Same as above but with svdbpi { - (U = 0).run(stream); - (S = 0).run(stream); - (VT = 0).run(stream); + (U = 0).run(exec); + (S = 0).run(exec); + (VT = 0).run(exec); // TODO add k - (mtie(U, S, VT) = svdbpi(A, iterations, tol)).run(stream); + (mtie(U, S, VT) = svdbpi(A, iterations, tol)).run(exec); - cudaDeviceSynchronize(); + exec.sync(); printf("svdbpi:\n"); printf("S\n"); @@ -183,29 +179,29 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) if( m <= n) { printf("UUT:\n"); - (UUT = matmul(U, conj(transpose_matrix(U)))).run(stream); + (UUT = matmul(U, conj(transpose_matrix(U)))).run(exec); print(UUT); } printf("UTU:\n"); - (UTU = matmul(conj(transpose_matrix(U)), U)).run(stream); + (UTU = matmul(conj(transpose_matrix(U)), U)).run(exec); print(UTU); if( n >= m) { printf("VVT:\n"); - (VVT = matmul(conj(transpose_matrix(VT)), VT)).run(stream); + (VVT = matmul(conj(transpose_matrix(VT)), VT)).run(exec); print(VVT); } printf("VTV:\n"); - (VTV = matmul(VT, conj(transpose_matrix(VT)))).run(stream); // works on r x r + (VTV = matmul(VT, conj(transpose_matrix(VT)))).run(exec); // works on r x r print(VTV); // scale U by eigen values (equivalent to matmul of the diagonal matrix) - (UD = U * D).run(stream); + (UD = U * D).run(exec); - (UDVT = matmul(UD, VT)).run(stream); + (UDVT = matmul(UD, VT)).run(exec); printf("A\n"); print(A); @@ -213,7 +209,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) printf("UDVT\n"); print(UDVT); - (A = A - UDVT).run(stream); + (A = A - UDVT).run(exec); printf("A-UDVT\n"); print(A); diff --git a/include/matx/executors/device.h b/include/matx/executors/device.h index 7125214be..b293ec1d8 100644 --- a/include/matx/executors/device.h +++ b/include/matx/executors/device.h @@ -63,11 +63,17 @@ namespace matx */ cudaExecutor() : stream_(0) {} - /* - * @breif Returns stream associated with executor - */ + /** + * @brief Returns stream associated with executor + */ auto getStream() const { return stream_; } + /** + * @brief Synchronize the cuda executor's stream + * + */ + void sync() { cudaStreamSynchronize(stream_); } + /** * Execute an operator on a device * diff --git a/include/matx/executors/host.h b/include/matx/executors/host.h index 969de943a..c17c96cf4 100644 --- a/include/matx/executors/host.h +++ b/include/matx/executors/host.h @@ -71,6 +71,12 @@ class HostExecutor { HostExecutor(const HostExecParams ¶ms = HostExecParams{}) : params_(params) {} + /** + * @brief Synchronize the host executor's threads. Currently a noop as the executor is single-threaded. + * + */ + void sync() {} + /** * @brief Execute an operator * diff --git a/test/00_io/FileIOTests.cu b/test/00_io/FileIOTests.cu index 2769f361e..0b5d938bd 100644 --- a/test/00_io/FileIOTests.cu +++ b/test/00_io/FileIOTests.cu @@ -153,7 +153,7 @@ TYPED_TEST(FileIoTestsNonComplexFloatTypes, MATWriteRank5) (t = random(t.Shape(), UNIFORM)).run(this->exec); - cudaDeviceSynchronize(); + this->exec.sync(); // Read "myvar" from mat file io::write_mat(t, "test_write.mat", "myvar"); @@ -182,7 +182,7 @@ TYPED_TEST(FileIoTestsNonComplexFloatTypes, MATWriteRank5GetShape) (t = random(t.Shape(), UNIFORM)).run(this->exec); - cudaDeviceSynchronize(); + this->exec.sync(); // Read "myvar" from mat file io::write_mat(t, "test_write.mat", "myvar"); @@ -213,7 +213,7 @@ TYPED_TEST(FileIoTestsComplexFloatTypes, MATWriteRank5GetShape) (t = random(t.Shape(), UNIFORM)).run(this->exec); - cudaDeviceSynchronize(); + this->exec.sync(); // Read "myvar" from mat file io::write_mat(t, "test_write.mat", "myvar"); diff --git a/test/00_operators/GeneratorTests.cu b/test/00_operators/GeneratorTests.cu index a4c92597a..a1b4f4a44 100644 --- a/test/00_operators/GeneratorTests.cu +++ b/test/00_operators/GeneratorTests.cu @@ -161,7 +161,7 @@ TYPED_TEST(BasicGeneratorTestsAll, Diag) // Assign the diagonal elements of `tc` to `td`. (td = diag(tc)).run(exec); // example-end diag-op-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (int i = 0; i < 10; i++) { for (int j = 0; j < 10; j++) { @@ -177,11 +177,11 @@ TYPED_TEST(BasicGeneratorTestsAll, Diag) { auto delta = make_tensor({1}); delta(0) = static_cast(1.0); - cudaStreamSynchronize(0); + exec.sync(); (td = 0).run(exec); (td = diag(conv1d(tc, delta, MATX_C_MODE_SAME))).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (int i = 0; i < 10; i++) { for (int j = 0; j < 10; j++) { @@ -209,7 +209,7 @@ TYPED_TEST(BasicGeneratorTestsFloat, Alternate) (td = alternate(10)).run(exec); // example-end alternate-gen-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (int i = 0; i < 10; i++) { MATX_ASSERT_EQ(td(i), (TestType)-2* (TestType)(i&1) + (TestType)1) @@ -235,7 +235,7 @@ TEST(OperatorTests, Kron) (ov = kron(eye({4, 4}), bv)).run(exec); // example-end kron-gen-test-1 - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, ov, "square", 0); tensor_t av({2, 3}); @@ -246,7 +246,7 @@ TEST(OperatorTests, Kron) // Explicit shape specified in ones() (ov2 = kron(av, ones({2, 2}))).run(exec); // example-end ones-gen-test-2 - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, ov2, "rect", 0); MATX_EXIT_HANDLER(); @@ -277,7 +277,7 @@ TEST(OperatorTests, MeshGrid) (yv = yy).run(exec); // example-end meshgrid-gen-test-1 - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, xv, "X", 0); MATX_TEST_ASSERT_COMPARE(pb, yv, "Y", 0); @@ -303,18 +303,18 @@ TYPED_TEST(BasicGeneratorTestsFloatNonComplex, FFTFreq) // Generate FFT frequencies using the length of the "t1" tensor and assign to t1 (t1 = fftfreq(t1.Size(0))).run(exec); // example-end fftfreq-gen-test-1 - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, t1, "F1", 0.1); (t2 = fftfreq(t2.Size(0))).run(exec); - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, t2, "F2", 0.1); // example-begin fftfreq-gen-test-2 // Generate FFT frequencies using the length of the "t1" tensor and a sample spacing of 0.5 and assign to t1 (t1 = fftfreq(t1.Size(0), 0.5)).run(exec); // example-end fftfreq-gen-test-2 - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, t1, "F3", 0.1); MATX_EXIT_HANDLER(); @@ -336,7 +336,7 @@ TYPED_TEST(BasicGeneratorTestsAll, Zeros) (t1 = zeros()).run(exec); // example-end zeros-gen-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { if constexpr (IsHalfType()) { @@ -362,7 +362,7 @@ TYPED_TEST(BasicGeneratorTestsAll, Ones) (t1 = ones()).run(exec); // example-end ones-gen-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { if constexpr (IsHalfType()) { @@ -390,7 +390,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Range) // Generate a sequence of 100 numbers starting at 1 and spaced by 1 (t1 = range<0>(t1.Shape(), 1, 1)).run(exec); // example-end range-gen-test-1 - cudaStreamSynchronize(0); + exec.sync(); TestType one = 1; TestType two = 1; @@ -403,7 +403,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Range) { (t1 = t1 * t1).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { TestType it = static_cast>(i); @@ -413,7 +413,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Range) { (t1 = t1 * two).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { TestType it = static_cast>(i); @@ -424,7 +424,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Range) { (t1 = three * t1).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { TestType it = static_cast>(i); @@ -451,7 +451,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Linspace) // with `count` points in between (t1 = linspace<0>(t1.Shape(), (TestType)1, (TestType)100)).run(exec); // example-end linspace-gen-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1(i), i + 1)); @@ -459,7 +459,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Linspace) { (t1 = t1 + t1).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1(i), (i + 1) + (i + 1))); @@ -468,7 +468,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Linspace) { (t1 = (TestType)1 + t1).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { EXPECT_TRUE( @@ -478,7 +478,7 @@ TYPED_TEST(BasicGeneratorTestsNumericNonComplex, Linspace) { (t1 = t1 + (TestType)2).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1(i), (i + 1) + (i + 1) + 1 + 2)); @@ -506,7 +506,7 @@ TYPED_TEST(BasicGeneratorTestsFloatNonComplex, Logspace) (t1 = logspace<0>(s, start, stop)).run(exec); // example-end logspace-gen-test-1 - cudaStreamSynchronize(0); + exec.sync(); // Use doubles for verification since half operators have no equivalent host // types @@ -563,7 +563,7 @@ TYPED_TEST(BasicGeneratorTestsNumeric, Eye) TestType one = 1.0f; TestType zero = 0.0f; - cudaDeviceSynchronize(); + exec.sync(); for (index_t i = 0; i < count; i++) { for (index_t j = 0; j < count; j++) { @@ -626,7 +626,7 @@ TYPED_TEST(BasicGeneratorTestsNumeric, Diag) TestType zero = 0.0f; - cudaDeviceSynchronize(); + exec.sync(); for (index_t i = 0; i < count; i++) { for (index_t j = 0; j < count; j++) { diff --git a/test/00_operators/OperatorTests.cu b/test/00_operators/OperatorTests.cu index 208dea374..208e31cb4 100644 --- a/test/00_operators/OperatorTests.cu +++ b/test/00_operators/OperatorTests.cu @@ -292,11 +292,11 @@ TYPED_TEST(OperatorTestsFloatAllExecs, IsClose) auto B = make_tensor({5, 5, 5}); auto C = make_tensor({5, 5, 5}); - (A = ones(A.Shape())).run(); - (B = ones(B.Shape())).run(); - (C = isclose(A, B)).run(); + (A = ones(A.Shape())).run(exec); + (B = ones(B.Shape())).run(exec); + (C = isclose(A, B)).run(exec); // example-end isclose-test-1 - cudaStreamSynchronize(0); + exec.sync(); for(int i=0; i < A.Size(0); i++) { for(int j=0; j < A.Size(1); j++) { @@ -307,8 +307,8 @@ TYPED_TEST(OperatorTestsFloatAllExecs, IsClose) } B(1,1,1) = 2; - (C = isclose(A, B)).run(); - cudaStreamSynchronize(0); + (C = isclose(A, B)).run(exec); + exec.sync(); for(int i=0; i < A.Size(0); i++) { for(int j=0; j < A.Size(1); j++) { @@ -349,7 +349,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexNonHalfAllExecs, Frexp) (tofrac = ofrac, toint = oint).run(exec); // example-end frexp-test-1 - cudaStreamSynchronize(0); + exec.sync(); int texp; for (int i = 0; i < tiv0.Size(0); i++) { @@ -396,7 +396,7 @@ TYPED_TEST(OperatorTestsComplexNonHalfTypesAllExecs, Frexpc) toint_imag = oint_imag).run(exec); // example-end frexpc-test-1 - cudaStreamSynchronize(0); + exec.sync(); int texp_real, texp_imag; for (int i = 0; i < tiv0.Size(0); i++) { if constexpr (std::is_same_v>) { @@ -438,7 +438,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, FMod) tiv1() = (TestType)3.1; (tov0 = fmod(tiv0, tiv1)).run(exec); // example-end fmod-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_fmod((TestType)5.0, (TestType)3.1))); MATX_EXIT_HANDLER(); @@ -460,73 +460,73 @@ TYPED_TEST(OperatorTestsFloatAllExecs, TrigFuncs) // example-begin sin-test-1 (tov0 = sin(tiv0)).run(exec); // example-end sin-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_sin(c))); // example-begin cos-test-1 (tov0 = cos(tiv0)).run(exec); // example-end cos-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_cos(c))); // example-begin tan-test-1 (tov0 = tan(tiv0)).run(exec); // example-end tan-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_tan(c))); // example-begin asin-test-1 (tov0 = asin(tiv0)).run(exec); // example-end asin-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_asin(c))); // example-begin acos-test-1 (tov0 = acos(tiv0)).run(exec); // example-end acos-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_acos(c))); // example-begin atan-test-1 (tov0 = atan(tiv0)).run(exec); // example-end atan-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_atan(c))); // example-begin sinh-test-1 (tov0 = sinh(tiv0)).run(exec); // example-end sinh-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_sinh(c))); // example-begin cosh-test-1 (tov0 = cosh(tiv0)).run(exec); // example-end cosh-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_cosh(c))); // example-begin tanh-test-1 (tov0 = tanh(tiv0)).run(exec); // example-end tanh-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_tanh(c))); // example-begin asinh-test-1 (tov0 = asinh(tiv0)).run(exec); // example-end asinh-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_asinh(c))); // example-begin acosh-test-1 (tov0 = acosh(tiv0)).run(exec); // example-end acosh-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_acosh(c))); // example-begin atanh-test-1 (tov0 = atanh(tiv0)).run(exec); // example-end atanh-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_atanh(c))); MATX_EXIT_HANDLER(); @@ -549,7 +549,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, AngleOp) // example-begin angle-test-1 (tov0 = angle(tiv0)).run(exec); // example-end angle-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_angle(c))); MATX_EXIT_HANDLER(); @@ -590,7 +590,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CloneOp) } (tov = op).run(exec); - cudaDeviceSynchronize(); + exec.sync(); for(int n = 0; n < N; n++) { for(int m = 0; m < M; m++) { @@ -625,7 +625,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CloneOp) } (tov = op).run(exec); - cudaDeviceSynchronize(); + exec.sync(); for(int n = 0; n < N; n++) { for(int m = 0; m < M; m++) { @@ -660,7 +660,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CloneOp) } (tov = op).run(exec); - cudaDeviceSynchronize(); + exec.sync(); for(int n = 0; n < N; n++) { for(int m = 0; m < M; m++) { @@ -697,7 +697,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CloneOp) } (tov = op).run(exec); - cudaDeviceSynchronize(); + exec.sync(); for(int n = 0; n < N; n++) { for(int m = 0; m < M; m++) { @@ -734,7 +734,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CloneOp) } (tov = op).run(exec); - cudaDeviceSynchronize(); + exec.sync(); for(int n = 0; n < N; n++) { for(int m = 0; m < M; m++) { @@ -759,11 +759,11 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CloneOp) delta(0,0) = static_cast::type>(1.0); - cudaDeviceSynchronize(); + exec.sync(); (tov = clone<3>(conv2d(tiv, delta, MATX_C_MODE_SAME), {N, matxKeepDim, matxKeepDim})).run(exec); - cudaDeviceSynchronize(); + exec.sync(); for(int n = 0; n < N; n++) { for(int m = 0; m < M; m++) { @@ -797,12 +797,12 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, AtOp) // `t1(3)` after execution (t0 = at(t1, 3)).run(exec); // example-end at-test-1 - cudaStreamSynchronize(0); + exec.sync(); ASSERT_EQ(t0(), t1(3)); (t0 = at(t2, 1, 4)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); ASSERT_EQ(t0(), t2(1, 4)); @@ -810,7 +810,7 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, AtOp) using ComplexType = detail::complex_from_scalar_t; auto c0 = make_tensor({}); (c0 = at(fft(t1), 0)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); // The first component of the FFT output (DC) is the sum of all elements, so // 10+20+...+100 = 550. The imaginary component should be 0. @@ -869,7 +869,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, SliceOp) (t2 = linspace<1>(t2.Shape(), (inner_type)0, (inner_type)10)).run(exec); (t3 = linspace<2>(t3.Shape(), (inner_type)0, (inner_type)10)).run(exec); (t4 = linspace<3>(t4.Shape(), (inner_type)0, (inner_type)10)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); // Slice with different start and end points in each dimension auto t2t = slice(t2, {1, 2}, {3, 5}); @@ -916,7 +916,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, SliceOp) // Test SliceOp applied to a transform, using transpose() as an example transform auto t2trans = make_tensor({3, 2}); (t2trans = slice(transpose(t2), {2, 1}, {5, 3})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); ASSERT_EQ(t2trans.Size(0), 3); ASSERT_EQ(t2trans.Size(1), 2); @@ -932,7 +932,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, SliceOp) auto t2s = slice(t2, {t2.Size(0) - 4, t2.Size(1) - 5}, {matxEnd, matxEnd}); // example-end slice-test-4 - cudaStreamSynchronize(0); + exec.sync(); ASSERT_EQ(t2sn.Size(0), t2s.Size(0)); ASSERT_EQ(t2sn.Size(1), t2s.Size(1)); for (index_t i = 0; i < t2sn.Size(0); i++) { @@ -956,7 +956,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, SliceAndReduceOp) tensor_t t3t{{30, 20, 10}}; (t2t = linspace<1>(t2t.Shape(), (inner_type)0, (inner_type)10)).run(exec); (t3t = linspace<2>(t3t.Shape(), (inner_type)0, (inner_type)10)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); { index_t j = 0; @@ -1074,7 +1074,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CollapseOp) (tov = (TestType)0).run(exec); (tov = op).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for(int n = 0; n < N; n++) { for(int m = 0; m < M; m++) { @@ -1100,7 +1100,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CollapseOp) (tov = (TestType)0).run(exec); (tov = op).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for(int n = 0; n < N; n++) { for(int m = 0; m < M; m++) { @@ -1121,7 +1121,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CollapseOp) (tov = (TestType)0).run(exec); (tov = op).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for(int n = 0; n < N; n++) { for(int m = 0; m < M; m++) { @@ -1142,7 +1142,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CollapseOp) (tov = (TestType)0).run(exec); (tov = op).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for(int n = 0; n < N; n++) { for(int m = 0; m < M; m++) { @@ -1167,7 +1167,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CollapseOp) (tov = (TestType)0).run(exec); (tov = op).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for(int n = 0; n < N; n++) { for(int m = 0; m < M; m++) { @@ -1192,7 +1192,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CollapseOp) (tov = (TestType)0).run(exec); (tov = op).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for(int n = 0; n < N; n++) { for(int m = 0; m < M; m++) { @@ -1238,7 +1238,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) // Remap 2D operator "tiv" by selecting elements from dimension 0 stored in "idx" (tov = remap<0>(tiv, idx)).run(exec); // example-end remap-test-1 - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < N ; i++) { for( int j = 0; j < N ; j++) { @@ -1247,7 +1247,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) } (tov = remap<1>(tiv, idx)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < N ; i++) { for( int j = 0; j < N ; j++) { @@ -1259,7 +1259,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) // Remap 2D operator "tiv" by selecting elements from dimensions 0 and 1 stored in "idx" (tov = remap<0,1>(tiv, idx, idx)).run(exec); // example-end remap-test-2 - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < N ; i++) { for( int j = 0; j < N ; j++) { @@ -1279,9 +1279,9 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) (tov = (TestType)0).run(exec); - (remap<0>(tov, idx) = tiv).run(); + (remap<0>(tov, idx) = tiv).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < N ; i++) { for( int j = 0; j < N ; j++) { @@ -1291,7 +1291,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) (tov = (TestType)0).run(exec); (remap<1>(tov, idx) = tiv).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < N ; i++) { for( int j = 0; j < N ; j++) { @@ -1301,7 +1301,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) (tov = (TestType)0).run(exec); (remap<0,1>(tov, idx, idx) = tiv).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < N ; i++) { for( int j = 0; j < N ; j++) { @@ -1320,7 +1320,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) } (tov = remap<0>(tiv, idx)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < N ; i++) { for( int j = 0; j < N ; j++) { @@ -1329,7 +1329,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) } (tov = remap<1>(tiv, idx)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < N ; i++) { for( int j = 0; j < N ; j++) { @@ -1338,7 +1338,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) } (tov = remap<0,1>(tiv, idx, idx)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < N ; i++) { for( int j = 0; j < N ; j++) { @@ -1357,7 +1357,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) } (remap<0>(tov, idx) = tiv).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < N ; i++) { for( int j = 0; j < N ; j++) { @@ -1366,7 +1366,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) } (remap<1>(tov, idx) = tiv).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < N ; i++) { for( int j = 0; j < N ; j++) { @@ -1375,7 +1375,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) } (remap<0,1>(tov, idx, idx) = tiv).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < N ; i++) { for( int j = 0; j < N ; j++) { @@ -1396,7 +1396,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) auto tov = make_tensor({M, N}); (tov = remap<0>(tiv, idx)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < M ; i++) { for( int j = 0; j < N ; j++) { @@ -1409,7 +1409,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) auto tov = make_tensor({N, M}); (tov = remap<1>(tiv, idx)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < N ; i++) { for( int j = 0; j < M ; j++) { @@ -1422,7 +1422,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) auto tov = make_tensor({M, M}); (tov = remap<0,1>(tiv, idx, idx)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < M ; i++) { for( int j = 0; j < M ; j++) { @@ -1444,7 +1444,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) auto tov = make_tensor({M, N}); (tov = remap<0>(tiv, idx)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < M ; i++) { for( int j = 0; j < N ; j++) { @@ -1457,7 +1457,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) auto tov = make_tensor({N, M}); (tov = remap<1>(tiv, idx)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < N ; i++) { for( int j = 0; j < M ; j++) { @@ -1479,7 +1479,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) auto tov = make_tensor({M, N}); (tov = remap<0>(tiv, idx)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < M ; i++) { for( int j = 0; j < N ; j++) { @@ -1492,7 +1492,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) auto tov = make_tensor({N, M}); (tov = remap<1>(tiv, idx)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < N ; i++) { for( int j = 0; j < M ; j++) { @@ -1505,7 +1505,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapOp) auto tov = make_tensor({M, M}); (tov = remap<0,1>(tiv, idx, idx)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for( int i = 0; i < M ; i++) { for( int j = 0; j < M ; j++) { @@ -1526,32 +1526,26 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapRankZero) ExecType exec{}; - auto sync = []() constexpr { - if constexpr (std::is_same_v) { - cudaDeviceSynchronize(); - } - }; - const int N = 16; // 1D source tensor cases { auto from = make_tensor({N}); (from = range<0>({N}, 0, 1)).run(exec); - sync(); + exec.sync(); auto ind = make_tensor({}); auto r = remap<0>(from, ind); auto to = make_tensor({1}); ind() = N/2; (to = r).run(exec); - sync(); + exec.sync(); ASSERT_EQ(to(0), N/2); ind() = N/4; (to = r).run(exec); - sync(); + exec.sync(); ASSERT_EQ(to(0), N/4); } @@ -1560,7 +1554,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapRankZero) { auto from = make_tensor({N,N}); (from = ones()).run(exec); - sync(); + exec.sync(); auto i0 = make_tensor({}); auto i1 = make_tensor({}); @@ -1575,7 +1569,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapRankZero) from(0,N/2) = 3; (to0 = r0).run(exec); (to1 = r1).run(exec); - sync(); + exec.sync(); ASSERT_EQ(to0(0,0), 2); ASSERT_EQ(to0(0,1), 1); @@ -1589,7 +1583,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, RemapRankZero) // Select a single entry from the 2D input tensor auto entry = make_tensor({1,1}); (entry = remap<0,1>(from, i0, i1)).run(exec); - sync(); + exec.sync(); ASSERT_EQ(entry(0,0), 11); } @@ -1613,13 +1607,13 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, RealImagOp) // example-begin real-test-1 (tov0 = real(tiv0)).run(exec); // example-end real-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c.real())); // example-begin imag-test-1 (tov0 = imag(tiv0)).run(exec); // example-end imag-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c.imag())); MATX_EXIT_HANDLER(); @@ -1646,19 +1640,19 @@ TYPED_TEST(OperatorTestsAllExecs, OperatorFuncs) // example-begin IFELSE-test-1 IFELSE(tiv0 == d, tov0 = z, tov0 = d).run(exec); // example-end IFELSE-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), z)); IFELSE(tiv0 == d, tov0 = tiv0, tov0 = d).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), tiv0())); IFELSE(tiv0 != d, tov0 = d, tov0 = z).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), z)); (tov0 = c, tov00 = c).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c)); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov00(), c)); @@ -1674,20 +1668,14 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Abs2) ExecType exec{}; - auto sync = []() constexpr { - if constexpr (std::is_same_v) { - cudaDeviceSynchronize(); - } - }; - if constexpr (std::is_same_v> && std::is_same_v) { // example-begin abs2-test-1 auto x = make_tensor>({}); auto y = make_tensor({}); x() = { 1.5f, 2.5f }; - (y = abs2(x)).run(); - cudaDeviceSynchronize(); + (y = abs2(x)).run(exec); + exec.sync(); ASSERT_NEAR(y(), 1.5f*1.5f+2.5f*2.5f, 1.0e-6); // example-end abs2-test-1 } @@ -1697,12 +1685,12 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Abs2) if constexpr (is_complex_v) { x() = TestType{2.0, 2.0}; (y = abs2(x)).run(exec); - sync(); + exec.sync(); ASSERT_NEAR(y(), 8.0, 1.0e-6); } else { x() = 2.0; (y = abs2(x)).run(exec); - sync(); + exec.sync(); ASSERT_NEAR(y(), 4.0, 1.0e-6); // Test with higher rank tensor @@ -1717,7 +1705,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Abs2) } (y3 = abs2(x3)).run(exec); - sync(); + exec.sync(); for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { @@ -1748,7 +1736,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, OperatorFuncsR2C) // example-begin expj-test-1 (tov0 = expj(tiv0)).run(exec); // example-end expj-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare( tov0(), @@ -1773,43 +1761,43 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, OperatorFuncs) // example-begin log10-test-1 (tov0 = log10(tiv0)).run(exec); // example-end log10-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_log10(c))); // example-begin log-test-1 (tov0 = log(tiv0)).run(exec); // example-end log-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_log(c))); // example-begin log2-test-1 (tov0 = log2(tiv0)).run(exec); // example-end log2-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_log2(c))); // example-begin floor-test-1 (tov0 = floor(tiv0)).run(exec); // example-end floor-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_floor(c))); // example-begin ceil-test-1 (tov0 = ceil(tiv0)).run(exec); // example-end ceil-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_ceil(c))); // example-begin round-test-1 (tov0 = round(tiv0)).run(exec); // example-end round-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_round(c))); // example-begin sqrt-test-1 (tov0 = sqrt(tiv0)).run(exec); // example-end sqrt-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_sqrt(c))); MATX_EXIT_HANDLER(); @@ -1827,14 +1815,14 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, NDOperatorFuncs) auto a = make_tensor({1,2,3,4,5}); auto b = make_tensor({1,2,3,4,5}); (a = ones(a.Shape())).run(exec); - cudaDeviceSynchronize(); + exec.sync(); (b = ones(b.Shape())).run(exec); - cudaDeviceSynchronize(); + exec.sync(); (a = a + b).run(exec); auto t0 = make_tensor({}); (t0 = sum(a)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); ASSERT_EQ(t0(), static_cast(2 * a.TotalSize())); MATX_EXIT_HANDLER(); } @@ -1857,13 +1845,13 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, OperatorFuncs) // example-begin max-el-test-1 (tov0 = max(tiv0, d)).run(exec); // example-end max-el-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), std::max(c, d))); // example-begin min-el-test-1 (tov0 = min(tiv0, d)).run(exec); // example-end min-el-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), std::min(c, d))); // These operators convert type T into bool @@ -1872,37 +1860,37 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, OperatorFuncs) // example-begin lt-test-1 (tob = tiv0 < d).run(exec); // example-end lt-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), c < d)); // example-begin gt-test-1 (tob = tiv0 > d).run(exec); // example-end gt-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), c > d)); // example-begin lte-test-1 (tob = tiv0 <= d).run(exec); // example-end lte-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), c <= d)); // example-begin gte-test-1 (tob = tiv0 >= d).run(exec); // example-end gte-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), c >= d)); // example-begin eq-test-1 (tob = tiv0 == d).run(exec); // example-end eq-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), c == d)); // example-begin neq-test-1 (tob = tiv0 != d).run(exec); // example-end neq-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), c != d)); MATX_EXIT_HANDLER(); @@ -1924,7 +1912,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, OperatorFuncDivComplex) tiv0() = c; (tov0 = s / tiv0).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), s / tiv0())); MATX_EXIT_HANDLER(); @@ -1947,50 +1935,50 @@ TYPED_TEST(OperatorTestsNumericAllExecs, OperatorFuncs) // example-begin add-test-1 (tov0 = tiv0 + tiv0).run(exec); // example-end add-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c + c)); // example-begin sub-test-1 (tov0 = tiv0 - tiv0).run(exec); // example-end sub-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c - c)); // example-begin mul-test-1 (tov0 = tiv0 * tiv0).run(exec); // example-end mul-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c * c)); // example-begin div-test-1 (tov0 = tiv0 / tiv0).run(exec); // example-end div-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c / c)); // example-begin neg-test-1 (tov0 = -tiv0).run(exec); // example-end neg-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), -c)); // example-begin IF-test-1 IF(tiv0 == tiv0, tov0 = c).run(exec); // example-end IF-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c)); TestType p = 2.0f; // example-begin pow-test-1 (tov0 = as_type(pow(tiv0, p))).run(exec); // example-end pow-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_pow(c, p))); TestType three = 3.0f; - (tov0 = tiv0 * tiv0 * (tiv0 + tiv0) / tiv0 + three).run(); - cudaStreamSynchronize(0); + (tov0 = tiv0 * tiv0 * (tiv0 + tiv0) / tiv0 + three).run(exec); + exec.sync(); TestType res; res = c * c * (c + c) / c + three; @@ -2005,9 +1993,9 @@ TYPED_TEST(OperatorTestsNumericAllExecs, OperatorFuncs) } auto tob = make_tensor({}); // example-begin nan-test-1 - (tob = matx::isnan(nan)).run(); + (tob = matx::isnan(nan)).run(exec); // example-end nan-test-1 - cudaDeviceSynchronize(); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), std::is_floating_point_v ? true : false)); auto notnanorinf = make_tensor({}); @@ -2016,8 +2004,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, OperatorFuncs) } else { notnanorinf() = 0; } - (tob = matx::isnan(notnanorinf)).run(); - cudaDeviceSynchronize(); + (tob = matx::isnan(notnanorinf)).run(exec); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), false)); auto inf = make_tensor({}); @@ -2028,13 +2016,13 @@ TYPED_TEST(OperatorTestsNumericAllExecs, OperatorFuncs) inf() = std::numeric_limits::infinity(); } // example-begin inf-test-1 - (tob = matx::isinf(inf)).run(); + (tob = matx::isinf(inf)).run(exec); // example-end inf-test-1 - cudaDeviceSynchronize(); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), std::is_floating_point_v ? true : false)); - (tob = matx::isinf(notnanorinf)).run(); - cudaDeviceSynchronize(); + (tob = matx::isinf(notnanorinf)).run(exec); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tob(), false)); MATX_EXIT_HANDLER(); @@ -2058,7 +2046,7 @@ TYPED_TEST(OperatorTestsIntegralAllExecs, OperatorFuncs) // example-begin mod-test-1 (tov0 = tiv0 % mod).run(exec); // example-end mod-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c % mod)); MATX_EXIT_HANDLER(); @@ -2082,37 +2070,37 @@ TYPED_TEST(OperatorTestsBooleanAllExecs, OperatorFuncs) // example-begin land-test-1 (tov0 = tiv0 && d).run(exec); // example-end land-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c && d)); // example-begin lor-test-1 (tov0 = tiv0 || d).run(exec); // example-end lor-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c || d)); // example-begin lnot-test-1 (tov0 = !tiv0).run(exec); // example-end lnot-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), !c)); // example-begin xor-test-1 (tov0 = tiv0 ^ d).run(exec); // example-end xor-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c ^ d)); // example-begin or-test-1 (tov0 = tiv0 | d).run(exec); // example-end or-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c | d)); // example-begin and-test-1 (tov0 = tiv0 & d).run(exec); // example-end and-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), c & d)); MATX_EXIT_HANDLER(); @@ -2136,13 +2124,13 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, OperatorFuncs) // example-begin exp-test-1 (tov0 = exp(tiv0)).run(exec); // example-end exp-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_exp(c))); // example-begin conj-test-1 (tov0 = conj(tiv0)).run(exec); // example-end conj-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_conj(c))); // abs and norm take a complex and output a floating point value @@ -2150,13 +2138,13 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, OperatorFuncs) // example-begin norm-test-1 (tdd0 = norm(tiv0)).run(exec); // example-end norm-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tdd0(), detail::_internal_norm(c))); // example-begin abs-test-1 (tdd0 = abs(tiv0)).run(exec); // example-end abs-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(tdd0(), detail::_internal_abs(c))); MATX_EXIT_HANDLER(); @@ -2184,7 +2172,7 @@ TYPED_TEST(OperatorTestsAllExecs, Flatten) auto t1 = make_tensor({t2.Size(0)*t2.Size(1)}); (t1 = flatten(t2)).run(exec); // example-end flatten-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2.Size(0)*t2.Size(1); i++) { ASSERT_EQ(t1(i), val); @@ -2215,7 +2203,7 @@ TYPED_TEST(OperatorTestsNumericNoHalfAllExecs, AdvancedOperators) { (c = a + b).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { TestType tcnt = static_cast>(i); @@ -2227,7 +2215,7 @@ TYPED_TEST(OperatorTestsNumericNoHalfAllExecs, AdvancedOperators) { (c = a * b).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { TestType tcnt = static_cast>(i); @@ -2238,7 +2226,7 @@ TYPED_TEST(OperatorTestsNumericNoHalfAllExecs, AdvancedOperators) { (c = a * b + a).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { TestType tcnt = static_cast>(i); @@ -2250,7 +2238,7 @@ TYPED_TEST(OperatorTestsNumericNoHalfAllExecs, AdvancedOperators) { (c = a * b + a * (TestType)4.0f).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { TestType tcnt = static_cast>(i); @@ -2285,7 +2273,7 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, AdvancedOperators) { (c = a + b).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { TestType tcnt = (TestType)i; @@ -2295,7 +2283,7 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, AdvancedOperators) { (c = a * b).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { TestType tcnt = (TestType)i; @@ -2306,7 +2294,7 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, AdvancedOperators) { (c = a * b + a).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { TestType tcnt = (TestType)i; @@ -2318,7 +2306,7 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, AdvancedOperators) { (c = a * b + a * (TestType)2.0f).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { TestType tcnt = (TestType)i; @@ -2356,7 +2344,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, ComplexTypeCompatibility) } (dview = dview * fview).run(exec); - cudaDeviceSynchronize(); + exec.sync(); for (index_t i = 0; i < count; i++) { ASSERT_EQ(static_cast>(dview(i).real()), @@ -2373,7 +2361,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, ComplexTypeCompatibility) } (dview = dview / fview).run(exec); - cudaDeviceSynchronize(); + exec.sync(); for (index_t i = 0; i < count; i++) { ASSERT_EQ(static_cast>(dview(i).real()), @@ -2393,7 +2381,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, ComplexTypeCompatibility) (dview = dview + fview).run(exec); - cudaDeviceSynchronize(); + exec.sync(); for (index_t i = 0; i < count; i++) { ASSERT_EQ(static_cast>(dview(i).real()), @@ -2410,7 +2398,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, ComplexTypeCompatibility) } (dview = dview - fview).run(exec); - cudaDeviceSynchronize(); + exec.sync(); for (index_t i = 0; i < count; i++) { ASSERT_EQ(static_cast>(dview(i).real()), @@ -2427,7 +2415,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, ComplexTypeCompatibility) } (dview = fview - dview).run(exec); - cudaDeviceSynchronize(); + exec.sync(); for (index_t i = 0; i < count; i++) { ASSERT_EQ(static_cast>(dview(i).real()), @@ -2458,12 +2446,9 @@ TYPED_TEST(OperatorTestsNumericAllExecs, SquareCopyTranspose) } } - t2.PrefetchDevice(0); - t2t.PrefetchDevice(0); matx::copy(t2t, t2, exec); - t2t.PrefetchHost(0); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { for (index_t j = 0; j < count; j++) { @@ -2472,11 +2457,9 @@ TYPED_TEST(OperatorTestsNumericAllExecs, SquareCopyTranspose) } } - t2t.PrefetchDevice(0); (t2t = transpose(t2)).run(exec); - t2t.PrefetchHost(0); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count; i++) { for (index_t j = 0; j < count; j++) { @@ -2509,7 +2492,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, NonSquareTranspose) } (t2t = transpose(t2)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count1; i++) { for (index_t j = 0; j < count2; j++) { @@ -2544,8 +2527,11 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Transpose3D) } (t3t = transpose_matrix(t3)).run(exec); - cudaError_t error = cudaStreamSynchronize(0); - ASSERT_EQ(error, cudaSuccess); + exec.sync(); + + if constexpr (std::is_same_v) { + ASSERT_EQ(cudaGetLastError(), cudaSuccess); + } for (index_t i = 0; i < num_rows; i++) { for (index_t j = 0; j < num_cols; j++) { @@ -2587,9 +2573,9 @@ TYPED_TEST(OperatorTestsNumericAllExecs, TransposeVsTransposeMatrix) (t3t = transpose(t3)).run(exec); (t3tm = transpose_matrix(t3)).run(exec); - if constexpr (is_cuda_executor_v) { - cudaError_t error = cudaStreamSynchronize(0); - ASSERT_EQ(error, cudaSuccess); + exec.sync(); + if constexpr (std::is_same_v) { + ASSERT_EQ(cudaGetLastError(), cudaSuccess); } for (index_t i = 0; i < m; i++) { @@ -2646,9 +2632,9 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, CloneAndAdd) auto vah = clone<4>(velAccelHypoth, {numBeams, matxKeepDim, matxKeepDim, matxKeepDim}); - (beamwiseRangeDoppler = smx + vah).run(); + (beamwiseRangeDoppler = smx + vah).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < numBeams; i++) { for (index_t j = 0; j < numPulses; j++) { for (index_t k = 0; k < numPairs; k++) { @@ -2813,7 +2799,7 @@ TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, Overlap) ASSERT_EQ(b4out.Size(0), 4); ASSERT_EQ(b4out.Size(1), 3); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < b4.Size(0); i++) { for (index_t j = 0; j < b4.Size(1); j++) { ASSERT_EQ(b4out(i, j), b4(i, j)); @@ -2853,7 +2839,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast) } (t4o = t4i * t0).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t4o.Size(0); i++) { for (index_t j = 0; j < t4o.Size(1); j++) { @@ -2871,7 +2857,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast) } } (t4o = t0 * t4i).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t4o.Size(0); i++) { for (index_t j = 0; j < t4o.Size(1); j++) { @@ -2909,8 +2895,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast) // } // } - // (t4o = t4i * t1).run(); - // cudaStreamSynchronize(0); + // (t4o = t4i * t1).run(exec); + // exec.sync(); // for (index_t i = 0; i < t4o.Size(0); i++) { // for (index_t j = 0; j < t4o.Size(1); j++) { @@ -2928,8 +2914,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast) // } // } - // (t4o = t1 * t4i).run(); - // cudaStreamSynchronize(0); + // (t4o = t1 * t4i).run(exec); + // exec.sync(); // for (index_t i = 0; i < t4o.Size(0); i++) { // for (index_t j = 0; j < t4o.Size(1); j++) { @@ -2970,8 +2956,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast) // } // } - // (t4o = t4i * t2).run(); - // cudaStreamSynchronize(0); + // (t4o = t4i * t2).run(exec); + // exec.sync(); // for (index_t i = 0; i < t4o.Size(0); i++) { // for (index_t j = 0; j < t4o.Size(1); j++) { @@ -2989,8 +2975,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast) // } // } - // (t4o = t2 * t4i).run(); - // cudaStreamSynchronize(0); + // (t4o = t2 * t4i).run(exec); + // exec.sync(); // for (index_t i = 0; i < t4o.Size(0); i++) { // for (index_t j = 0; j < t4o.Size(1); j++) { @@ -3033,8 +3019,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast) // } // } - // (t4o = t4i * t3).run(); - // cudaStreamSynchronize(0); + // (t4o = t4i * t3).run(exec); + // exec.sync() // for (index_t i = 0; i < t4o.Size(0); i++) { // for (index_t j = 0; j < t4o.Size(1); j++) { @@ -3052,8 +3038,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast) // } // } - // (t4o = t3 * t4i).run(); - // cudaStreamSynchronize(0); + // (t4o = t3 * t4i).run(exec); + // exec.sync() // for (index_t i = 0; i < t4o.Size(0); i++) { // for (index_t j = 0; j < t4o.Size(1); j++) { @@ -3111,8 +3097,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast) // } // } - // (t4o = t4i + t3 + t2 + t1 + t0).run(); - // cudaStreamSynchronize(0); + // (t4o = t4i + t3 + t2 + t1 + t0).run(exec); + // exec.sync() // for (index_t i = 0; i < t4o.Size(0); i++) { // for (index_t j = 0; j < t4o.Size(1); j++) { @@ -3133,8 +3119,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Broadcast) // } // } - // (t4o = t0 + t1 + t2 + t3 + t4i).run(); - // cudaStreamSynchronize(0); + // (t4o = t0 + t1 + t2 + t3 + t4i).run(exec); + // exec.sync(); // for (index_t i = 0; i < t4o.Size(0); i++) { // for (index_t j = 0; j < t4o.Size(1); j++) { @@ -3179,7 +3165,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Concatenate) // Concatenate "t11" and "t12" into a new 1D tensor (t1o = concat(0, t11, t12)).run(exec); // example-end concat-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (i = 0; i < t11.Size(0) + t12.Size(0); i++) { if (i < t11.Size(0)) { @@ -3198,7 +3184,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Concatenate) (t1o = 0).run(exec); (t1o = concat(0, conv1d(t11, delta, MATX_C_MODE_SAME), conv1d(t12, delta, MATX_C_MODE_SAME))).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (i = 0; i < t11.Size(0) + t12.Size(0); i++) { if (i < t11.Size(0)) { @@ -3230,7 +3216,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Concatenate) {10,11,12}}); (t2o1 = concat(0, t21, t22)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (i = 0; i < t21.Size(0) + t22.Size(0); i++) { for (j = 0; j < t21.Size(1); j++) { @@ -3244,7 +3230,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Concatenate) } (t2o2 = concat(1, t21, t23)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (j = 0; j < t21.Size(1) + t23.Size(1); j++) { for (i = 0; i < t21.Size(0); i++) { @@ -3261,7 +3247,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Concatenate) // Concatenating 3 tensors (t1o1 = concat(0, t11, t11, t11)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (i = 0; i < t1o1.Size(0); i++) { ASSERT_EQ(t1o1(i), t11(i % t11.Size(0))); @@ -3285,7 +3271,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Concatenate) auto tempConcat2 = matx::concat(0, c, d); (result = matx::concat(0, tempConcat1, tempConcat2 )).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (int cnt = 0; cnt < result.Size(0); cnt++) { ASSERT_EQ(result(cnt), cnt + 1); } @@ -3310,7 +3296,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Stack) auto cop = concat(0, t1a, t1b, t1c); (cop = (TestType)2).run(exec); - cudaDeviceSynchronize(); + exec.sync(); { // example-begin stack-test-1 @@ -3362,7 +3348,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, HermitianTranspose) // example-begin hermitianT-test-1 (t2s = hermitianT(t2)).run(exec); // example-end hermitianT-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -3397,7 +3383,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, PlanarTransform) } (t2p = planar(t2)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < m; i++) { for (index_t j = 0; j < k; j++) { @@ -3434,7 +3420,7 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, InterleavedTransform) } (t2 = interleaved(t2p)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < m; i++) { for (index_t j = 0; j < k; j++) { @@ -3474,7 +3460,7 @@ TYPED_TEST(OperatorTestsAllExecs, RepMat) ASSERT_TRUE(repop.Size(1) == same_reps * t2.Size(1)); (t2s = repop).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0 * same_reps; i++) { for (index_t j = 0; j < count1 * same_reps; j++) { @@ -3491,7 +3477,7 @@ TYPED_TEST(OperatorTestsAllExecs, RepMat) ASSERT_TRUE(rrepop.Size(1) == same_reps * 2 * t2.Size(1)); (t2r = rrepop).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0 * same_reps; i++) { for (index_t j = 0; j < count1 * same_reps * 2; j++) { @@ -3550,9 +3536,9 @@ TYPED_TEST(OperatorTestsFloatNonComplexNonHalfAllExecs, PolyVal) pb->NumpyToTensorView(c, "c"); // example-begin polyval-test-1 - (out = polyval(x, c)).run(); + (out = polyval(x, c)).run(exec); // example-end polyval-test-1 - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, out, "out", 0.01); MATX_EXIT_HANDLER(); @@ -3576,14 +3562,15 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Upsample) (t1 = static_cast(1)).run(exec); auto us_op = upsample(t1, 0, n); // example-end upsample-test-1 + exec.sync(); ASSERT_TRUE(us_op.Size(0) == t1.Size(0) * n); for (index_t i = 0; i < us_op.Size(0); i++) { if ((i % n) == 0) { - MatXUtils::MatXTypeCompare(us_op(i), t1(i / n)); + ASSERT_TRUE(MatXUtils::MatXTypeCompare(us_op(i), t1(i / n))); } else { - MatXUtils::MatXTypeCompare(us_op(i), static_cast(0)); + ASSERT_TRUE(MatXUtils::MatXTypeCompare(us_op(i), static_cast(0))); } } } @@ -3607,10 +3594,11 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Downsample) (t1 = static_cast(1)).run(exec); auto ds_op = downsample(t1, 0, n); // example-end downsample-test-1 + exec.sync(); ASSERT_TRUE(ds_op.Size(0) == t1.Size(0) / n); for (index_t i = 0; i < ds_op.Size(0); i++) { - MatXUtils::MatXTypeCompare(ds_op(i), t1(i * n)); + ASSERT_TRUE(MatXUtils::MatXTypeCompare(ds_op(i), t1(i * n))); } } @@ -3623,7 +3611,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Downsample) ASSERT_TRUE(ds_op.Size(0) == t1.Size(0) / n + 1); for (index_t i = 0; i < ds_op.Size(0); i++) { - MatXUtils::MatXTypeCompare(ds_op(i), t1(i * n)); + ASSERT_TRUE(MatXUtils::MatXTypeCompare(ds_op(i), t1(i * n))); } } @@ -3653,7 +3641,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexNonHalfAllExecs, R2COp) for (int i = 0; i < N1; i++) { t1(i) = static_cast(i+1); } for (int i = 0; i < N2; i++) { t2(i) = static_cast(i+1); } - cudaStreamSynchronize(0); + exec.sync(); const std::array T1_expected = {{ { 15.0, 0.0 }, { -2.5, static_cast(3.4409548) }, { -2.5, static_cast(0.81229924) }, @@ -3672,7 +3660,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexNonHalfAllExecs, R2COp) (T1 = r2c(fft(t1), N1)).run(exec); (T2 = r2c(fft(t2), N2)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (int i = 0; i < N1; i++) { ASSERT_NEAR(T1(i).real(), T1_expected[i].real(), thresh); @@ -3688,7 +3676,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexNonHalfAllExecs, R2COp) (T1 = r2c(fft(t1, N1), N1)).run(exec); (T2 = r2c(fft(t2, N2), N2)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (int i = 0; i < N1; i++) { ASSERT_NEAR(T1(i).real(), T1_expected[i].real(), thresh); @@ -3706,7 +3694,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexNonHalfAllExecs, R2COp) (T1 = ifft(r2c(fft(t1), N1))).run(exec); (T2 = ifft(r2c(fft(t2), N2))).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (int i = 0; i < N1; i++) { ASSERT_NEAR(T1(i).real(), t1(i), thresh); @@ -3725,9 +3713,16 @@ TYPED_TEST(OperatorTestsFloatNonHalf, FftShiftWithTransform) { MATX_ENTER_HANDLER(); using TestType = std::tuple_element_t<0, TypeParam>; + using ExecType = std::tuple_element_t<1, TypeParam>; using inner_type = typename inner_op_type_t::type; using complex_type = detail::complex_from_scalar_t; + if constexpr (!detail::CheckFFTSupport()) { + GTEST_SKIP(); + } + + ExecType exec{}; + [[maybe_unused]] const inner_type thresh = static_cast(1.0e-6); // Verify that fftshift1D/ifftshift1D work with nested transforms. @@ -3750,12 +3745,12 @@ TYPED_TEST(OperatorTestsFloatNonHalf, FftShiftWithTransform) for (int i = 0; i < N1; i++) { t3(i) = t3_vals[i]; }; for (int i = 0; i < N2; i++) { t4(i) = t4_vals[i]; }; - cudaStreamSynchronize(0); + exec.sync(); - (T3 = fftshift1D(fft(t3))).run(); - (T4 = fftshift1D(fft(t4))).run(); + (T3 = fftshift1D(fft(t3))).run(exec); + (T4 = fftshift1D(fft(t4))).run(exec); - cudaStreamSynchronize(0); + exec.sync(); const std::array T3_expected = {{ { -1.5, static_cast(-0.8660254) }, { 6.0, 0.0 }, { -1.5, static_cast(0.8660254) } @@ -3774,10 +3769,10 @@ TYPED_TEST(OperatorTestsFloatNonHalf, FftShiftWithTransform) ASSERT_NEAR(T4(i).imag(), T4_expected[i].imag(), thresh); } - (T3 = ifftshift1D(fft(t3))).run(); - (T4 = ifftshift1D(fft(t4))).run(); + (T3 = ifftshift1D(fft(t3))).run(exec); + (T4 = ifftshift1D(fft(t4))).run(exec); - cudaStreamSynchronize(0); + exec.sync(); const std::array T3_ifftshift_expected = {{ { -1.5, static_cast(0.8660254) }, { -1.5, static_cast(-0.8660254) }, { 6.0, 0.0 } @@ -3803,14 +3798,14 @@ TYPED_TEST(OperatorTestsFloatNonHalf, FftShiftWithTransform) auto x = make_tensor({N,N}); auto X = make_tensor({N,N}); - (x = 0).run(); + (x = static_cast(0)).run(exec); - (X = fftshift2D(fft2(x))).run(); - (X = fftshift2D(ifft2(x))).run(); - (X = ifftshift2D(fft2(x))).run(); - (X = ifftshift2D(ifft2(x))).run(); + (X = fftshift2D(fft2(x))).run(exec); + (X = fftshift2D(ifft2(x))).run(exec); + (X = ifftshift2D(fft2(x))).run(exec); + (X = ifftshift2D(ifft2(x))).run(exec); - cudaStreamSynchronize(0); + exec.sync(); } MATX_EXIT_HANDLER(); @@ -3844,7 +3839,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp) // Shift the first dimension of "t2" by -5 so the 5th element of "t2" is the first element of "t2s" (t2s = shift<0>(t2, -5)).run(exec); // example-end shift-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -3856,7 +3851,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp) { (t2s = shift<0>(t2, t0)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -3868,7 +3863,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp) { (t2s = shift<1>(t2, -5)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -3880,7 +3875,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp) { (t2s = shift<1,0>(t2, -5, -6)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -3894,7 +3889,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp) // example-begin fftshift2D-test-1 (t2s = fftshift2D(t2)).run(exec); // example-end fftshift2D-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -3909,7 +3904,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp) // example-begin ifftshift2D-test-1 (t2s = ifftshift2D(t2)).run(exec); // example-end ifftshift2D-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -3923,7 +3918,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp) // Right shifts { (t2s = shift<0>(t2, 5)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -3935,7 +3930,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp) { (t2s = shift<1>(t2, 5)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -3948,7 +3943,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp) // Large shifts { (t2s = shift<0>(t2, -t2.Size(0) * 4)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -3961,8 +3956,8 @@ TYPED_TEST(OperatorTestsNumericAllExecs, ShiftOp) // Shift 4 times the size back, minus one. This should be equivalent to // simply shifting by -1 (t2s = shift<0>(t2, -t2.Size(0) * 4 - 1)).run(exec); - (t2s2 = shift<0>(t2, -1)).run(); - cudaStreamSynchronize(0); + (t2s2 = shift<0>(t2, -1)).run(exec); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -3999,7 +3994,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Reverse) // Reverse the values of t2 along dimension 0 (t2r = reverse<0>(t2)).run(exec); // example-end reverse-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -4011,7 +4006,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Reverse) { (t2r = reverse<1>(t2)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -4023,7 +4018,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Reverse) { (t2r = reverse<0,1>(t2)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -4038,7 +4033,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Reverse) // example-begin flipud-test-1 (t2r = flipud(t2)).run(exec); // example-end flipud-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -4052,7 +4047,7 @@ TYPED_TEST(OperatorTestsNumericAllExecs, Reverse) // example-begin fliplr-test-1 (t2r = fliplr(t2)).run(exec); // example-end fliplr-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < count0; i++) { for (index_t j = 0; j < count1; j++) { @@ -4073,22 +4068,24 @@ TEST(OperatorTests, Cast) auto t2 = make_tensor({count0}); auto to = make_tensor({count0}); + cudaExecutor exec{}; + t.SetVals({126, 126, 126, 126}); t2.SetVals({126, 126, 126, 126}); // example-begin as_type-test-1 - (to = as_type(t + t2)).run(); + (to = as_type(t + t2)).run(exec); // example-end as_type-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (int i = 0; i < t.Size(0); i++) { ASSERT_EQ(to(i), -4); // -4 from 126 + 126 wrap-around } // example-begin as_int8-test-1 - (to = as_int8(t + t2)).run(); + (to = as_int8(t + t2)).run(exec); // example-end as_int8-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (int i = 0; i < t.Size(0); i++) { ASSERT_EQ(to(i), -4); // -4 from 126 + 126 wrap-around @@ -4155,7 +4152,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Legendre) (out = legendre(n, m, x)).run(exec); // example-end legendre-test-1 - cudaStreamSynchronize(0); + exec.sync(); for(int j = 0; j < order; j++) { for(int p = 0; p < order; p++) { @@ -4179,7 +4176,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Legendre) (out = lcollapse<2>(legendre(order, m, x))).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for(int i = 0 ; i < size; i++) { for(int p = 0; p < order; p++) { @@ -4200,7 +4197,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Legendre) (out = lcollapse<3>(legendre(order, order, x))).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for(int i = 0 ; i < size; i++) { if constexpr (is_matx_half_v) { @@ -4220,7 +4217,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Legendre) (out = lcollapse<3>(legendre(order, m, x))).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for(int i = 0 ; i < size; i++) { if constexpr (is_matx_half_v) { @@ -4237,6 +4234,7 @@ TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, Legendre) TEST(OperatorTestsAdvanced, AdvancedRemapOp) { typedef cuda::std::complex complex; + cudaExecutor exec{}; MATX_ENTER_HANDLER(); int I = 4; @@ -4275,7 +4273,7 @@ TEST(OperatorTestsAdvanced, AdvancedRemapOp) } } - (B = 0).run(); + (B = 0).run(exec); auto rop = remap<1>(A, idx); auto lop = lcollapse<3>(rop); @@ -4284,9 +4282,9 @@ TEST(OperatorTestsAdvanced, AdvancedRemapOp) ASSERT_EQ(lop.Size(1) , A.Size(3)); ASSERT_EQ(lop.Size(0) , I * M * K); - (B = lop).run(); + (B = lop).run(exec); - cudaDeviceSynchronize(); + exec.sync(); for (int i = 0; i < I; i++) { for (int m = 0; m < M; m++) { @@ -4337,17 +4335,8 @@ TEST(OperatorTestsAdvanced, AdvancedRemapOp) } } } - - A.PrefetchDevice(0); - B.PrefetchDevice(0); - C.PrefetchDevice(0); - D.PrefetchDevice(0); - O1.PrefetchDevice(0); - O2.PrefetchDevice(0); - O3.PrefetchDevice(0); - O4.PrefetchDevice(0); - cudaDeviceSynchronize(); + exec.sync(); auto o1op = lcollapse<3>(remap<1>(O1, idx)); auto o2op = lcollapse<3>(remap<1>(O2, idx)); @@ -4357,27 +4346,27 @@ TEST(OperatorTestsAdvanced, AdvancedRemapOp) auto cop = C.Clone<4>({matxKeepDim, M, matxKeepDim, matxKeepDim}); auto rcop = lcollapse<3>(remap<1>(cop, idx)); - (O1 = 1).run(); - (O2 = 2).run(); - (O3 = 3).run(); - (O4 = 4).run(); + (O1 = 1).run(exec); + (O2 = 2).run(exec); + (O3 = 3).run(exec); + (O4 = 4).run(exec); - (B = lop).run(); - (D = rcop).run(); + (B = lop).run(exec); + (D = rcop).run(exec); // two operators as input - (o1op = conv1d(lop, rcop, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run(); + (o1op = conv1d(lop, rcop, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec); // one tensor and one operators as input - (o2op = conv1d(B, rcop, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run(); + (o2op = conv1d(B, rcop, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec); // one tensor and one operators as input - (o3op = conv1d(lop, D, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run(); + (o3op = conv1d(lop, D, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec); //two tensors as input - (o4op = conv1d(B, D, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run(); + (o4op = conv1d(B, D, matx::matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec); - cudaDeviceSynchronize(); + exec.sync(); for (int i = 0; i < o1op.Size(0); i++) { for (int l = 0; l < o1op.Size(1); l++) { diff --git a/test/00_operators/PWelch.cu b/test/00_operators/PWelch.cu index 98bf8eddf..059bcad13 100644 --- a/test/00_operators/PWelch.cu +++ b/test/00_operators/PWelch.cu @@ -100,33 +100,35 @@ void helper(PWelchComplexExponentialTest& test) auto Pxx = make_tensor({test.params.nfft}); + cudaExecutor exec{}; + if (test.params.window_name == "none") { - (Pxx = pwelch(x, test.params.nperseg, test.params.noverlap, test.params.nfft)).run(); + (Pxx = pwelch(x, test.params.nperseg, test.params.noverlap, test.params.nfft)).run(exec); } else { auto w = make_tensor({test.params.nperseg}); if (test.params.window_name == "boxcar") { - (w = ones({test.params.nperseg})).run(); + (w = ones({test.params.nperseg})).run(exec); } else if (test.params.window_name == "hann") { - (w = hanning<0,1,typename TypeParam::value_type>({test.params.nperseg})).run(); + (w = hanning<0,1,typename TypeParam::value_type>({test.params.nperseg})).run(exec); } else if (test.params.window_name == "flattop") { - (w = flattop<0,1,typename TypeParam::value_type>({test.params.nperseg})).run(); + (w = flattop<0,1,typename TypeParam::value_type>({test.params.nperseg})).run(exec); } else { ASSERT_TRUE(false) << "Unknown window parameter name " + test.params.window_name; } - (Pxx = pwelch(x, w, test.params.nperseg, test.params.noverlap, test.params.nfft)).run(); + (Pxx = pwelch(x, w, test.params.nperseg, test.params.noverlap, test.params.nfft)).run(exec); } - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(test.pb, Pxx, "Pxx_out", test.thresh); MATX_EXIT_HANDLER(); @@ -154,13 +156,14 @@ TEST(PWelchOpTest, xin_complex_float) index_t noverlap = 0; index_t nfft = 8; auto x = ones>({signal_size}); + cudaExecutor exec{}; // example-begin pwelch-test-1 auto Pxx = make_tensor({nfft}); auto w = ones({nperseg}); - (Pxx = pwelch(x, w, nperseg, noverlap, nfft)).run(); + (Pxx = pwelch(x, w, nperseg, noverlap, nfft)).run(exec); // example-end pwelch-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_NEAR(Pxx(0), 64, thresh); for (index_t k=1; k({3, 4, 5}); (t2 = sum(t4, {2, 3})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2.Size(0); i++) { for (index_t j = 0; j < t2.Size(1); j++) { ASSERT_TRUE(MatXUtils::MatXTypeCompare( @@ -294,7 +294,7 @@ TYPED_TEST(ReductionTestsNumericNoHalfAllExecs, Sum) } (t2 = sum(t3, {2})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2.Size(0); i++) { for (index_t j = 0; j < t2.Size(1); j++) { ASSERT_TRUE( @@ -371,7 +371,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce) (t2a = sum(permute(t4,{2,3,0,1}))).run(exec); (t2b = sum(t4, {0,1})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2a.Size(0); i++) { for (index_t j = 0; j < t2a.Size(1); j++) { ASSERT_TRUE(MatXUtils::MatXTypeCompare( @@ -384,7 +384,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce) (t2a = mean(permute(t4,{2,3,0,1}))).run(exec); (t2b = mean(t4, {0,1})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2a.Size(0); i++) { for (index_t j = 0; j < t2a.Size(1); j++) { ASSERT_TRUE(MatXUtils::MatXTypeCompare( @@ -398,7 +398,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce) median(t2a, permute(t4,{2,3,0,1}), exec); median(t2b, t4, {0,1}, exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2a.Size(0); i++) { for (index_t j = 0; j < t2a.Size(1); j++) { ASSERT_TRUE(MatXUtils::MatXTypeCompare( @@ -413,7 +413,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce) (t2a = prod(permute(t4,{2,3,0,1}))).run(exec); (t2b = prod(t4, {0,1})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2a.Size(0); i++) { for (index_t j = 0; j < t2a.Size(1); j++) { ASSERT_TRUE(MatXUtils::MatXTypeCompare( @@ -431,7 +431,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce) (t2b = max(t4, {0,1})).run(exec); // example-end max-test-2 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2a.Size(0); i++) { for (index_t j = 0; j < t2a.Size(1); j++) { ASSERT_TRUE(MatXUtils::MatXTypeCompare( @@ -449,7 +449,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce) (t2b = min(t4, {0,1})).run(exec); // example-end min-test-2 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2a.Size(0); i++) { for (index_t j = 0; j < t2a.Size(1); j++) { ASSERT_TRUE(MatXUtils::MatXTypeCompare( @@ -467,7 +467,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce) (mtie(t2b, t2bi) = argmax(t4, {0,1})).run(exec); // example-end argmax-test-2 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2a.Size(0); i++) { for (index_t j = 0; j < t2a.Size(1); j++) { ASSERT_TRUE(MatXUtils::MatXTypeCompare( @@ -487,7 +487,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce) (mtie(t2b, t2bi) = argmin(t4, {0,1})).run(exec); // example-end argmin-test-2 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2a.Size(0); i++) { for (index_t j = 0; j < t2a.Size(1); j++) { ASSERT_TRUE(MatXUtils::MatXTypeCompare( @@ -507,7 +507,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce) (t2b = any(t4, {0,1})).run(exec); // example-end any-test-2 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2a.Size(0); i++) { for (index_t j = 0; j < t2a.Size(1); j++) { ASSERT_TRUE(MatXUtils::MatXTypeCompare( @@ -525,7 +525,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce) (t2b = all(t4, {0,1})).run(exec); // example-end all-test-2 - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2a.Size(0); i++) { for (index_t j = 0; j < t2a.Size(1); j++) { ASSERT_TRUE(MatXUtils::MatXTypeCompare( @@ -539,7 +539,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce) (t2a = var(permute(t4,{2,3,0,1}))).run(exec); (t2b = var(t4, {0,1})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2a.Size(0); i++) { for (index_t j = 0; j < t2a.Size(1); j++) { ASSERT_TRUE(MatXUtils::MatXTypeCompare( @@ -553,7 +553,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, PermutedReduce) (t2a = stdd(permute(t4,{2,3,0,1}))).run(exec); (t2b = stdd(t4, {0,1})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2a.Size(0); i++) { for (index_t j = 0; j < t2a.Size(1); j++) { ASSERT_TRUE(MatXUtils::MatXTypeCompare( @@ -585,7 +585,7 @@ TYPED_TEST(ReductionTestsNumericNonComplexAllExecs, Any) (t2 = zeros(t2.Shape())).run(exec); (t3 = zeros(t3.Shape())).run(exec); (t4 = zeros(t4.Shape())).run(exec); - cudaStreamSynchronize(0); + exec.sync(); t1(5) = 5; t3(1, 1, 1) = 6; @@ -595,19 +595,19 @@ TYPED_TEST(ReductionTestsNumericNonComplexAllExecs, Any) // convertible to "true" (t0 = any(t4)).run(exec); // example-end any-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(0))); (t0 = any(t3)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1))); (t0 = any(t2)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(0))); (t0 = any(t1)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1))); } @@ -631,13 +631,13 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, AllClose) (B = ones(B.Shape())).run(exec); allclose(C, A, B, 1e-5, 1e-8, exec); // example-end allclose-test-1 - cudaStreamSynchronize(0); + exec.sync(); ASSERT_EQ(C(), 1); B(1,1,1) = 2; allclose(C, A, B, 1e-5, 1e-8, exec); - cudaStreamSynchronize(0); + exec.sync(); ASSERT_EQ(C(), 0); @@ -661,11 +661,11 @@ TYPED_TEST(ReductionTestsNumericNonComplexAllExecs, All) tensor_t t3({30, 40, 50}); tensor_t t4({30, 40, 50, 60}); - (t1 = ones(t1.Shape())).run(); - (t2 = ones(t2.Shape())).run(); - (t3 = ones(t3.Shape())).run(); - (t4 = ones(t4.Shape())).run(); - cudaStreamSynchronize(0); + (t1 = ones(t1.Shape())).run(exec); + (t2 = ones(t2.Shape())).run(exec); + (t3 = ones(t3.Shape())).run(exec); + (t4 = ones(t4.Shape())).run(exec); + exec.sync(); t1(5) = 0; t3(1, 1, 1) = 0; @@ -675,19 +675,19 @@ TYPED_TEST(ReductionTestsNumericNonComplexAllExecs, All) // convert to "true", or "false" otherwise (t0 = all(t4)).run(exec); // example-end all-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1))); (t0 = all(t3)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(0))); (t0 = all(t2)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1))); (t0 = all(t1)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(0))); } @@ -718,51 +718,51 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Percentile) // Find the 50th percentile value in `t1e` using linear interpolation between midpoints (t0 = percentile(t1e, 50, PercentileMethod::LINEAR)).run(exec); // example-end percentile-test-1 - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_linear50", 0.01); (t0 = percentile(t1e, 80, PercentileMethod::LINEAR)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_linear80", 0.01); (t0 = percentile(t1e, 50, PercentileMethod::LOWER)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_lower50", 0.01); (t0 = percentile(t1e, 80, PercentileMethod::LOWER)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_lower80", 0.01); (t0 = percentile(t1e, 50, PercentileMethod::HIGHER)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_higher50", 0.01); (t0 = percentile(t1e, 80, PercentileMethod::HIGHER)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_higher80", 0.01); (t0 = percentile(t1o, 50, PercentileMethod::LINEAR)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_linear50", 0.01); (t0 = percentile(t1o, 80, PercentileMethod::LINEAR)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_linear80", 0.01); (t0 = percentile(t1o, 50, PercentileMethod::LOWER)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_lower50", 0.01); (t0 = percentile(t1o, 80, PercentileMethod::LOWER)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_lower80", 0.01); (t0 = percentile(t1o, 50, PercentileMethod::HIGHER)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_higher50", 0.01); (t0 = percentile(t1o, 80, PercentileMethod::HIGHER)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_higher80", 0.01); } @@ -794,20 +794,20 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Median) // Compute media over all elements in "t1e" and store result in "t0" (t0 = median(t1e)).run(exec); // example-end median-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(4.5f))); (t0 = median(t1o)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(5))); (t1out = median(t2e, {1})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1out(0), (TestType)(2.5f))); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1out(1), (TestType)(2.5f))); (t1out = median(t2o, {1})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1out(0), (TestType)(3.0f))); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1out(1), (TestType)(3.0f))); } @@ -824,10 +824,12 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, MinMaxNegative) auto t = matx::make_tensor({3}); t.SetVals({-3, -1, -7}); + ExecType exec{}; + matx::tensor_t max_val{{}}; matx::tensor_t max_idx{{}}; - (mtie(max_val, max_idx) = matx::argmax(t)).run(ExecType{}); - cudaStreamSynchronize(0); + (mtie(max_val, max_idx) = matx::argmax(t)).run(exec); + exec.sync(); ASSERT_EQ(max_val(), -1); ASSERT_EQ(max_idx(), 1); } @@ -853,7 +855,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Max) // Reduce all inputs in "t1o" into "t0" by the maximum of all elements (t0 = max(t1o)).run(exec); // example-end max-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(11))); } @@ -878,7 +880,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Min) // Reduce all inputs in "t1o" into "t0" by the minimum of all elements (t0 = min(t1o)).run(exec); // example-end min-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1))); } @@ -903,7 +905,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMax) (mtie(t0, t0i) = argmax(t1o)).run(exec); // example-end argmax-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(11))); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0i(), (TestType)(10))); @@ -913,7 +915,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMax) t2o.SetVals({{(T)2, (T)4, (T)1, (T)3, (T)5}, {(T)3, (T)1, (T)5, (T)2, (T)4}}); (mtie(t1o_small, t1i_small) = argmax(t2o, {1})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); auto rel = GetIdxFromAbs(t2o, t1i_small(0)); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(5))); @@ -942,7 +944,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMin) (mtie(t0, t0i) = argmin(t1o)).run(exec); // example-end argmin-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1))); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0i(), (TestType)(0))); @@ -952,7 +954,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMin) t2o.SetVals({{(T)2, (T)4, (T)1, (T)3, (T)5}, {(T)3, (T)1, (T)5, (T)2, (T)4}}); (mtie(t1o_small, t1i_small) = argmin(t2o, {1})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); auto rel = GetIdxFromAbs(t2o, t1i_small(0)); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(1))); @@ -982,19 +984,19 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Mean) // Compute the mean over all dimensions in "t4" and store the result in "t0" (t0 = mean(t4)).run(exec); // example-end mean-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1))); (t0 = mean(t3)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1))); (t0 = mean(t2)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1))); (t0 = mean(t1)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), (TestType)(1))); } { @@ -1005,19 +1007,19 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Mean) auto t2 = ones({30, 40}); (t1 = mean(t4, {1, 2, 3})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t1.Size(0); i++) { EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1(i), (TestType)(1))); } (t1 = mean(t3, {1, 2})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t1.Size(0); i++) { EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1(i), (TestType)(1))); } (t1 = mean(t2, {1})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t1.Size(0); i++) { EXPECT_TRUE(MatXUtils::MatXTypeCompare(t1(i), (TestType)(1))); } @@ -1030,7 +1032,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Mean) auto t3 = ones({30, 40, 50}); (t2 = mean(t4, {2, 3})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2.Size(0); i++) { for (index_t j = 0; j < t2.Size(1); j++) { EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2(i, j), (TestType)(1))); @@ -1038,7 +1040,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Mean) } (t2 = mean(t3, {2})).run(exec); - cudaStreamSynchronize(0); + exec.sync(); for (index_t i = 0; i < t2.Size(0); i++) { for (index_t j = 0; j < t2.Size(1); j++) { EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2(i, j), (TestType)(1))); @@ -1085,11 +1087,11 @@ TYPED_TEST(ReductionTestsNumericNonComplexAllExecs, Prod) // Compute the product of all elements in "t2" and store into "t0" (t0 = prod(t2)).run(exec); // example-end prod-test-1 - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), t2p)); (t0 = prod(t1)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t0(), t1p)); } @@ -1104,6 +1106,8 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Find) using TestType = std::tuple_element_t<0, TypeParam>; using ExecType = std::tuple_element_t<1, TypeParam>; + ExecType exec{}; + tensor_t num_found{{}}; tensor_t t1{{100}}; tensor_t t1o{{100}}; @@ -1116,9 +1120,9 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Find) // example-begin find-test-1 // Find values greater than 0.5 TestType thresh = (TestType)0.5; - (mtie(t1o, num_found) = find(t1, GT{thresh})).run(ExecType{}); + (mtie(t1o, num_found) = find(t1, GT{thresh})).run(exec); // example-end find-test-1 - cudaStreamSynchronize(0); + exec.sync(); int output_found = 0; for (int i = 0; i < t1.Size(0); i++) { @@ -1143,6 +1147,8 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, FindIdx) using TestType = std::tuple_element_t<0, TypeParam>; using ExecType = std::tuple_element_t<1, TypeParam>; + ExecType exec{}; + tensor_t num_found{{}}; tensor_t t1{{100}}; tensor_t t1o{{100}}; @@ -1155,9 +1161,9 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, FindIdx) // example-begin find_idx-test-1 // Find indices with values greater than 0.5 TestType thresh = (TestType)0.5; - (mtie(t1o, num_found) = find_idx(t1, GT{thresh})).run(ExecType{}); + (mtie(t1o, num_found) = find_idx(t1, GT{thresh})).run(exec); // example-end find_idx-test-1 - cudaStreamSynchronize(0); + exec.sync(); int output_found = 0; for (int i = 0; i < t1.Size(0); i++) { @@ -1187,7 +1193,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, FindIdxAndSelect) tensor_t t1o_2{{100}}; TestType thresh = (TestType)0.5; - auto executor = ExecType{}; + ExecType exec{}; for (int i = 0; i < t1.Size(0); i++) { t1(i) = static_cast>((float)rand() / @@ -1196,19 +1202,19 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, FindIdxAndSelect) // Find indices with values greater than 0.5 // example-begin select-test-1 - (mtie(t1o_idx, num_found) = find_idx(t1, GT{thresh})).run(executor); + (mtie(t1o_idx, num_found) = find_idx(t1, GT{thresh})).run(exec); // Since we use the output on the host in select() we need to synchronize first - cudaStreamSynchronize(0); + exec.sync(); auto t1o_slice = t1o.Slice({0}, {num_found()}); auto t1o_idx_slice = t1o_idx.Slice({0}, {num_found()}); - (t1o_slice = select(t1o_slice, t1o_idx_slice)).run(executor); + (t1o_slice = select(t1o_slice, t1o_idx_slice)).run(exec); // Compare to simply finding the values - (mtie(t1o_2, num_found2) = find(t1, GT{thresh})).run(executor); + (mtie(t1o_2, num_found2) = find(t1, GT{thresh})).run(exec); // example-end select-test-1 - cudaStreamSynchronize(0); + exec.sync(); ASSERT_EQ(num_found(), num_found2()); @@ -1227,6 +1233,8 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Unique) using TestType = std::tuple_element_t<0, TypeParam>; using ExecType = std::tuple_element_t<1, TypeParam>; + ExecType exec{}; + tensor_t num_found{{}}; tensor_t t1{{100}}; tensor_t t1o{{100}}; @@ -1236,9 +1244,9 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Unique) } // example-begin unique-test-1 - (mtie(t1o, num_found) = unique(t1)).run(ExecType{}); + (mtie(t1o, num_found) = unique(t1)).run(exec); // example-end unique-test-1 - cudaStreamSynchronize(0); + exec.sync(); for (int i = 0; i < 10; i++) { ASSERT_NEAR(t1o(i), i, 0.01); @@ -1270,7 +1278,7 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Trace) (t0 = trace(t2)).run(exec); // example-end trace-test-1 - cudaDeviceSynchronize(); + exec.sync(); ASSERT_EQ(t0(), count); MATX_EXIT_HANDLER(); diff --git a/test/00_solver/Cholesky.cu b/test/00_solver/Cholesky.cu index 79bfc7762..576745496 100644 --- a/test/00_solver/Cholesky.cu +++ b/test/00_solver/Cholesky.cu @@ -83,7 +83,7 @@ TYPED_TEST(CholSolverTestNonHalfFloatTypes, CholeskyBasic) // example-begin chol-test-1 (Bv = chol(Bv, CUBLAS_FILL_MODE_LOWER)).run(exec); // example-end chol-test-1 - cudaStreamSynchronize(0); + exec.sync(); // Cholesky fills the lower triangular portion (due to CUBLAS_FILL_MODE_LOWER) // and destroys the upper triangular portion. @@ -130,10 +130,10 @@ TYPED_TEST(CholSolverTestNonHalfFloatTypes, CholeskyWindowed) this->pb->NumpyToTensorView(Cv, "B"); this->pb->NumpyToTensorView(Lv, "L"); (Bslice = Cv).run(exec); - cudaStreamSynchronize(0); + exec.sync(); (Bslice = chol(Bslice, CUBLAS_FILL_MODE_LOWER)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); // Cholesky fills the lower triangular portion (due to CUBLAS_FILL_MODE_LOWER) // and destroys the upper triangular portion. diff --git a/test/00_solver/Det.cu b/test/00_solver/Det.cu index 870979889..e6fea1663 100644 --- a/test/00_solver/Det.cu +++ b/test/00_solver/Det.cu @@ -76,7 +76,7 @@ TYPED_TEST(DetSolverTestNonComplexFloatTypes, Determinant) (this->detv = det(this->Atv)).run(this->exec); (this->Av = transpose(this->Atv)).run(this->exec); // Transpose back to row-major - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, this->detv, "det", 0.1); diff --git a/test/00_solver/Eigen.cu b/test/00_solver/Eigen.cu index 3076f579d..b1a4c1c8d 100644 --- a/test/00_solver/Eigen.cu +++ b/test/00_solver/Eigen.cu @@ -94,7 +94,7 @@ TYPED_TEST(EigenSolverTestNonComplexFloatTypes, EigenBasic) // Compute A*v (this->Gtv = matmul(this->Bv, this->Wv)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); // Compare for (index_t j = 0; j < dim_size; j++) { ASSERT_NEAR(this->Gtv(j, 0), this->Lvv(j, 0), 0.001); diff --git a/test/00_solver/Inverse.cu b/test/00_solver/Inverse.cu index 74fa61ee4..4896cfb64 100644 --- a/test/00_solver/Inverse.cu +++ b/test/00_solver/Inverse.cu @@ -77,7 +77,7 @@ TYPED_TEST(InvSolverTestFloatTypes, Inv4x4) // Perform an inverse on matrix "A" and store the output in "Ainv" (Ainv = inv(A)).run(this->exec); // example-end inv-test-1 - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 0; i < A.Size(0); i++) { for (index_t j = 0; j <= i; j++) { @@ -108,7 +108,7 @@ TYPED_TEST(InvSolverTestFloatTypes, Inv4x4Batched) this->pb->NumpyToTensorView(Ainv_ref, "A_inv"); (Ainv = inv(A)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t b = 0; b < A.Size(0); b++) { for (index_t i = 0; i < A.Size(1); i++) { @@ -141,7 +141,7 @@ TYPED_TEST(InvSolverTestFloatTypes, Inv8x8) this->pb->NumpyToTensorView(Ainv_ref, "A_inv"); (Ainv = inv(A)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 0; i < A.Size(0); i++) { for (index_t j = 0; j <= i; j++) { @@ -172,7 +172,7 @@ TYPED_TEST(InvSolverTestFloatTypes, Inv8x8Batched) this->pb->NumpyToTensorView(Ainv_ref, "A_inv"); (Ainv = inv(A)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t b = 0; b < A.Size(0); b++) { for (index_t i = 0; i < A.Size(1); i++) { @@ -206,7 +206,7 @@ TYPED_TEST(InvSolverTestFloatTypes, Inv256x256) this->pb->NumpyToTensorView(Ainv_ref, "A_inv"); (Ainv = inv(A)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 0; i < A.Size(0); i++) { for (index_t j = 0; j <= i; j++) { diff --git a/test/00_solver/LU.cu b/test/00_solver/LU.cu index 8f57dd5c4..36cbdc0c7 100644 --- a/test/00_solver/LU.cu +++ b/test/00_solver/LU.cu @@ -77,7 +77,7 @@ TYPED_TEST(LUSolverTestNonComplexFloatTypes, LUBasic) // example-begin lu-test-1 (mtie(this->Av, this->PivV) = lu(this->Av)).run(this->exec); // example-end lu-test-1 - cudaStreamSynchronize(0); + this->exec.sync(); // The upper and lower triangle components are saved in Av. Python saves them // as separate matrices with the diagonal of the lower matrix set to 0 diff --git a/test/00_solver/QR.cu b/test/00_solver/QR.cu index 1c11be41b..2ac40a85f 100644 --- a/test/00_solver/QR.cu +++ b/test/00_solver/QR.cu @@ -82,7 +82,7 @@ TYPED_TEST(QRSolverTestNonComplexFloatTypes, QRBasic) // solve, then transpose again to compare to Python (mtie(this->Av, this->TauV) = cusolver_qr(this->Av)).run(this->exec); // example-end cusolver_qr-test-1 - cudaStreamSynchronize(0); + this->exec.sync(); // For now we're only verifying R. Q is a bit more complex to compute since // cuSolver/BLAS don't return Q, and instead return Householder reflections diff --git a/test/00_solver/QR2.cu b/test/00_solver/QR2.cu index 38abc510e..25f730adf 100644 --- a/test/00_solver/QR2.cu +++ b/test/00_solver/QR2.cu @@ -51,6 +51,7 @@ void qr_test( const index_t (&AshapeA)[RANK]) { using SType = typename inner_op_type_t::type; cudaStream_t stream = 0; + cudaExecutor exec{stream}; std::array Ashape = detail::to_array(AshapeA); std::array Qshape = Ashape; @@ -69,14 +70,10 @@ void qr_test( const index_t (&AshapeA)[RANK]) { auto Q = make_tensor(Qshape); auto R = make_tensor(Rshape); - (A = random(Ashape, NORMAL)).run(stream); - - A.PrefetchDevice(stream); - Q.PrefetchDevice(stream); - R.PrefetchDevice(stream); + (A = random(Ashape, NORMAL)).run(exec); // example-begin qr-test-1 - (mtie(Q, R) = qr(A)).run(stream); + (mtie(Q, R) = qr(A)).run(exec); // example-end qr-test-1 auto mdiffQTQ = make_tensor({}); @@ -85,7 +82,7 @@ void qr_test( const index_t (&AshapeA)[RANK]) { { // QTQ == Identity auto QTQ = make_tensor(Qshape); - (QTQ = matmul(conj(transpose_matrix(Q)), Q)).run(stream); + (QTQ = matmul(conj(transpose_matrix(Q)), Q)).run(exec); auto e = eye({m, m}); auto eShape = Qshape; @@ -93,19 +90,19 @@ void qr_test( const index_t (&AshapeA)[RANK]) { eShape[RANK-2] = matxKeepDim; auto I = clone(e, eShape); - (mdiffQTQ = max(abs(QTQ-I))).run(stream); + (mdiffQTQ = max(abs(QTQ-I))).run(exec); } { // Q*R == A auto QR = make_tensor(Ashape); - (QR = matmul(Q, R)).run(stream); + (QR = matmul(Q, R)).run(exec); - (mdiffQR = max(abs(A-QR))).run(stream); + (mdiffQR = max(abs(A-QR))).run(exec); } - cudaDeviceSynchronize(); + exec.sync(); ASSERT_NEAR( mdiffQTQ(), SType(0), .00001); ASSERT_NEAR( mdiffQR(), SType(0), .00001); diff --git a/test/00_solver/SVD.cu b/test/00_solver/SVD.cu index 11a2905e3..990e6d04f 100644 --- a/test/00_solver/SVD.cu +++ b/test/00_solver/SVD.cu @@ -94,7 +94,7 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasic) (mtie(Uv, Sv, Vv) = svd(Atv2)).run(this->exec); // example-end svd-test-1 - cudaStreamSynchronize(0); + this->exec.sync(); // Since SVD produces a solution that's not necessarily unique, we cannot // compare against Python output. Instead, we just make sure that A = U*S*V'. @@ -105,14 +105,14 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasic) // Zero out s (Sav = zeros::type>({m, n})).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); // Construct S matrix since it's just a vector from cuSolver for (index_t i = 0; i < n; i++) { Sav(i, i) = Sv(i); } - cudaStreamSynchronize(0); + this->exec.sync(); (SSolav = 0).run(this->exec); if constexpr (is_complex_v) { @@ -124,7 +124,7 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasic) (tmpV = matmul(Uav, SSolav)).run(this->exec); // U * S (SSolav = matmul(tmpV, Vav)).run(this->exec); // (U * S) * V' - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 0; i < Av.Size(0); i++) { for (index_t j = 0; j < Av.Size(1); j++) { @@ -175,7 +175,7 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasicBatched) auto Atv2 = Atv.View({batches, m, n}); (mtie(Uv, Sv, Vv) = svd(Atv2)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); // Since SVD produces a solution that's not necessarily unique, we cannot // compare against Python output. Instead, we just make sure that A = U*S*V'. @@ -186,7 +186,7 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasicBatched) // Zero out s (Sav = zeros::type>({batches, m, n})).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); // Construct S matrix since it's just a vector from cuSolver for (index_t b = 0; b < batches; b++) { @@ -195,7 +195,7 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasicBatched) } } - cudaStreamSynchronize(0); + this->exec.sync(); (SSolav = 0).run(this->exec); if constexpr (is_complex_v) { @@ -207,7 +207,7 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasicBatched) (tmpV = matmul(Uav, SSolav)).run(this->exec); // U * S (SSolav = matmul(tmpV, Vav)).run(this->exec); // (U * S) * V' - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t b = 0; b < batches; b++) { for (index_t i = 0; i < Av.Size(0); i++) { @@ -313,7 +313,7 @@ void svdpi_test( const index_t (&AshapeA)[RANK], Executor exec) { (mdiffV = max(VTVd)).run(exec); (mdiffA = max(Ad)).run(exec); - cudaDeviceSynchronize(); + exec.sync(); #if 0 printf("A\n"); print(A); @@ -363,7 +363,7 @@ void svdbpi_test( const index_t (&AshapeA)[RANK], Executor exec) { std::array Ashape = detail::to_array(AshapeA); - cudaDeviceSynchronize(); + exec.sync(); index_t mm = Ashape[RANK-2]; index_t nn = Ashape[RANK-1]; @@ -445,7 +445,7 @@ void svdbpi_test( const index_t (&AshapeA)[RANK], Executor exec) { (mdiffV = max(VTVd)).run(exec); (mdiffA = max(Ad)).run(exec); - cudaDeviceSynchronize(); + exec.sync(); #if 0 printf("A\n"); print(A); @@ -467,7 +467,7 @@ void svdbpi_test( const index_t (&AshapeA)[RANK], Executor exec) { ASSERT_NEAR( mdiffV(), SType(0), .1); ASSERT_NEAR( mdiffA(), SType(0), .00001); - cudaDeviceSynchronize(); + exec.sync(); } TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBPI) diff --git a/test/00_tensor/BasicTensorTests.cu b/test/00_tensor/BasicTensorTests.cu index bd19ac0ba..9d5ee8e9b 100644 --- a/test/00_tensor/BasicTensorTests.cu +++ b/test/00_tensor/BasicTensorTests.cu @@ -242,7 +242,7 @@ TYPED_TEST(BasicTensorTestsAll, AssignmentOps) (t2c = this->t2).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 0; i < t2c.Size(0); i++) { for (index_t j = 0; j < t2c.Size(1); j++) { ASSERT_EQ(this->t2(i,j), t2c(i,j)); @@ -250,7 +250,7 @@ TYPED_TEST(BasicTensorTestsAll, AssignmentOps) } (this->t2 = t2c = t2c2).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 0; i < t2c.Size(0); i++) { for (index_t j = 0; j < t2c.Size(1); j++) { ASSERT_EQ(this->t2(i,j), t2c2(i,j)); @@ -277,7 +277,7 @@ TYPED_TEST(BasicTensorTestsNumeric, AssignmentOps) } (this->t2 += t2c).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 0; i < t2c.Size(0); i++) { for (index_t j = 0; j < t2c.Size(1); j++) { ASSERT_EQ(this->t2(i,j), t2c(i,j) + t2c(i,j)); @@ -285,7 +285,7 @@ TYPED_TEST(BasicTensorTestsNumeric, AssignmentOps) } (this->t2 -= t2c).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 0; i < t2c.Size(0); i++) { for (index_t j = 0; j < t2c.Size(1); j++) { ASSERT_EQ(this->t2(i,j), t2c(i,j)); @@ -293,7 +293,7 @@ TYPED_TEST(BasicTensorTestsNumeric, AssignmentOps) } (this->t2 *= t2c).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 0; i < t2c.Size(0); i++) { for (index_t j = 0; j < t2c.Size(1); j++) { ASSERT_EQ(this->t2(i,j), static_cast(1)); @@ -301,8 +301,8 @@ TYPED_TEST(BasicTensorTestsNumeric, AssignmentOps) } (t2c = this->t2).run(this->exec); - (this->t2 /= static_cast(1)).run(); - cudaStreamSynchronize(0); + (this->t2 /= static_cast(1)).run(this->exec); + this->exec.sync(); for (index_t i = 0; i < t2c.Size(0); i++) { for (index_t j = 0; j < t2c.Size(1); j++) { ASSERT_EQ(t2c(i,j) , this->t2(i,j)); @@ -328,7 +328,7 @@ TYPED_TEST(BasicTensorTestsIntegral, AssignmentOps) } (this->t2 |= t2c).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 0; i < t2c.Size(0); i++) { for (index_t j = 0; j < t2c.Size(1); j++) { ASSERT_EQ(this->t2(i,j), 3); @@ -336,7 +336,7 @@ TYPED_TEST(BasicTensorTestsIntegral, AssignmentOps) } (this->t2 &= t2c).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 0; i < t2c.Size(0); i++) { for (index_t j = 0; j < t2c.Size(1); j++) { ASSERT_EQ(this->t2(i,j), 2); @@ -344,7 +344,7 @@ TYPED_TEST(BasicTensorTestsIntegral, AssignmentOps) } (this->t2 ^= t2c).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 0; i < t2c.Size(0); i++) { for (index_t j = 0; j < t2c.Size(1); j++) { ASSERT_EQ(this->t2(i,j), 0); diff --git a/test/00_tensor/CUBTests.cu b/test/00_tensor/CUBTests.cu index d162b8e83..9a34901fb 100644 --- a/test/00_tensor/CUBTests.cu +++ b/test/00_tensor/CUBTests.cu @@ -117,10 +117,12 @@ TEST(TensorStats, Hist) inv.SetVals({2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5}); + cudaExecutor exec{}; + // example-begin hist-test-1 - (outv = hist(inv, 0.0f, 12.0f)).run();; + (outv = hist(inv, 0.0f, 12.0f)).run(exec); // example-end hist-test-1 - cudaStreamSynchronize(0); + exec.sync(); std::array sol = {1, 5, 0, 3, 0, 0}; for (index_t i = 0; i < outv.Lsize(); i++) { @@ -147,7 +149,7 @@ TYPED_TEST(CUBTestsNumericNonComplexAllExecs, CumSum) // Compute the cumulative sum/exclusive scan across "t1" (tmpv = cumsum(this->t1)).run(this->exec); // example-end cumsum-test-1 - cudaStreamSynchronize(0); + this->exec.sync(); TestType ttl = 0; for (index_t i = 0; i < tmpv.Lsize(); i++) { @@ -165,7 +167,7 @@ TYPED_TEST(CUBTestsNumericNonComplexAllExecs, CumSum) } (tmpv2 = cumsum(this->t2)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 0; i < tmpv2.Size(0); i++) { ttl = 0; for (index_t j = 0; j < tmpv2.Size(1); j++) { @@ -194,7 +196,7 @@ TYPED_TEST(CUBTestsNumericNonComplexAllExecs, Sort) // Ascending sort of 1D input (tmpv = matx::sort(this->t1, SORT_DIR_ASC)).run(this->exec); // example-end sort-test-1 - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 1; i < tmpv.Lsize(); i++) { ASSERT_TRUE(tmpv(i) > tmpv(i - 1)); @@ -204,7 +206,7 @@ TYPED_TEST(CUBTestsNumericNonComplexAllExecs, Sort) // Descending sort of 1D input (tmpv = matx::sort(this->t1, SORT_DIR_DESC)).run(this->exec); // example-end sort-test-2 - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 1; i < tmpv.Lsize(); i++) { ASSERT_TRUE(tmpv(i) < tmpv(i - 1)); @@ -220,7 +222,7 @@ TYPED_TEST(CUBTestsNumericNonComplexAllExecs, Sort) } (tmpv2 = matx::sort(this->t2, SORT_DIR_ASC)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 0; i < tmpv2.Size(0); i++) { for (index_t j = 1; j < tmpv2.Size(1); j++) { @@ -230,7 +232,7 @@ TYPED_TEST(CUBTestsNumericNonComplexAllExecs, Sort) // Descending (tmpv2 = matx::sort(this->t2, SORT_DIR_DESC)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t i = 0; i < tmpv2.Size(0); i++) { for (index_t j = 1; j < tmpv2.Size(1); j++) { diff --git a/test/00_tensor/EinsumTests.cu b/test/00_tensor/EinsumTests.cu index 02c9bcbd1..ab1c68786 100644 --- a/test/00_tensor/EinsumTests.cu +++ b/test/00_tensor/EinsumTests.cu @@ -121,7 +121,7 @@ TYPED_TEST(EinsumTestsFloatNonComplexNonHalfTypes, Contraction3D) // Perform a 3D tensor contraction (c2 = cutensor::einsum("ijk,jil->kl", a, b)).run(exec); // example-end einsum-contraction-1 - cudaStreamSynchronize(0); + exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, c2, "c_float3d", 0.01); MATX_EXIT_HANDLER(); @@ -146,7 +146,7 @@ TYPED_TEST(EinsumTestsFloatNonComplexNonHalfTypes, Dot) // Perform a dot product of b1 with itself and store in a1 (c0 = cutensor::einsum("i,i->", a1, b1)).run(exec); // example-end einsum-dot-1 - cudaStreamSynchronize(0); + exec.sync(); MATX_ASSERT_EQ(c0(), 4 * a1.Size(0)); MATX_EXIT_HANDLER(); @@ -173,7 +173,7 @@ TYPED_TEST(EinsumTestsFloatNonComplexNonHalfTypes, GEMM) (c2 = cutensor::einsum("mk,kn->mn", a2, b2)).run(exec); (c22 = matmul(a2, b2)).run(exec); // example-end einsum-gemm-1 - cudaStreamSynchronize(0); + exec.sync(); for (auto i = 0; i < c2.Size(0); i++) { for (auto j = 0; j < c2.Size(1); j++) { @@ -203,7 +203,7 @@ TYPED_TEST(EinsumTestsFloatNonComplexNonHalfTypes, GEMMTranspose) (c2 = cutensor::einsum("mk,kn->nm", a2, b2)).run(exec); // example-end einsum-gemm-2 (c22 = matmul(a2, b2)).run(exec); - cudaStreamSynchronize(0); + exec.sync(); auto c22t = c22.Permute({1,0}); // Permute to match cutensor @@ -232,7 +232,7 @@ TYPED_TEST(EinsumTestsFloatNonComplexNonHalfTypes, Permute) (b = cutensor::einsum("ijkl->jlki", a)).run(exec); (b2 = a.Permute({1,3,2,0})).run(exec); // example-end einsum-permute-1 - cudaStreamSynchronize(0); + exec.sync(); for (auto i = 0; i < b.Size(0); i++) { for (auto j = 0; j < b.Size(1); j++) { @@ -264,7 +264,7 @@ TYPED_TEST(EinsumTestsFloatNonComplexNonHalfTypes, Sum) (b = matx::cutensor::einsum("ij->j", a)).run(exec); // example-end einsum-sum-1 - cudaStreamSynchronize(0); + exec.sync(); for (auto i = 0; i < a.Size(1); i++) { TestType s = 0; for (auto j = 0; j < a.Size(0); j++) { @@ -294,7 +294,7 @@ TYPED_TEST(EinsumTestsFloatNonComplexNonHalfTypes, Trace) (c0_1 = trace(a2)).run(exec); // example-end einsum-trace-1 - cudaStreamSynchronize(0); + exec.sync(); MATX_ASSERT_EQ(c0_0(), c0_1()); MATX_ASSERT_EQ(c0_0(), 10); diff --git a/test/00_tensor/ViewTests.cu b/test/00_tensor/ViewTests.cu index 9c913bc9b..599d98f8c 100644 --- a/test/00_tensor/ViewTests.cu +++ b/test/00_tensor/ViewTests.cu @@ -430,7 +430,7 @@ TYPED_TEST(ViewTestsFloatNonComplexNonHalf, Random) (t3f = (TestType)-1000000).run(this->exec); (t3f = random({count, count, count}, UNIFORM)).run(this->exec); // example-end random-test-1 - cudaDeviceSynchronize(); + this->exec.sync(); TestType total = 0; for (index_t i = 0; i < count; i++) { @@ -449,7 +449,7 @@ TYPED_TEST(ViewTestsFloatNonComplexNonHalf, Random) (t3f = (TestType)-1000000).run(this->exec); (t3f = random({count, count, count}, NORMAL)).run(this->exec); - cudaDeviceSynchronize(); + this->exec.sync(); total = 0; diff --git a/test/00_transform/ChannelizePoly.cu b/test/00_transform/ChannelizePoly.cu index ca5947aea..582063da2 100644 --- a/test/00_transform/ChannelizePoly.cu +++ b/test/00_transform/ChannelizePoly.cu @@ -149,8 +149,6 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Simple) { 1000000, 40*16, 40 } }; - cudaStream_t stream = 0; - for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) { const index_t a_len = test_cases[i].a_len; const index_t f_len = test_cases[i].f_len; @@ -172,14 +170,14 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Simple) (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec); // example-end channelize_poly-test-1 - cudaStreamSynchronize(stream); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); // Now test with a multiplicative operator on the input. The channelizer is linear, // so we can inverse-scale the output to compare against the golden outputs. (b = channelize_poly(static_cast(4.0) * a, f, num_channels, decimation_factor)).run(this->exec); (b = b * static_cast(0.25)).run(this->exec); - cudaStreamSynchronize(stream); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); } @@ -233,7 +231,7 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Batched) this->pb->NumpyToTensorView(f, "filter_random"); (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); @@ -241,7 +239,7 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Batched) // so we can inverse-scale the output to compare against the golden outputs. (b = channelize_poly(static_cast(4.0) * a, f, num_channels, decimation_factor)).run(this->exec); (b = b * static_cast(0.25)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); } @@ -277,12 +275,12 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, IdentityFilter) this->pb->NumpyToTensorView(a, "a"); for (auto i = 0; i < num_channels; i++) { f(i) = 1; } - cudaStreamSynchronize(0); + this->exec.sync(); const index_t decimation_factor = num_channels; (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (auto k = 0; k < b_elem_per_channel; k++) { // Explicit DFT in the channel dimension. The complex exponential sign here is opposite @@ -315,8 +313,6 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Operators) using InnerType = typename test_types::inner_type::type; using ComplexType = typename test_types::complex_type::type; - cudaStream_t stream = 0; - const index_t a_len = 2500; [[maybe_unused]] const index_t f_len = 90; const index_t num_channels = 10; @@ -331,13 +327,13 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Operators) this->pb->NumpyToTensorView(a, "a"); this->pb->NumpyToTensorView(f, "filter_random"); - cudaStreamSynchronize(stream); + this->exec.sync(); const index_t decimation_factor = num_channels; auto b = permute(bp, {1, 0}); (b = channelize_poly(shift<0>(shift<0>(a, 8), -8), f, num_channels, decimation_factor)).run(this->exec); - cudaStreamSynchronize(stream); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); @@ -610,12 +606,12 @@ TYPED_TEST(ChannelizePolyTestDoubleType, Harris2003) cudaMemcpyAsync(a.Data(), input.data(), signal_len * sizeof(TestType), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(f.Data(), filter.data(), filter_len * sizeof(TestType), cudaMemcpyHostToDevice, stream); - cudaStreamSynchronize(stream); + this->exec.sync(); const index_t decimation_factor = num_channels; (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec); - cudaStreamSynchronize(stream); + this->exec.sync(); for (auto chan = 0; chan < num_channels; chan++) { for (auto k = 0; k < b_elem_per_channel; k++) { diff --git a/test/00_transform/ConvCorr.cu b/test/00_transform/ConvCorr.cu index b1522058e..f0c32c7ea 100644 --- a/test/00_transform/ConvCorr.cu +++ b/test/00_transform/ConvCorr.cu @@ -593,7 +593,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv1Axis) (out2 = conv1d(in1, in2, {2}, MATX_C_MODE_SAME)).run(this->exec); // example-end conv1d-test-2 - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -608,7 +608,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv1Axis) (out2 = conv1d(in1, in2, {1}, MATX_C_MODE_SAME)).run(this->exec); // example-end conv1d-test-3 - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -621,7 +621,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv1Axis) (out1.Permute({1,2,0}) = conv1d(in1.Permute({1,2,0}), in2.Permute({1,2,0}), MATX_C_MODE_SAME)).run(this->exec); (out2 = conv1d(in1, in2, {0}, MATX_C_MODE_SAME)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -634,7 +634,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv1Axis) (out1 = corr(in1, in2, MATX_C_MODE_SAME, MATX_C_METHOD_DIRECT)).run(this->exec); (out2 = corr(in1, in2, {2}, MATX_C_MODE_SAME, MATX_C_METHOD_DIRECT)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -647,7 +647,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv1Axis) (out1.Permute({0,2,1}) = corr(in1.Permute({0,2,1}), in2.Permute({0,2,1}), MATX_C_MODE_SAME, MATX_C_METHOD_DIRECT)).run(this->exec); (out2 = corr(in1, in2, {1}, MATX_C_MODE_SAME, MATX_C_METHOD_DIRECT)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -660,7 +660,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv1Axis) (out1.Permute({1,2,0}) = corr(in1.Permute({1,2,0}), in2.Permute({1,2,0}), MATX_C_MODE_SAME, MATX_C_METHOD_DIRECT)).run(this->exec); (out2 = corr(in1, in2, {0}, MATX_C_MODE_SAME, MATX_C_METHOD_DIRECT)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -699,7 +699,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv2Axis) (out1 = conv2d(in1, in2, MATX_C_MODE_SAME)).run(this->exec); (out2 = conv2d(in1, in2, {1, 2}, MATX_C_MODE_SAME)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -712,7 +712,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv2Axis) (out1.Permute({0,2,1}) = conv2d(in1.Permute({0,2,1}), in2.Permute({0,2,1}), MATX_C_MODE_SAME)).run(this->exec); (out2 = conv2d(in1, in2, {2, 1}, MATX_C_MODE_SAME)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -725,7 +725,7 @@ TYPED_TEST(CorrelationConvolutionTestFloatTypes, Conv2Axis) (out1.Permute({1,2,0}) = conv2d(in1.Permute({1,2,0}), in2.Permute({1,2,0}), MATX_C_MODE_SAME)).run(this->exec); (out2 = conv2d(in1, in2, {2, 0}, MATX_C_MODE_SAME)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { diff --git a/test/00_transform/Copy.cu b/test/00_transform/Copy.cu index 7e17ea8e4..1dd32155e 100755 --- a/test/00_transform/Copy.cu +++ b/test/00_transform/Copy.cu @@ -54,12 +54,6 @@ TYPED_TEST(CopyTestsAll, CopyOutParam) ExecType exec{}; - auto sync = []() constexpr { - if constexpr (std::is_same_v) { - cudaDeviceSynchronize(); - } - }; - const int SZ = 5; TestType DEFAULT, TEST_VAL; if constexpr (std::is_same_v) { @@ -81,13 +75,13 @@ TYPED_TEST(CopyTestsAll, CopyOutParam) auto in = make_tensor(dims); \ auto out = make_tensor(dims); \ (in = DEFAULT).run(exec); \ - sync(); \ + exec.sync(); \ std::array inds; \ inds.fill(SZ/2); \ in(inds) = TEST_VAL; \ - sync(); \ + exec.sync(); \ matx::copy(out, in, exec); \ - sync(); \ + exec.sync(); \ ASSERT_EQ(in(inds), out(inds)); \ ASSERT_EQ(out(inds), TEST_VAL); \ inds.fill(0); \ @@ -109,9 +103,9 @@ TYPED_TEST(CopyTestsAll, CopyOutParam) auto in = make_tensor({}); auto out = make_tensor({}); in() = TEST_VAL; - sync(); + exec.sync(); matx::copy(out, in, exec); - sync(); + exec.sync(); ASSERT_EQ(in(), out()); ASSERT_EQ(out(), TEST_VAL); } @@ -121,11 +115,11 @@ TYPED_TEST(CopyTestsAll, CopyOutParam) auto in = make_tensor({SZ, SZ, SZ}); auto out = make_tensor({SZ}); (in = DEFAULT).run(exec); - sync(); + exec.sync(); in(0, SZ/2, 0) = TEST_VAL; - sync(); + exec.sync(); matx::copy(out, slice<1>(in, {0,0,0}, {matxDropDim,matxEnd,matxDropDim}), exec); - sync(); + exec.sync(); ASSERT_EQ(out.Rank(), 1); ASSERT_EQ(out.Size(0), SZ); ASSERT_EQ(out(SZ/2), TEST_VAL); @@ -148,12 +142,6 @@ TYPED_TEST(CopyTestsAll, CopyReturn) ExecType exec{}; - auto sync = []() constexpr { - if constexpr (std::is_same_v) { - cudaDeviceSynchronize(); - } - }; - const int SZ = 5; TestType DEFAULT, TEST_VAL; if constexpr (std::is_same_v) { @@ -174,13 +162,13 @@ TYPED_TEST(CopyTestsAll, CopyReturn) dims.fill(SZ); \ auto in = make_tensor(dims); \ (in = DEFAULT).run(exec); \ - sync(); \ + exec.sync(); \ std::array inds; \ inds.fill(SZ/2); \ in(inds) = TEST_VAL; \ - sync(); \ + exec.sync(); \ auto out = matx::copy(in, exec); \ - sync(); \ + exec.sync(); \ ASSERT_EQ(in(inds), out(inds)); \ ASSERT_EQ(out(inds), TEST_VAL); \ inds.fill(0); \ @@ -201,9 +189,9 @@ TYPED_TEST(CopyTestsAll, CopyReturn) { auto in = make_tensor({}); in() = TEST_VAL; - sync(); + exec.sync(); auto out = matx::copy(in, exec); - sync(); + exec.sync(); ASSERT_EQ(in(), out()); ASSERT_EQ(out(), TEST_VAL); } @@ -212,11 +200,11 @@ TYPED_TEST(CopyTestsAll, CopyReturn) { auto in = make_tensor({SZ, SZ, SZ}); (in = DEFAULT).run(exec); - sync(); + exec.sync(); in(0, SZ/2, 0) = TEST_VAL; - sync(); + exec.sync(); auto out = matx::copy(slice<1>(in, {0,0,0}, {matxDropDim,matxEnd,matxDropDim}), exec); - sync(); + exec.sync(); ASSERT_EQ(out.Rank(), 1); ASSERT_EQ(out.Size(0), SZ); ASSERT_EQ(out(SZ/2), TEST_VAL); diff --git a/test/00_transform/Cov.cu b/test/00_transform/Cov.cu index 217797c51..0ca3e7a43 100644 --- a/test/00_transform/Cov.cu +++ b/test/00_transform/Cov.cu @@ -82,7 +82,7 @@ TYPED_TEST(CovarianceTestFloatTypes, SmallCov) // example-begin cov-test-1 (this->cv = cov(this->av)).run(this->exec); // example-end cov-test-1 - cudaDeviceSynchronize(); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, this->cv, "c_cov", this->thresh); MATX_EXIT_HANDLER(); } @@ -107,7 +107,7 @@ TYPED_TEST(CovarianceTestFloatTypes, BatchedCov) (batched_in = clone<5>(this->av, {m, n, k, matxKeepDim, matxKeepDim})).run(this->exec); (batched_out = cov(batched_in)).run(this->exec); - cudaDeviceSynchronize(); + this->exec.sync(); for (int im = 0; im < m; im++) { for (int in = 0; in < n; in++) { diff --git a/test/00_transform/FFT.cu b/test/00_transform/FFT.cu index 7a3b35c0b..0ff8a0a65 100644 --- a/test/00_transform/FFT.cu +++ b/test/00_transform/FFT.cu @@ -98,7 +98,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT1D1024C2C) // type of the tensors and output size. (avo = fft(av)).run(this->exec); // example-end fft-1 - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -121,7 +121,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT1DFWD1024C2C) // type of the tensors and output size. (avo = fft(av, fft_dim, FFTNorm::FORWARD)).run(this->exec); // example-end fft-1 - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -144,7 +144,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT1DORTHO1024C2C) // type of the tensors and output size. (avo = fft(av, fft_dim, FFTNorm::ORTHO)).run(this->exec); // example-end fft-1 - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -178,7 +178,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1Axis) (out2 = fft(in, {2})).run(this->exec); // example-end fft-2 - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -195,7 +195,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1Axis) (out2 = fft(in, {1})).run(this->exec); // example-end fft-3 - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -211,7 +211,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1Axis) (out1 = ifft(in)).run(this->exec); (out2 = ifft(in, {2})).run(this->exec); // example-end ifft-1 - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -227,7 +227,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1Axis) (out1.Permute({0,2,1}) = ifft(in.Permute({0,2,1}))).run(this->exec); (out2 = ifft(in, {1})).run(this->exec); // example-end ifft-2 - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -242,7 +242,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1Axis) (out1.Permute({0,2,1}) = fft(permute(in1, {0,2,1}))).run(this->exec); (out2 = fft(in1, {1})).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -255,7 +255,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1Axis) (out1.Permute({1,2,0}) = ifft(permute(in1, {1,2,0}))).run(this->exec); (out2 = ifft(in1, {0})).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -296,7 +296,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis) (out1 = fft2(in)).run(this->exec); (out2 = fft2(in, {1,2})).run(this->exec); // example-end fft2-1 - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -312,7 +312,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis) (out1.Permute({1,2,0}) = fft2(in.Permute({1,2,0}))).run(this->exec); (out2 = fft2(in, {2,0})).run(this->exec); // example-end fft2-2 - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -324,7 +324,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis) (out1.Permute({1,0,2}) = fft2(in.Permute({1,0,2}))).run(this->exec); (out2 = fft2(in, {0,2})).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -340,7 +340,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis) (out1 = ifft2(in)).run(this->exec); (out2 = ifft2(in, {1,2})).run(this->exec); // example-end ifft2-1 - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -356,7 +356,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis) (out1.Permute({1,2,0}) = ifft2(in.Permute({1,2,0}))).run(this->exec); (out2 = ifft2(in, {2,0})).run(this->exec); // example-end ifft2-2 - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -368,7 +368,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis) (out1.Permute({1,0,2}) = ifft2(in.Permute({1,0,2}))).run(this->exec); (out2 = ifft2(in, {0,2})).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -383,7 +383,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis) (out1.Permute({1,0,2}) = fft2(permute(in1, {1,0,2}))).run(this->exec); (out2 = fft2(in1, {0,2})).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -396,7 +396,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT2Axis) (out1.Permute({1,0,2}) = ifft2(permute(in1, {1,0,2}))).run(this->exec); (out2 = ifft2(in1, {0,2})).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for(int i = 0; i < d1; i++) { for(int j = 0; j < d2; j++) { @@ -422,7 +422,7 @@ TYPED_TEST(FFTTestComplexTypes, IFFT1D1024C2C) this->pb->NumpyToTensorView(av, "a_in"); (avo = ifft(av)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -440,7 +440,7 @@ TYPED_TEST(FFTTestComplexTypes, IFFT1DORTHO1024C2C) this->pb->NumpyToTensorView(av, "a_in"); (avo = ifft(av, fft_dim, FFTNorm::ORTHO)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -458,7 +458,7 @@ TYPED_TEST(FFTTestComplexTypes, IFFT1DFWD1024C2C) this->pb->NumpyToTensorView(av, "a_in"); (avo = ifft(av, fft_dim, FFTNorm::FORWARD)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -479,7 +479,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT1D1024PadC2C) // Specify the FFT size as bigger than av. Thus, av will be zero-padded to the appropriate size (avo = fft(av, fft_dim * 2)).run(this->exec); // example-end fft-4 - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -497,7 +497,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT1D1024PadBatchedC2C) this->pb->NumpyToTensorView(av, "a_in"); (avo = fft(av, fft_dim*2)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); @@ -505,7 +505,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT1D1024PadBatchedC2C) // Perform an FFT but force the size to be fft_dim * 2 instead of the output size (avo = fft(av, fft_dim * 2)).run(this->exec); // Force the FFT size // example-end fft-5 - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -524,7 +524,7 @@ TYPED_TEST(FFTTestComplexTypes, IFFT1D1024PadC2C) // Specify the IFFT size as bigger than av. Thus, av will be zero-padded to the appropriate size (avo = ifft(av, fft_dim * 2)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -545,7 +545,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1D1024R2C) this->pb->NumpyToTensorView(av, "a_in"); (avo = fft(av)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -566,7 +566,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1D1024PadR2C) this->pb->NumpyToTensorView(av, "a_in"); (avo = fft(av, fft_dim*2)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -588,28 +588,28 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1DSizeChecks) auto t2 = make_tensor({2*N}); // We do not implicitly zero-pad to a larger transform size (t2 = fft(tc)).run(this->exec); - cudaDeviceSynchronize(); + this->exec.sync(); }, matx::detail::matxException); // C2C, output size smaller than input size ASSERT_THROW({ auto t2 = make_tensor({(N/2)+1}); (t2 = fft(tc)).run(this->exec); - cudaDeviceSynchronize(); + this->exec.sync(); }, matx::detail::matxException); // R2C, output size smaller than N/2 + 1 ASSERT_THROW({ auto t2 = make_tensor({N/2}); (t2 = fft(tr)).run(this->exec); - cudaDeviceSynchronize(); + this->exec.sync(); }, matx::detail::matxException); // R2C, output size larger than N/2 + 1 ASSERT_THROW({ auto t2 = make_tensor({N/2+2}); (t2 = fft(tr)).run(this->exec); - cudaDeviceSynchronize(); + this->exec.sync(); }, matx::detail::matxException); // C2R, output size smaller than N @@ -617,7 +617,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1DSizeChecks) auto tcs = slice(tc, {0}, {N/2+1}); auto t2 = make_tensor({N-1}); (t2 = fft(tcs)).run(this->exec); - cudaDeviceSynchronize(); + this->exec.sync(); }, matx::detail::matxException); // C2R, output size too large @@ -625,7 +625,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1DSizeChecks) auto tcs = slice(tc, {0}, {N/2+1}); auto t2 = make_tensor({N+2}); (t2 = fft(tcs)).run(this->exec); - cudaDeviceSynchronize(); + this->exec.sync(); }, matx::detail::matxException); MATX_EXIT_HANDLER(); @@ -645,7 +645,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1D1024PadBatchedR2C) this->pb->NumpyToTensorView(av, "a_in"); (avo = fft(av, fft_dim*2)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -664,7 +664,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT2D16C2C) this->pb->NumpyToTensorView(av, "a_in"); (avo = fft2(av)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -683,7 +683,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT2D16x32C2C) this->pb->NumpyToTensorView(av, "a_in"); (avo = fft2(av)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -704,7 +704,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT2D16BatchedC2C) this->pb->NumpyToTensorView(av, "a_in"); (avo = fft2(av)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -726,7 +726,7 @@ TYPED_TEST(FFTTestComplexTypes, FFT2D16BatchedStridedC2C) const int32_t axes[] = {0, 2}; (avo = fft2(av, axes)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -745,7 +745,7 @@ TYPED_TEST(FFTTestComplexTypes, IFFT2D16C2C) this->pb->NumpyToTensorView(av, "a_in"); (avo = ifft2(av)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -764,7 +764,7 @@ TYPED_TEST(FFTTestComplexTypes, IFFT2D16x32C2C) this->pb->NumpyToTensorView(av, "a_in"); (avo = ifft2(av)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -784,7 +784,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypes, FFT2D16R2C) this->pb->NumpyToTensorView(av, "a_in"); (avo = fft2(av)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -804,7 +804,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypes, FFT2D16x32R2C) this->pb->NumpyToTensorView(av, "a_in"); (avo = fft2(av)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -824,7 +824,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypes, IFFT2D16C2R) this->pb->NumpyToTensorView(av, "a_in"); (avo = ifft2(av)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -844,7 +844,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypes, IFFT2D16x32C2R) this->pb->NumpyToTensorView(av, "a_in"); (avo = ifft2(av)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -864,7 +864,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, FFT1D1024C2CShort) this->pb->NumpyToTensorView(av, "a_in"); (avo = fft(av, fft_dim - 16)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); @@ -883,7 +883,7 @@ TYPED_TEST(FFTTestComplexNonHalfTypesAllExecs, IFFT1D1024C2CShort) this->pb->NumpyToTensorView(av, "a_in"); (avo = ifft(av, fft_dim - 16)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, avo, "a_out", this->thresh); MATX_EXIT_HANDLER(); diff --git a/test/00_transform/MatMul.cu b/test/00_transform/MatMul.cu index 99c78d767..28133f573 100644 --- a/test/00_transform/MatMul.cu +++ b/test/00_transform/MatMul.cu @@ -369,9 +369,9 @@ TYPED_TEST(MatMulTestFloatTypes, MediumRectBatched0StrideA) (b = b0).run(); // Perform a batched gemm with "batches" GEMMs - (c = matmul(a0, b)).run(); + (c = matmul(a0, b)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (int i = 0; i < c.Size(0); i++) { for (int j = 0; j < c.Size(1); j++) { @@ -409,9 +409,9 @@ TYPED_TEST(MatMulTestFloatTypes, MediumRectBatched0StrideB) (a = a0).run(); // Perform a batched gemm with "batches" GEMMs - (c = matmul(a, b0)).run(); + (c = matmul(a, b0)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (int i = 0; i < c.Size(0); i++) { for (int j = 0; j < c.Size(1); j++) { @@ -545,14 +545,14 @@ TYPED_TEST(MatMulTestFloatNonHalfTypes, MatMulAxis) auto bp = permute(bi, perm); auto cp = permute(ci, perm); - (ap = a3).run(); - (bp = b3).run(); + (ap = a3).run(this->exec); + (bp = b3).run(this->exec); (ci = matmul(ai, bi, axis)).run(); - (c3 = cp).run(); + (c3 = cp).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, c3, "c", this->thresh); } @@ -571,17 +571,17 @@ TYPED_TEST(MatMulTestFloatNonHalfTypes, MatMulAxis) auto cp = permute(ci, perm); // copy data into permuted inputs - (ap = a3).run(); - (bp = b3).run(); + (ap = a3).run(this->exec); + (bp = b3).run(this->exec); // Perform a GEMM with the last two dimensions permuted - (ci = matmul(ai, bi, axis)).run(); + (ci = matmul(ai, bi, axis)).run(this->exec); // example-end matmul-test-6 // copy result from permuted output - (c3 = cp).run(); + (c3 = cp).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, c3, "c", this->thresh); } @@ -599,15 +599,15 @@ TYPED_TEST(MatMulTestFloatNonHalfTypes, MatMulAxis) auto cp = permute(ci, perm); // copy data into permuted inputs - (ap = a3).run(); - (bp = b3).run(); + (ap = a3).run(this->exec); + (bp = b3).run(this->exec); - (ci = matmul(ai, bi, axis)).run(); + (ci = matmul(ai, bi, axis)).run(this->exec); // copy result from permuted output - (c3 = cp).run(); + (c3 = cp).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, c3, "c", this->thresh); } @@ -625,15 +625,15 @@ TYPED_TEST(MatMulTestFloatNonHalfTypes, MatMulAxis) auto cp = permute(ci, perm); // copy data into permuted inputs - (ap = a3).run(); - (bp = b3).run(); + (ap = a3).run(this->exec); + (bp = b3).run(this->exec); - (ci = matmul(ai, bi, axis)).run(); + (ci = matmul(ai, bi, axis)).run(this->exec); // copy result from permuted output - (c3 = cp).run(); + (c3 = cp).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, c3, "c", this->thresh); } @@ -701,7 +701,7 @@ TYPED_TEST(MatMulTestFloatNonHalfTypes, MatMulBroadcast) // Broadcast eye2, scaling each entry in a5 by 2 (c5 = matmul(eye2, a5)).run(this->exec); - cudaDeviceSynchronize(); + this->exec.sync(); for (index_t i0 = 0; i0 < x; i0++) for (index_t i1 = 0; i1 < y; i1++) @@ -720,7 +720,7 @@ TYPED_TEST(MatMulTestFloatNonHalfTypes, MatMulBroadcast) // Broadcast eye2, scaling each entry in a5 by 2 (c5 = matmul(a5, eye2)).run(this->exec); - cudaDeviceSynchronize(); + this->exec.sync(); for (index_t i0 = 0; i0 < x; i0++) for (index_t i1 = 0; i1 < y; i1++) @@ -828,9 +828,9 @@ TYPED_TEST(MatMulTestFloatTypes, MediumMatVecBatch) tensor_t bv{{blocks, k}}; tensor_t cv{{blocks, m}}; - (bv = bs).run(); - (cv = cs).run(); - (cv = matvec(a, bv)).run(); + (bv = bs).run(this->exec); + (cv = cs).run(this->exec); + (cv = matvec(a, bv)).run(this->exec); MATX_TEST_ASSERT_COMPARE(this->pb, c, "c", this->thresh); @@ -859,15 +859,15 @@ TYPED_TEST(MatMulTestFloatTypes, MatVecRowVector) auto cs = slice<2>(c, {0,0,0}, {matxEnd, matxEnd, matxDropDim}); auto bs = slice<2>(b, {0,0,0}, {matxEnd, matxEnd, matxDropDim}); - (cs = matvec(a, bs)).run(); + (cs = matvec(a, bs)).run(this->exec); MATX_TEST_ASSERT_COMPARE(this->pb, c, "c", this->thresh); tensor_t bv{{blocks, k}}; tensor_t cv{{blocks, m}}; - (bv = bs).run(); - (cv = cs).run(); - (cv = matvec(a, bv)).run(); + (bv = bs).run(this->exec); + (cv = cs).run(this->exec); + (cv = matvec(a, bv)).run(this->exec); MATX_TEST_ASSERT_COMPARE(this->pb, c, "c", this->thresh); @@ -894,10 +894,10 @@ TYPED_TEST(MatMulTestFloatTypes, OuterProduct) this->pb->NumpyToTensorView(b, "b"); // example-begin outer-test-1 - (c = outer(a, b)).run(); + (c = outer(a, b)).run(this->exec); // example-end outer-test-1 - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, c, "c", this->thresh); auto ba = make_tensor({batches, an}); @@ -906,9 +906,9 @@ TYPED_TEST(MatMulTestFloatTypes, OuterProduct) this->pb->NumpyToTensorView(bb, "bb"); auto bc = make_tensor({batches, an, bn}); - (bc = outer(ba, bb)).run(); + (bc = outer(ba, bb)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, bc, "bc", this->thresh); MATX_EXIT_HANDLER(); diff --git a/test/00_transform/ResamplePoly.cu b/test/00_transform/ResamplePoly.cu index 1124d4cfe..1dbfc2041 100644 --- a/test/00_transform/ResamplePoly.cu +++ b/test/00_transform/ResamplePoly.cu @@ -128,7 +128,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, SimpleOddLength) (b = resample_poly(a, f, up, down)).run(this->exec); // example-end resample_poly-test-1 - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); @@ -136,7 +136,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, SimpleOddLength) // so we can inverse-scale the output to compare against the golden outputs. (b = resample_poly(static_cast(4.0) * a, f, up, down)).run(this->exec); (b = b * static_cast(0.25)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); } @@ -190,7 +190,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, SimpleEvenLength) this->pb->NumpyToTensorView(f, "filter_random"); (b = resample_poly(a, f, up, down)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); @@ -198,7 +198,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, SimpleEvenLength) // so we can inverse-scale the output to compare against the golden outputs. (b = resample_poly(static_cast(4.0) * a, f, up, down)).run(this->exec); (b = b * static_cast(0.25)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); } @@ -244,11 +244,11 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, DefaultFilter) this->pb->NumpyToTensorView(a, "a"); this->pb->NumpyToTensorView(f, "filter_default"); - cudaStreamSynchronize(0); + this->exec.sync(); (b = resample_poly(a, f, up, down)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_default", this->thresh); } @@ -296,11 +296,11 @@ TYPED_TEST(ResamplePolyTestFloatTypes, DefaultFilter) this->pb->NumpyToTensorView(a, "a"); this->pb->NumpyToTensorView(f, "filter_default"); - cudaStreamSynchronize(0); + this->exec.sync(); (b = resample_poly(a, f, up, down)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_default", this->thresh); } @@ -344,7 +344,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Batched) this->pb->NumpyToTensorView(f, "filter_random"); (b = resample_poly(ac, f, up, down)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); // Verify that the 4D tensor was handled in a batched fashion for (int ia = 0; ia < nA; ia++) { @@ -361,11 +361,11 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Batched) (full = ac).run(this->exec); (b = 0).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); (b = resample_poly(ac, f, up, down)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); // Verify that the 4D tensor was handled in a batched fashion for (int ia = 0; ia < nA; ia++) { @@ -377,7 +377,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Batched) } } - cudaStreamSynchronize(0); + this->exec.sync(); } MATX_EXIT_HANDLER(); @@ -399,7 +399,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Identity) auto zero = make_tensor({1}); (zero = 0).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) { const index_t a_len = test_cases[i].a_len; @@ -413,7 +413,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Identity) this->pb->NumpyToTensorView(a, "a"); (b = resample_poly(a, zero, up, down)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); // The output should equal the input because up == down. for (index_t k = 0; k < a_len; k++) { @@ -442,7 +442,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Downsample) auto seven = make_tensor({1}); (seven = 7).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) { const index_t a_len = test_cases[i].a_len; @@ -457,7 +457,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Downsample) this->pb->NumpyToTensorView(a, "a"); (b = resample_poly(a, seven, up, down)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); for (index_t j = 0; j < b_len; j++) { double aj, bj; @@ -501,7 +501,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Upsample) // The resample kernel scales the filter by up, so we use 1/up to get an // effective filter of 1. (f = 1.0/static_cast(up)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); auto a = make_tensor({a_len}); const index_t b_len = a_len * up; @@ -509,7 +509,7 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Upsample) this->pb->NumpyToTensorView(a, "a"); (b = resample_poly(a, f, up, down)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); // Since the filter is single tapped and == 1, we should get the sequence // [a_0, 0, ..., 0, a_1, 0, ...] with up-1 zeros between successive values @@ -569,11 +569,11 @@ TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, Operators) this->pb->NumpyToTensorView(a, "a"); this->pb->NumpyToTensorView(f, "filter_random"); - cudaStreamSynchronize(0); + this->exec.sync(); (b = resample_poly(shift<0>(shift<0>(a, 8), -8), shift<0>(shift<0>(f, 3), -3), up, down)).run(this->exec); - cudaStreamSynchronize(0); + this->exec.sync(); MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); } diff --git a/test/00_transform/Solve.cu b/test/00_transform/Solve.cu index 6a14dd0a7..02369df8b 100644 --- a/test/00_transform/Solve.cu +++ b/test/00_transform/Solve.cu @@ -83,7 +83,7 @@ TYPED_TEST(SolveTestsFloatNonComplexNonHalf, CGSolve) (X = cgsolve(A, B, .00001, 10)).run(exec); // example-end cgsolve-test-1 (B = matvec(A, X)).run(exec); - cudaDeviceSynchronize(); + exec.sync(); for(int i = 0; i < BATCH; i++) { for(int j = 0; j < N; j++) { diff --git a/test/01_radar/MVDRBeamformer.cu b/test/01_radar/MVDRBeamformer.cu index c2ce47fc2..21c5d430d 100644 --- a/test/01_radar/MVDRBeamformer.cu +++ b/test/01_radar/MVDRBeamformer.cu @@ -48,9 +48,9 @@ TEST(Radar, MVDRBeamformer) index_t data_len = 1000; index_t snap_len = 2 * num_el; - auto mvdr = MVDRBeamformer(num_beams, num_el, data_len, snap_len); + cudaExecutor exec{}; - mvdr.Prefetch(0); + auto mvdr = MVDRBeamformer(num_beams, num_el, data_len, snap_len); auto pb = std::make_unique(); pb->InitAndRunTVGenerator("mvdr_beamformer", "mvdr_beamformer", @@ -64,8 +64,8 @@ TEST(Radar, MVDRBeamformer) pb->NumpyToTensorView(in_vec, "in_vec"); pb->NumpyToTensorView(v, "v"); - mvdr.Run(0); - cudaStreamSynchronize(0); + mvdr.Run(exec); + exec.sync(); auto cbf = mvdr.GetCBFView(); diff --git a/test/01_radar/MultiChannelRadarPipeline.cu b/test/01_radar/MultiChannelRadarPipeline.cu index 98fc66418..1abbd4efb 100644 --- a/test/01_radar/MultiChannelRadarPipeline.cu +++ b/test/01_radar/MultiChannelRadarPipeline.cu @@ -93,15 +93,11 @@ TYPED_TEST(MultiChannelRadarPipelineTypes, PulseCompression) // Copy the replicated data into the actual data pointer matx::copy(d, x_clone, 0); - d.PrefetchDevice(0); - auto wfd = p.GetwaveformView(); auto wf = wfd.Slice({0}, {this->waveformLength}); this->pb->NumpyToTensorView(wf, "waveform"); - wfd.PrefetchDevice(0); - p.PulseCompression(); auto xc = d.Slice({0, 0, 0}, {this->numChannels, this->numPulses, @@ -155,8 +151,6 @@ TYPED_TEST(MultiChannelRadarPipelineTypes, Doppler) this->numCompressedSamples}); this->pb->NumpyToTensorView(v, "x_conv2"); - in.PrefetchHost(0); - p.DopplerProcessing(); auto out = p.GetTPCView(); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 34e498140..3db9bef96 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -37,7 +37,7 @@ set (test_sources main.cu ) -# Some of <00_io> tests need csv files and the binary 'test.mat' which all +# Some of <00_io> tests need csv files and binaries which all # are located under 'CMAKE_SOURCE_DIR/test/00_io'. When calling the test # executable from its location in 'CMAKE_BINARY_DIR/test' the # search paths according are @@ -57,6 +57,10 @@ file(COPY ${CMAKE_SOURCE_DIR}/test/00_io/test.mat DESTINATION ${CMAKE_BINARY_DIR}/test/00_io ) +file(COPY + ${CMAKE_SOURCE_DIR}/test/00_io/test.npy + DESTINATION ${CMAKE_BINARY_DIR}/test/00_io +) # Find proprietary parameters file (GLOB_RECURSE proprietary_sources ../proprietary/*/tests/*.cu)