@@ -13,6 +13,7 @@ template <typename ValueType>
1313void conv1d_direct_4d_batch (nvbench::state &state,
1414 nvbench::type_list<ValueType>)
1515{
16+ cudaExecutor exec{0 };
1617 auto out = make_tensor<ValueType>({4 , 2 , 14 , 288 + 4096 + 133 - 1 });
1718 auto at = make_tensor<ValueType>({ 4 , 2 , 14 , 133 });
1819 auto bt = make_tensor<ValueType>({ 4 , 2 , 14 , 288 + 4096 });
@@ -21,7 +22,7 @@ void conv1d_direct_4d_batch(nvbench::state &state,
2122 at.PrefetchDevice (0 );
2223 bt.PrefetchDevice (0 );
2324
24- cudaDeviceSynchronize ();
25+ exec. sync ();
2526 MATX_NVTX_START_RANGE ( " Exec" , matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
2627 state.exec (
2728 [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d (at, bt, MATX_C_MODE_FULL)).run (cudaExecutor (launch.get_stream ())); });
@@ -35,7 +36,7 @@ template <typename ValueType>
3536void conv1d_direct_2d_batch (nvbench::state &state,
3637 nvbench::type_list<ValueType>)
3738{
38-
39+ cudaExecutor exec{ 0 };
3940
4041 auto out = make_tensor<ValueType>({4 * 2 * 14 , 288 + 4096 + 133 - 1 });
4142 auto at = make_tensor<ValueType>({ 4 * 2 * 14 , 133 });
@@ -45,7 +46,7 @@ void conv1d_direct_2d_batch(nvbench::state &state,
4546 at.PrefetchDevice (0 );
4647 bt.PrefetchDevice (0 );
4748
48- cudaDeviceSynchronize ();
49+ exec. sync ();
4950
5051 state.exec (
5152 [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d (at, bt, MATX_C_MODE_FULL)).run (cudaExecutor (launch.get_stream ())); });
@@ -56,6 +57,7 @@ template <typename ValueType>
5657void conv1d_direct_large (nvbench::state &state,
5758 nvbench::type_list<ValueType>)
5859{
60+ cudaExecutor exec{0 };
5961 auto at = make_tensor<ValueType>({state.get_int64 (" Signal Size" )});
6062 auto bt = make_tensor<ValueType>({state.get_int64 (" Filter Size" )});
6163 auto out = make_tensor<ValueType>({at.Size (at.Rank ()-1 ) + bt.Size (bt.Rank ()-1 ) - 1 });
@@ -64,9 +66,9 @@ void conv1d_direct_large(nvbench::state &state,
6466 at.PrefetchDevice (0 );
6567 bt.PrefetchDevice (0 );
6668
67- (out = conv1d (at, bt, MATX_C_MODE_FULL)).run ();
69+ (out = conv1d (at, bt, MATX_C_MODE_FULL)).run (exec );
6870
69- cudaDeviceSynchronize ();
71+ exec. sync ();
7072
7173 state.exec (
7274 [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d (at, bt, MATX_C_MODE_FULL)).run (cudaExecutor (launch.get_stream ())); });
@@ -79,17 +81,18 @@ template <typename ValueType>
7981void conv1d_fft_large (nvbench::state &state,
8082 nvbench::type_list<ValueType>)
8183{
84+ cudaExecutor exec{0 };
8285 auto at = make_tensor<ValueType>({state.get_int64 (" Signal Size" )});
8386 auto bt = make_tensor<ValueType>({state.get_int64 (" Filter Size" )});
8487 auto out = make_tensor<ValueType>({at.Size (at.Rank ()-1 ) + bt.Size (bt.Rank ()-1 ) - 1 });
8588
86- (out = conv1d (at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run ();
89+ (out = conv1d (at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run (exec );
8790
8891 out.PrefetchDevice (0 );
8992 at.PrefetchDevice (0 );
9093 bt.PrefetchDevice (0 );
9194
92- cudaDeviceSynchronize ();
95+ exec. sync ();
9396
9497 state.exec (
9598 [&out, &at, &bt](nvbench::launch &launch) { (out = conv1d (at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run (cudaExecutor (launch.get_stream ())); });
@@ -103,6 +106,7 @@ template <typename ValueType>
103106void conv2d_direct_batch (nvbench::state &state,
104107 nvbench::type_list<ValueType>)
105108{
109+ cudaExecutor exec{0 };
106110 auto at = make_tensor<ValueType>({256 , 1024 , 1024 });
107111 auto bt = make_tensor<ValueType>({256 , 16 , 16 });
108112 auto out = make_tensor<ValueType>({256 ,
@@ -113,7 +117,7 @@ void conv2d_direct_batch(nvbench::state &state,
113117 at.PrefetchDevice (0 );
114118 bt.PrefetchDevice (0 );
115119
116- cudaDeviceSynchronize ();
120+ exec. sync ();
117121
118122 state.exec (
119123 [&out, &at, &bt](nvbench::launch &launch) { (out = conv2d (at, bt, MATX_C_MODE_FULL)).run (cudaExecutor (launch.get_stream ())); });
0 commit comments