Skip to content

Commit db2ae6b

Browse files
committed
Added synchronization support through .sync().
Updated all tests to use calls to .sync() instead of the cuda api and removed Prefetch calls.
1 parent 464e4a7 commit db2ae6b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+904
-876
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
.vscode
2+
.cache
23
build*
34
*.pyc

bench/00_operators/operators.cu

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ template <typename ValueType>
1010
void vector_add(nvbench::state &state, nvbench::type_list<ValueType>)
1111
{
1212
// Get current parameters:
13+
cudaExecutor exec{0};
1314
const int x_len = static_cast<int>(state.get_int64("Vector size"));
1415

1516
state.add_element_count(x_len, "NumElements");
@@ -19,9 +20,9 @@ void vector_add(nvbench::state &state, nvbench::type_list<ValueType>)
1920

2021
tensor_t<ValueType, 1> xv{{x_len}};
2122
tensor_t<ValueType, 1> xv2{{x_len}};
22-
xv.PrefetchDevice(0);
23-
(xv = xv + xv2).run();
24-
cudaDeviceSynchronize();
23+
24+
(xv = xv + xv2).run(exec);
25+
exec.sync();
2526

2627
state.exec(
2728
[&xv, &xv2](nvbench::launch &launch) {
@@ -38,6 +39,7 @@ using permute_types = nvbench::type_list<float, double, cuda::std::complex<float
3839
template <typename ValueType>
3940
void permute(nvbench::state &state, nvbench::type_list<ValueType>)
4041
{
42+
cudaExecutor exec{0};
4143
auto x = make_tensor<ValueType>({1000,200,6,300});
4244
auto y = make_tensor<ValueType>({300,1000,6,200});
4345

@@ -46,7 +48,7 @@ void permute(nvbench::state &state, nvbench::type_list<ValueType>)
4648
state.add_global_memory_writes<ValueType>(x.TotalSize());
4749

4850
x.PrefetchDevice(0);
49-
cudaDeviceSynchronize();
51+
exec.sync();
5052

5153
state.exec(
5254
[&x, &y](nvbench::launch &launch) {
@@ -61,17 +63,18 @@ using random_types = nvbench::type_list<float, double, cuda::std::complex<float>
6163
template <typename ValueType>
6264
void random(nvbench::state &state, nvbench::type_list<ValueType>)
6365
{
66+
cudaExecutor exec{0};
6467
auto x = make_tensor<ValueType>({1966800});
6568
auto y = make_tensor<ValueType>({1966800});
6669
x.PrefetchDevice(0);
6770
y.PrefetchDevice(0);
6871

69-
(y = random<float>(x.Shape(), NORMAL)).run();
72+
(y = random<float>(x.Shape(), NORMAL)).run(exec);
7073

7174
state.add_element_count(x.TotalSize(), "NumElements");
7275
state.add_global_memory_writes<ValueType>(x.TotalSize());
7376

74-
cudaDeviceSynchronize();
77+
exec.sync();
7578

7679
state.exec(
7780
[&x, &y](nvbench::launch &launch) {
@@ -98,6 +101,7 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list<ValueType>)
98101
int n = 600;
99102
ValueType dx = M_PI/n;
100103

104+
cudaExecutor exec{};
101105
auto col = range<0>({n+1},ValueType(0), ValueType(dx));
102106
auto az = range<0>({2*n+1}, ValueType(0), ValueType(dx));
103107

@@ -122,8 +126,8 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list<ValueType>)
122126
auto Y = make_tensor<ValueType>(Ym.Shape());
123127
auto Z = make_tensor<ValueType>(Zm.Shape());
124128

125-
cudaDeviceSynchronize();
126-
129+
exec.sync();
130+
127131
state.add_element_count(n+1, "Elements");
128132

129133
state.exec(

bench/00_operators/reduction.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ void reduce_4d(
127127
nvbench::type_list<ValueType>
128128
)
129129
{
130+
cudaExecutor exec{0};
130131
const int size0 = static_cast<int>(state.get_int64("Size0"));
131132
const int size1 = static_cast<int>(state.get_int64("Size1"));
132133
const int size2 = static_cast<int>(state.get_int64("Size2"));
@@ -138,8 +139,8 @@ void reduce_4d(
138139
t1.PrefetchDevice(0);
139140
t4.PrefetchDevice(0);
140141

141-
(t4 = random<float>(t4.Shape(), UNIFORM)).run();
142-
cudaDeviceSynchronize();
142+
(t4 = random<float>(t4.Shape(), UNIFORM)).run(exec);
143+
exec.sync();
143144

144145
state.exec([&t4, &t1](nvbench::launch &launch) {
145146
(t1 = matx::sum(t4, {1, 2, 3})).run((cudaStream_t)launch.get_stream()); });

bench/00_transform/conv.cu

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ template <typename ValueType>
1313
void conv1d_direct_4d_batch(nvbench::state &state,
1414
nvbench::type_list<ValueType>)
1515
{
16+
cudaExecutor exec{0};
1617
auto out = make_tensor<ValueType>({4, 2, 14, 288 + 4096 + 133 - 1});
1718
auto at = make_tensor<ValueType>({ 4, 2, 14, 133});
1819
auto bt = make_tensor<ValueType>({ 4, 2, 14, 288 + 4096});
@@ -21,7 +22,7 @@ void conv1d_direct_4d_batch(nvbench::state &state,
2122
at.PrefetchDevice(0);
2223
bt.PrefetchDevice(0);
2324

24-
cudaDeviceSynchronize();
25+
exec.sync();
2526
MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
2627
state.exec(
2728
[&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
@@ -35,7 +36,7 @@ template <typename ValueType>
3536
void conv1d_direct_2d_batch(nvbench::state &state,
3637
nvbench::type_list<ValueType>)
3738
{
38-
39+
cudaExecutor exec{0};
3940

4041
auto out = make_tensor<ValueType>({4 * 2* 14, 288 + 4096 + 133 - 1});
4142
auto at = make_tensor<ValueType>({ 4 * 2* 14, 133});
@@ -45,7 +46,7 @@ void conv1d_direct_2d_batch(nvbench::state &state,
4546
at.PrefetchDevice(0);
4647
bt.PrefetchDevice(0);
4748

48-
cudaDeviceSynchronize();
49+
exec.sync();
4950

5051
state.exec(
5152
[&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
@@ -56,6 +57,7 @@ template <typename ValueType>
5657
void conv1d_direct_large(nvbench::state &state,
5758
nvbench::type_list<ValueType>)
5859
{
60+
cudaExecutor exec{0};
5961
auto at = make_tensor<ValueType>({state.get_int64("Signal Size")});
6062
auto bt = make_tensor<ValueType>({state.get_int64("Filter Size")});
6163
auto out = make_tensor<ValueType>({at.Size(at.Rank()-1) + bt.Size(bt.Rank()-1) - 1});
@@ -64,9 +66,9 @@ void conv1d_direct_large(nvbench::state &state,
6466
at.PrefetchDevice(0);
6567
bt.PrefetchDevice(0);
6668

67-
(out = conv1d(at, bt, MATX_C_MODE_FULL)).run();
69+
(out = conv1d(at, bt, MATX_C_MODE_FULL)).run(exec);
6870

69-
cudaDeviceSynchronize();
71+
exec.sync();
7072

7173
state.exec(
7274
[&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
@@ -79,17 +81,18 @@ template <typename ValueType>
7981
void conv1d_fft_large(nvbench::state &state,
8082
nvbench::type_list<ValueType>)
8183
{
84+
cudaExecutor exec{0};
8285
auto at = make_tensor<ValueType>({state.get_int64("Signal Size")});
8386
auto bt = make_tensor<ValueType>({state.get_int64("Filter Size")});
8487
auto out = make_tensor<ValueType>({at.Size(at.Rank()-1) + bt.Size(bt.Rank()-1) - 1});
8588

86-
(out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run();
89+
(out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run(exec);
8790

8891
out.PrefetchDevice(0);
8992
at.PrefetchDevice(0);
9093
bt.PrefetchDevice(0);
9194

92-
cudaDeviceSynchronize();
95+
exec.sync();
9396

9497
state.exec(
9598
[&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run(cudaExecutor(launch.get_stream())); });
@@ -103,6 +106,7 @@ template <typename ValueType>
103106
void conv2d_direct_batch(nvbench::state &state,
104107
nvbench::type_list<ValueType>)
105108
{
109+
cudaExecutor exec{0};
106110
auto at = make_tensor<ValueType>({256, 1024, 1024});
107111
auto bt = make_tensor<ValueType>({256, 16, 16});
108112
auto out = make_tensor<ValueType>({256,
@@ -113,7 +117,7 @@ void conv2d_direct_batch(nvbench::state &state,
113117
at.PrefetchDevice(0);
114118
bt.PrefetchDevice(0);
115119

116-
cudaDeviceSynchronize();
120+
exec.sync();
117121

118122
state.exec(
119123
[&out, &at, &bt](nvbench::launch &launch) { (out = conv2d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });

bench/00_transform/cub.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,15 @@ void sort1d(
2424
nvbench::type_list<ValueType>
2525
)
2626
{
27+
cudaExecutor exec{0};
2728
const int dataSize = static_cast<int>(state.get_int64("Tensor Size"));
2829

2930
auto sortedData = matx::make_tensor<ValueType>({dataSize});
3031
auto randomData = matx::make_tensor<ValueType>({dataSize});
3132

3233
sortedData.PrefetchDevice(0);
3334
randomData.PrefetchDevice(0);
34-
cudaDeviceSynchronize();
35+
exec.sync();
3536

3637
(randomData = random<float>(sortedData.Shape(), NORMAL)).run();
3738

bench/00_transform/qr.cu

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ void qr_batch(nvbench::state &state,
1717

1818
cudaStream_t stream = 0;
1919
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
20+
cudaExecutor exec{stream};
2021

2122
int batch = state.get_int64("batch");
2223
int m = state.get_int64("rows");
@@ -26,17 +27,13 @@ void qr_batch(nvbench::state &state,
2627
auto Q = make_tensor<AType>({batch, m, m});
2728
auto R = make_tensor<AType>({batch, m, n});
2829

29-
A.PrefetchDevice(stream);
30-
Q.PrefetchDevice(stream);
31-
R.PrefetchDevice(stream);
32-
33-
(A = random<float>({batch, m, n}, NORMAL)).run(stream);
30+
(A = random<float>({batch, m, n}, NORMAL)).run(exec);
3431

3532
// warm up
3633
nvtxRangePushA("Warmup");
37-
(mtie(Q, R) = qr(A)).run(stream);
34+
(mtie(Q, R) = qr(A)).run(exec);
3835

39-
cudaDeviceSynchronize();
36+
exec.sync();
4037
nvtxRangePop();
4138

4239
MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )

bench/00_transform/svd_power.cu

Lines changed: 15 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ void svdpi_batch(nvbench::state &state,
1717

1818
cudaStream_t stream = 0;
1919
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
20+
cudaExecutor exec{stream};
2021

2122
int batch = state.get_int64("batch");
2223
int m = state.get_int64("rows");
@@ -30,22 +31,17 @@ void svdpi_batch(nvbench::state &state,
3031

3132
int iterations = 10;
3233

33-
(A = random<float>({batch, m, n}, NORMAL)).run(stream);
34-
35-
A.PrefetchDevice(stream);
36-
U.PrefetchDevice(stream);
37-
S.PrefetchDevice(stream);
38-
VT.PrefetchDevice(stream);
39-
40-
(U = 0).run(stream);
41-
(S = 0).run(stream);
42-
(VT = 0).run(stream);
34+
(A = random<float>({batch, m, n}, NORMAL)).run(exec);
35+
36+
(U = 0).run(exec);
37+
(S = 0).run(exec);
38+
(VT = 0).run(exec);
4339
auto x0 = random<float>({batch, r}, NORMAL);
4440

4541
// warm up
4642
nvtxRangePushA("Warmup");
47-
(mtie(U, S, VT) = svdpi(A, x0, iterations, r)).run(stream);
48-
cudaDeviceSynchronize();
43+
(mtie(U, S, VT) = svdpi(A, x0, iterations, r)).run(exec);
44+
exec.sync();
4945
nvtxRangePop();
5046

5147
MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
@@ -70,6 +66,7 @@ void svdbpi_batch(nvbench::state &state,
7066

7167
cudaStream_t stream = 0;
7268
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
69+
cudaExecutor exec{stream};
7370

7471
int batch = state.get_int64("batch");
7572
int m = state.get_int64("rows");
@@ -83,21 +80,16 @@ void svdbpi_batch(nvbench::state &state,
8380

8481
int iterations = 10;
8582

86-
(A = random<float>({batch, m, n}, NORMAL)).run(stream);
87-
88-
A.PrefetchDevice(stream);
89-
U.PrefetchDevice(stream);
90-
S.PrefetchDevice(stream);
91-
VT.PrefetchDevice(stream);
83+
(A = random<float>({batch, m, n}, NORMAL)).run(exec);
9284

93-
(U = 0).run(stream);
94-
(S = 0).run(stream);
95-
(VT = 0).run(stream);
85+
(U = 0).run(exec);
86+
(S = 0).run(exec);
87+
(VT = 0).run(exec);
9688

9789
// warm up
9890
nvtxRangePushA("Warmup");
99-
(mtie(U, S, VT) = svdbpi(A, iterations)).run(stream);
100-
cudaDeviceSynchronize();
91+
(mtie(U, S, VT) = svdbpi(A, iterations)).run(exec);
92+
exec.sync();
10193
nvtxRangePop();
10294

10395
MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )

docs_input/api/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ API Reference
1717
casting/index.rst
1818
window/index.rst
1919
signalimage/index.rst
20+
synchronization/index.rst
2021
polynomials/index.rst
2122
random/random.rst
2223
dft/index.rst
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
.. _synchronization:
2+
3+
Synchronization
4+
###############
5+
6+
.. toctree::
7+
:maxdepth: 1
8+
:glob:
9+
10+
*
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
.. _sync_func:
2+
3+
sync
4+
====
5+
6+
Wait for any code running on an executor to complete.
7+
8+
.. doxygenfunction:: matx::cudaExecutor::sync()
9+
.. doxygenfunction:: matx::HostExecutor::sync()
10+
11+
Examples
12+
~~~~~~~~
13+
14+
.. literalinclude:: ../../../examples/cgsolve.cu
15+
:language: cpp
16+
:start-after: example-begin sync-test-1
17+
:end-before: example-end sync-test-1
18+
:dedent:

0 commit comments

Comments
 (0)