Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.vscode
.cache
build*
*.pyc
20 changes: 12 additions & 8 deletions bench/00_operators/operators.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ template <typename ValueType>
void vector_add(nvbench::state &state, nvbench::type_list<ValueType>)
{
// Get current parameters:
cudaExecutor exec{0};
const int x_len = static_cast<int>(state.get_int64("Vector size"));

state.add_element_count(x_len, "NumElements");
Expand All @@ -19,9 +20,9 @@ void vector_add(nvbench::state &state, nvbench::type_list<ValueType>)

tensor_t<ValueType, 1> xv{{x_len}};
tensor_t<ValueType, 1> xv2{{x_len}};
xv.PrefetchDevice(0);
(xv = xv + xv2).run();
cudaDeviceSynchronize();

(xv = xv + xv2).run(exec);
exec.sync();

state.exec(
[&xv, &xv2](nvbench::launch &launch) {
Expand All @@ -38,6 +39,7 @@ using permute_types = nvbench::type_list<float, double, cuda::std::complex<float
template <typename ValueType>
void permute(nvbench::state &state, nvbench::type_list<ValueType>)
{
cudaExecutor exec{0};
auto x = make_tensor<ValueType>({1000,200,6,300});
auto y = make_tensor<ValueType>({300,1000,6,200});

Expand All @@ -46,7 +48,7 @@ void permute(nvbench::state &state, nvbench::type_list<ValueType>)
state.add_global_memory_writes<ValueType>(x.TotalSize());

x.PrefetchDevice(0);
cudaDeviceSynchronize();
exec.sync();

state.exec(
[&x, &y](nvbench::launch &launch) {
Expand All @@ -61,17 +63,18 @@ using random_types = nvbench::type_list<float, double, cuda::std::complex<float>
template <typename ValueType>
void random(nvbench::state &state, nvbench::type_list<ValueType>)
{
cudaExecutor exec{0};
auto x = make_tensor<ValueType>({1966800});
auto y = make_tensor<ValueType>({1966800});
x.PrefetchDevice(0);
y.PrefetchDevice(0);

(y = random<float>(x.Shape(), NORMAL)).run();
(y = random<float>(x.Shape(), NORMAL)).run(exec);

state.add_element_count(x.TotalSize(), "NumElements");
state.add_global_memory_writes<ValueType>(x.TotalSize());

cudaDeviceSynchronize();
exec.sync();

state.exec(
[&x, &y](nvbench::launch &launch) {
Expand All @@ -98,6 +101,7 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list<ValueType>)
int n = 600;
ValueType dx = M_PI/n;

cudaExecutor exec{};
auto col = range<0>({n+1},ValueType(0), ValueType(dx));
auto az = range<0>({2*n+1}, ValueType(0), ValueType(dx));

Expand All @@ -122,8 +126,8 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list<ValueType>)
auto Y = make_tensor<ValueType>(Ym.Shape());
auto Z = make_tensor<ValueType>(Zm.Shape());

cudaDeviceSynchronize();

exec.sync();
state.add_element_count(n+1, "Elements");

state.exec(
Expand Down
5 changes: 3 additions & 2 deletions bench/00_operators/reduction.cu
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ void reduce_4d(
nvbench::type_list<ValueType>
)
{
cudaExecutor exec{0};
const int size0 = static_cast<int>(state.get_int64("Size0"));
const int size1 = static_cast<int>(state.get_int64("Size1"));
const int size2 = static_cast<int>(state.get_int64("Size2"));
Expand All @@ -138,8 +139,8 @@ void reduce_4d(
t1.PrefetchDevice(0);
t4.PrefetchDevice(0);

(t4 = random<float>(t4.Shape(), UNIFORM)).run();
cudaDeviceSynchronize();
(t4 = random<float>(t4.Shape(), UNIFORM)).run(exec);
exec.sync();

state.exec([&t4, &t1](nvbench::launch &launch) {
(t1 = matx::sum(t4, {1, 2, 3})).run((cudaStream_t)launch.get_stream()); });
Expand Down
20 changes: 12 additions & 8 deletions bench/00_transform/conv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ template <typename ValueType>
void conv1d_direct_4d_batch(nvbench::state &state,
nvbench::type_list<ValueType>)
{
cudaExecutor exec{0};
auto out = make_tensor<ValueType>({4, 2, 14, 288 + 4096 + 133 - 1});
auto at = make_tensor<ValueType>({ 4, 2, 14, 133});
auto bt = make_tensor<ValueType>({ 4, 2, 14, 288 + 4096});
Expand All @@ -21,7 +22,7 @@ void conv1d_direct_4d_batch(nvbench::state &state,
at.PrefetchDevice(0);
bt.PrefetchDevice(0);

cudaDeviceSynchronize();
exec.sync();
MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
state.exec(
[&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
Expand All @@ -35,7 +36,7 @@ template <typename ValueType>
void conv1d_direct_2d_batch(nvbench::state &state,
nvbench::type_list<ValueType>)
{

cudaExecutor exec{0};

auto out = make_tensor<ValueType>({4 * 2* 14, 288 + 4096 + 133 - 1});
auto at = make_tensor<ValueType>({ 4 * 2* 14, 133});
Expand All @@ -45,7 +46,7 @@ void conv1d_direct_2d_batch(nvbench::state &state,
at.PrefetchDevice(0);
bt.PrefetchDevice(0);

cudaDeviceSynchronize();
exec.sync();

state.exec(
[&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
Expand All @@ -56,6 +57,7 @@ template <typename ValueType>
void conv1d_direct_large(nvbench::state &state,
nvbench::type_list<ValueType>)
{
cudaExecutor exec{0};
auto at = make_tensor<ValueType>({state.get_int64("Signal Size")});
auto bt = make_tensor<ValueType>({state.get_int64("Filter Size")});
auto out = make_tensor<ValueType>({at.Size(at.Rank()-1) + bt.Size(bt.Rank()-1) - 1});
Expand All @@ -64,9 +66,9 @@ void conv1d_direct_large(nvbench::state &state,
at.PrefetchDevice(0);
bt.PrefetchDevice(0);

(out = conv1d(at, bt, MATX_C_MODE_FULL)).run();
(out = conv1d(at, bt, MATX_C_MODE_FULL)).run(exec);

cudaDeviceSynchronize();
exec.sync();

state.exec(
[&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
Expand All @@ -79,17 +81,18 @@ template <typename ValueType>
void conv1d_fft_large(nvbench::state &state,
nvbench::type_list<ValueType>)
{
cudaExecutor exec{0};
auto at = make_tensor<ValueType>({state.get_int64("Signal Size")});
auto bt = make_tensor<ValueType>({state.get_int64("Filter Size")});
auto out = make_tensor<ValueType>({at.Size(at.Rank()-1) + bt.Size(bt.Rank()-1) - 1});

(out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run();
(out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run(exec);

out.PrefetchDevice(0);
at.PrefetchDevice(0);
bt.PrefetchDevice(0);

cudaDeviceSynchronize();
exec.sync();

state.exec(
[&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run(cudaExecutor(launch.get_stream())); });
Expand All @@ -103,6 +106,7 @@ template <typename ValueType>
void conv2d_direct_batch(nvbench::state &state,
nvbench::type_list<ValueType>)
{
cudaExecutor exec{0};
auto at = make_tensor<ValueType>({256, 1024, 1024});
auto bt = make_tensor<ValueType>({256, 16, 16});
auto out = make_tensor<ValueType>({256,
Expand All @@ -113,7 +117,7 @@ void conv2d_direct_batch(nvbench::state &state,
at.PrefetchDevice(0);
bt.PrefetchDevice(0);

cudaDeviceSynchronize();
exec.sync();

state.exec(
[&out, &at, &bt](nvbench::launch &launch) { (out = conv2d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
Expand Down
3 changes: 2 additions & 1 deletion bench/00_transform/cub.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@ void sort1d(
nvbench::type_list<ValueType>
)
{
cudaExecutor exec{0};
const int dataSize = static_cast<int>(state.get_int64("Tensor Size"));

auto sortedData = matx::make_tensor<ValueType>({dataSize});
auto randomData = matx::make_tensor<ValueType>({dataSize});

sortedData.PrefetchDevice(0);
randomData.PrefetchDevice(0);
cudaDeviceSynchronize();
exec.sync();

(randomData = random<float>(sortedData.Shape(), NORMAL)).run();

Expand Down
11 changes: 4 additions & 7 deletions bench/00_transform/qr.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ void qr_batch(nvbench::state &state,

cudaStream_t stream = 0;
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
cudaExecutor exec{stream};

int batch = state.get_int64("batch");
int m = state.get_int64("rows");
Expand All @@ -26,17 +27,13 @@ void qr_batch(nvbench::state &state,
auto Q = make_tensor<AType>({batch, m, m});
auto R = make_tensor<AType>({batch, m, n});

A.PrefetchDevice(stream);
Q.PrefetchDevice(stream);
R.PrefetchDevice(stream);

(A = random<float>({batch, m, n}, NORMAL)).run(stream);
(A = random<float>({batch, m, n}, NORMAL)).run(exec);

// warm up
nvtxRangePushA("Warmup");
(mtie(Q, R) = qr(A)).run(stream);
(mtie(Q, R) = qr(A)).run(exec);

cudaDeviceSynchronize();
exec.sync();
nvtxRangePop();

MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
Expand Down
38 changes: 15 additions & 23 deletions bench/00_transform/svd_power.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ void svdpi_batch(nvbench::state &state,

cudaStream_t stream = 0;
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
cudaExecutor exec{stream};

int batch = state.get_int64("batch");
int m = state.get_int64("rows");
Expand All @@ -30,22 +31,17 @@ void svdpi_batch(nvbench::state &state,

int iterations = 10;

(A = random<float>({batch, m, n}, NORMAL)).run(stream);

A.PrefetchDevice(stream);
U.PrefetchDevice(stream);
S.PrefetchDevice(stream);
VT.PrefetchDevice(stream);

(U = 0).run(stream);
(S = 0).run(stream);
(VT = 0).run(stream);
(A = random<float>({batch, m, n}, NORMAL)).run(exec);

(U = 0).run(exec);
(S = 0).run(exec);
(VT = 0).run(exec);
auto x0 = random<float>({batch, r}, NORMAL);

// warm up
nvtxRangePushA("Warmup");
(mtie(U, S, VT) = svdpi(A, x0, iterations, r)).run(stream);
cudaDeviceSynchronize();
(mtie(U, S, VT) = svdpi(A, x0, iterations, r)).run(exec);
exec.sync();
nvtxRangePop();

MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
Expand All @@ -70,6 +66,7 @@ void svdbpi_batch(nvbench::state &state,

cudaStream_t stream = 0;
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
cudaExecutor exec{stream};

int batch = state.get_int64("batch");
int m = state.get_int64("rows");
Expand All @@ -83,21 +80,16 @@ void svdbpi_batch(nvbench::state &state,

int iterations = 10;

(A = random<float>({batch, m, n}, NORMAL)).run(stream);

A.PrefetchDevice(stream);
U.PrefetchDevice(stream);
S.PrefetchDevice(stream);
VT.PrefetchDevice(stream);
(A = random<float>({batch, m, n}, NORMAL)).run(exec);

(U = 0).run(stream);
(S = 0).run(stream);
(VT = 0).run(stream);
(U = 0).run(exec);
(S = 0).run(exec);
(VT = 0).run(exec);

// warm up
nvtxRangePushA("Warmup");
(mtie(U, S, VT) = svdbpi(A, iterations)).run(stream);
cudaDeviceSynchronize();
(mtie(U, S, VT) = svdbpi(A, iterations)).run(exec);
exec.sync();
nvtxRangePop();

MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
Expand Down
1 change: 1 addition & 0 deletions docs_input/api/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ API Reference
casting/index.rst
window/index.rst
signalimage/index.rst
synchronization/index.rst
polynomials/index.rst
random/random.rst
dft/index.rst
Expand Down
10 changes: 10 additions & 0 deletions docs_input/api/synchronization/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
.. _synchronization:

Synchronization
###############

.. toctree::
:maxdepth: 1
:glob:

*
18 changes: 18 additions & 0 deletions docs_input/api/synchronization/sync.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
.. _sync_func:

sync
====

Wait for any code running on an executor to complete.

.. doxygenfunction:: matx::cudaExecutor::sync()
.. doxygenfunction:: matx::HostExecutor::sync()

Examples
~~~~~~~~

.. literalinclude:: ../../../examples/cgsolve.cu
:language: cpp
:start-after: example-begin sync-test-1
:end-before: example-end sync-test-1
:dedent:
2 changes: 0 additions & 2 deletions docs_input/notebooks/exercises/example1_assignment1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ int main() {
// t2 = ;
/*** End editing ***/

t2.PrefetchDevice(0);

/****************************************************************************************************
* Get a slice of the second and third rows with all columns
* https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#slicing-and-dicing
Expand Down
8 changes: 5 additions & 3 deletions docs_input/notebooks/exercises/example2_assignment1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ int main() {
auto B = make_tensor<float>({2, 3});
auto V = make_tensor<float>({3});

cudaExecutor exec{};

/****************************************************************************************************
* Initialize tensor A with increasing values from 0.5 to 3.0 in steps of 0.4,
*and tensor V from -1 to -3 in steps of -1.
Expand Down Expand Up @@ -83,7 +85,7 @@ int main() {

/*** End editing ***/

cudaStreamSynchronize(0);
exec.sync();

step = 0.5;
for (int row = 0; row < A.Size(0); row++) {
Expand Down Expand Up @@ -111,7 +113,7 @@ int main() {
/// auto tvs = ;
/*** End editing. ***/

// cudaStreamSynchronize(0);
// exec.sync();

// step = 0.5;
// for (int row = 0; row < A.Size(0); row++) {
Expand All @@ -137,7 +139,7 @@ int main() {

/*** End editing ***/

cudaStreamSynchronize(0);
exec.sync();

for (int row = 0; row < B.Size(0); row++) {
for (int col = 0; col < B.Size(1); col++) {
Expand Down
Loading