Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,10 @@ if (MATX_NVTX_FLAGS)
target_compile_definitions(matx INTERFACE MATX_NVTX_FLAGS)
endif()
if (MATX_BUILD_32_BIT)
set(INT_TYPE "lp64")
target_compile_definitions(matx INTERFACE INDEX_32_BIT)
set(MATX_NVPL_INT_TYPE "lp64")
target_compile_definitions(matx INTERFACE MATX_INDEX_32_BIT)
else()
set(INT_TYPE "ilp64")
set(MATX_NVPL_INT_TYPE "ilp64")
endif()

# Host support
Expand All @@ -211,13 +211,13 @@ if (MATX_EN_NVPL OR MATX_EN_X86_FFTW OR MATX_EN_BLIS OR MATX_EN_OPENBLAS)
endif()

if (MATX_EN_NVPL)
message(STATUS "Enabling NVPL library support for ARM CPUs with ${INT_TYPE} interface")
message(STATUS "Enabling NVPL library support for ARM CPUs with ${MATX_NVPL_INT_TYPE} interface")
find_package(nvpl REQUIRED COMPONENTS fft blas lapack HINTS ${blas_DIR})
if (NOT MATX_BUILD_32_BIT)
target_compile_definitions(matx INTERFACE NVPL_ILP64)
endif()
target_compile_definitions(matx INTERFACE NVPL_LAPACK_COMPLEX_CUSTOM)
target_link_libraries(matx INTERFACE nvpl::fftw nvpl::blas_${INT_TYPE}_omp nvpl::lapack_${INT_TYPE}_omp)
target_link_libraries(matx INTERFACE nvpl::fftw nvpl::blas_${MATX_NVPL_INT_TYPE}_omp nvpl::lapack_${MATX_NVPL_INT_TYPE}_omp)
target_compile_definitions(matx INTERFACE MATX_EN_NVPL)
else()
# FFTW
Expand Down
28 changes: 14 additions & 14 deletions examples/black_scholes.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ using namespace matx;
* instructions. While caching helps, this can have a slight performance impact when compared to native CUDA
* kernels. To work around this problem, complex expressions can be placed in a custom operator by adding some
* boilerplate code around the original expression. This custom operator can then be used either alone or inside
* other arithmetic expressions, and only a single load is issues for each tensor.
*
* other arithmetic expressions, and only a single load is issues for each tensor.
*
* This example uses the Black-Scholes equtation to demonstrate the two ways to implement the equation in MatX, and
* shows the performance difference.
*/
Expand Down Expand Up @@ -76,7 +76,7 @@ public:
auto d2 = d1 - VsqrtT;
auto cdf_d1 = normcdf(d1);
auto cdf_d2 = normcdf(d2);
auto expRT = exp(-1 * r * T);
auto expRT = exp(-1 * r * T);

out_(idx) = S * cdf_d1 - K * expRT * cdf_d2;
}
Expand All @@ -87,20 +87,20 @@ public:

/* Arithmetic expression */
template<typename T1>
void compute_black_scholes_matx(tensor_t<T1,1>& K,
tensor_t<T1,1>& S,
tensor_t<T1,1>& V,
tensor_t<T1,1>& r,
tensor_t<T1,1>& T,
tensor_t<T1,1>& output,
void compute_black_scholes_matx(tensor_t<T1,1>& K,
tensor_t<T1,1>& S,
tensor_t<T1,1>& V,
tensor_t<T1,1>& r,
tensor_t<T1,1>& T,
tensor_t<T1,1>& output,
cudaExecutor& exec)
{
auto VsqrtT = V * sqrt(T);
auto d1 = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ;
auto d2 = d1 - VsqrtT;
auto cdf_d1 = normcdf(d1);
auto cdf_d2 = normcdf(d2);
auto expRT = exp(-1 * r * T);
auto expRT = exp(-1 * r * T);

(output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);
}
Expand All @@ -120,13 +120,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
tensor_t<dtype, 1> V_tensor{{input_size}};
tensor_t<dtype, 1> r_tensor{{input_size}};
tensor_t<dtype, 1> T_tensor{{input_size}};
tensor_t<dtype, 1> output_tensor{{input_size}};
tensor_t<dtype, 1> output_tensor{{input_size}};

cudaStream_t stream;
cudaStreamCreate(&stream);
cudaExecutor exec{stream};

compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);
compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);

cudaEvent_t start, stop;
cudaEventCreate(&start);
Expand Down Expand Up @@ -154,11 +154,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
cudaEventElapsedTime(&time_ms, start, stop);

printf("Time with custom operator = %.2fms per iteration\n",
time_ms / num_iterations);
time_ms / num_iterations);

cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaStreamDestroy(stream);
CUDA_CHECK_LAST_ERROR();
MATX_CUDA_CHECK_LAST_ERROR();
MATX_EXIT_HANDLER();
}
2 changes: 1 addition & 1 deletion examples/cgsolve.cu
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
// example-end sync-test-1
printf ("max l2 norm: %f\n", (float)sqrt(maxn()));

CUDA_CHECK_LAST_ERROR();
MATX_CUDA_CHECK_LAST_ERROR();
MATX_EXIT_HANDLER();
}
4 changes: 2 additions & 2 deletions examples/channelize_poly_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
}
cudaEventRecord(stop, stream);
exec.sync();
CUDA_CHECK_LAST_ERROR();
MATX_CUDA_CHECK_LAST_ERROR();
cudaEventElapsedTime(&elapsed_ms, start, stop);

const double avg_elapsed_us = (static_cast<double>(elapsed_ms)/NUM_ITERATIONS)*1.0e3;
Expand All @@ -112,7 +112,7 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
printf("\n");
}

CUDA_CHECK_LAST_ERROR();
MATX_CUDA_CHECK_LAST_ERROR();

cudaEventDestroy(start);
cudaEventDestroy(stop);
Expand Down
16 changes: 8 additions & 8 deletions examples/conv2d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,23 @@ using namespace matx;
int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
{
MATX_ENTER_HANDLER();

index_t iN = 4;
index_t iM = 6;

index_t fN = 4;
index_t fM = 2;

auto in = make_tensor<int>({iN,iM});
auto filter = make_tensor<int>({fN,fM});

in.SetVals({ {1,2,3,4,5,6},
{5,4,3,2,1,0},
{3,4,5,6,7,8},
{1,2,3,4,5,6},
});

filter.SetVals({ {1,2},
filter.SetVals({ {1,2},
{3,4},
{5,6},
{7,8}});
Expand All @@ -73,9 +73,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
index_t oM = iM - fM + 1;
auto mode = MATX_C_MODE_VALID;
#endif

auto out = make_tensor<int>({oN,oM});

(out = conv2d(in, filter, mode)).run();

printf("in:\n");
Expand All @@ -86,6 +86,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
print(out);


CUDA_CHECK_LAST_ERROR();
MATX_CUDA_CHECK_LAST_ERROR();
MATX_EXIT_HANDLER();
}
4 changes: 2 additions & 2 deletions examples/convolution.cu
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
for (uint32_t i = 0; i < iterations; i++) {
(outView = conv1d(inView, filterView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
}


cudaEventRecord(stop, stream);
exec.sync();
Expand Down Expand Up @@ -149,6 +149,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)

matxPrintMemoryStatistics();

CUDA_CHECK_LAST_ERROR();
MATX_CUDA_CHECK_LAST_ERROR();
MATX_EXIT_HANDLER();
}
Loading