From 1969207f7a78218fe53226b6d3cc0f211d68e2be Mon Sep 17 00:00:00 2001 From: Tyler Allen Date: Mon, 5 Aug 2024 20:23:06 +0000 Subject: [PATCH 1/2] changes all cub caches to use new format. code changes are made, things compile, but have not tested --- include/matx/transforms/cub.h | 362 ++++++++++++++++------------------ 1 file changed, 168 insertions(+), 194 deletions(-) diff --git a/include/matx/transforms/cub.h b/include/matx/transforms/cub.h index 2d36aef00..c5db8271c 100644 --- a/include/matx/transforms/cub.h +++ b/include/matx/transforms/cub.h @@ -1187,11 +1187,10 @@ void cub_reduce(OutputTensor &a_out, const InputOperator &a, typename InputOpera } ); - } #else - auto tmp = detail::matxCubPlan_t{ - a_out, a, reduce_params, stream}; - tmp.ExecReduce(a_out, a, stream); + auto tmp = detail::matxCubPlan_t{ + a_out, a, reduce_params, stream}; + tmp.ExecReduce(a_out, a, stream); #endif #endif } @@ -1222,25 +1221,25 @@ void cub_sum(OutputTensor &a_out, const InputOperator &a, #ifndef MATX_DISABLE_CUB_CACHE auto params = - detail::matxCubPlan_t::GetCubParams(a_out, a, stream); - - // Get cache or new Sort plan if it doesn't exist - auto ret = detail::cub_cache.Lookup(params); - if (ret == std::nullopt) { - auto tmp = new detail::matxCubPlan_t{a_out, a, {}, stream}; - tmp->ExecSum(a_out, a, stream); - detail::cub_cache.Insert(params, static_cast(tmp)); - } - else { - auto type = - static_cast *>(ret.value()); - type->ExecSum(a_out, a, stream); - } + using cache_val_type = detail::matxCubPlan_t; + detail::GetCache().LookupAndExec( + detail::GetCacheIdFromType(), + params, + [&]() { + int test; ///\todo TYLER_TODO: this should not be needed + return std::make_shared(a_out, a, test, stream); + }, + [&](std::shared_ptr ctype) { + ctype->ExecSum(a_out, a, stream); + } + ); #else - auto tmp = detail::matxCubPlan_t{a_out, a, {}, stream}; - tmp.ExecSum(a_out, a, stream); + auto tmp = detail::matxCubPlan_t{a_out, a, {}, stream}; + tmp.ExecSum(a_out, a, stream); #endif #endif } @@ -1272,21 +1271,18 @@ void cub_min(OutputTensor &a_out, const InputOperator &a, InputOperator, detail::CUB_OP_REDUCE_MIN>::GetCubParams(a_out, a, stream); - // Get cache or new Sort plan if it doesn't exist - auto ret = detail::cub_cache.Lookup(params); - if (ret == std::nullopt) { - auto tmp = new detail::matxCubPlan_t{ - a_out, a, {}, stream}; - - tmp->ExecMin(a_out, a, stream); - detail::cub_cache.Insert(params, static_cast(tmp)); - } - else { - auto type = - static_cast *>( - ret.value()); - type->ExecMin(a_out, a, stream); - } + using cache_val_type = detail::matxCubPlan_t; + detail::GetCache().LookupAndExec( + detail::GetCacheIdFromType(), + params, + [&]() { + int test; ///\todo TYLER_TODO: this should not be needed + return std::make_shared(a_out, a, test, stream); + }, + [&](std::shared_ptr ctype) { + ctype->ExecMin(a_out, a, stream); + } + ); #else auto tmp = detail::matxCubPlan_t{ a_out, a, {}, stream}; @@ -1322,24 +1318,22 @@ void cub_max(OutputTensor &a_out, const InputOperator &a, InputOperator, detail::CUB_OP_REDUCE_MAX>::GetCubParams(a_out, a, stream); - // Get cache or new Sort plan if it doesn't exist - auto ret = detail::cub_cache.Lookup(params); - if (ret == std::nullopt) { - auto tmp = new detail::matxCubPlan_t{ - a_out, a, {}, stream}; - tmp->ExecMax(a_out, a, stream); - detail::cub_cache.Insert(params, static_cast(tmp)); - } - else { - auto type = - static_cast *>( - ret.value()); - type->ExecMax(a_out, a, stream); - } + using cache_val_type = detail::matxCubPlan_t; + detail::GetCache().LookupAndExec( + detail::GetCacheIdFromType(), + params, + [&]() { + int test; ///\todo TYLER_TODO: this should not be needed + return std::make_shared(a_out, a, test, stream); + }, + [&](std::shared_ptr ctype) { + ctype->ExecMax(a_out, a, stream); + } + ); #else - auto tmp = detail::matxCubPlan_t{ - a_out, a, {}, stream}; - tmp.ExecMax(a_out, a, stream); + auto tmp = detail::matxCubPlan_t{ + a_out, a, {}, stream}; + tmp.ExecMax(a_out, a, stream); #endif #endif } @@ -1383,28 +1377,26 @@ void sort_impl(OutputTensor &a_out, const InputOperator &a, detail::SortParams_t p{dir}; #ifndef MATX_DISABLE_CUB_CACHE - // Get parameters required by these tensors auto params = - detail::matxCubPlan_t::GetCubParams(a_out, a, stream); + detail::matxCubPlan_t::GetCubParams(a_out, a, stream); - // Get cache or new Sort plan if it doesn't exist - auto ret = detail::cub_cache.Lookup(params); - if (ret == std::nullopt) { - auto tmp = new detail::matxCubPlan_t{ - a_out, a, p, stream}; - detail::cub_cache.Insert(params, static_cast(tmp)); - tmp->ExecSort(a_out, a, dir, stream); - } - else { - auto sort_type = - static_cast *>( - ret.value()); - sort_type->ExecSort(a_out, a, dir, stream); - } + using cache_val_type = detail::matxCubPlan_t; + detail::GetCache().LookupAndExec( + detail::GetCacheIdFromType(), + params, + [&]() { + return std::make_shared(a_out, a, p, stream); + }, + [&](std::shared_ptr ctype) { + ctype->ExecSort(a_out, a, dir, stream); + } + ); #else - auto tmp = detail::matxCubPlan_t{ - a_out, a, p, stream}; - tmp.ExecSort(a_out, a, dir, stream); + auto tmp = detail::matxCubPlan_t{ + a_out, a, p, stream}; + tmp.ExecSort(a_out, a, dir, stream); #endif #endif } @@ -1486,23 +1478,22 @@ void cumsum_impl(OutputTensor &a_out, const InputOperator &a, auto params = detail::matxCubPlan_t::GetCubParams(a_out, a, stream); - // Get cache or new Sort plan if it doesn't exist - auto ret = detail::cub_cache.Lookup(params); - if (ret == std::nullopt) { - auto tmp = - new detail::matxCubPlan_t{a_out, a, {}, stream}; - tmp->ExecPrefixScanEx(a_out, a, stream); - detail::cub_cache.Insert(params, static_cast(tmp)); - } - else { - auto sort_type = - static_cast *>(ret.value()); - sort_type->ExecPrefixScanEx(a_out, a, stream); - } + using cache_val_type = detail::matxCubPlan_t; + detail::GetCache().LookupAndExec( + detail::GetCacheIdFromType(), + params, + [&]() { + int test; ///\todo TYLER_TODO: this should not be needed + return std::make_shared(a_out, a, test, stream); + }, + [&](std::shared_ptr ctype) { + ctype->ExecPrefixScanEx(a_out, a, stream); + } + ); #else - auto tmp = - detail::matxCubPlan_t{a_out, a, {}, stream}; - tmp.ExecPrefixScanEx(a_out, a, stream); + auto tmp = + detail::matxCubPlan_t{a_out, a, {}, stream}; + tmp.ExecPrefixScanEx(a_out, a, stream); #endif #endif } @@ -1572,41 +1563,34 @@ void hist_impl(OutputTensor &a_out, const InputOperator &a, #ifdef __CUDACC__ MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) + using param_type = typename detail::HistEvenParams_t; detail::HistEvenParams_t hp{lower, upper}; #ifndef MATX_DISABLE_CUB_CACHE - // Get parameters required by these tensors auto params = - detail::matxCubPlan_t::GetCubParams(a_out, a, stream); + detail::matxCubPlan_t::GetCubParams(a_out, a, stream); + using cache_val_type = detail::matxCubPlan_t; + detail::GetCache().LookupAndExec( + detail::GetCacheIdFromType(), + params, + [&]() { + return std::make_shared(a_out, a, hp, stream); + }, + [&](std::shared_ptr ctype) { + ctype->ExecHistEven(a_out, a, lower, upper, stream); + } + ); - // Don't cache until we have a good plan for hashing parameters here - // Get cache or new Sort plan if it doesn't exist - auto ret = detail::cub_cache.Lookup(params); - if (ret == std::nullopt) { - auto tmp = new detail::matxCubPlan_t< OutputTensor, - InputOperator, - detail::CUB_OP_HIST_EVEN, - detail::HistEvenParams_t>{ - a_out, a, detail::HistEvenParams_t{hp}, stream}; - - tmp->ExecHistEven(a_out, a, lower, upper, stream); - detail::cub_cache.Insert(params, static_cast(tmp)); - } - else { - auto sort_type = - static_cast> *>( - ret.value()); - sort_type->ExecHistEven(a_out, a, lower, upper, stream); - } #else - auto tmp = detail::matxCubPlan_t< OutputTensor, - InputOperator, - detail::CUB_OP_HIST_EVEN, - detail::HistEvenParams_t>{ - a_out, a, detail::HistEvenParams_t{hp}, stream}; + auto tmp = detail::matxCubPlan_t< OutputTensor, + InputOperator, + detail::CUB_OP_HIST_EVEN, + detail::HistEvenParams_t>{ + a_out, a, detail::HistEvenParams_t{hp}, stream}; - tmp.ExecHistEven(a_out, a, lower, upper, stream); + tmp.ExecHistEven(a_out, a, lower, upper, stream); #endif #endif @@ -1716,40 +1700,36 @@ void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator static_assert(CountTensor::Rank() == 0, "Num found output tensor rank must be 0"); MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) + using param_type = typename detail::SelectParams_t; auto cparams = detail::SelectParams_t{sel, num_found}; cudaStream_t stream = exec.getStream(); #ifndef MATX_DISABLE_CUB_CACHE - // Get parameters required by these tensors - auto params = - detail::matxCubPlan_t::GetCubParams(a_out, a, stream); - // Get cache or new Sort plan if it doesn't exist - auto ret = detail::cub_cache.Lookup(params); - - - // Don't cache until we have a good plan for hashing parameters here - if (ret == std::nullopt) { - auto tmp = new detail::matxCubPlan_t< OutputTensor, - InputOperator, - detail::CUB_OP_SELECT, - decltype(cparams)>{a_out, a, cparams, stream}; - tmp->ExecSelect(a_out, a, stream); - detail::cub_cache.Insert(params, static_cast(tmp)); - } - else { - auto sort_type = - static_cast *>( - ret.value()); - sort_type->ExecSelect(a_out, a, stream); - } + auto params = + detail::matxCubPlan_t::GetCubParams(a_out, a, stream); + using cache_val_type = detail::matxCubPlan_t; + detail::GetCache().LookupAndExec( + detail::GetCacheIdFromType(), + params, + [&]() { + return std::make_shared(a_out, a, cparams, stream); + }, + [&](std::shared_ptr ctype) { + ctype->ExecSelect(a_out, a, stream); + } + ); + #else - auto tmp = detail::matxCubPlan_t< OutputTensor, - InputOperator, - detail::CUB_OP_SELECT, - decltype(cparams)>{a_out, a, cparams, stream}; - tmp.ExecSelect(a_out, a, stream); + auto tmp = detail::matxCubPlan_t< OutputTensor, + InputOperator, + detail::CUB_OP_SELECT, + decltype(cparams)>{a_out, a, cparams, stream}; + tmp.ExecSelect(a_out, a, stream); #endif #endif } @@ -1840,37 +1820,35 @@ void find_idx_impl(OutputTensor &a_out, CountTensor &num_found, const InputOpera MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) cudaStream_t stream = exec.getStream(); + using param_type = typename detail::SelectParams_t; auto cparams = detail::SelectParams_t{sel, num_found}; #ifndef MATX_DISABLE_CUB_CACHE - // Get parameters required by these tensors - auto params = - detail::matxCubPlan_t::GetCubParams(a_out, a, stream); - + // Get cache or new Sort plan if it doesn't exist - auto ret = detail::cub_cache.Lookup(params); - - // Don't cache until we have a good plan for hashing parameters here - if (ret == std::nullopt) { - auto tmp = new detail::matxCubPlan_t< OutputTensor, - InputOperator, - detail::CUB_OP_SELECT_IDX, - decltype(cparams)>{a_out, a, cparams, stream}; - tmp->ExecSelectIndex(a_out, a, stream); - detail::cub_cache.Insert(params, static_cast(tmp)); - } - else { - auto sort_type = - static_cast *>( - ret.value()); - sort_type->ExecSelectIndex(a_out, a, stream); - } + auto params = + detail::matxCubPlan_t::GetCubParams(a_out, a, stream); + using cache_val_type = detail::matxCubPlan_t; + detail::GetCache().LookupAndExec( + detail::GetCacheIdFromType(), + params, + [&]() { + return std::make_shared(a_out, a, cparams, stream); + }, + [&](std::shared_ptr ctype) { + ctype->ExecSelectIndex(a_out, a, stream); + } + ); + #else - auto tmp = detail::matxCubPlan_t< OutputTensor, - InputOperator, - detail::CUB_OP_SELECT_IDX, - decltype(cparams)>{a_out, a, cparams, stream}; - tmp.ExecSelectIndex(a_out, a, stream); + auto tmp = detail::matxCubPlan_t< OutputTensor, + InputOperator, + detail::CUB_OP_SELECT_IDX, + decltype(cparams)>{a_out, a, cparams, stream}; + tmp.ExecSelectIndex(a_out, a, stream); #endif #endif } @@ -1961,37 +1939,33 @@ void unique_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperato matx::sort_impl(sort_tensor, a, SORT_DIR_ASC, stream); + using param_type = typename detail::UniqueParams_t; auto cparams = detail::UniqueParams_t{num_found}; #ifndef MATX_DISABLE_CUB_CACHE - // Get parameters required by these tensors - auto params = - detail::matxCubPlan_t::GetCubParams(a_out, sort_tensor, stream); - // Get cache or new Sort plan if it doesn't exist - auto ret = detail::cub_cache.Lookup(params); - - - if (ret == std::nullopt) { - auto tmp = new detail::matxCubPlan_t< OutputTensor, - InputOperator, - detail::CUB_OP_UNIQUE, - decltype(cparams)>{a_out, sort_tensor, cparams, stream}; - tmp->ExecUnique(a_out, sort_tensor, stream); - detail::cub_cache.Insert(params, static_cast(tmp)); - } - else { - auto sort_type = - static_cast *>( - ret.value()); - sort_type->ExecUnique(a_out, sort_tensor, stream); - } + auto params = + detail::matxCubPlan_t::GetCubParams(a_out, a, stream); + using cache_val_type = detail::matxCubPlan_t; + detail::GetCache().LookupAndExec( + detail::GetCacheIdFromType(), + params, + [&]() { + return std::make_shared(a_out, a, cparams, stream); + }, + [&](std::shared_ptr ctype) { + ctype->ExecUnique(a_out, sort_tensor, stream); + } + ); #else - auto tmp = detail::matxCubPlan_t< OutputTensor, - InputOperator, - detail::CUB_OP_UNIQUE, - decltype(cparams)>{a_out, sort_tensor, cparams, stream}; - tmp.ExecUnique(a_out, sort_tensor, stream); + auto tmp = detail::matxCubPlan_t< OutputTensor, + InputOperator, + detail::CUB_OP_UNIQUE, + decltype(cparams)>{a_out, sort_tensor, cparams, stream}; + tmp.ExecUnique(a_out, sort_tensor, stream); #endif #endif } From b9ce3a57de2f151df7bb82998dbd04a657b6bb2a Mon Sep 17 00:00:00 2001 From: Tyler Allen Date: Wed, 7 Aug 2024 22:37:43 +0000 Subject: [PATCH 2/2] update to correct empty param for cub plan --- include/matx/transforms/cub.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/include/matx/transforms/cub.h b/include/matx/transforms/cub.h index c5db8271c..c71be9afe 100644 --- a/include/matx/transforms/cub.h +++ b/include/matx/transforms/cub.h @@ -1225,13 +1225,12 @@ void cub_sum(OutputTensor &a_out, const InputOperator &a, InputOperator, detail::CUB_OP_REDUCE_SUM>::GetCubParams(a_out, a, stream); - using cache_val_type = detail::matxCubPlan_t; + using cache_val_type = detail::matxCubPlan_t; detail::GetCache().LookupAndExec( detail::GetCacheIdFromType(), params, [&]() { - int test; ///\todo TYLER_TODO: this should not be needed - return std::make_shared(a_out, a, test, stream); + return std::make_shared(a_out, a, EmptyParams_t{}, stream); }, [&](std::shared_ptr ctype) { ctype->ExecSum(a_out, a, stream); @@ -1271,13 +1270,12 @@ void cub_min(OutputTensor &a_out, const InputOperator &a, InputOperator, detail::CUB_OP_REDUCE_MIN>::GetCubParams(a_out, a, stream); - using cache_val_type = detail::matxCubPlan_t; + using cache_val_type = detail::matxCubPlan_t; detail::GetCache().LookupAndExec( detail::GetCacheIdFromType(), params, [&]() { - int test; ///\todo TYLER_TODO: this should not be needed - return std::make_shared(a_out, a, test, stream); + return std::make_shared(a_out, a, EmptyParams_t{}, stream); }, [&](std::shared_ptr ctype) { ctype->ExecMin(a_out, a, stream); @@ -1318,13 +1316,12 @@ void cub_max(OutputTensor &a_out, const InputOperator &a, InputOperator, detail::CUB_OP_REDUCE_MAX>::GetCubParams(a_out, a, stream); - using cache_val_type = detail::matxCubPlan_t; + using cache_val_type = detail::matxCubPlan_t; detail::GetCache().LookupAndExec( detail::GetCacheIdFromType(), params, [&]() { - int test; ///\todo TYLER_TODO: this should not be needed - return std::make_shared(a_out, a, test, stream); + return std::make_shared(a_out, a, EmptyParams_t{}, stream); }, [&](std::shared_ptr ctype) { ctype->ExecMax(a_out, a, stream); @@ -1478,13 +1475,12 @@ void cumsum_impl(OutputTensor &a_out, const InputOperator &a, auto params = detail::matxCubPlan_t::GetCubParams(a_out, a, stream); - using cache_val_type = detail::matxCubPlan_t; + using cache_val_type = detail::matxCubPlan_t; detail::GetCache().LookupAndExec( detail::GetCacheIdFromType(), params, [&]() { - int test; ///\todo TYLER_TODO: this should not be needed - return std::make_shared(a_out, a, test, stream); + return std::make_shared(a_out, a, EmptyParams_t{}, stream); }, [&](std::shared_ptr ctype) { ctype->ExecPrefixScanEx(a_out, a, stream);