From 1969207f7a78218fe53226b6d3cc0f211d68e2be Mon Sep 17 00:00:00 2001
From: Tyler Allen <tylera@nvidia.com>
Date: Mon, 5 Aug 2024 20:23:06 +0000
Subject: [PATCH 1/2] changes all cub caches to use new format. code changes
 are made, things compile, but have not tested

---
 include/matx/transforms/cub.h | 362 ++++++++++++++++------------------
 1 file changed, 168 insertions(+), 194 deletions(-)
diff --git a/include/matx/transforms/cub.h b/include/matx/transforms/cub.h
index 2d36aef00..c5db8271c 100644
--- a/include/matx/transforms/cub.h
+++ b/include/matx/transforms/cub.h
@@ -1187,11 +1187,10 @@ void cub_reduce(OutputTensor &a_out, const InputOperator &a, typename InputOpera
     }
   );
 
-  }
 #else
-    auto tmp = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE, param_type>{
-        a_out, a, reduce_params, stream};
-    tmp.ExecReduce(a_out, a, stream);
+  auto tmp = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE, param_type>{
+      a_out, a, reduce_params, stream};
+  tmp.ExecReduce(a_out, a, stream);
 #endif
 #endif
 }
@@ -1222,25 +1221,25 @@ void cub_sum(OutputTensor &a_out, const InputOperator &a,
 
 #ifndef MATX_DISABLE_CUB_CACHE
   auto params =
-      detail::matxCubPlan_t<OutputTensor, InputOperator,
+      detail::matxCubPlan_t<OutputTensor,
+                            InputOperator,
                             detail::CUB_OP_REDUCE_SUM>::GetCubParams(a_out, a, stream);
 
-
-  // Get cache or new Sort plan if it doesn't exist
-  auto ret = detail::cub_cache.Lookup(params);
-  if (ret == std::nullopt) {
-    auto tmp = new detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_SUM>{a_out, a, {}, stream};
-    tmp->ExecSum(a_out, a, stream);
-    detail::cub_cache.Insert(params, static_cast<void *>(tmp));
-  }
-  else {
-    auto type =
-        static_cast<detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_SUM> *>(ret.value());
-    type->ExecSum(a_out, a, stream);
-  }
+  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_SUM, int>;
+  detail::GetCache().LookupAndExec<detail::cub_cache_t>(
+      detail::GetCacheIdFromType<detail::cub_cache_t>(),
+      params,
+      [&]() {
+        int test; ///\todo TYLER_TODO: this should not be needed
+        return std::make_shared<cache_val_type>(a_out, a, test, stream);
+      },
+      [&](std::shared_ptr<cache_val_type> ctype) {
+        ctype->ExecSum(a_out, a, stream);
+      }
+    );
 #else
-    auto tmp = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_SUM>{a_out, a, {}, stream};
-    tmp.ExecSum(a_out, a, stream);
+  auto tmp = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_SUM>{a_out, a, {}, stream};
+  tmp.ExecSum(a_out, a, stream);
 #endif
 #endif
 }
@@ -1272,21 +1271,18 @@ void cub_min(OutputTensor &a_out, const InputOperator &a,
                             InputOperator,
                             detail::CUB_OP_REDUCE_MIN>::GetCubParams(a_out, a, stream);
 
-  // Get cache or new Sort plan if it doesn't exist
-  auto ret = detail::cub_cache.Lookup(params);
-  if (ret == std::nullopt) {
-    auto tmp = new detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_MIN>{
-        a_out, a, {}, stream};
-
-    tmp->ExecMin(a_out, a, stream);
-    detail::cub_cache.Insert(params, static_cast<void *>(tmp));
-  }
-  else {
-    auto type =
-        static_cast<detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_MIN> *>(
-            ret.value());
-    type->ExecMin(a_out, a, stream);
-  }
+  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_MIN, int>;
+  detail::GetCache().LookupAndExec<detail::cub_cache_t>(
+      detail::GetCacheIdFromType<detail::cub_cache_t>(),
+      params,
+      [&]() {
+        int test; ///\todo TYLER_TODO: this should not be needed
+        return std::make_shared<cache_val_type>(a_out, a, test, stream);
+      },
+      [&](std::shared_ptr<cache_val_type> ctype) {
+        ctype->ExecMin(a_out, a, stream);
+      }
+    );
 #else
   auto tmp = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_MIN>{
       a_out, a, {}, stream};
@@ -1322,24 +1318,22 @@ void cub_max(OutputTensor &a_out, const InputOperator &a,
                             InputOperator,
                             detail::CUB_OP_REDUCE_MAX>::GetCubParams(a_out, a, stream);
 
-  // Get cache or new Sort plan if it doesn't exist
-  auto ret = detail::cub_cache.Lookup(params);
-  if (ret == std::nullopt) {
-    auto tmp = new detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_MAX>{
-        a_out, a, {}, stream};
-    tmp->ExecMax(a_out, a, stream);
-    detail::cub_cache.Insert(params, static_cast<void *>(tmp));
-  }
-  else {
-    auto type =
-        static_cast<detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_MAX> *>(
-            ret.value());
-    type->ExecMax(a_out, a, stream);
-  }
+  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_MAX, int>;
+  detail::GetCache().LookupAndExec<detail::cub_cache_t>(
+      detail::GetCacheIdFromType<detail::cub_cache_t>(),
+      params,
+      [&]() {
+        int test; ///\todo TYLER_TODO: this should not be needed
+        return std::make_shared<cache_val_type>(a_out, a, test, stream);
+      },
+      [&](std::shared_ptr<cache_val_type> ctype) {
+        ctype->ExecMax(a_out, a, stream);
+      }
+    );
 #else
-    auto tmp = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_MAX>{
-        a_out, a, {}, stream};
-    tmp.ExecMax(a_out, a, stream);
+  auto tmp = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_MAX>{
+      a_out, a, {}, stream};
+  tmp.ExecMax(a_out, a, stream);
 #endif
 #endif
 }
@@ -1383,28 +1377,26 @@ void sort_impl(OutputTensor &a_out, const InputOperator &a,
   detail::SortParams_t p{dir};
 
 #ifndef MATX_DISABLE_CUB_CACHE
-  // Get parameters required by these tensors
   auto params =
-      detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_RADIX_SORT>::GetCubParams(a_out, a, stream);
+      detail::matxCubPlan_t<OutputTensor,
+                            InputOperator,
+                            detail::CUB_OP_RADIX_SORT>::GetCubParams(a_out, a, stream);
 
-  // Get cache or new Sort plan if it doesn't exist
-  auto ret = detail::cub_cache.Lookup(params);
-  if (ret == std::nullopt) {
-    auto tmp = new detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_RADIX_SORT, decltype(p)>{
-        a_out, a, p, stream};
-    detail::cub_cache.Insert(params, static_cast<void *>(tmp));
-    tmp->ExecSort(a_out, a, dir, stream);
-  }
-  else {
-    auto sort_type =
-        static_cast<detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_RADIX_SORT, decltype(p)> *>(
-            ret.value());
-    sort_type->ExecSort(a_out, a, dir, stream);
-  }
+  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_RADIX_SORT, detail::SortParams_t>;
+  detail::GetCache().LookupAndExec<detail::cub_cache_t>(
+      detail::GetCacheIdFromType<detail::cub_cache_t>(),
+      params,
+      [&]() {
+        return std::make_shared<cache_val_type>(a_out, a, p, stream);
+      },
+      [&](std::shared_ptr<cache_val_type> ctype) {
+        ctype->ExecSort(a_out, a, dir, stream);
+      }
+    );
 #else
-    auto tmp = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_RADIX_SORT, decltype(p)>{
-        a_out, a, p, stream};
-    tmp.ExecSort(a_out, a, dir, stream);
+  auto tmp = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_RADIX_SORT, decltype(p)>{
+      a_out, a, p, stream};
+  tmp.ExecSort(a_out, a, dir, stream);
 #endif
 #endif
 }
@@ -1486,23 +1478,22 @@ void cumsum_impl(OutputTensor &a_out, const InputOperator &a,
   auto params =
       detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_INC_SUM>::GetCubParams(a_out, a, stream);
 
-  // Get cache or new Sort plan if it doesn't exist
-  auto ret = detail::cub_cache.Lookup(params);
-  if (ret == std::nullopt) {
-    auto tmp =
-        new detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_INC_SUM>{a_out, a, {}, stream};
-    tmp->ExecPrefixScanEx(a_out, a, stream);
-    detail::cub_cache.Insert(params, static_cast<void *>(tmp));
-  }
-  else {
-    auto sort_type =
-        static_cast<detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_INC_SUM> *>(ret.value());
-    sort_type->ExecPrefixScanEx(a_out, a, stream);
-  }
+  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_INC_SUM, int>;
+  detail::GetCache().LookupAndExec<detail::cub_cache_t>(
+      detail::GetCacheIdFromType<detail::cub_cache_t>(),
+      params,
+      [&]() {
+        int test; ///\todo TYLER_TODO: this should not be needed
+        return std::make_shared<cache_val_type>(a_out, a, test, stream);
+      },
+      [&](std::shared_ptr<cache_val_type> ctype) {
+        ctype->ExecPrefixScanEx(a_out, a, stream);
+      }
+    );
 #else
-    auto tmp =
-        detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_INC_SUM>{a_out, a, {}, stream};
-    tmp.ExecPrefixScanEx(a_out, a, stream);
+  auto tmp =
+      detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_INC_SUM>{a_out, a, {}, stream};
+  tmp.ExecPrefixScanEx(a_out, a, stream);
 #endif
 #endif
 }
@@ -1572,41 +1563,34 @@ void hist_impl(OutputTensor &a_out, const InputOperator &a,
 #ifdef __CUDACC__
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
+  using param_type = typename detail::HistEvenParams_t<typename InputOperator::value_type>;
   detail::HistEvenParams_t<typename InputOperator::value_type> hp{lower, upper};
 #ifndef MATX_DISABLE_CUB_CACHE
-  // Get parameters required by these tensors
   auto params =
-       detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_HIST_EVEN>::GetCubParams(a_out, a, stream);
+      detail::matxCubPlan_t<OutputTensor,
+                            InputOperator,
+                            detail::CUB_OP_HIST_EVEN>::GetCubParams(a_out, a, stream);
 
+  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_HIST_EVEN, param_type>;
+  detail::GetCache().LookupAndExec<detail::cub_cache_t>(
+      detail::GetCacheIdFromType<detail::cub_cache_t>(),
+      params,
+      [&]() {
+        return std::make_shared<cache_val_type>(a_out, a, hp, stream);
+      },
+      [&](std::shared_ptr<cache_val_type> ctype) {
+        ctype->ExecHistEven(a_out, a, lower, upper, stream);
+      }
+    );
 
-  // Don't cache until we have a good plan for hashing parameters here
-  // Get cache or new Sort plan if it doesn't exist
-   auto ret = detail::cub_cache.Lookup(params);
-  if (ret == std::nullopt) {
-    auto tmp = new detail::matxCubPlan_t< OutputTensor,
-                                          InputOperator,
-                                          detail::CUB_OP_HIST_EVEN,
-                                          detail::HistEvenParams_t<typename InputOperator::value_type>>{
-        a_out, a, detail::HistEvenParams_t<typename InputOperator::value_type>{hp}, stream};
-
-    tmp->ExecHistEven(a_out, a, lower, upper, stream);
-    detail::cub_cache.Insert(params, static_cast<void *>(tmp));
-  }
-  else {
-    auto sort_type =
-        static_cast<detail::matxCubPlan_t<OutputTensor, InputOperator,
-            detail::CUB_OP_HIST_EVEN, detail::HistEvenParams_t<typename InputOperator::value_type>> *>(
-            ret.value());
-    sort_type->ExecHistEven(a_out, a, lower, upper, stream);
-  }
 #else
-    auto tmp = detail::matxCubPlan_t< OutputTensor,
-                                          InputOperator,
-                                          detail::CUB_OP_HIST_EVEN,
-                                          detail::HistEvenParams_t<typename InputOperator::value_type>>{
-        a_out, a, detail::HistEvenParams_t<typename InputOperator::value_type>{hp}, stream};
+  auto tmp = detail::matxCubPlan_t< OutputTensor,
+                                        InputOperator,
+                                        detail::CUB_OP_HIST_EVEN,
+                                        detail::HistEvenParams_t<typename InputOperator::value_type>>{
+      a_out, a, detail::HistEvenParams_t<typename InputOperator::value_type>{hp}, stream};
 
-    tmp.ExecHistEven(a_out, a, lower, upper, stream);
+  tmp.ExecHistEven(a_out, a, lower, upper, stream);
 #endif
 
 #endif
@@ -1716,40 +1700,36 @@ void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator
   static_assert(CountTensor::Rank() == 0, "Num found output tensor rank must be 0");
 
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
+  using param_type = typename detail::SelectParams_t<SelectType, CountTensor>;
   auto cparams = detail::SelectParams_t<SelectType, CountTensor>{sel, num_found};
   cudaStream_t stream = exec.getStream();
 
 #ifndef MATX_DISABLE_CUB_CACHE
 
-  // Get parameters required by these tensors
-  auto params =
-       detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_SELECT, SelectType>::GetCubParams(a_out, a, stream);
-
   // Get cache or new Sort plan if it doesn't exist
-  auto ret = detail::cub_cache.Lookup(params);
-
-
-  // Don't cache until we have a good plan for hashing parameters here
-  if (ret == std::nullopt) {
-    auto tmp = new detail::matxCubPlan_t< OutputTensor,
-                                          InputOperator,
-                                          detail::CUB_OP_SELECT,
-                                          decltype(cparams)>{a_out, a, cparams, stream};
-    tmp->ExecSelect(a_out, a, stream);
-    detail::cub_cache.Insert(params, static_cast<void *>(tmp));
-  }
-  else {
-    auto sort_type =
-        static_cast<detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_SELECT, decltype(cparams)> *>(
-            ret.value());
-    sort_type->ExecSelect(a_out, a, stream);
-  }
+  auto params =
+      detail::matxCubPlan_t<OutputTensor,
+                            InputOperator,
+                            detail::CUB_OP_SELECT,
+                            param_type>::GetCubParams(a_out, a, stream);
+  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_SELECT, param_type>;
+  detail::GetCache().LookupAndExec<detail::cub_cache_t>(
+      detail::GetCacheIdFromType<detail::cub_cache_t>(),
+      params,
+      [&]() {
+        return std::make_shared<cache_val_type>(a_out, a, cparams, stream);
+      },
+      [&](std::shared_ptr<cache_val_type> ctype) {
+        ctype->ExecSelect(a_out, a, stream);
+      }
+    );
+    
 #else
-    auto tmp = detail::matxCubPlan_t< OutputTensor,
-                                          InputOperator,
-                                          detail::CUB_OP_SELECT,
-                                          decltype(cparams)>{a_out, a, cparams, stream};
-    tmp.ExecSelect(a_out, a, stream);
+  auto tmp = detail::matxCubPlan_t< OutputTensor,
+                                        InputOperator,
+                                        detail::CUB_OP_SELECT,
+                                        decltype(cparams)>{a_out, a, cparams, stream};
+  tmp.ExecSelect(a_out, a, stream);
 #endif
 #endif
 }
@@ -1840,37 +1820,35 @@ void find_idx_impl(OutputTensor &a_out, CountTensor &num_found, const InputOpera
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
   cudaStream_t stream = exec.getStream();
+  using param_type = typename detail::SelectParams_t<SelectType, CountTensor>;
   auto cparams = detail::SelectParams_t<SelectType, CountTensor>{sel, num_found};
 
 #ifndef MATX_DISABLE_CUB_CACHE
-  // Get parameters required by these tensors
-  auto params =
-       detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_SELECT_IDX, SelectType>::GetCubParams(a_out, a, stream);
-
+  
   // Get cache or new Sort plan if it doesn't exist
-  auto ret = detail::cub_cache.Lookup(params);
-
-  // Don't cache until we have a good plan for hashing parameters here
-  if (ret == std::nullopt) {
-    auto tmp = new detail::matxCubPlan_t< OutputTensor,
-                                          InputOperator,
-                                          detail::CUB_OP_SELECT_IDX,
-                                          decltype(cparams)>{a_out, a, cparams, stream};
-    tmp->ExecSelectIndex(a_out, a, stream);
-    detail::cub_cache.Insert(params, static_cast<void *>(tmp));
-  }
-  else {
-    auto sort_type =
-        static_cast<detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_SELECT_IDX, decltype(cparams)> *>(
-            ret.value());
-    sort_type->ExecSelectIndex(a_out, a, stream);
-  }
+  auto params =
+      detail::matxCubPlan_t<OutputTensor,
+                            InputOperator,
+                            detail::CUB_OP_SELECT_IDX,
+                            param_type>::GetCubParams(a_out, a, stream);
+  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_SELECT_IDX, param_type>;
+  detail::GetCache().LookupAndExec<detail::cub_cache_t>(
+      detail::GetCacheIdFromType<detail::cub_cache_t>(),
+      params,
+      [&]() {
+        return std::make_shared<cache_val_type>(a_out, a, cparams, stream);
+      },
+      [&](std::shared_ptr<cache_val_type> ctype) {
+        ctype->ExecSelectIndex(a_out, a, stream);
+      }
+    );
+    
 #else
-    auto tmp = detail::matxCubPlan_t< OutputTensor,
-                                          InputOperator,
-                                          detail::CUB_OP_SELECT_IDX,
-                                          decltype(cparams)>{a_out, a, cparams, stream};
-    tmp.ExecSelectIndex(a_out, a, stream);
+  auto tmp = detail::matxCubPlan_t< OutputTensor,
+                                        InputOperator,
+                                        detail::CUB_OP_SELECT_IDX,
+                                        decltype(cparams)>{a_out, a, cparams, stream};
+  tmp.ExecSelectIndex(a_out, a, stream);
 #endif
 #endif
 }
@@ -1961,37 +1939,33 @@ void unique_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperato
 
   matx::sort_impl(sort_tensor, a, SORT_DIR_ASC, stream);
 
+  using param_type = typename detail::UniqueParams_t<CountTensor>;
   auto cparams = detail::UniqueParams_t<CountTensor>{num_found};
 
 #ifndef MATX_DISABLE_CUB_CACHE
-  // Get parameters required by these tensors
-  auto params =
-      detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_UNIQUE, decltype(cparams)>::GetCubParams(a_out, sort_tensor, stream);
-
   // Get cache or new Sort plan if it doesn't exist
-  auto ret = detail::cub_cache.Lookup(params);
-
-
-  if (ret == std::nullopt) {
-    auto tmp = new detail::matxCubPlan_t< OutputTensor,
-                                          InputOperator,
-                                          detail::CUB_OP_UNIQUE,
-                                          decltype(cparams)>{a_out, sort_tensor, cparams, stream};
-    tmp->ExecUnique(a_out, sort_tensor, stream);
-    detail::cub_cache.Insert(params, static_cast<void *>(tmp));
-  }
-  else {
-    auto sort_type =
-        static_cast<detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_UNIQUE, decltype(cparams)> *>(
-            ret.value());
-    sort_type->ExecUnique(a_out, sort_tensor, stream);
-  }
+  auto params =
+      detail::matxCubPlan_t<OutputTensor,
+                            InputOperator,
+                            detail::CUB_OP_UNIQUE,
+                            param_type>::GetCubParams(a_out, a, stream);
+  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_UNIQUE, param_type>;
+  detail::GetCache().LookupAndExec<detail::cub_cache_t>(
+      detail::GetCacheIdFromType<detail::cub_cache_t>(),
+      params,
+      [&]() {
+        return std::make_shared<cache_val_type>(a_out, a, cparams, stream);
+      },
+      [&](std::shared_ptr<cache_val_type> ctype) {
+        ctype->ExecUnique(a_out, sort_tensor, stream);
+      }
+    );
 #else
-    auto tmp = detail::matxCubPlan_t< OutputTensor,
-                                          InputOperator,
-                                          detail::CUB_OP_UNIQUE,
-                                          decltype(cparams)>{a_out, sort_tensor, cparams, stream};
-    tmp.ExecUnique(a_out, sort_tensor, stream);
+  auto tmp = detail::matxCubPlan_t< OutputTensor,
+                                        InputOperator,
+                                        detail::CUB_OP_UNIQUE,
+                                        decltype(cparams)>{a_out, sort_tensor, cparams, stream};
+  tmp.ExecUnique(a_out, sort_tensor, stream);
 #endif
 #endif
 }

From b9ce3a57de2f151df7bb82998dbd04a657b6bb2a Mon Sep 17 00:00:00 2001
From: Tyler Allen <tylera@nvidia.com>
Date: Wed, 7 Aug 2024 22:37:43 +0000
Subject: [PATCH 2/2] update to correct empty param for cub plan

---
 include/matx/transforms/cub.h | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/include/matx/transforms/cub.h b/include/matx/transforms/cub.h
index c5db8271c..c71be9afe 100644
--- a/include/matx/transforms/cub.h
+++ b/include/matx/transforms/cub.h
@@ -1225,13 +1225,12 @@ void cub_sum(OutputTensor &a_out, const InputOperator &a,
                             InputOperator,
                             detail::CUB_OP_REDUCE_SUM>::GetCubParams(a_out, a, stream);
 
-  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_SUM, int>;
+  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_SUM, EmptyParams_t>;
   detail::GetCache().LookupAndExec<detail::cub_cache_t>(
       detail::GetCacheIdFromType<detail::cub_cache_t>(),
       params,
       [&]() {
-        int test; ///\todo TYLER_TODO: this should not be needed
-        return std::make_shared<cache_val_type>(a_out, a, test, stream);
+        return std::make_shared<cache_val_type>(a_out, a, EmptyParams_t{}, stream);
       },
       [&](std::shared_ptr<cache_val_type> ctype) {
         ctype->ExecSum(a_out, a, stream);
@@ -1271,13 +1270,12 @@ void cub_min(OutputTensor &a_out, const InputOperator &a,
                             InputOperator,
                             detail::CUB_OP_REDUCE_MIN>::GetCubParams(a_out, a, stream);
 
-  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_MIN, int>;
+  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_MIN, EmptyParams_t>;
   detail::GetCache().LookupAndExec<detail::cub_cache_t>(
       detail::GetCacheIdFromType<detail::cub_cache_t>(),
       params,
       [&]() {
-        int test; ///\todo TYLER_TODO: this should not be needed
-        return std::make_shared<cache_val_type>(a_out, a, test, stream);
+        return std::make_shared<cache_val_type>(a_out, a, EmptyParams_t{}, stream);
       },
       [&](std::shared_ptr<cache_val_type> ctype) {
         ctype->ExecMin(a_out, a, stream);
@@ -1318,13 +1316,12 @@ void cub_max(OutputTensor &a_out, const InputOperator &a,
                             InputOperator,
                             detail::CUB_OP_REDUCE_MAX>::GetCubParams(a_out, a, stream);
 
-  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_MAX, int>;
+  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_REDUCE_MAX, EmptyParams_t>;
   detail::GetCache().LookupAndExec<detail::cub_cache_t>(
       detail::GetCacheIdFromType<detail::cub_cache_t>(),
       params,
       [&]() {
-        int test; ///\todo TYLER_TODO: this should not be needed
-        return std::make_shared<cache_val_type>(a_out, a, test, stream);
+        return std::make_shared<cache_val_type>(a_out, a, EmptyParams_t{}, stream);
       },
       [&](std::shared_ptr<cache_val_type> ctype) {
         ctype->ExecMax(a_out, a, stream);
@@ -1478,13 +1475,12 @@ void cumsum_impl(OutputTensor &a_out, const InputOperator &a,
   auto params =
       detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_INC_SUM>::GetCubParams(a_out, a, stream);
 
-  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_INC_SUM, int>;
+  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_INC_SUM, EmptyParams_t>;
   detail::GetCache().LookupAndExec<detail::cub_cache_t>(
       detail::GetCacheIdFromType<detail::cub_cache_t>(),
       params,
       [&]() {
-        int test; ///\todo TYLER_TODO: this should not be needed
-        return std::make_shared<cache_val_type>(a_out, a, test, stream);
+        return std::make_shared<cache_val_type>(a_out, a, EmptyParams_t{}, stream);
       },
       [&](std::shared_ptr<cache_val_type> ctype) {
         ctype->ExecPrefixScanEx(a_out, a, stream);