NVIDIA · tmartin-gh · Oct 22, 2024 · Oct 22, 2024
diff --git a/include/matx/operators/argmax.h b/include/matx/operators/argmax.h
@@ -115,7 +115,7 @@ __MATX_INLINE__ auto argmax(const InType &in, const int (&dims)[D])
 }
 
 /**
- * Compute min reduction of a tensor and returns value + index
+ * Compute max reduction of an operator and returns value + index
  *
  * @tparam InType
  *   Input data type

diff --git a/include/matx/operators/argmin.h b/include/matx/operators/argmin.h
@@ -91,7 +91,7 @@ namespace detail {
 }
 
 /**
- * Compute min reduction of a tensor and returns value + index along specified axes
+ * Compute min reduction of an operator and returns value + index along specified axes
  *
  * @tparam InType
  *   Input data type
@@ -115,7 +115,7 @@ __MATX_INLINE__ auto argmin(const InType &in, const int (&dims)[D])
 }
 
 /**
- * Compute min reduction of a tensor and returns value + index
+ * Compute min reduction of an operator and returns value + index
  *
  * @tparam InType
  *   Input data type

diff --git a/include/matx/operators/argminmax.h b/include/matx/operators/argminmax.h
@@ -91,7 +91,7 @@ namespace detail {
 }
 
 /**
- * Compute min and max reduction of a tensor and returns value + index along specified axes
+ * Compute min and max reduction of an operator and returns value + index along specified axes
  *
  * @tparam InType
  *   Input data type
@@ -102,7 +102,7 @@ namespace detail {
  *   Input data to reduce
  * @param dims
  *   Array containing dimensions to reduce over
- * @returns Operator with reduced values of argmin-reduce computed
+ * @returns Operator with reduced values of argminmax-reduce computed
  */
 template <typename InType, int D>
 __MATX_INLINE__ auto argminmax(const InType &in, const int (&dims)[D])
@@ -115,14 +115,14 @@ __MATX_INLINE__ auto argminmax(const InType &in, const int (&dims)[D])
 }
 
 /**
- * Compute min and max reduction of a tensor and returns value + index
+ * Compute min and max reduction of an operator and returns value + index
  *
  * @tparam InType
  *   Input data type
  *
  * @param in
  *   Input data to reduce
- * @returns Operator with reduced values of argmin-reduce computed
+ * @returns Operator with reduced values of argminmax-reduce computed
  */
 template <typename InType>
 __MATX_INLINE__ auto argminmax(const InType &in)

diff --git a/include/matx/transforms/reduce.h b/include/matx/transforms/reduce.h
@@ -1976,13 +1976,10 @@ void __MATX_INLINE__ prod_impl(OutType dest, const InType &in, [[maybe_unused]]
 
 
 /**
- * Compute max reduction of a tensor
+ * Compute max reduction of an operator
  *
  * Returns a tensor representing the max of all numbers in the reduction
  *
- * @note This function uses the name rmax instead of max to not collide with the
- * element-wise operator max.
- *
  * @tparam OutType
  *   Output data type
  * @tparam InType
@@ -2007,13 +2004,10 @@ void __MATX_INLINE__ max_impl(OutType dest, const InType &in, cudaExecutor exec
 }
 
 /**
- * Compute max reduction of a tensor
+ * Compute max reduction of an operator
  *
  * Returns a tensor representing the max of all numbers in the reduction
  *
- * @note This function uses the name rmax instead of max to not collide with the
- * element-wise operator max.
- *
  * @tparam OutType
  *   Output data type
  * @tparam InType
@@ -2036,8 +2030,9 @@ void __MATX_INLINE__ max_impl(OutType dest, const InType &in, [[maybe_unused]] c
       *lout = *std::max_element(lin, lin + TotalSize(in));
     }
     else {
-      auto els = lend[1] - lbegin[0];
-      for (index_t b = 0; b < els; b++) {
+      const index_t BATCHES = TotalSize(dest);
+      const index_t els = lend[0] - lbegin[0];
+      for (index_t b = 0; b < BATCHES; b++) {
         lout[b] = *std::max_element(lin + lbegin[b], lin + lend[b]);
       }
     }
@@ -2084,9 +2079,9 @@ void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InT
 }
 
 /**
- * Compute maxn reduction of a tensor and returns value + index
+ * Compute max reduction of an operator and returns value + index
  *
- * Returns a tensor with maximums and indices
+ * Returns a tensor with maximums and a tensor with indices
  *
  * @tparam OutType
  *   Output data type
@@ -2114,8 +2109,9 @@ void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InT
       *lout = cuda::std::max_element(lin, lin + TotalSize(in)) - lin;
     }
     else {
-      auto els = lend[0] - lbegin[0];
-      for (index_t b = 0; b < els; b++) {
+      const index_t BATCHES = TotalSize(dest);
+      const index_t els = lend[0] - lbegin[0];
+      for (index_t b = 0; b < BATCHES; b++) {
         lout[b] = cuda::std::max_element(lin + lbegin[b], lin + lend[b]) - lin;
       }
     }
@@ -2130,7 +2126,7 @@ void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InT
 
 
 /**
- * Compute min reduction of a tensor
+ * Compute min reduction of an operator
  *
  * Returns a tensor representing the min of all numbers in the reduction
  *
@@ -2158,13 +2154,10 @@ void __MATX_INLINE__ min_impl(OutType dest, const InType &in, cudaExecutor exec
 }
 
 /**
- * Compute min reduction of a tensor
+ * Compute min reduction of an operator
  *
  * Returns a tensor representing the min of all numbers in the reduction
  *
- * @note This function uses the name rmin instead of min to not collide with the
- * element-wise operator min.
- *
  * @tparam OutType
  *   Output data type
  * @tparam InType
@@ -2186,8 +2179,9 @@ void __MATX_INLINE__ min_impl(OutType dest, const InType &in, [[maybe_unused]] c
       *lout = *std::min_element(lin, lin + TotalSize(in));
     }
     else {
-      auto els = lend[1] - lbegin[0];
-      for (index_t b = 0; b < els; b++) {
+      const index_t BATCHES = TotalSize(dest);
+      const index_t els = lend[0] - lbegin[0];
+      for (index_t b = 0; b < BATCHES; b++) {
         lout[b] = *std::min_element(lin + lbegin[b], lin + lend[b]);
       }
     }
@@ -2234,6 +2228,53 @@ void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InT
 #endif
 }
 
+/**
+ * Compute min reduction of an operator and returns value + index
+ *
+ * Returns a tensor with minimums and indices
+ *
+ * @tparam OutType
+ *   Output data type
+ * @tparam TensorIndexType
+ *   Output type stpring indices
+ * @tparam InType
+ *   Input data type
+ * @tparam MODE
+ *   Host executor threads mode
+ *
+ * @param dest
+ *   Destination view of reduction
+ * @param idest
+ *   Destination for indices
+ * @param in
+ *   Input data to reduce
+ * @param exec
+ *   Single host executor
+ */
+template <typename OutType, typename TensorIndexType, typename InType, ThreadsMode MODE>
+void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InType &in, [[maybe_unused]] const HostExecutor<MODE> &exec)
+{
+  MATX_NVTX_START("argmin_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
+
+  auto ft = [&](auto &&lin, auto &&lout, [[maybe_unused]] auto &&lbegin, [[maybe_unused]] auto &&lend) {
+    if constexpr (OutType::Rank() == 0) {
+      *lout = cuda::std::min_element(lin, lin + TotalSize(in)) - lin;
+    }
+    else {
+      const index_t BATCHES = TotalSize(dest);
+      const index_t els = lend[0] - lbegin[0];
+      for (index_t b = 0; b < BATCHES; b++) {
+        lout[b] = cuda::std::min_element(lin + lbegin[b], lin + lend[b]) - lin;
+      }
+    }
+  };
+
+  // This could be more efficient by not running two reductions to find the same values, but
+  // for brevity this is faster
+  ReduceInput(ft, idest, in);
+  min_impl(dest, in, exec);
+}
+
 /**
  * Compute min and max reduction of an operator and returns value + index
  *
@@ -2281,51 +2322,45 @@ void __MATX_INLINE__ argminmax_impl(OutType destmin, TensorIndexType &idestmin,
 }
 
 /**
- * Compute min reduction of a tensor and returns value + index
+ * Compute min and max reduction of an operator and returns value + index
  *
- * Returns a tensor with minimums and indices
+ * Returns tensors with minimums and indices, and maximums and indices
  *
  * @tparam OutType
  *   Output data type
  * @tparam TensorIndexType
  *   Output type stpring indices
  * @tparam InType
  *   Input data type
+ * @tparam MODE
+ *   Host executor threads mode
  *
- * @param dest
- *   Destination view of reduction
- * @param idest
- *   Destination for indices
+ * @param destmin
+ *   Destination view of min reduction
+ * @param idestmin
+ *   Destination for min indices
+ * @param destmax
+ *   Destination view of max reduction
+ * @param idestmax
+ *   Destination for max indices
  * @param in
  *   Input data to reduce
  * @param exec
- *   SIngle host executor
+ *   Single host executor
  */
 template <typename OutType, typename TensorIndexType, typename InType, ThreadsMode MODE>
-void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InType &in, [[maybe_unused]] const HostExecutor<MODE> &exec)
+void __MATX_INLINE__ argminmax_impl(OutType destmin, TensorIndexType &idestmin, OutType destmax, TensorIndexType &idestmax, const InType &in, [[maybe_unused]] const HostExecutor<MODE> &exec)
 {
-  MATX_NVTX_START("argmin_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
-
-  auto ft = [&](auto &&lin, auto &&lout, [[maybe_unused]] auto &&lbegin, [[maybe_unused]] auto &&lend) {
-    if constexpr (OutType::Rank() == 0) {
-      *lout = cuda::std::min_element(lin, lin + TotalSize(in)) - lin;
-    }
-    else {
-      auto els = lend[1] - lbegin[0];
-      for (index_t b = 0; b < els; b++) {
-        lout[b] = cuda::std::min_element(lin + lbegin[b], lin + lend[b]) - lin;
-      }
-    }
-  };
+  static_assert(OutType::Rank() == TensorIndexType::Rank());
+  MATX_NVTX_START("argminmax_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
 
-  // This could be more efficient by not running two reductions to find the same values, but
+  // This could be more efficient by not running argmin and argmax separately but
   // for brevity this is faster
-  ReduceInput(ft, idest, in);
-  min_impl(dest, in, exec);
+  argmin_impl(destmin, idestmin, in, exec);
+  argmax_impl(destmax, idestmax, in, exec);
 }
 
 
-
 /**
  * Find if any value is != 0
  *

diff --git a/test/00_operators/ReductionTests.cu b/test/00_operators/ReductionTests.cu
@@ -940,8 +940,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMax)
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(5)));
   }
 
-  // Limit to cudaExecutor for now, core dump observed for matx::HostExecutor
-  if (std::is_same_v<ExecType, matx::cudaExecutor>)
   {
     ExecType exec{};
     const int BATCHES = 6;
@@ -1013,7 +1011,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMin)
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(1)));  
   }
 
-  if (std::is_same_v<ExecType, matx::cudaExecutor>)
   {
     ExecType exec{};
     const int BATCHES = 6;
@@ -1055,8 +1052,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMinMax)
   using TestType = cuda::std::tuple_element_t<0, TypeParam>;
   using ExecType = cuda::std::tuple_element_t<1, TypeParam>;
 
-  // Limit to cudaExecutor for now, not yet implemented for matx::HostExecutor
-  if constexpr (std::is_same_v<ExecType, matx::cudaExecutor>)
   {
     ExecType exec{};
     using T = TestType;
@@ -1097,8 +1092,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMinMax)
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(5)));
   }
 
-  // Limit to cudaExecutor for now, not yet implemented for matx::HostExecutor
-  if constexpr (std::is_same_v<ExecType, matx::cudaExecutor>)
   {
     ExecType exec{};
     const int BATCHES = 6;