Fix matx::HostExecutor segfault with argmin/argmax. Add matx::HostExecutor implementation for argminmax.

tmartin-gh · tmartin-gh · commit ae94f4497e80 · 2024-10-22T13:40:31.000-07:00
diff --git a/include/matx/operators/argmax.h b/include/matx/operators/argmax.h
@@ -115,7 +115,7 @@ __MATX_INLINE__ auto argmax(const InType &in, const int (&dims)[D])
 }
 
 /**
- * Compute min reduction of a tensor and returns value + index
+ * Compute max reduction of an operator and returns value + index
  *
  * @tparam InType
  *   Input data type
diff --git a/include/matx/operators/argmin.h b/include/matx/operators/argmin.h
@@ -91,7 +91,7 @@ namespace detail {
 }
 
 /**
- * Compute min reduction of a tensor and returns value + index along specified axes
+ * Compute min reduction of an operator and returns value + index along specified axes
  *
  * @tparam InType
  *   Input data type
@@ -115,7 +115,7 @@ __MATX_INLINE__ auto argmin(const InType &in, const int (&dims)[D])
 }
 
 /**
- * Compute min reduction of a tensor and returns value + index
+ * Compute min reduction of an operator and returns value + index
  *
  * @tparam InType
  *   Input data type
diff --git a/include/matx/operators/argminmax.h b/include/matx/operators/argminmax.h
@@ -91,7 +91,7 @@ namespace detail {
 }
 
 /**
- * Compute min and max reduction of a tensor and returns value + index along specified axes
+ * Compute min and max reduction of an operator and returns value + index along specified axes
  *
  * @tparam InType
  *   Input data type
@@ -102,7 +102,7 @@ namespace detail {
  *   Input data to reduce
  * @param dims
  *   Array containing dimensions to reduce over
- * @returns Operator with reduced values of argmin-reduce computed
+ * @returns Operator with reduced values of argminmax-reduce computed
  */
 template <typename InType, int D>
 __MATX_INLINE__ auto argminmax(const InType &in, const int (&dims)[D])
@@ -115,14 +115,14 @@ __MATX_INLINE__ auto argminmax(const InType &in, const int (&dims)[D])
 }
 
 /**
- * Compute min and max reduction of a tensor and returns value + index
+ * Compute min and max reduction of an operator and returns value + index
  *
  * @tparam InType
  *   Input data type
  *
  * @param in
  *   Input data to reduce
- * @returns Operator with reduced values of argmin-reduce computed
+ * @returns Operator with reduced values of argminmax-reduce computed
  */
 template <typename InType>
 __MATX_INLINE__ auto argminmax(const InType &in)
diff --git a/include/matx/transforms/reduce.h b/include/matx/transforms/reduce.h
@@ -1976,13 +1976,10 @@ void __MATX_INLINE__ prod_impl(OutType dest, const InType &in, [[maybe_unused]]
 
 
 /**
- * Compute max reduction of a tensor
+ * Compute max reduction of an operator
  *
  * Returns a tensor representing the max of all numbers in the reduction
  *
- * @note This function uses the name rmax instead of max to not collide with the
- * element-wise operator max.
- *
  * @tparam OutType
  *   Output data type
  * @tparam InType
@@ -2007,13 +2004,10 @@ void __MATX_INLINE__ max_impl(OutType dest, const InType &in, cudaExecutor exec
 }
 
 /**
- * Compute max reduction of a tensor
+ * Compute max reduction of an operator
  *
  * Returns a tensor representing the max of all numbers in the reduction
  *
- * @note This function uses the name rmax instead of max to not collide with the
- * element-wise operator max.
- *
  * @tparam OutType
  *   Output data type
  * @tparam InType
@@ -2036,8 +2030,9 @@ void __MATX_INLINE__ max_impl(OutType dest, const InType &in, [[maybe_unused]] c
       *lout = *std::max_element(lin, lin + TotalSize(in));
     }
     else {
-      auto els = lend[1] - lbegin[0];
-      for (index_t b = 0; b < els; b++) {
+      const index_t BATCHES = TotalSize(dest);
+      const index_t els = lend[0] - lbegin[0];
+      for (index_t b = 0; b < BATCHES; b++) {
         lout[b] = *std::max_element(lin + lbegin[b], lin + lend[b]);
       }
     }
@@ -2084,9 +2079,9 @@ void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InT
 }
 
 /**
- * Compute maxn reduction of a tensor and returns value + index
+ * Compute max reduction of an operator and returns value + index
  *
- * Returns a tensor with maximums and indices
+ * Returns a tensor with maximums and a tensor with indices
  *
  * @tparam OutType
  *   Output data type
@@ -2114,8 +2109,9 @@ void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InT
       *lout = cuda::std::max_element(lin, lin + TotalSize(in)) - lin;
     }
     else {
-      auto els = lend[0] - lbegin[0];
-      for (index_t b = 0; b < els; b++) {
+      const index_t BATCHES = TotalSize(dest);
+      const index_t els = lend[0] - lbegin[0];
+      for (index_t b = 0; b < BATCHES; b++) {
         lout[b] = cuda::std::max_element(lin + lbegin[b], lin + lend[b]) - lin;
       }
     }
@@ -2130,7 +2126,7 @@ void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InT
 
 
 /**
- * Compute min reduction of a tensor
+ * Compute min reduction of an operator
  *
  * Returns a tensor representing the min of all numbers in the reduction
  *
@@ -2158,13 +2154,10 @@ void __MATX_INLINE__ min_impl(OutType dest, const InType &in, cudaExecutor exec
 }
 
 /**
- * Compute min reduction of a tensor
+ * Compute min reduction of an operator
  *
  * Returns a tensor representing the min of all numbers in the reduction
  *
- * @note This function uses the name rmin instead of min to not collide with the
- * element-wise operator min.
- *
  * @tparam OutType
  *   Output data type
  * @tparam InType
@@ -2186,8 +2179,9 @@ void __MATX_INLINE__ min_impl(OutType dest, const InType &in, [[maybe_unused]] c
       *lout = *std::min_element(lin, lin + TotalSize(in));
     }
     else {
-      auto els = lend[1] - lbegin[0];
-      for (index_t b = 0; b < els; b++) {
+      const index_t BATCHES = TotalSize(dest);
+      const index_t els = lend[0] - lbegin[0];
+      for (index_t b = 0; b < BATCHES; b++) {
         lout[b] = *std::min_element(lin + lbegin[b], lin + lend[b]);
       }
     }
@@ -2234,6 +2228,53 @@ void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InT
 #endif
 }
 
+/**
+ * Compute min reduction of an operator and returns value + index
+ *
+ * Returns a tensor with minimums and indices
+ *
+ * @tparam OutType
+ *   Output data type
+ * @tparam TensorIndexType
+ *   Output type stpring indices
+ * @tparam InType
+ *   Input data type
+ * @tparam MODE
+ *   Host executor threads mode
+ *
+ * @param dest
+ *   Destination view of reduction
+ * @param idest
+ *   Destination for indices
+ * @param in
+ *   Input data to reduce
+ * @param exec
+ *   Single host executor
+ */
+template <typename OutType, typename TensorIndexType, typename InType, ThreadsMode MODE>
+void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InType &in, [[maybe_unused]] const HostExecutor<MODE> &exec)
+{
+  MATX_NVTX_START("argmin_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
+
+  auto ft = [&](auto &&lin, auto &&lout, [[maybe_unused]] auto &&lbegin, [[maybe_unused]] auto &&lend) {
+    if constexpr (OutType::Rank() == 0) {
+      *lout = cuda::std::min_element(lin, lin + TotalSize(in)) - lin;
+    }
+    else {
+      const index_t BATCHES = TotalSize(dest);
+      const index_t els = lend[0] - lbegin[0];
+      for (index_t b = 0; b < BATCHES; b++) {
+        lout[b] = cuda::std::min_element(lin + lbegin[b], lin + lend[b]) - lin;
+      }
+    }
+  };
+
+  // This could be more efficient by not running two reductions to find the same values, but
+  // for brevity this is faster
+  ReduceInput(ft, idest, in);
+  min_impl(dest, in, exec);
+}
+
 /**
  * Compute min and max reduction of an operator and returns value + index
  *
@@ -2281,51 +2322,45 @@ void __MATX_INLINE__ argminmax_impl(OutType destmin, TensorIndexType &idestmin,
 }
 
 /**
- * Compute min reduction of a tensor and returns value + index
+ * Compute min and max reduction of an operator and returns value + index
  *
- * Returns a tensor with minimums and indices
+ * Returns tensors with minimums and indices, and maximums and indices
  *
  * @tparam OutType
  *   Output data type
  * @tparam TensorIndexType
  *   Output type stpring indices
  * @tparam InType
  *   Input data type
+ * @tparam MODE
+ *   Host executor threads mode
  *
- * @param dest
- *   Destination view of reduction
- * @param idest
- *   Destination for indices
+ * @param destmin
+ *   Destination view of min reduction
+ * @param idestmin
+ *   Destination for min indices
+ * @param destmax
+ *   Destination view of max reduction
+ * @param idestmax
+ *   Destination for max indices
  * @param in
  *   Input data to reduce
  * @param exec
- *   SIngle host executor
+ *   Single host executor
  */
 template <typename OutType, typename TensorIndexType, typename InType, ThreadsMode MODE>
-void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InType &in, [[maybe_unused]] const HostExecutor<MODE> &exec)
+void __MATX_INLINE__ argminmax_impl(OutType destmin, TensorIndexType &idestmin, OutType destmax, TensorIndexType &idestmax, const InType &in, [[maybe_unused]] const HostExecutor<MODE> &exec)
 {
-  MATX_NVTX_START("argmin_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
-
-  auto ft = [&](auto &&lin, auto &&lout, [[maybe_unused]] auto &&lbegin, [[maybe_unused]] auto &&lend) {
-    if constexpr (OutType::Rank() == 0) {
-      *lout = cuda::std::min_element(lin, lin + TotalSize(in)) - lin;
-    }
-    else {
-      auto els = lend[1] - lbegin[0];
-      for (index_t b = 0; b < els; b++) {
-        lout[b] = cuda::std::min_element(lin + lbegin[b], lin + lend[b]) - lin;
-      }
-    }
-  };
+  static_assert(OutType::Rank() == TensorIndexType::Rank());
+  MATX_NVTX_START("argminmax_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
 
-  // This could be more efficient by not running two reductions to find the same values, but
+  // This could be more efficient by not running argmin and argmax separately but
   // for brevity this is faster
-  ReduceInput(ft, idest, in);
-  min_impl(dest, in, exec);
+  argmin_impl(destmin, idestmin, in, exec);
+  argmax_impl(destmax, idestmax, in, exec);
 }
 
 
-
 /**
  * Find if any value is != 0
  *
diff --git a/test/00_operators/ReductionTests.cu b/test/00_operators/ReductionTests.cu
@@ -940,8 +940,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMax)
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(5)));
   }
 
-  // Limit to cudaExecutor for now, core dump observed for matx::HostExecutor
-  if (std::is_same_v<ExecType, matx::cudaExecutor>)
   {
     ExecType exec{};
     const int BATCHES = 6;
@@ -1013,7 +1011,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMin)
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(1)));  
   }
 
-  if (std::is_same_v<ExecType, matx::cudaExecutor>)
   {
     ExecType exec{};
     const int BATCHES = 6;
@@ -1055,8 +1052,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMinMax)
   using TestType = cuda::std::tuple_element_t<0, TypeParam>;
   using ExecType = cuda::std::tuple_element_t<1, TypeParam>;
 
-  // Limit to cudaExecutor for now, not yet implemented for matx::HostExecutor
-  if constexpr (std::is_same_v<ExecType, matx::cudaExecutor>)
   {
     ExecType exec{};
     using T = TestType;
@@ -1097,8 +1092,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMinMax)
     EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(5)));
   }
 
-  // Limit to cudaExecutor for now, not yet implemented for matx::HostExecutor
-  if constexpr (std::is_same_v<ExecType, matx::cudaExecutor>)
   {
     ExecType exec{};
     const int BATCHES = 6;

Original file line number	Diff line number	Diff line change
`@@ -115,7 +115,7 @@ __MATX_INLINE__ auto argmax(const InType &in, const int (&dims)[D])`
`115`	`115`	`}`
`116`	`116`
`117`	`117`	`/**`
`118`		`- * Compute min reduction of a tensor and returns value + index`
	`118`	`+ * Compute max reduction of an operator and returns value + index`
`119`	`119`	`*`
`120`	`120`	`* @tparam InType`
`121`	`121`	`* Input data type`
Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ namespace detail {`
`91`	`91`	`}`
`92`	`92`
`93`	`93`	`/**`
`94`		`- * Compute min reduction of a tensor and returns value + index along specified axes`
	`94`	`+ * Compute min reduction of an operator and returns value + index along specified axes`
`95`	`95`	`*`
`96`	`96`	`* @tparam InType`
`97`	`97`	`* Input data type`
`@@ -115,7 +115,7 @@ __MATX_INLINE__ auto argmin(const InType &in, const int (&dims)[D])`
`115`	`115`	`}`
`116`	`116`
`117`	`117`	`/**`
`118`		`- * Compute min reduction of a tensor and returns value + index`
	`118`	`+ * Compute min reduction of an operator and returns value + index`
`119`	`119`	`*`
`120`	`120`	`* @tparam InType`
`121`	`121`	`* Input data type`
Original file line number	Diff line number	Diff line change
`@@ -940,8 +940,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMax)`
`940`	`940`	`EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(5)));`
`941`	`941`	`}`
`942`	`942`
`943`		`- // Limit to cudaExecutor for now, core dump observed for matx::HostExecutor`
`944`		`- if (std::is_same_v<ExecType, matx::cudaExecutor>)`
`945`	`943`	`{`
`946`	`944`	`ExecType exec{};`
`947`	`945`	`const int BATCHES = 6;`
`@@ -1013,7 +1011,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMin)`
`1013`	`1011`	`EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(1)));`
`1014`	`1012`	`}`
`1015`	`1013`
`1016`		`- if (std::is_same_v<ExecType, matx::cudaExecutor>)`
`1017`	`1014`	`{`
`1018`	`1015`	`ExecType exec{};`
`1019`	`1016`	`const int BATCHES = 6;`
`@@ -1055,8 +1052,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMinMax)`
`1055`	`1052`	`using TestType = cuda::std::tuple_element_t<0, TypeParam>;`
`1056`	`1053`	`using ExecType = cuda::std::tuple_element_t<1, TypeParam>;`
`1057`	`1054`
`1058`		`- // Limit to cudaExecutor for now, not yet implemented for matx::HostExecutor`
`1059`		`- if constexpr (std::is_same_v<ExecType, matx::cudaExecutor>)`
`1060`	`1055`	`{`
`1061`	`1056`	`ExecType exec{};`
`1062`	`1057`	`using T = TestType;`
`@@ -1097,8 +1092,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMinMax)`
`1097`	`1092`	`EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(5)));`
`1098`	`1093`	`}`
`1099`	`1094`
`1100`		`- // Limit to cudaExecutor for now, not yet implemented for matx::HostExecutor`
`1101`		`- if constexpr (std::is_same_v<ExecType, matx::cudaExecutor>)`
`1102`	`1095`	`{`
`1103`	`1096`	`ExecType exec{};`
`1104`	`1097`	`const int BATCHES = 6;`