Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/matx/operators/argmax.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ __MATX_INLINE__ auto argmax(const InType &in, const int (&dims)[D])
}

/**
* Compute min reduction of a tensor and returns value + index
* Compute max reduction of an operator and returns value + index
*
* @tparam InType
* Input data type
Expand Down
4 changes: 2 additions & 2 deletions include/matx/operators/argmin.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ namespace detail {
}

/**
* Compute min reduction of a tensor and returns value + index along specified axes
* Compute min reduction of an operator and returns value + index along specified axes
*
* @tparam InType
* Input data type
Expand All @@ -115,7 +115,7 @@ __MATX_INLINE__ auto argmin(const InType &in, const int (&dims)[D])
}

/**
* Compute min reduction of a tensor and returns value + index
* Compute min reduction of an operator and returns value + index
*
* @tparam InType
* Input data type
Expand Down
8 changes: 4 additions & 4 deletions include/matx/operators/argminmax.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ namespace detail {
}

/**
* Compute min and max reduction of a tensor and returns value + index along specified axes
* Compute min and max reduction of an operator and returns value + index along specified axes
*
* @tparam InType
* Input data type
Expand All @@ -102,7 +102,7 @@ namespace detail {
* Input data to reduce
* @param dims
* Array containing dimensions to reduce over
* @returns Operator with reduced values of argmin-reduce computed
* @returns Operator with reduced values of argminmax-reduce computed
*/
template <typename InType, int D>
__MATX_INLINE__ auto argminmax(const InType &in, const int (&dims)[D])
Expand All @@ -115,14 +115,14 @@ __MATX_INLINE__ auto argminmax(const InType &in, const int (&dims)[D])
}

/**
* Compute min and max reduction of a tensor and returns value + index
* Compute min and max reduction of an operator and returns value + index
*
* @tparam InType
* Input data type
*
* @param in
* Input data to reduce
* @returns Operator with reduced values of argmin-reduce computed
* @returns Operator with reduced values of argminmax-reduce computed
*/
template <typename InType>
__MATX_INLINE__ auto argminmax(const InType &in)
Expand Down
127 changes: 81 additions & 46 deletions include/matx/transforms/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -1976,13 +1976,10 @@ void __MATX_INLINE__ prod_impl(OutType dest, const InType &in, [[maybe_unused]]


/**
* Compute max reduction of a tensor
* Compute max reduction of an operator
*
* Returns a tensor representing the max of all numbers in the reduction
*
* @note This function uses the name rmax instead of max to not collide with the
* element-wise operator max.
*
* @tparam OutType
* Output data type
* @tparam InType
Expand All @@ -2007,13 +2004,10 @@ void __MATX_INLINE__ max_impl(OutType dest, const InType &in, cudaExecutor exec
}

/**
* Compute max reduction of a tensor
* Compute max reduction of an operator
*
* Returns a tensor representing the max of all numbers in the reduction
*
* @note This function uses the name rmax instead of max to not collide with the
* element-wise operator max.
*
* @tparam OutType
* Output data type
* @tparam InType
Expand All @@ -2036,8 +2030,9 @@ void __MATX_INLINE__ max_impl(OutType dest, const InType &in, [[maybe_unused]] c
*lout = *std::max_element(lin, lin + TotalSize(in));
}
else {
auto els = lend[1] - lbegin[0];
for (index_t b = 0; b < els; b++) {
const index_t BATCHES = TotalSize(dest);
const index_t els = lend[0] - lbegin[0];
for (index_t b = 0; b < BATCHES; b++) {
lout[b] = *std::max_element(lin + lbegin[b], lin + lend[b]);
}
}
Expand Down Expand Up @@ -2084,9 +2079,9 @@ void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InT
}

/**
* Compute maxn reduction of a tensor and returns value + index
* Compute max reduction of an operator and returns value + index
*
* Returns a tensor with maximums and indices
* Returns a tensor with maximums and a tensor with indices
*
* @tparam OutType
* Output data type
Expand Down Expand Up @@ -2114,8 +2109,9 @@ void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InT
*lout = cuda::std::max_element(lin, lin + TotalSize(in)) - lin;
}
else {
auto els = lend[0] - lbegin[0];
for (index_t b = 0; b < els; b++) {
const index_t BATCHES = TotalSize(dest);
const index_t els = lend[0] - lbegin[0];
for (index_t b = 0; b < BATCHES; b++) {
lout[b] = cuda::std::max_element(lin + lbegin[b], lin + lend[b]) - lin;
}
}
Expand All @@ -2130,7 +2126,7 @@ void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InT


/**
* Compute min reduction of a tensor
* Compute min reduction of an operator
*
* Returns a tensor representing the min of all numbers in the reduction
*
Expand Down Expand Up @@ -2158,13 +2154,10 @@ void __MATX_INLINE__ min_impl(OutType dest, const InType &in, cudaExecutor exec
}

/**
* Compute min reduction of a tensor
* Compute min reduction of an operator
*
* Returns a tensor representing the min of all numbers in the reduction
*
* @note This function uses the name rmin instead of min to not collide with the
* element-wise operator min.
*
* @tparam OutType
* Output data type
* @tparam InType
Expand All @@ -2186,8 +2179,9 @@ void __MATX_INLINE__ min_impl(OutType dest, const InType &in, [[maybe_unused]] c
*lout = *std::min_element(lin, lin + TotalSize(in));
}
else {
auto els = lend[1] - lbegin[0];
for (index_t b = 0; b < els; b++) {
const index_t BATCHES = TotalSize(dest);
const index_t els = lend[0] - lbegin[0];
for (index_t b = 0; b < BATCHES; b++) {
lout[b] = *std::min_element(lin + lbegin[b], lin + lend[b]);
}
}
Expand Down Expand Up @@ -2234,6 +2228,53 @@ void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InT
#endif
}

/**
* Compute min reduction of an operator and returns value + index
*
* Returns a tensor with minimums and indices
*
* @tparam OutType
* Output data type
* @tparam TensorIndexType
* Output type stpring indices
* @tparam InType
* Input data type
* @tparam MODE
* Host executor threads mode
*
* @param dest
* Destination view of reduction
* @param idest
* Destination for indices
* @param in
* Input data to reduce
* @param exec
* Single host executor
*/
template <typename OutType, typename TensorIndexType, typename InType, ThreadsMode MODE>
void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InType &in, [[maybe_unused]] const HostExecutor<MODE> &exec)
{
MATX_NVTX_START("argmin_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)

auto ft = [&](auto &&lin, auto &&lout, [[maybe_unused]] auto &&lbegin, [[maybe_unused]] auto &&lend) {
if constexpr (OutType::Rank() == 0) {
*lout = cuda::std::min_element(lin, lin + TotalSize(in)) - lin;
}
else {
const index_t BATCHES = TotalSize(dest);
const index_t els = lend[0] - lbegin[0];
for (index_t b = 0; b < BATCHES; b++) {
lout[b] = cuda::std::min_element(lin + lbegin[b], lin + lend[b]) - lin;
}
}
};

// This could be more efficient by not running two reductions to find the same values, but
// for brevity this is faster
ReduceInput(ft, idest, in);
min_impl(dest, in, exec);
}

/**
* Compute min and max reduction of an operator and returns value + index
*
Expand Down Expand Up @@ -2281,51 +2322,45 @@ void __MATX_INLINE__ argminmax_impl(OutType destmin, TensorIndexType &idestmin,
}

/**
* Compute min reduction of a tensor and returns value + index
* Compute min and max reduction of an operator and returns value + index
*
* Returns a tensor with minimums and indices
* Returns tensors with minimums and indices, and maximums and indices
*
* @tparam OutType
* Output data type
* @tparam TensorIndexType
* Output type stpring indices
* @tparam InType
* Input data type
* @tparam MODE
* Host executor threads mode
*
* @param dest
* Destination view of reduction
* @param idest
* Destination for indices
* @param destmin
* Destination view of min reduction
* @param idestmin
* Destination for min indices
* @param destmax
* Destination view of max reduction
* @param idestmax
* Destination for max indices
* @param in
* Input data to reduce
* @param exec
* SIngle host executor
* Single host executor
*/
template <typename OutType, typename TensorIndexType, typename InType, ThreadsMode MODE>
void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InType &in, [[maybe_unused]] const HostExecutor<MODE> &exec)
void __MATX_INLINE__ argminmax_impl(OutType destmin, TensorIndexType &idestmin, OutType destmax, TensorIndexType &idestmax, const InType &in, [[maybe_unused]] const HostExecutor<MODE> &exec)
{
MATX_NVTX_START("argmin_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)

auto ft = [&](auto &&lin, auto &&lout, [[maybe_unused]] auto &&lbegin, [[maybe_unused]] auto &&lend) {
if constexpr (OutType::Rank() == 0) {
*lout = cuda::std::min_element(lin, lin + TotalSize(in)) - lin;
}
else {
auto els = lend[1] - lbegin[0];
for (index_t b = 0; b < els; b++) {
lout[b] = cuda::std::min_element(lin + lbegin[b], lin + lend[b]) - lin;
}
}
};
static_assert(OutType::Rank() == TensorIndexType::Rank());
MATX_NVTX_START("argminmax_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)

// This could be more efficient by not running two reductions to find the same values, but
// This could be more efficient by not running argmin and argmax separately but
// for brevity this is faster
ReduceInput(ft, idest, in);
min_impl(dest, in, exec);
argmin_impl(destmin, idestmin, in, exec);
argmax_impl(destmax, idestmax, in, exec);
}



/**
* Find if any value is != 0
*
Expand Down
7 changes: 0 additions & 7 deletions test/00_operators/ReductionTests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -940,8 +940,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMax)
EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(5)));
}

// Limit to cudaExecutor for now, core dump observed for matx::HostExecutor
if (std::is_same_v<ExecType, matx::cudaExecutor>)
{
ExecType exec{};
const int BATCHES = 6;
Expand Down Expand Up @@ -1013,7 +1011,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMin)
EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(1)));
}

if (std::is_same_v<ExecType, matx::cudaExecutor>)
{
ExecType exec{};
const int BATCHES = 6;
Expand Down Expand Up @@ -1055,8 +1052,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMinMax)
using TestType = cuda::std::tuple_element_t<0, TypeParam>;
using ExecType = cuda::std::tuple_element_t<1, TypeParam>;

// Limit to cudaExecutor for now, not yet implemented for matx::HostExecutor
if constexpr (std::is_same_v<ExecType, matx::cudaExecutor>)
{
ExecType exec{};
using T = TestType;
Expand Down Expand Up @@ -1097,8 +1092,6 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, ArgMinMax)
EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TestType)(5)));
}

// Limit to cudaExecutor for now, not yet implemented for matx::HostExecutor
if constexpr (std::is_same_v<ExecType, matx::cudaExecutor>)
{
ExecType exec{};
const int BATCHES = 6;
Expand Down