NVIDIA
diff --git a/‎docs_input/api/searchsort/find_peaks.rst‎
Lines changed: 22 additions & 0 deletions b/‎docs_input/api/searchsort/find_peaks.rst‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎include/matx/core/type_utils.h‎
Lines changed: 20 additions & 0 deletions b/‎include/matx/core/type_utils.h‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎include/matx/operators/find_peaks.h‎
Lines changed: 151 additions & 0 deletions b/‎include/matx/operators/find_peaks.h‎
Lines changed: 151 additions & 0 deletions
diff --git a/‎include/matx/operators/operators.h‎
Lines changed: 1 addition & 0 deletions b/‎include/matx/operators/operators.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/matx/transforms/cub.h‎
Lines changed: 40 additions & 25 deletions b/‎include/matx/transforms/cub.h‎
Lines changed: 40 additions & 25 deletions
@@ -0,0 +1,22 @@
+.. _find_peaks_func:
+
+find_peaks
+##########
+
+Finds the peaks in a 1D input operator. 
+
+Currently only the `height` and `threshold` parameters are supported.
+
+.. doxygenfunction:: find_peaks
+
+Examples
+~~~~~~~~
+
+.. literalinclude:: ../../../test/00_operators/find_peaks.cu
+   :language: cpp
+   :start-after: example-begin findpeaks-test-1
+   :end-before: example-end findpeaks-test-1
+   :dedent:
+
+
+
@@ -677,6 +677,26 @@ inline constexpr bool has_shape_type_v = detail::has_shape_type<typename remove_
 
 
 
+// Detect presence of nested alias `index_cmp_op` and verify it equals bool
+namespace detail {
+template <typename T, typename = void>
+struct has_index_cmp_op : std::false_type {};
+
+template <typename T>
+struct has_index_cmp_op<T, std::void_t<typename T::index_cmp_op>>
+    : std::bool_constant<std::is_same_v<typename T::index_cmp_op, bool>> {};
+}
+
+/**
+ * @brief Determine if a type defines `using index_cmp_op = bool;`
+ * 
+ * @tparam T Type to test
+ */
+template <typename T>
+inline constexpr bool has_index_cmp_op_v = detail::has_index_cmp_op<typename remove_cvref<T>::type>::value;
+
+
+
 namespace detail {
 template <typename T>
 struct is_complex_half
 
@@ -0,0 +1,151 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// sum rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COpBRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHsum THE COpBRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+
+#include "matx/core/type_utils.h"
+#include "matx/operators/base_operator.h"
+#include "matx/operators/permute.h"
+#include "matx/transforms/find_peaks.h"
+
+namespace matx {
+
+
+
+namespace detail {
+  template<typename OpA>
+  class FindPeaksOp : public BaseOp<FindPeaksOp<OpA>>
+  {
+    private:
+      typename detail::base_type_t<OpA> a_;
+      typename remove_cvref_t<OpA>::value_type height_;
+      typename remove_cvref_t<OpA>::value_type threshold_;
+
+    public:
+      using matxop = bool;
+      using value_type = typename remove_cvref_t<OpA>::value_type;
+      using matx_transform_op = bool;
+      using find_peaks_xform_op = bool;
+
+      __MATX_INLINE__ std::string str() const { return "find_peaks(" + get_type_str(a_) + ")"; }
+      __MATX_INLINE__ FindPeaksOp(const OpA &a, value_type height, 
+                                                value_type threshold) : 
+                                                a_(a), height_(height), threshold_(threshold) { 
+      }
+
+      template <typename... Is>
+      __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const = delete;
+
+      template <OperatorCapability Cap>
+      __MATX_INLINE__ __MATX_HOST__ auto get_capability() const {
+        auto self_has_cap = capability_attributes<Cap>::default_value;
+        return combine_capabilities<Cap>(self_has_cap, detail::get_operator_capability<Cap>(a_));
+      }
+
+      template <typename Out, typename Executor>
+      void Exec(Out &&out, Executor &&ex) const {
+        static_assert(cuda::std::tuple_size_v<remove_cvref_t<Out>> == 3, "Must use mtie with 2 outputs on find_peaks(). ie: (mtie(O, num_found) = find_peaks(A, height, threshold))");     
+        static_assert(remove_cvref_t<decltype(cuda::std::get<1>(out))>::Rank() == 0 &&
+                      std::is_same_v<typename remove_cvref_t<decltype(cuda::std::get<1>(out))>::value_type, int>, 
+                      "Num elements output must be a scalar integer tensor");
+        static_assert(std::is_same_v<typename remove_cvref_t<decltype(cuda::std::get<0>(out))>::value_type, index_t>, 
+                      "Peak indices output must be a 1D matx::index_t tensor");
+        find_peaks_impl(cuda::std::get<0>(out), cuda::std::get<1>(out), a_, height_, threshold_, ex);
+      }
+
+      static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
+      {
+        return remove_cvref_t<OpA>::Rank();
+      }
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        if constexpr (is_matx_op<OpA>()) {
+          a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+        }          
+      }      
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));    
+      }
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PostRun(ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        if constexpr (is_matx_op<OpA>()) {
+          a_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+        }
+      }
+
+      constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(int dim) const
+      {
+        return a_.Size(dim);
+      }
+
+  };
+}
+
+
+/**
+ * Compute peak search of input
+ *
+ * Returns a tensor representing the indices of peaks found in the input operator. The first output parameter holds the indices
+ * while the second holds the number of indices/peaks found. The output index tensor must be large enough to hold all of the peaks 
+ * found or the behavior is undefined.
+ *
+ * @tparam InType
+ *   Input data type
+ * @tparam D
+ *   Number of right-most dimensions to reduce over
+ *
+ * @param in
+ *   Input data to reduce
+ * @param height
+ *   Height threshold for peak detection. Values below this threshold are not considered peaks.
+ * @param threshold
+ *   Threshold for peak detection. Neighboring values must be larger in vertical distance than this threshold
+ * @returns Operator with reduced values of peak search computed
+ */
+template <typename InType>
+__MATX_INLINE__ auto find_peaks(const InType &in,
+                                 typename InType::value_type height,
+                                 typename InType::value_type threshold)
+{
+  static_assert(InType::Rank() == 1, "Input to find_peaks() must be rank 1");
+  return detail::FindPeaksOp<decltype(in)>(in, height, threshold);
+}
+
+}
@@ -63,6 +63,7 @@
 #include "matx/operators/fft.h"
 #include "matx/operators/fftshift.h"
 #include "matx/operators/filter.h"
+#include "matx/operators/find_peaks.h"
 #include "matx/operators/flatten.h"
 #include "matx/operators/frexp.h"
 #include "matx/operators/hermitian.h"
 
@@ -72,7 +72,7 @@ typedef enum {
   CUB_OP_REDUCE_SUM,
   CUB_OP_REDUCE_MIN,
   CUB_OP_REDUCE_MAX,
-  CUB_OP_SELECT,
+  CUB_OP_SELECT_VALS,
   CUB_OP_SELECT_IDX,
   CUB_OP_UNIQUE,
   CUB_OP_SINGLE_ARG_REDUCE,
@@ -178,7 +178,7 @@ class matxCubPlan_t {
     else if constexpr (op == CUB_OP_REDUCE_MAX) {
       ExecMax(a_out, a, stream);
     }
-    else if constexpr (op == CUB_OP_SELECT) {
+    else if constexpr (op == CUB_OP_SELECT_VALS) {
       ExecSelect(a_out, a, stream);
     }
     else if constexpr (op == CUB_OP_SELECT_IDX) {
@@ -819,8 +819,6 @@ inline void ExecSort(OutputTensor &a_out,
 #endif
   }
 
-
-
   /**
    * Execute a selection reduction on a tensor
    *
@@ -922,16 +920,30 @@ inline void ExecSort(OutputTensor &a_out,
 #ifdef __CUDACC__
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
 
-    if constexpr (is_tensor_view_v<InputOperator>) {
-      if (a.IsContiguous()) {
-        cub::DeviceSelect::If(d_temp,
-                              temp_storage_bytes,
-                              thrust::counting_iterator<index_t>(0),
-                              a_out.Data(),
-                              cparams_.num_found.Data(),
-                              static_cast<int>(TotalSize(a)),
-                              IndexToSelectOp<decltype(a.Data()), decltype(cparams_.op)>{a.Data(), cparams_.op},
-                              stream);
+    if (!has_index_cmp_op_v<decltype(cparams_.op)>) {
+      if constexpr (is_tensor_view_v<InputOperator>) {
+        if (a.IsContiguous()) {
+          cub::DeviceSelect::If(d_temp,
+                                temp_storage_bytes,
+                                thrust::counting_iterator<index_t>(0),
+                                a_out.Data(),
+                                cparams_.num_found.Data(),
+                                static_cast<int>(TotalSize(a)),
+                                IndexToSelectOp<decltype(a.Data()), decltype(cparams_.op)>{a.Data(), cparams_.op},
+                                stream);
+        }
+        else {
+          tensor_impl_t<typename InputOperator::value_type, InputOperator::Rank(), typename InputOperator::desc_type> base = a;
+          cub::DeviceSelect::If(d_temp,
+                                temp_storage_bytes,
+                                thrust::counting_iterator<index_t>(0),
+                                a_out.Data(),
+                                cparams_.num_found.Data(),
+                                static_cast<int>(TotalSize(a)),
+                                IndexToSelectOp<decltype(RandomOperatorIterator{base}), decltype(cparams_.op)>
+                                  {RandomOperatorIterator{base}, cparams_.op},
+                                stream);
+        }
       }
       else {
         tensor_impl_t<typename InputOperator::value_type, InputOperator::Rank(), typename InputOperator::desc_type> base = a;
@@ -947,16 +959,18 @@ inline void ExecSort(OutputTensor &a_out,
       }
     }
     else {
+      // Custom compare op that only takes an index. This can be more powerful for users by allowing them to define whatever
+      // they want inside the op and not be limited to simple binary comparisons.
       cub::DeviceSelect::If(d_temp,
-                            temp_storage_bytes,
-                            thrust::counting_iterator<index_t>(0),
-                            a_out.Data(),
-                            cparams_.num_found.Data(),
-                            static_cast<int>(TotalSize(a)),
-                            IndexToSelectOp<decltype(RandomOperatorIterator{a}), decltype(cparams_.op)>
-                              {RandomOperatorIterator{a}, cparams_.op},
-                            stream);
+        temp_storage_bytes,
+        thrust::counting_iterator<index_t>(0),
+        a_out.Data(),
+        cparams_.num_found.Data(),
+        static_cast<int>(TotalSize(a)),
+        cparams_.op,
+        stream);
     }
+
 #endif
   }
 
@@ -2399,6 +2413,7 @@ struct GTE
 };
 
 
+
 /**
  * Reduce values that meet a certain criteria
  *
@@ -2443,9 +2458,9 @@ void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator
   auto params =
       detail::matxCubPlan_t<OutputTensor,
                             InputOperator,
-                            detail::CUB_OP_SELECT,
+                            detail::CUB_OP_SELECT_VALS,
                             param_type>::GetCubParams(a_out, a, stream);
-  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_SELECT, param_type>;
+  using cache_val_type = detail::matxCubPlan_t<OutputTensor, InputOperator, detail::CUB_OP_SELECT_VALS, param_type>;
   detail::GetCache().LookupAndExec<detail::cub_cache_t>(
       detail::GetCacheIdFromType<detail::cub_cache_t>(),
       params,
@@ -2461,7 +2476,7 @@ void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator
 #else
   auto tmp = detail::matxCubPlan_t< OutputTensor,
                                         InputOperator,
-                                        detail::CUB_OP_SELECT,
+                                        detail::CUB_OP_SELECT_VALS,
                                         decltype(cparams)>{a_out, a, cparams, stream};
   tmp.ExecSelect(a_out, a, stream);
 #endif