NVIDIA · cliffburdick · Oct 17, 2024 · Oct 16, 2024
diff --git a/include/matx/operators/clone.h b/include/matx/operators/clone.h
@@ -107,20 +107,7 @@ IGNORE_WARNING_POP_GCC
         template <typename... Is>
         __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices)
         {
-
-          // convert variadic type to tuple so we can read/update
-IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")         
-          cuda::std::array<index_t, Rank()> sind{indices...};
-          cuda::std::array<index_t, T::Rank()> gind;
-IGNORE_WARNING_POP_GCC
-
-          // gather indices
-          for(int i = 0; i < T::Rank(); i++) {
-            auto idx = dims_[i];
-            gind[i] = sind[idx];
-          }
-
-          return cuda::std::apply(op_, gind);
+          return std::as_const(*this).template operator()(indices...);
         }
 
         static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()

diff --git a/include/matx/operators/collapse.h b/include/matx/operators/collapse.h
@@ -55,70 +55,51 @@ namespace matx
 
         __MATX_INLINE__ std::string str() const { return "lcollapse<" + std::to_string(DIM) + ">(" + op_.str() + ")"; }
         __MATX_INLINE__ LCollapseOp(const T1 &op) : op_(op)
-      {
-        static_assert(DIM <= T1::Rank(),  "Collapse DIM must be less than or equal to Rank() of operator");
-        static_assert(DIM > 1, "Must collapse multiple dims");
-        static_assert(T1::Rank() >= 2, "Collapse must be called on operators with rank >= 2");
+        {
+          static_assert(DIM <= T1::Rank(),  "Collapse DIM must be less than or equal to Rank() of operator");
+          static_assert(DIM > 1, "Must collapse multiple dims");
+          static_assert(T1::Rank() >= 2, "Collapse must be called on operators with rank >= 2");
 
-        // comptue size of collapsed dimension
-        size_ = 1;
+          // comptue size of collapsed dimension
+          size_ = 1;
 
-        // Collapse left-most dims
-#pragma unroll
-        for(int i = 0 ; i < DIM; i++) {
-          size_ *= op_.Size(i);
+          // Collapse left-most dims
+  #pragma unroll
+          for(int i = 0 ; i < DIM; i++) {
+            size_ *= op_.Size(i);
+          }
         }
-      }
 
         template <typename... Is>
-          __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const 
-          {
-            // indices coming in
-            cuda::std::array<index_t, Rank()> in{indices...};  // index coming in
-            cuda::std::array<index_t, T1::Rank()> out;         // index going out
+        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const 
+        {
+          // indices coming in
+          cuda::std::array<index_t, Rank()> in{indices...};  // index coming in
+          cuda::std::array<index_t, T1::Rank()> out;         // index going out
 
 #pragma unroll
-            for(int i = 1; i < Rank(); i++) {
-              // copy all but first input index into out array
-              out[DIM + i - 1] = in[i];
-            }
+          for(int i = 1; i < Rank(); i++) {
+            // copy all but first input index into out array
+            out[DIM + i - 1] = in[i];
+          }
 
-            // expand first input index into DIM indices
-            auto ind = in[0];
+          // expand first input index into DIM indices
+          auto ind = in[0];
 #pragma unroll
-            for(int i = 0; i < DIM; i++) {
-              int d = DIM - i - 1;
-              out[d] = ind % op_.Size(d);
-              ind /= op_.Size(d);
-            }
+          for(int i = 0; i < DIM; i++) {
+            int d = DIM - i - 1;
+            out[d] = ind % op_.Size(d);
+            ind /= op_.Size(d);
+          }
 
-            return cuda::std::apply(op_, out);
-          }    
+          return cuda::std::apply(op_, out);
+        }    
 
         template <typename... Is>
-          __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) 
-          {
-            // indices coming in
-            cuda::std::array<index_t, Rank()> in{indices...};  // index coming in
-            cuda::std::array<index_t, T1::Rank()> out;         // index going out
-
-#pragma unroll
-            for(int i = 1; i < Rank(); i++) {
-              // copy all but first input index into out array
-              out[DIM + i - 1] = in[i];
-            }
-
-            // expand first input index into DIM indices
-            auto ind = in[0];
-#pragma unroll
-            for(int i = 0; i < DIM; i++) {
-              int d = DIM - i - 1;
-              out[d] = ind % op_.Size(d);
-              ind /= op_.Size(d);
-            }
-
-            return cuda::std::apply(op_, out);
-          }    
+        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices)
+        {
+          return std::as_const(*this).template operator()(indices...);
+        }   
 
         static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
         {
@@ -211,70 +192,51 @@ namespace matx
         __MATX_INLINE__ std::string str() const { return "rcollapse<" + std::to_string(DIM) + ">(" + op_.str() + ")"; }
 
         __MATX_INLINE__ RCollapseOp(const T1 op) : op_(op)
-      {
-        static_assert(DIM <= T1::Rank(),  "Collapse DIM must be less than or equal to Rank() of operator");
-        static_assert(DIM > 1, "Collapse DIM must have be greater than 1");
-        static_assert(T1::Rank() >= 2, "Collapse must be called on operators with rank >= 2");
+        {
+          static_assert(DIM <= T1::Rank(),  "Collapse DIM must be less than or equal to Rank() of operator");
+          static_assert(DIM > 1, "Collapse DIM must have be greater than 1");
+          static_assert(T1::Rank() >= 2, "Collapse must be called on operators with rank >= 2");
 
-        // comptue size of collapsed dimension
-        size_ = 1;
+          // comptue size of collapsed dimension
+          size_ = 1;
 
-        // Collapse right-most dims
-#pragma unroll
-        for(int i = 0 ; i < DIM; i++) {
-          size_ *= op_.Size(T1::Rank() - 1 - i);
+          // Collapse right-most dims
+  #pragma unroll
+          for(int i = 0 ; i < DIM; i++) {
+            size_ *= op_.Size(T1::Rank() - 1 - i);
+          }
         }
-      }
 
         template <typename... Is>
-          __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const 
-          {
-            // indices coming in
-            cuda::std::array<index_t, Rank()> in{indices...};  // index coming in
-            cuda::std::array<index_t, T1::Rank()> out;         // index going out
+        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const 
+        {
+          // indices coming in
+          cuda::std::array<index_t, Rank()> in{indices...};  // index coming in
+          cuda::std::array<index_t, T1::Rank()> out;         // index going out
 
 #pragma unroll
-            for(int i = 0 ; i < Rank() - 1; i++) {
-              // copy all but last index into out array
-              out[i] = in[i];
-            }
+          for(int i = 0 ; i < Rank() - 1; i++) {
+            // copy all but last index into out array
+            out[i] = in[i];
+          }
 
-            // expand last index into DIM indices
-            auto ind = in[Rank() - 1];
+          // expand last index into DIM indices
+          auto ind = in[Rank() - 1];
 #pragma unroll
-            for(int i = 0; i < DIM; i++) {
-              int d = T1::Rank() - 1 - i;
-              out[d] = ind % op_.Size(d);
-              ind /= op_.Size(d);
-            }
+          for(int i = 0; i < DIM; i++) {
+            int d = T1::Rank() - 1 - i;
+            out[d] = ind % op_.Size(d);
+            ind /= op_.Size(d);
+          }
 
-            return cuda::std::apply(op_, out);
-          }    
+          return cuda::std::apply(op_, out);
+        }    
 
         template <typename... Is>
-          __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices)
-          {
-            // indices coming in
-            cuda::std::array<index_t, Rank()> in{indices...};  // index coming in
-            cuda::std::array<index_t, T1::Rank()> out;         // index going out
-
-#pragma unroll
-            for(int i = 0 ; i < Rank() - 1; i++) {
-              // copy all but last index into out array
-              out[i] = in[i];
-            }
-
-            // expand last index into DIM indices
-            auto ind = in[Rank() - 1];
-#pragma unroll
-            for(int i = 0; i < DIM; i++) {
-              int d = T1::Rank() - 1 - i;
-              out[d] = ind % op_.Size(d);
-              ind /= op_.Size(d);
-            }
-
-            return cuda::std::apply(op_, out);
-          }    
+        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices)
+        {
+          return std::as_const(*this).template operator()(indices...);
+        }   
 
         static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
         {

diff --git a/include/matx/operators/diag.h b/include/matx/operators/diag.h
@@ -63,41 +63,41 @@ namespace matx
         __MATX_INLINE__ DiagOp(const T1 &op, index_t k) : op_(op), k_(k) { }
 
         template <typename... Is>
-          __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const 
-          {
-            static_assert(RANK != 0, "Cannot make get diagonals from 0D tensor");
-            using tt = cuda::std::tuple_element_t<0, cuda::std::tuple<Is...>>;
+        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const 
+        {
+          static_assert(RANK != 0, "Cannot make get diagonals from 0D tensor");
+          using tt = cuda::std::tuple_element_t<0, cuda::std::tuple<Is...>>;
 
-            if constexpr (RANK == 1) {
-              static_assert(sizeof...(Is) == 2, "Indexing of diag() on a 1D input must be 2 indices");
-              if (((pp_get<0>(indices...) == indices) && ...)) {
-                return (value_type)(pp_get<0>(indices...));
-              }
-              else {
-                return (value_type)(0);
-              }
+          if constexpr (RANK == 1) {
+            static_assert(sizeof...(Is) == 2, "Indexing of diag() on a 1D input must be 2 indices");
+            if (((pp_get<0>(indices...) == indices) && ...)) {
+              return (value_type)(pp_get<0>(indices...));
             }
             else {
-              static_assert(sizeof...(Is) == RANK - 1, "Diagonal operator must have one fewer op() index than rank of operator");
-
-              // Offset either the rows or columns by k_, depending on if it's negative
-              if (k_ < 0) {
-                auto tup = cuda::std::make_tuple(indices..., static_cast<tt>(0));
-                cuda::std::get<RANK - 1>(tup) = pp_get<RANK-2>(indices...) ;
+              return (value_type)(0);
+            }
+          }
+          else {
+            static_assert(sizeof...(Is) == RANK - 1, "Diagonal operator must have one fewer op() index than rank of operator");
+
+            // Offset either the rows or columns by k_, depending on if it's negative
+            if (k_ < 0) {
+              auto tup = cuda::std::make_tuple(indices..., static_cast<tt>(0));
+              cuda::std::get<RANK - 1>(tup) = pp_get<RANK-2>(indices...) ;
 IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
-                cuda::std::get<RANK - 2>(tup) = cuda::std::get<RANK - 2>(tup) - k_;
+              cuda::std::get<RANK - 2>(tup) = cuda::std::get<RANK - 2>(tup) - k_;
 IGNORE_WARNING_POP_GCC
-                return cuda::std::apply(op_, tup);
-              }
-              else {
-                auto tup = cuda::std::make_tuple(indices..., static_cast<tt>(0));
+              return cuda::std::apply(op_, tup);
+            }
+            else {
+              auto tup = cuda::std::make_tuple(indices..., static_cast<tt>(0));
 IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized")
-                cuda::std::get<RANK - 1>(tup) = pp_get<RANK-2>(indices...) + k_;
+              cuda::std::get<RANK - 1>(tup) = pp_get<RANK-2>(indices...) + k_;
 IGNORE_WARNING_POP_GCC
-                return cuda::std::apply(op_, tup);                
-              }
+              return cuda::std::apply(op_, tup);                
             }
           }
+        }
 
         static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
         {

diff --git a/include/matx/operators/fftshift.h b/include/matx/operators/fftshift.h
@@ -64,14 +64,11 @@ namespace matx
         }
 
         template <typename... Is>
-        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) 
+        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices)
         {
-          auto tup = cuda::std::make_tuple(indices...);
-          cuda::std::get<Rank()-1>(tup) = (cuda::std::get<Rank()-1>(tup) + (Size(Rank()-1) + 1) / 2) % Size(Rank()-1);
-          return cuda::std::apply(op_, tup);
+          return std::as_const(*this).template operator()(indices...);
         }
 
-
         static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
         {
           return detail::get_rank<T1>();

diff --git a/include/matx/operators/hermitian.h b/include/matx/operators/hermitian.h
@@ -61,14 +61,14 @@ namespace matx
         }
 
         template <typename... Is>
-          __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const 
-          {
-            auto tup = cuda::std::make_tuple(indices...);
-            auto stl = cuda::std::get<Rank()-2>(tup);
-            cuda::std::get<Rank()-2>(tup) = cuda::std::get<Rank()-1>(tup);
-            cuda::std::get<Rank()-1>(tup) = stl;      
-            return conj(cuda::std::apply(op_, tup));
-          }
+        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const 
+        {
+          auto tup = cuda::std::make_tuple(indices...);
+          auto stl = cuda::std::get<Rank()-2>(tup);
+          cuda::std::get<Rank()-2>(tup) = cuda::std::get<Rank()-1>(tup);
+          cuda::std::get<Rank()-1>(tup) = stl;      
+          return conj(cuda::std::apply(op_, tup));
+        }
 
         static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
         {

diff --git a/include/matx/operators/interleaved.h b/include/matx/operators/interleaved.h
@@ -73,16 +73,9 @@ namespace matx
         }
 
         template <typename... Is>
-        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ complex_type operator()(Is... indices) 
+        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices)
         {
-          auto real = op_(indices...);
-
-          constexpr size_t rank_idx = (Rank() == 1) ? 0 : (Rank() - 2);
-          auto tup = cuda::std::make_tuple(indices...);
-          cuda::std::get<rank_idx>(tup) += op_.Size(rank_idx) / 2;
-
-          auto imag = cuda::std::apply(op_, tup);
-          return complex_type{real, imag};
+          return std::as_const(*this).template operator()(indices...);
         }
 
         static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()

diff --git a/include/matx/operators/kronecker.h b/include/matx/operators/kronecker.h
@@ -81,15 +81,7 @@ namespace matx
         template <typename... Is>
         __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices)
         {
-          auto tup1 = cuda::std::make_tuple(indices...);
-          auto tup2 = cuda::std::make_tuple(indices...);
-          cuda::std::get<Rank() - 2>(tup2) = pp_get<Rank() - 2>(indices...) % op2_.Size(Rank() - 2);
-          cuda::std::get<Rank() - 1>(tup2) = pp_get<Rank() - 1>(indices...) % op2_.Size(Rank() - 1);
-
-          cuda::std::get<Rank() - 2>(tup1) = pp_get<Rank() - 2>(indices...) / op2_.Size(Rank() - 2);
-          cuda::std::get<Rank() - 1>(tup1) = pp_get<Rank() - 1>(indices...) / op2_.Size(Rank() - 1);
-
-          return cuda::std::apply(op2_, tup2) * cuda::std::apply(op1_, tup1);
+          return std::as_const(*this).template operator()(indices...);
         }
 
         template <typename ShapeType, typename Executor>