Merge branch 'main' into cccl_std_conversion

cliffburdick · web-flow · commit 8b75eae2f5db · 2024-05-21T15:23:37.000-07:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -23,6 +23,7 @@ option(MATX_EN_CUTENSOR OFF)
 option(MATX_EN_FILEIO OFF)
 option(MATX_EN_NVPL OFF, "Enable NVIDIA Performance Libraries for optimized ARM CPU support")
 option(MATX_DISABLE_CUB_CACHE "Disable caching for CUB allocations" ON)
+option(MATX_EN_COVERAGE OFF "Enable code coverage reporting")
 
 set(MATX_EN_PYBIND11 OFF CACHE BOOL "Enable pybind11 support")
 
@@ -181,6 +182,11 @@ if (MATX_DISABLE_CUB_CACHE)
     target_compile_definitions(matx INTERFACE MATX_DISABLE_CUB_CACHE=1)
 endif()
 
+if (MATX_EN_COVERAGE)
+    target_compile_options(matx INTERFACE -fprofile-arcs -ftest-coverage)
+    target_link_options(matx INTERFACE -lgcov --coverage)
+endif()
+
 # Get the tensor libraries if we need them
 if (MATX_EN_CUTENSOR)
     set(CUTENSORNET_VERSION 24.03.0.4)
diff --git a/bench/00_transform/svd_power.cu b/bench/00_transform/svd_power.cu
@@ -102,7 +102,7 @@ void svdbpi_batch(nvbench::state &state,
 
   MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
   state.exec(
-   [&U, &S, &VT, &A, &iterations, &r](nvbench::launch &launch) {
+   [&U, &S, &VT, &A, &iterations](nvbench::launch &launch) {
       (mtie(U, S, VT) = svdbpi(A, iterations)).run(cudaExecutor{launch.get_stream()}); });
   MATX_NVTX_END_RANGE( 1 )
 }
diff --git a/include/matx/core/stacktrace.h b/include/matx/core/stacktrace.h
@@ -47,23 +47,25 @@
 #include <iostream>
 #include <sstream>
 #include <string>
+#include <array>
 
 namespace matx {
 namespace detail{
 
+static constexpr int MAX_FRAMES = 63;
+
 /** Print a demangled stack backtrace of the caller function to FILE* out. */
-static inline void printStackTrace(std::ostream &eout = std::cerr,
-                                   unsigned int max_frames = 63)
+static inline void printStackTrace(std::ostream &eout = std::cerr)
 {
 #ifdef _WIN32
   // TODO add code for windows stack trace
 #else
   std::stringstream out;
   // storage array for stack trace address data
-  void *addrlist[max_frames + 1];
+  std::array<void *, MAX_FRAMES + 1> addrlist;
   // retrieve current stack addresses
   int addrlen =
-      backtrace(addrlist, static_cast<int>(sizeof(addrlist) / sizeof(void *)));
+      backtrace(reinterpret_cast<void **>(&addrlist), static_cast<int>(addrlist.size()));
 
   if (addrlen == 0) {
     out << "  <empty, possibly corrupt>\n";
@@ -72,7 +74,7 @@ static inline void printStackTrace(std::ostream &eout = std::cerr,
 
   // resolve addresses into strings containing "filename(function+address)",
   // this array must be free()-ed
-  char **symbollist = backtrace_symbols(addrlist, addrlen);
+  char **symbollist = backtrace_symbols(reinterpret_cast<void *const *>(&addrlist), addrlen);
   // allocate string which will be filled with the demangled function name
   size_t funcnamesize = 256;
   char *funcname = (char *)malloc(funcnamesize);
diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h
@@ -1466,7 +1466,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     int d = 0;
     bool def_stride = (strides[0] == -1);
 
-    int end_count = 0;
+    [[maybe_unused]] int end_count = 0;
     for (int i = 0; i < RANK; i++) {
       if (ends[i] == matxDropDim) {
         end_count++;
diff --git a/include/matx/core/tensor_utils.h b/include/matx/core/tensor_utils.h
@@ -186,13 +186,6 @@ namespace detail {
     else {
       return cuda::std::invoke(std::forward<Func>(f), cuda::std::get<S>(std::forward<Tuple>(tuple))...);
     }
-
-    if constexpr (!(is_std_tuple<remove_cvref_t<Tuple>>::value || is_std_array<remove_cvref_t<Tuple>>::value)) {
-            return cuda::std::invoke(std::forward<Func>(f), cuda::std::get<S>(std::forward<Tuple>(tuple))...);
-    }
-    else {
-      return cuda::std::invoke(std::forward<Func>(f), cuda::std::get<S>(std::forward<Tuple>(tuple))...);
-    }
   }
 
   template <class Func, class Tuple>
@@ -208,17 +201,6 @@ namespace detail {
           std::forward<Func>(f), std::forward<Tuple>(t),
           std::make_index_sequence<cuda::std::tuple_size_v<remove_cvref_t<Tuple>>>{});
     }
-
-    if constexpr (!(is_std_tuple<remove_cvref_t<Tuple>>::value || is_std_array<remove_cvref_t<Tuple>>::value)) {
-      return apply_impl(
-          std::forward<Func>(f), std::forward<Tuple>(t),
-          std::make_index_sequence<cuda::std::tuple_size_v<remove_cvref_t<Tuple>>>{});
-    }
-    else {
-      return apply_impl(
-          std::forward<Func>(f), std::forward<Tuple>(t),
-          std::make_index_sequence<cuda::std::tuple_size_v<remove_cvref_t<Tuple>>>{});
-    }
   }
 
   template <class Func, class Tuple>
@@ -234,17 +216,6 @@ namespace detail {
           std::forward<Func>(f), std::forward<Tuple>(t),
           make_index_sequence_rev<cuda::std::tuple_size_v<remove_cvref_t<Tuple>>>{});
     }
-
-    if constexpr (!(is_std_tuple<remove_cvref_t<Tuple>>::value || is_std_array<remove_cvref_t<Tuple>>::value)) {
-      return apply_impl(
-          std::forward<Func>(f), std::forward<Tuple>(t),
-          make_index_sequence_rev<cuda::std::tuple_size_v<remove_cvref_t<Tuple>>>{});
-    }
-    else {
-      return apply_impl(
-          std::forward<Func>(f), std::forward<Tuple>(t),
-          make_index_sequence_rev<cuda::std::tuple_size_v<remove_cvref_t<Tuple>>>{});
-    }
   }
 
   template <typename T0, typename T1, typename... Tn>
@@ -332,20 +303,6 @@ namespace detail {
         return i(args...);
       }, sliced_tup);
     }
-
-    if constexpr (!(T::Rank() == int(sizeof...(Is)) || T::Rank() == matxNoRank)) {
-      // Construct an integer sequence of the length of the tuple, but only using the last indices
-      using seq = offset_sequence_t<sizeof...(Is) - T::Rank(), std::make_index_sequence<T::Rank()>>;
-      auto tup = cuda::std::make_tuple(indices...);
-      auto sliced_tup = select_tuple(std::forward<decltype(tup)>(tup), seq{});
-      return cuda::std::apply([&](auto... args) {
-        return i(args...);
-      }, sliced_tup);
-    }
-    else
-    {
-      return i(indices...);
-    }
   }
 
 
@@ -360,15 +317,6 @@ namespace detail {
     {
       return i;
     }
-
-    if constexpr (!is_matx_op<T>())
-    {
-      return i;
-    }
-    else
-    {
-      return get_matx_value(i, indices...);
-    }
   }
 
   template <typename T> __MATX_INLINE__ std::string to_short_str() {
@@ -1184,6 +1132,21 @@ void print(const Op &op, [[maybe_unused]] Args... dims) {
   cuda::std::apply([&](auto &&...args) { fprint(stdout, op, args...); }, tp);
 }
 
+/**
+ * @brief Print a tensor's all values to stdout
+ *
+ * This form of `print()` is a specialization for 0D tensors. 
+ *
+ * @tparam Op Operator input type
+ * @param op Operator input
+ */
+template <typename Op, 
+         std::enable_if_t<(Op::Rank() == 0), bool> = true>
+void print(const Op &op) 
+{
+  fprint(stdout, op);
+}
+
 #endif // not DOXYGEN_ONLY
 
 template <typename Op>
diff --git a/test/00_io/PrintTests.cu b/test/00_io/PrintTests.cu
@@ -187,6 +187,21 @@ TEST_F(PrintTest, DefaultTest4)
   MATX_EXIT_HANDLER();
 }
 
+TEST_F(PrintTest, DefaultTest5)
+{
+  MATX_ENTER_HANDLER();
+  auto pft = get_print_format_type();
+  ASSERT_EQ(MATX_PRINT_FORMAT_DEFAULT, pft);
+  
+  auto testSlice = matx::slice<0>(A1, {0}, {matx::matxDropDim});
+
+  print_checker(testSlice,
+      "Tensor{complex<double>} Rank: 0, Sizes:[], Strides:[]\n"
+      "-9.2466e-01+9.9114e-01j \n");
+
+  MATX_EXIT_HANDLER();
+}
+
 TEST_F(PrintTest, MlabTest1)
 {
   MATX_ENTER_HANDLER();
@@ -281,6 +296,22 @@ TEST_F(PrintTest, MlabTest4)
   MATX_EXIT_HANDLER();
 }
 
+TEST_F(PrintTest, MlabTest5)
+{
+  MATX_ENTER_HANDLER();
+  set_print_format_type(MATX_PRINT_FORMAT_MLAB);
+  auto pft = get_print_format_type();
+  ASSERT_EQ(MATX_PRINT_FORMAT_MLAB, pft);
+
+  auto testSlice = matx::slice<0>(A1, {0}, {matx::matxDropDim});
+
+  print_checker(testSlice,
+      "Tensor{complex<double>} Rank: 0, Sizes:[], Strides:[]\n"
+      "-9.2466e-01+9.9114e-01j \n");
+
+  MATX_EXIT_HANDLER();
+}
+
 TEST_F(PrintTest, PythonTest1)
 {
   MATX_ENTER_HANDLER();
@@ -370,3 +401,21 @@ TEST_F(PrintTest, PythonTest4)
 
   MATX_EXIT_HANDLER();
 }
+
+TEST_F(PrintTest, PythonTest5)
+{
+  MATX_ENTER_HANDLER();
+  set_print_format_type(MATX_PRINT_FORMAT_PYTHON);
+  auto pft = get_print_format_type();
+  ASSERT_EQ(MATX_PRINT_FORMAT_PYTHON, pft);
+
+  auto testSlice = matx::slice<0>(A1, {0}, {matx::matxDropDim});
+
+  print_checker(testSlice,
+      "Tensor{complex<double>} Rank: 0, Sizes:[], Strides:[]\n"
+      "-9.2466e-01+9.9114e-01j \n");
+
+  MATX_EXIT_HANDLER();
+}
+
+

Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@ void svdbpi_batch(nvbench::state &state,`
`102`	`102`
`103`	`103`	`MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )`
`104`	`104`	`state.exec(`
`105`		`- [&U, &S, &VT, &A, &iterations, &r](nvbench::launch &launch) {`
	`105`	`+ [&U, &S, &VT, &A, &iterations](nvbench::launch &launch) {`
`106`	`106`	`(mtie(U, S, VT) = svdbpi(A, iterations)).run(cudaExecutor{launch.get_stream()}); });`
`107`	`107`	`MATX_NVTX_END_RANGE( 1 )`
`108`	`108`	`}`