Update Black-Scholes to add apply() example

cliffburdick · cliffburdick · commit e22672521123 · 2025-10-23T10:08:24.000-07:00
diff --git a/examples/black_scholes.cu b/examples/black_scholes.cu
@@ -48,25 +48,29 @@ using namespace matx;
  * boilerplate code around the original expression. This custom operator can then be used either alone or inside
  * other arithmetic expressions, and only a single load is issues for each tensor.
  *
- * This example uses the Black-Scholes equtation to demonstrate the two ways to implement the equation in MatX, and
- * shows the performance difference.
+ * This example uses the Black-Scholes equtation to demonstrate three ways to implement the equation in MatX, and
+ * shows the performance difference between them. The three ways are:
+ * 1. Using a custom operator
+ * 2. Using a lambda function via apply()
+ * 3. Using a MatX expression
+ *
+ * Which method to use depends on the use case, but the lambda function is preferred for simplicity and readability.
  */
 
 /* Custom operator */
-template <class O, class I1>
-class BlackScholes : public BaseOp<BlackScholes<O, I1>> {
+template <class I1>
+class BlackScholes : public BaseOp<BlackScholes<I1>> {
 private:
-  O out_;
   I1 V_, S_, K_, r_, T_;
 
 public:
   using matxop = bool;
 
-  BlackScholes(O out, I1 K, I1 V, I1 S, I1 r, I1 T)
-      : out_(out), V_(V), S_(S), K_(K), r_(r), T_(T)  {}
+  BlackScholes(I1 K, I1 V, I1 S, I1 r, I1 T)
+      : V_(V), S_(S), K_(K), r_(r), T_(T)  {}
 
   template <detail::ElementsPerThread EPT>
-  __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ void operator()(index_t idx)
+  __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto operator()(index_t idx) const
   {
     auto V = V_(idx);
     auto K = K_(idx);
@@ -81,27 +85,32 @@ public:
     auto cdf_d2 = normcdff(d2);
     auto expRT = exp(-1.f * r * T);
 
-    out_(idx) = S * cdf_d1 - K * expRT * cdf_d2;
+    return S * cdf_d1 - K * expRT * cdf_d2;
   }
 
   __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__  void operator()(index_t idx) {
     return this->operator()<detail::ElementsPerThread::ONE>(idx);
   }
 
-  __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(uint32_t i) const  { return out_.Size(i); }
-  static constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() { return O::Rank(); }
+  __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(uint32_t i) const  { return V_.Size(i); }
+  static constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() { return I1::Rank(); }
 
   template <detail::OperatorCapability Cap>
   __MATX_INLINE__ __MATX_HOST__ auto get_capability() const {  
-    auto self_has_cap = detail::capability_attributes<Cap>::default_value;
-    return detail::combine_capabilities<Cap>(
-        self_has_cap,
-      detail::get_operator_capability<Cap>(V_),
-      detail::get_operator_capability<Cap>(S_),
-      detail::get_operator_capability<Cap>(K_),
-      detail::get_operator_capability<Cap>(r_),
-      detail::get_operator_capability<Cap>(T_)
-    );
+    // Don't support vectorization yet
+    if constexpr (Cap == detail::OperatorCapability::ELEMENTS_PER_THREAD) {
+      return detail::ElementsPerThread::ONE;
+    } else {    
+      auto self_has_cap = detail::capability_attributes<Cap>::default_value;
+      return detail::combine_capabilities<Cap>(
+          self_has_cap,
+        detail::get_operator_capability<Cap>(V_),
+        detail::get_operator_capability<Cap>(S_),
+        detail::get_operator_capability<Cap>(K_),
+        detail::get_operator_capability<Cap>(r_),
+        detail::get_operator_capability<Cap>(T_)
+      );
+    }
   }
 };
 
@@ -132,7 +141,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   using dtype = float;
 
   index_t input_size = 100000000;
-  constexpr uint32_t num_iterations = 1;
+  constexpr uint32_t num_iterations = 100;
   float time_ms;
 
   tensor_t<dtype, 1> K_tensor{{input_size}};
@@ -141,12 +150,20 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   tensor_t<dtype, 1> r_tensor{{input_size}};
   tensor_t<dtype, 1> T_tensor{{input_size}};
   tensor_t<dtype, 1> output_tensor{{input_size}};
+  tensor_t<dtype, 1> output_tensor2{{input_size}};
+  tensor_t<dtype, 1> output_tensor3{{input_size}};
+
+  (K_tensor = random<float>({input_size}, UNIFORM)).run();
+  (S_tensor = random<float>({input_size}, UNIFORM)).run();
+  (V_tensor = random<float>({input_size}, UNIFORM)).run();
+  (r_tensor = random<float>({input_size}, UNIFORM)).run();
+  (T_tensor = random<float>({input_size}, UNIFORM)).run();
 
   cudaStream_t stream;
   cudaStreamCreate(&stream);
   cudaExecutor exec{stream};
 
-  compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);
+  //compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);
 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
@@ -159,6 +176,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   }
   cudaEventRecord(stop, stream);
   exec.sync();
+
   cudaEventElapsedTime(&time_ms, start, stop);
 
   printf("Time without custom operator = %.2fms per iteration\n",
@@ -167,10 +185,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventRecord(start, stream);
   // Time non-operator version
   for (uint32_t i = 0; i < num_iterations; i++) {
-    BlackScholes(output_tensor, K_tensor, V_tensor, S_tensor, r_tensor, T_tensor).run(exec);
+    (output_tensor2 = BlackScholes(K_tensor, V_tensor, S_tensor, r_tensor, T_tensor)).run(exec);
   }
   cudaEventRecord(stop, stream);
   exec.sync();
+
   cudaEventElapsedTime(&time_ms, start, stop);
   printf("Time with custom operator = %.2fms per iteration\n",
     time_ms / num_iterations);
@@ -192,15 +211,36 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   cudaEventRecord(start, stream);
   for (uint32_t i = 0; i < num_iterations; i++) {
-    (output_tensor = matx::apply(bs_lambda, K_tensor, S_tensor, V_tensor, r_tensor, T_tensor)).run(exec);
+    (output_tensor3 = matx::apply(bs_lambda, K_tensor, S_tensor, V_tensor, r_tensor, T_tensor)).run(exec);
   }
+  
   cudaEventRecord(stop, stream);
   exec.sync();
+
   cudaEventElapsedTime(&time_ms, start, stop);
   printf("Time with lambda = %.2fms per iteration\n",
     time_ms / num_iterations);
 
-
+  // Verify all 3 outputs match within 1e-6 using operator() (Managed Memory)
+  bool all_match = true;
+  constexpr float tol = 1e-6f;
+  auto n = K_tensor.Size(0);
+
+  for (index_t i = 0; i < n; i++) {
+    float v1 = output_tensor(i);
+    float v2 = output_tensor2(i);
+    float v3 = output_tensor3(i);
+    if (fabsf(v1 - v2) > tol || fabsf(v1 - v3) > tol || fabsf(v2 - v3) > tol) {
+      printf("Mismatch at idx %lld: v1=%.8f v2=%.8f v3=%.8f\n", i, v1, v2, v3);
+      all_match = false;
+      break;
+    }
+  }
+  if (all_match) {
+    printf("All outputs match within %.1e tolerance.\n", tol);
+  } else {
+    printf("Outputs do NOT match within %.1e tolerance!\n", tol);
+  }
 
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
diff --git a/include/matx/operators/apply.h b/include/matx/operators/apply.h
@@ -71,11 +71,7 @@ namespace matx
         template <ElementsPerThread EPT, typename... Is>
         __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const
         {
-          if constexpr (EPT == ElementsPerThread::ONE) {
-            return apply_impl(cuda::std::index_sequence_for<Ops...>{}, indices...);
-          } else {
-            return Vector<value_type, static_cast<size_t>(EPT)>();
-          }
+          return apply_impl<EPT>(cuda::std::index_sequence_for<Ops...>{}, indices...);
         }
 
         template <typename... Is>
@@ -86,12 +82,8 @@ namespace matx
 
         template <OperatorCapability Cap>
         __MATX_INLINE__ __MATX_HOST__ auto get_capability() const {
-          if constexpr (Cap == OperatorCapability::ELEMENTS_PER_THREAD) {
-            return ElementsPerThread::ONE;
-          } else {
-            auto self_has_cap = capability_attributes<Cap>::default_value;
-            return combine_capabilities_tuple<Cap>(self_has_cap, ops_, cuda::std::index_sequence_for<Ops...>{});
-          }
+          auto self_has_cap = capability_attributes<Cap>::default_value;
+          return combine_capabilities_tuple<Cap>(self_has_cap, ops_, cuda::std::index_sequence_for<Ops...>{});
         }
 
         template <typename ShapeType, typename Executor>
@@ -126,11 +118,28 @@ namespace matx
         cuda::std::tuple<typename detail::base_type_t<Ops>...> ops_;
         cuda::std::array<index_t, first_op_type::Rank()> sizes_;
         // Helper to apply the lambda function to all operators
-        template <size_t... Is, typename... Indices>
+        template <ElementsPerThread EPT, size_t... Is, typename... Indices>
         __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) apply_impl(
             cuda::std::index_sequence<Is...>, Indices... indices) const
         {
-          return func_(cuda::std::get<Is>(ops_)(indices...)...);
+          using out_type = decltype(cuda::std::get<0>(ops_).template operator()<EPT>(indices...));
+          if constexpr (is_vector_v<out_type>) {
+            // Each operator returns a vector, so call operator() once per operator to get the vectors
+            auto op_results = cuda::std::make_tuple(cuda::std::get<Is>(ops_).template operator()<EPT>(indices...)...);
+            
+            // Deduce the result type by calling func_ on scalar elements
+            using result_element_type = decltype(func_(cuda::std::get<Is>(op_results).data[0]...));
+            Vector<result_element_type, static_cast<int>(EPT)> result;
+            
+            // Unroll loop to call func_ on each element of the vectors
+            #pragma unroll
+            for (int i = 0; i < static_cast<int>(EPT); i++) {
+              result.data[i] = func_(cuda::std::get<Is>(op_results).data[i]...);
+            }
+            return result;
+          } else {
+            return func_(cuda::std::get<Is>(ops_).template operator()<EPT>(indices...)...);
+          }
         }
 
         // Helper to call PreRun on all operators