Add mutex around cache lookup

cliffburdick · cliffburdick · commit 1c15c44f553d · 2024-12-03T11:20:24.000-08:00
diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu
@@ -73,7 +73,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
   using complex = cuda::std::complex<float>;
-  cudaExecutor exec{};
 
   index_t signal_size = 1ULL << 16;
   index_t filter_size = 16;
@@ -87,6 +86,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);  
+  cudaExecutor exec{stream};  
 
   // Create time domain buffers
   auto sig_time  = make_tensor<complex>({batches, signal_size});
diff --git a/include/matx/core/cache.h b/include/matx/core/cache.h
@@ -36,6 +36,7 @@
 #include <functional>
 #include <optional>
 #include <any>
+#include <shared_mutex>
 #include <unordered_map>
 #include <cuda/atomic>
 
@@ -50,6 +51,7 @@ using CacheId = uint64_t;
 __attribute__ ((visibility ("default")))
 #endif
 inline cuda::std::atomic<CacheId> CacheIdCounter{0};
+inline std::shared_mutex cache_mtx; ///< Mutex protecting updates from map
 
 template<typename CacheType>
 __attribute__ ((visibility ("default")))
@@ -83,6 +85,8 @@ class matxCache_t {
    */
   template <typename CacheType>
   void Clear(const CacheId &id) {
+    [[maybe_unused]] std::unique_lock lck(cache_mtx);
+
     auto el = cache.find(id);
     MATX_ASSERT_STR(el != cache.end(), matxInvalidType, "Cache type not found");
 
@@ -91,6 +95,8 @@ class matxCache_t {
 
   template <typename CacheType, typename InParams, typename MakeFun, typename ExecFun>
   void LookupAndExec(const CacheId &id, const InParams &params, const MakeFun &mfun, const ExecFun &efun) {
+    [[maybe_unused]] std::unique_lock lck(cache_mtx);
+
     // Create named cache if it doesn't exist
     auto el = cache.find(id);
     if (el == cache.end()) {
diff --git a/include/matx/transforms/chol/chol_cuda.h b/include/matx/transforms/chol/chol_cuda.h
@@ -103,8 +103,8 @@ class matxDnCholCUDAPlan_t : matxDnCUDASolver_t {
     MATX_STATIC_ASSERT_STR(!is_half_v<T1>, matxInvalidType, "Cholesky solver does not support half precision");
     MATX_STATIC_ASSERT_STR((std::is_same_v<T1, typename OutTensor_t::value_type>), matxInavlidType, "Input and Output types must match");
 
-    params = GetCholParams(a, uplo);
-    params.exec = exec;
+    params = GetCholParams(a, uplo, exec);
+
     this->GetWorkspaceSize();
     this->AllocateWorkspace(params.batch_size, false, exec);
   }
@@ -120,13 +120,15 @@ class matxDnCholCUDAPlan_t : matxDnCUDASolver_t {
   }
 
   static DnCholCUDAParams_t GetCholParams(const ATensor &a,
-                                      cublasFillMode_t uplo)
+                                      cublasFillMode_t uplo,
+                                      const cudaExecutor &exec)
   {
     DnCholCUDAParams_t params;
     params.batch_size = GetNumBatches(a);
     params.n = a.Size(RANK - 1);
     params.A = a.Data();
     params.uplo = uplo;
+    params.exec = exec;    
     params.dtype = TypeToInt<T1>();
 
     return params;
@@ -298,7 +300,7 @@ void chol_impl(OutputTensor &&out, const ATensor &a,
   cublasFillMode_t uplo_cusolver = (uplo == SolverFillMode::UPPER)? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
 
   // Get parameters required by these tensors
-  auto params = detail::matxDnCholCUDAPlan_t<OutputTensor, decltype(tmp_out)>::GetCholParams(tmp_out, uplo_cusolver);
+  auto params = detail::matxDnCholCUDAPlan_t<OutputTensor, decltype(tmp_out)>::GetCholParams(tmp_out, uplo_cusolver, exec);
 
   using cache_val_type = detail::matxDnCholCUDAPlan_t<OutputTensor, decltype(tmp_out)>;
   detail::GetCache().LookupAndExec<detail::chol_cuda_cache_t>(
diff --git a/include/matx/transforms/eig/eig_cuda.h b/include/matx/transforms/eig/eig_cuda.h
@@ -115,8 +115,7 @@ class matxDnEigCUDAPlan_t : matxDnCUDASolver_t {
     MATX_STATIC_ASSERT_STR(!is_complex_v<T2>, matxInvalidType, "W type must be real");
     MATX_STATIC_ASSERT_STR((std::is_same_v<typename inner_op_type_t<T1>::type, T2>), matxInvalidType, "Out and W inner types must match");
 
-    params = GetEigParams(w, a, jobz, uplo);
-    params.exec = exec;
+    params = GetEigParams(w, a, jobz, uplo, exec);
     this->GetWorkspaceSize();
 #if CUSOLVER_VERSION > 11701 || (CUSOLVER_VERSION == 11701 && CUSOLVER_VER_BUILD >= 2)    
     this->AllocateWorkspace(params.batch_size, true, exec);
@@ -150,7 +149,8 @@ class matxDnEigCUDAPlan_t : matxDnCUDASolver_t {
   static DnEigCUDAParams_t GetEigParams(WTensor &w,
                                     const ATensor &a,
                                     cusolverEigMode_t jobz,
-                                    cublasFillMode_t uplo)
+                                    cublasFillMode_t uplo,
+                                    const cudaExecutor &exec)
   {
     DnEigCUDAParams_t params;
     params.batch_size = GetNumBatches(a);
@@ -159,6 +159,8 @@ class matxDnEigCUDAPlan_t : matxDnCUDASolver_t {
     params.W = w.Data();
     params.jobz = jobz;
     params.uplo = uplo;
+    params.exec = exec;    
+
     params.dtype = TypeToInt<T1>();
 
     return params;
@@ -342,7 +344,7 @@ void eig_impl(OutputTensor &&out, WTensor &&w,
 
   // Get parameters required by these tensors
   auto params = detail::matxDnEigCUDAPlan_t<OutputTensor, decltype(w_new), decltype(a_new)>::
-      GetEigParams(w_new, tv, jobz_cusolver, uplo_cusolver);
+      GetEigParams(w_new, tv, jobz_cusolver, uplo_cusolver, exec);
 
   // Get cache or new eigen plan if it doesn't exist
   using cache_val_type = detail::matxDnEigCUDAPlan_t<OutputTensor, decltype(w_new), decltype(a_new)>;
diff --git a/include/matx/transforms/lu/lu_cuda.h b/include/matx/transforms/lu/lu_cuda.h
@@ -106,8 +106,7 @@ class matxDnLUCUDAPlan_t : matxDnCUDASolver_t {
     MATX_STATIC_ASSERT_STR((std::is_same_v<T1, typename OutTensor_t::value_type>), matxInavlidType, "Input and Output types must match");
     MATX_STATIC_ASSERT_STR((std::is_same_v<T2, int64_t>), matxInavlidType, "Pivot tensor type must be int64_t");
 
-    params = GetLUParams(piv, a);
-    params.exec = exec;
+    params = GetLUParams(piv, a, exec);
     this->GetWorkspaceSize();
     this->AllocateWorkspace(params.batch_size, false, exec);
   }
@@ -123,7 +122,8 @@ class matxDnLUCUDAPlan_t : matxDnCUDASolver_t {
   }
 
   static DnLUCUDAParams_t GetLUParams(PivotTensor &piv,
-                                  const ATensor &a) noexcept
+                                      const ATensor &a,
+                                      const cudaExecutor &exec) noexcept
   {
     DnLUCUDAParams_t params;
     params.batch_size = GetNumBatches(a);
@@ -132,7 +132,7 @@ class matxDnLUCUDAPlan_t : matxDnCUDASolver_t {
     params.A = a.Data();
     params.piv = piv.Data();
     params.dtype = TypeToInt<T1>();
-
+    params.exec = exec;
     return params;
   }
 
@@ -287,7 +287,7 @@ void lu_impl(OutputTensor &&out, PivotTensor &&piv,
   auto tvt = tv.PermuteMatrix();
 
   // Get parameters required by these tensors
-  auto params = detail::matxDnLUCUDAPlan_t<OutputTensor, decltype(piv_new), decltype(a_new)>::GetLUParams(piv_new, tvt);
+  auto params = detail::matxDnLUCUDAPlan_t<OutputTensor, decltype(piv_new), decltype(a_new)>::GetLUParams(piv_new, tvt, exec);
 
   // Get cache or new LU plan if it doesn't exist
   using cache_val_type = detail::matxDnLUCUDAPlan_t<OutputTensor, decltype(piv_new), decltype(a_new)>;
diff --git a/include/matx/transforms/qr/qr_cuda.h b/include/matx/transforms/qr/qr_cuda.h
@@ -295,8 +295,7 @@ class matxDnQRCUDAPlan_t : matxDnCUDASolver_t {
     MATX_STATIC_ASSERT_STR((std::is_same_v<T1, typename OutTensor_t::value_type>), matxInavlidType, "Input and Output types must match");
     MATX_STATIC_ASSERT_STR((std::is_same_v<T1, T2>), matxInavlidType, "A and Tau types must match");
 
-    params = GetQRParams(tau, a);
-    params.exec = exec;
+    params = GetQRParams(tau, a, exec);
     this->GetWorkspaceSize();
     this->AllocateWorkspace(params.batch_size, false, exec);
   }
@@ -311,7 +310,8 @@ class matxDnQRCUDAPlan_t : matxDnCUDASolver_t {
   }
 
   static DnQRCUDAParams_t GetQRParams(TauTensor &tau,
-                                  const ATensor &a)
+                                  const ATensor &a,
+                                  const cudaExecutor &exec)
   {
     DnQRCUDAParams_t params;
 
@@ -321,7 +321,7 @@ class matxDnQRCUDAPlan_t : matxDnCUDASolver_t {
     params.A = a.Data();
     params.tau = tau.Data();
     params.dtype = TypeToInt<T1>();
-
+    params.exec = exec;
     return params;
   }
 
@@ -468,7 +468,7 @@ void qr_solver_impl(OutTensor &&out, TauTensor &&tau,
   auto tvt = tv.PermuteMatrix();
 
   // Get parameters required by these tensors
-  auto params = detail::matxDnQRCUDAPlan_t<OutTensor, decltype(tau_new), decltype(a_new)>::GetQRParams(tau_new, tvt);
+  auto params = detail::matxDnQRCUDAPlan_t<OutTensor, decltype(tau_new), decltype(a_new)>::GetQRParams(tau_new, tvt, exec);
 
   // Get cache or new QR plan if it doesn't exist
   using cache_val_type = detail::matxDnQRCUDAPlan_t<OutTensor, decltype(tau_new), decltype(a_new)>;
diff --git a/include/matx/transforms/svd/svd_cuda.h b/include/matx/transforms/svd/svd_cuda.h
@@ -644,8 +644,7 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
     MATX_STATIC_ASSERT_STR(!is_complex_v<T3>, matxInvalidType, "S type must be real");
     MATX_STATIC_ASSERT_STR((std::is_same_v<typename inner_op_type_t<T1>::type, T3>), matxInvalidType, "A and S inner types must match");
 
-    params        = GetSVDParams(u, s, vt, a, jobz);
-    params.exec   = exec;
+    params        = GetSVDParams(u, s, vt, a, jobz, exec);
     params.method = method;
 
     if (params.method == SVDMethod::GESVDJ_BATCHED) {
@@ -725,8 +724,8 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
 
   static DnSVDCUDAParams_t
   GetSVDParams(UTensor &u, STensor &s,
-               VtTensor &vt, const ATensor &a,
-               const char jobz = 'A')
+               VtTensor &vt, const ATensor &a, 
+               const char jobz, const cudaExecutor &exec)
   {
     DnSVDCUDAParams_t params;
     params.batch_size = GetNumBatches(a);
@@ -738,6 +737,7 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
     params.S = s.Data();
     params.jobz = jobz;
     params.dtype = TypeToInt<T1>();
+    params.exec = exec;
     return params;
   }
 
@@ -997,7 +997,7 @@ void svd_impl(UTensor &&u, STensor &&s,
 
     // Get parameters required by these tensors
     auto params = detail::matxDnSVDCUDAPlan_t<decltype(u_in), decltype(s_new), decltype(vt_in), decltype(at_col_maj)>::
-      GetSVDParams(u_in, s_new, vt_in, at_col_maj, job_cusolver);
+      GetSVDParams(u_in, s_new, vt_in, at_col_maj, job_cusolver, exec);
 
     // Get cache or new SVD plan if it doesn't exist
     using cache_val_type = detail::matxDnSVDCUDAPlan_t<decltype(u_in), decltype(s_new), decltype(vt_in), decltype(at_col_maj)>;
@@ -1034,15 +1034,15 @@ void svd_impl(UTensor &&u, STensor &&s,
 
     // Get parameters required by these tensors
     auto params = detail::matxDnSVDCUDAPlan_t<decltype(u_col_maj), decltype(s_new), decltype(vt_col_maj), decltype(tvt)>::
-        GetSVDParams(u_col_maj, s_new, vt_col_maj, tvt, job_cusolver);
+        GetSVDParams(u_col_maj, s_new, vt_col_maj, tvt, job_cusolver, exec);
 
     // Get cache or new SVD plan if it doesn't exist
     using cache_val_type = detail::matxDnSVDCUDAPlan_t<decltype(u_col_maj), decltype(s_new), decltype(vt_col_maj), decltype(tvt)>;
     detail::GetCache().LookupAndExec<detail::svd_cuda_cache_t>(
       detail::GetCacheIdFromType<detail::svd_cuda_cache_t>(),
       params,
       [&]() {
-        return std::make_shared<cache_val_type>(u_col_maj, s_new, vt_col_maj, tvt, method, stream, job_cusolver);
+        return std::make_shared<cache_val_type>(u_col_maj, s_new, vt_col_maj, tvt, method, exec, job_cusolver);
       },
       [&](std::shared_ptr<cache_val_type> ctype) {
         ctype->Exec(u_col_maj, s_new, vt_col_maj, tvt, exec, job_cusolver);