[FIX] Fix high GPU memory usage (#351)

aliceb-nv · web-flow · commit 4e8bdaec4220 · 2025-09-10T15:19:02.000Z
closes #349 compute_related_variables was heuristically allocating memory based on A100/H100 with >=40GB of VRAM. This is now automatically adjusted based on the total VRAM of the device A command-line option has also been added to solve_MPS_file to specify device memory allocation limits for ease of testing. Authors: - Alice Boucher (https://github.com/aliceb-nv) - Nicolas L. Guidotti (https://github.com/nguidotti) Approvers: - Nicolas L. Guidotti (https://github.com/nguidotti) URL: #351
diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp
@@ -28,7 +28,10 @@
 #include <raft/core/handle.hpp>
 
 #include <rmm/mr/device/cuda_async_memory_resource.hpp>
+#include <rmm/mr/device/limiting_resource_adaptor.hpp>
+#include <rmm/mr/device/logging_resource_adaptor.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/device/tracking_resource_adaptor.hpp>
 
 #include <rmm/mr/device/owning_wrapper.hpp>
 
@@ -256,7 +259,9 @@ void run_single_file_mp(std::string file_path,
 {
   std::cout << "running file " << file_path << " on gpu : " << device << std::endl;
   auto memory_resource = make_async();
-  rmm::mr::set_current_device_resource(memory_resource.get());
+  auto limiting_adaptor =
+    rmm::mr::limiting_resource_adaptor(memory_resource.get(), 6ULL * 1024ULL * 1024ULL * 1024ULL);
+  rmm::mr::set_current_device_resource(&limiting_adaptor);
   int sol_found = run_single_file(file_path,
                                   device,
                                   batch_id,
@@ -340,6 +345,15 @@ int main(int argc, char* argv[])
     .scan<'g', double>()
     .default_value(std::numeric_limits<double>::max());
 
+  program.add_argument("--memory-limit")
+    .help("memory limit in MB")
+    .scan<'g', double>()
+    .default_value(0.0);
+
+  program.add_argument("--track-allocations")
+    .help("track allocations (t/f)")
+    .default_value(std::string("f"));
+
   // Parse arguments
   try {
     program.parse_args(argc, argv);
@@ -362,10 +376,12 @@ int main(int argc, char* argv[])
   std::string result_file;
   int batch_num = -1;
 
-  bool heuristics_only = program.get<std::string>("--heuristics-only")[0] == 't';
-  int num_cpu_threads  = program.get<int>("--num-cpu-threads");
-  bool write_log_file  = program.get<std::string>("--write-log-file")[0] == 't';
-  bool log_to_console  = program.get<std::string>("--log-to-console")[0] == 't';
+  bool heuristics_only   = program.get<std::string>("--heuristics-only")[0] == 't';
+  int num_cpu_threads    = program.get<int>("--num-cpu-threads");
+  bool write_log_file    = program.get<std::string>("--write-log-file")[0] == 't';
+  bool log_to_console    = program.get<std::string>("--log-to-console")[0] == 't';
+  double memory_limit    = program.get<double>("--memory-limit");
+  bool track_allocations = program.get<std::string>("--track-allocations")[0] == 't';
 
   if (program.is_used("--out-dir")) {
     out_dir     = program.get<std::string>("--out-dir");
@@ -469,7 +485,17 @@ int main(int argc, char* argv[])
     merge_result_files(out_dir, result_file, n_gpus, batch_num);
   } else {
     auto memory_resource = make_async();
-    rmm::mr::set_current_device_resource(memory_resource.get());
+    if (memory_limit > 0) {
+      auto limiting_adaptor =
+        rmm::mr::limiting_resource_adaptor(memory_resource.get(), memory_limit * 1024ULL * 1024ULL);
+      rmm::mr::set_current_device_resource(&limiting_adaptor);
+    } else if (track_allocations) {
+      rmm::mr::tracking_resource_adaptor tracking_adaptor(memory_resource.get(),
+                                                          /*capture_stacks=*/true);
+      rmm::mr::set_current_device_resource(&tracking_adaptor);
+    } else {
+      rmm::mr::set_current_device_resource(memory_resource.get());
+    }
     run_single_file(path,
                     0,
                     0,
diff --git a/cpp/src/mip/problem/problem.cu b/cpp/src/mip/problem/problem.cu
@@ -21,6 +21,7 @@
 #include "problem_kernels.cuh"
 
 #include <utilities/copy_helpers.hpp>
+#include <utilities/cuda_helpers.cuh>
 #include <utilities/macros.cuh>
 
 #include <linear_programming/utils.cuh>
@@ -798,16 +799,21 @@ void problem_t<i_t, f_t>::compute_related_variables(double time_limit)
 
   handle_ptr->sync_stream();
 
+  // previously used constants were based on 40GB of memory. Scale accordingly on smaller GPUs
+  // We can't rely on querying free memory or allocation try/catch
+  // since this would break determinism guarantees (GPU may be shared by other processes)
+  f_t size_factor = std::min(1.0, cuopt::get_device_memory_size() / 1e9 / 40.0);
+
   // TODO: determine optimal number of slices based on available GPU memory? This used to be 2e9 /
   // n_variables
-  i_t max_slice_size = 6e8 / n_variables;
+  i_t max_slice_size = 6e8 * size_factor / n_variables;
 
   rmm::device_uvector<i_t> varmap(max_slice_size * n_variables, handle_ptr->get_stream());
   rmm::device_uvector<i_t> offsets(max_slice_size * n_variables, handle_ptr->get_stream());
 
   related_variables.resize(0, handle_ptr->get_stream());
   // TODO: this used to be 1e8
-  related_variables.reserve(1e8, handle_ptr->get_stream());  // reserve space
+  related_variables.reserve(1e8 * size_factor, handle_ptr->get_stream());  // reserve space
   related_variables_offsets.resize(n_variables + 1, handle_ptr->get_stream());
   related_variables_offsets.set_element_to_zero_async(0, handle_ptr->get_stream());
 
@@ -851,7 +857,7 @@ void problem_t<i_t, f_t>::compute_related_variables(double time_limit)
     auto current_time = std::chrono::high_resolution_clock::now();
     // if the related variable array would wind up being too large for available memory, abort
     // TODO this used to be 1e9
-    if (related_variables.size() > 1e9 ||
+    if (related_variables.size() > 1e9 * size_factor ||
         std::chrono::duration_cast<std::chrono::seconds>(current_time - start_time).count() >
           time_limit) {
       CUOPT_LOG_DEBUG(
diff --git a/cpp/src/utilities/cuda_helpers.cuh b/cpp/src/utilities/cuda_helpers.cuh
@@ -24,6 +24,8 @@
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
+#include <rmm/mr/device/limiting_resource_adaptor.hpp>
 
 namespace cuopt {
 
@@ -208,4 +210,27 @@ DI void sorted_insert(T* array, T item, int curr_size, int max_size)
   array[0] = item;
 }
 
+inline size_t get_device_memory_size()
+{
+  // Otherwise, we need to get the free memory from the device
+  size_t free_mem, total_mem;
+  cudaMemGetInfo(&free_mem, &total_mem);
+
+  auto res = rmm::mr::get_current_device_resource();
+  auto limiting_adaptor =
+    dynamic_cast<rmm::mr::limiting_resource_adaptor<rmm::mr::cuda_async_memory_resource>*>(res);
+  // Did we specifiy an explicit memory limit?
+  if (limiting_adaptor) {
+    printf("limiting_adaptor->get_allocation_limit(): %fMiB\n",
+           limiting_adaptor->get_allocation_limit() / (double)1e6);
+    printf("used_mem: %fMiB\n", limiting_adaptor->get_allocated_bytes() / (double)1e6);
+    printf("free_mem: %fMiB\n",
+           (limiting_adaptor->get_allocation_limit() - limiting_adaptor->get_allocated_bytes()) /
+             (double)1e6);
+    return std::min(total_mem, limiting_adaptor->get_allocation_limit());
+  } else {
+    return total_mem;
+  }
+}
+
 }  // namespace cuopt