Skip to content

Commit 4e8bdae

Browse files
authored
[FIX] Fix high GPU memory usage (#351)
closes #349 compute_related_variables was heuristically allocating memory based on A100/H100 with >=40GB of VRAM. This is now automatically adjusted based on the total VRAM of the device A command-line option has also been added to solve_MPS_file to specify device memory allocation limits for ease of testing. Authors: - Alice Boucher (https://github.com/aliceb-nv) - Nicolas L. Guidotti (https://github.com/nguidotti) Approvers: - Nicolas L. Guidotti (https://github.com/nguidotti) URL: #351
1 parent 8d3a438 commit 4e8bdae

File tree

3 files changed

+66
-9
lines changed

3 files changed

+66
-9
lines changed

benchmarks/linear_programming/cuopt/run_mip.cpp

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@
2828
#include <raft/core/handle.hpp>
2929

3030
#include <rmm/mr/device/cuda_async_memory_resource.hpp>
31+
#include <rmm/mr/device/limiting_resource_adaptor.hpp>
32+
#include <rmm/mr/device/logging_resource_adaptor.hpp>
3133
#include <rmm/mr/device/pool_memory_resource.hpp>
34+
#include <rmm/mr/device/tracking_resource_adaptor.hpp>
3235

3336
#include <rmm/mr/device/owning_wrapper.hpp>
3437

@@ -256,7 +259,9 @@ void run_single_file_mp(std::string file_path,
256259
{
257260
std::cout << "running file " << file_path << " on gpu : " << device << std::endl;
258261
auto memory_resource = make_async();
259-
rmm::mr::set_current_device_resource(memory_resource.get());
262+
auto limiting_adaptor =
263+
rmm::mr::limiting_resource_adaptor(memory_resource.get(), 6ULL * 1024ULL * 1024ULL * 1024ULL);
264+
rmm::mr::set_current_device_resource(&limiting_adaptor);
260265
int sol_found = run_single_file(file_path,
261266
device,
262267
batch_id,
@@ -340,6 +345,15 @@ int main(int argc, char* argv[])
340345
.scan<'g', double>()
341346
.default_value(std::numeric_limits<double>::max());
342347

348+
program.add_argument("--memory-limit")
349+
.help("memory limit in MB")
350+
.scan<'g', double>()
351+
.default_value(0.0);
352+
353+
program.add_argument("--track-allocations")
354+
.help("track allocations (t/f)")
355+
.default_value(std::string("f"));
356+
343357
// Parse arguments
344358
try {
345359
program.parse_args(argc, argv);
@@ -362,10 +376,12 @@ int main(int argc, char* argv[])
362376
std::string result_file;
363377
int batch_num = -1;
364378

365-
bool heuristics_only = program.get<std::string>("--heuristics-only")[0] == 't';
366-
int num_cpu_threads = program.get<int>("--num-cpu-threads");
367-
bool write_log_file = program.get<std::string>("--write-log-file")[0] == 't';
368-
bool log_to_console = program.get<std::string>("--log-to-console")[0] == 't';
379+
bool heuristics_only = program.get<std::string>("--heuristics-only")[0] == 't';
380+
int num_cpu_threads = program.get<int>("--num-cpu-threads");
381+
bool write_log_file = program.get<std::string>("--write-log-file")[0] == 't';
382+
bool log_to_console = program.get<std::string>("--log-to-console")[0] == 't';
383+
double memory_limit = program.get<double>("--memory-limit");
384+
bool track_allocations = program.get<std::string>("--track-allocations")[0] == 't';
369385

370386
if (program.is_used("--out-dir")) {
371387
out_dir = program.get<std::string>("--out-dir");
@@ -469,7 +485,17 @@ int main(int argc, char* argv[])
469485
merge_result_files(out_dir, result_file, n_gpus, batch_num);
470486
} else {
471487
auto memory_resource = make_async();
472-
rmm::mr::set_current_device_resource(memory_resource.get());
488+
if (memory_limit > 0) {
489+
auto limiting_adaptor =
490+
rmm::mr::limiting_resource_adaptor(memory_resource.get(), memory_limit * 1024ULL * 1024ULL);
491+
rmm::mr::set_current_device_resource(&limiting_adaptor);
492+
} else if (track_allocations) {
493+
rmm::mr::tracking_resource_adaptor tracking_adaptor(memory_resource.get(),
494+
/*capture_stacks=*/true);
495+
rmm::mr::set_current_device_resource(&tracking_adaptor);
496+
} else {
497+
rmm::mr::set_current_device_resource(memory_resource.get());
498+
}
473499
run_single_file(path,
474500
0,
475501
0,

cpp/src/mip/problem/problem.cu

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "problem_kernels.cuh"
2222

2323
#include <utilities/copy_helpers.hpp>
24+
#include <utilities/cuda_helpers.cuh>
2425
#include <utilities/macros.cuh>
2526

2627
#include <linear_programming/utils.cuh>
@@ -798,16 +799,21 @@ void problem_t<i_t, f_t>::compute_related_variables(double time_limit)
798799

799800
handle_ptr->sync_stream();
800801

802+
// previously used constants were based on 40GB of memory. Scale accordingly on smaller GPUs
803+
// We can't rely on querying free memory or allocation try/catch
804+
// since this would break determinism guarantees (GPU may be shared by other processes)
805+
f_t size_factor = std::min(1.0, cuopt::get_device_memory_size() / 1e9 / 40.0);
806+
801807
// TODO: determine optimal number of slices based on available GPU memory? This used to be 2e9 /
802808
// n_variables
803-
i_t max_slice_size = 6e8 / n_variables;
809+
i_t max_slice_size = 6e8 * size_factor / n_variables;
804810

805811
rmm::device_uvector<i_t> varmap(max_slice_size * n_variables, handle_ptr->get_stream());
806812
rmm::device_uvector<i_t> offsets(max_slice_size * n_variables, handle_ptr->get_stream());
807813

808814
related_variables.resize(0, handle_ptr->get_stream());
809815
// TODO: this used to be 1e8
810-
related_variables.reserve(1e8, handle_ptr->get_stream()); // reserve space
816+
related_variables.reserve(1e8 * size_factor, handle_ptr->get_stream()); // reserve space
811817
related_variables_offsets.resize(n_variables + 1, handle_ptr->get_stream());
812818
related_variables_offsets.set_element_to_zero_async(0, handle_ptr->get_stream());
813819

@@ -851,7 +857,7 @@ void problem_t<i_t, f_t>::compute_related_variables(double time_limit)
851857
auto current_time = std::chrono::high_resolution_clock::now();
852858
// if the related variable array would wind up being too large for available memory, abort
853859
// TODO this used to be 1e9
854-
if (related_variables.size() > 1e9 ||
860+
if (related_variables.size() > 1e9 * size_factor ||
855861
std::chrono::duration_cast<std::chrono::seconds>(current_time - start_time).count() >
856862
time_limit) {
857863
CUOPT_LOG_DEBUG(

cpp/src/utilities/cuda_helpers.cuh

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
#include <raft/util/cuda_utils.cuh>
2525
#include <raft/util/cudart_utils.hpp>
2626
#include <rmm/device_uvector.hpp>
27+
#include <rmm/mr/device/cuda_async_memory_resource.hpp>
28+
#include <rmm/mr/device/limiting_resource_adaptor.hpp>
2729

2830
namespace cuopt {
2931

@@ -208,4 +210,27 @@ DI void sorted_insert(T* array, T item, int curr_size, int max_size)
208210
array[0] = item;
209211
}
210212

213+
inline size_t get_device_memory_size()
214+
{
215+
// Otherwise, we need to get the free memory from the device
216+
size_t free_mem, total_mem;
217+
cudaMemGetInfo(&free_mem, &total_mem);
218+
219+
auto res = rmm::mr::get_current_device_resource();
220+
auto limiting_adaptor =
221+
dynamic_cast<rmm::mr::limiting_resource_adaptor<rmm::mr::cuda_async_memory_resource>*>(res);
222+
// Did we specifiy an explicit memory limit?
223+
if (limiting_adaptor) {
224+
printf("limiting_adaptor->get_allocation_limit(): %fMiB\n",
225+
limiting_adaptor->get_allocation_limit() / (double)1e6);
226+
printf("used_mem: %fMiB\n", limiting_adaptor->get_allocated_bytes() / (double)1e6);
227+
printf("free_mem: %fMiB\n",
228+
(limiting_adaptor->get_allocation_limit() - limiting_adaptor->get_allocated_bytes()) /
229+
(double)1e6);
230+
return std::min(total_mem, limiting_adaptor->get_allocation_limit());
231+
} else {
232+
return total_mem;
233+
}
234+
}
235+
211236
} // namespace cuopt

0 commit comments

Comments
 (0)