NVIDIA · aliceb-nv · Aug 25, 2025 · Aug 25, 2025 · Aug 25, 2025 · Aug 26, 2025
@@ -0,0 +1,225 @@
+<?xml version="1.0" encoding="utf-8"?>
+<ComputeSanitizerOutput>
+  <record>
+    <kind>Initcheck</kind>
+    <what>
+      <text>Uninitialized __global__ memory read of size 4 bytes</text>
+      <size>4</size>
+    </what>
+    <where>
+      <func>.*</func>
+    </where>
+    <hostStack>
+      <frame>
+        <module>.*libcuda.so.*</module>
+      </frame>
+      <frame>
+        <func>cusparseCsr2cscEx2</func>
+        <module>.*libcusparse.so.*</module>
+      </frame>
+    </hostStack>
+  </record>
+  <record>
+    <kind>Initcheck</kind>
+    <what>
+      <text>Uninitialized __global__ memory read of size 4 bytes</text>
+      <size>4</size>
+    </what>
+    <where>
+      <func>ThreadLoad</func>
+    </where>
+    <hostStack>
+      <frame>
+        <module>.*libcuda.so.*</module>
+      </frame>
+      <frame>
+        <module>libcudart.*</module>
+      </frame>
+      <frame>
+        <func>cudaLaunchKernel</func>
+      </frame>
+      <frame>
+        <func>.*cub::.*::Device(Segmented)?(Reduce|Scan)(SingleTile)?Kernel.*</func>
+      </frame>
+    </hostStack>
+  </record>
+  <record>
+    <kind>Initcheck</kind>
+    <what>
+      <text>Uninitialized __global__ memory read of size 2 bytes</text>
+      <size>2</size>
+    </what>
+    <where>
+      <func>ThreadLoad</func>
+    </where>
+    <hostStack>
+      <frame>
+        <module>.*libcuda.so.*</module>
+      </frame>
+      <frame>
+        <module>libcudart.*</module>
+      </frame>
+      <frame>
+        <func>cudaLaunchKernel</func>
+      </frame>
+      <frame>
+        <func>.*cub::.*::Device(Segmented)?(Reduce|Scan)(SingleTile)?Kernel.*</func>
+      </frame>
+    </hostStack>
+  </record>
+  <record>
+    <kind>Initcheck</kind>
+    <what>
+      <text>Uninitialized __global__ memory read of size 8 bytes</text>
+      <size>8</size>
+    </what>
+    <where>
+      <func>DeviceSegmentedReduceKernel</func>
+    </where>
+  </record>
+  <record>
+    <kind>Initcheck</kind>
+    <what>
+      <text>Uninitialized __global__ memory read of size 4 bytes</text>
+      <size>4</size>
+    </what>
+    <where>
+      <func>ThreadLoad</func>
+    </where>
+    <hostStack>
+      <frame>
+        <module>.*libcuda.so.*</module>
+      </frame>
+      <frame>
+        <module>libcudart.*</module>
+      </frame>
+      <frame>
+        <module>libcudart.*</module>
+      </frame>
+      <frame>
+        <module>.*libcuopt.*</module>
+      </frame>
+      <frame>
+        <func>.*Device(Reduce|Scan)Kernel.*</func>
+      </frame>
+    </hostStack>
+  </record>
+  <!-- Rule matching cccl's pattern of copying tuples back to host after reduce_by_keys, which contain uninitialized padding  -->
+  <!-- Because of aggressive inlining, thrust calls are elided out of the host stack, which prevents a more finely grained rule. In practice this is good enough -->
+  <record>
+    <kind>InitcheckApiError</kind>
+    <level>Error</level>
+    <what>
+      <text>Host API uninitialized memory access</text>
+      <accessSize>16</accessSize>
+    </what>
+    <hostStack>
+      <frame>
+        <func>cuMemcpyDtoHAsync.*</func>
+        <module>.*libcuda.so.*</module>
+      </frame>
+    </hostStack>
+  </record>
+  <!-- Suppress uninit copies on rmm::device_vector copy constructor - often vector members are allocated but not filled -->
+  <record>
+    <kind>InitcheckApiError</kind>
+    <level>Error</level>
+    <what>
+      <text>Host API uninitialized memory access</text>
+    </what>
+    <hostStack>
+      <frame>
+        <func>cuMemcpyAsync</func>
+        <module>.*libcuda.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <module>.*librmm.so.*</module>
+      </frame>
+      <frame>
+        <func>rmm::device_buffer::device_buffer</func>
+        <module>.*librmm.so.*</module>
+      </frame>
+    </hostStack>
+  </record>
+  <record>
+    <kind>InitcheckApiError</kind>
+    <level>Error</level>
+    <what>
+      <text>Host API uninitialized memory access</text>
+    </what>
+    <hostStack>
+      <frame>
+        <func>cuMemcpyAsync</func>
+        <module>.*libcuda.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <module>.*librmm.so.*</module>
+      </frame>
+      <frame>
+        <module>.*librmm.so.*</module>
+      </frame>
+      <frame>
+        <func>rmm::device_uvector.*::device_uvector</func>
+        <module>.*libcuopt.so.*</module>
+      </frame>
+    </hostStack>
+  </record>
+  <!-- Uninitialized device-to-device copies are usually harmless - if actualy bogus, errors may be caught later on -->
+  <record>
+    <kind>InitcheckApiError</kind>
+    <level>Error</level>
+    <what>
+      <text>Host API uninitialized memory access</text>
+    </what>
+    <hostStack>
+      <frame>
+        <func>cuMemcpyDtoDAsync.*</func>
+        <module>.*libcuda.so.*</module>
+      </frame>
+    </hostStack>
+  </record>
+  <record>
+    <kind>InitcheckApiError</kind>
+    <level>Error</level>
+    <what>
+      <text>Host API uninitialized memory access</text>
+    </what>
+    <hostStack>
+      <frame>
+        <func>cuMemcpyAsync</func>
+        <module>.*libcuda.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <func>cudaMemcpyAsync</func>
+      </frame>
+      <frame>
+        <func>rmm::device_buffer::resize</func>
+        <module>.*librmm.so.*</module>
+      </frame>
+    </hostStack>
+  </record>
+</ComputeSanitizerOutput>
@@ -1139,7 +1139,7 @@ void mps_parser_t<i_t, f_t>::parse_bounds(std::string_view line)
     c_values.emplace_back(f_t(0));
     variable_lower_bounds.emplace_back(0);
     variable_upper_bounds.emplace_back(+std::numeric_limits<f_t>::infinity());
-    var_types.resize(var_types.size() + 1);
+    var_types.emplace_back('C');
     itr = var_names_map.find(std::string(var_name));
   }
   i_t var_id = itr->second;

@@ -19,6 +19,8 @@
 #include <linear_programming/utilities/ping_pong_graph.cuh>
 #include <linear_programming/utils.cuh>
 #include <mip/mip_constants.hpp>
+#include <utilities/copy_helpers.hpp>
+#include <utilities/cuda_helpers.cuh>
 
 #include <raft/sparse/detail/cusparse_macros.h>
 #include <raft/sparse/detail/cusparse_wrappers.h>
@@ -122,6 +124,10 @@ void pdhg_solver_t<i_t, f_t>::compute_At_y()
 {
   // A_t @ y
 
+  // cusparse flags a false positive here on the destination tmp buffer, silence it
+  cuopt::mark_span_as_initialized(make_span(current_saddle_point_state_.get_current_AtY()),
+                                  handle_ptr_->get_stream());
+
   RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
                                                        CUSPARSE_OPERATION_NON_TRANSPOSE,
                                                        reusable_device_scalar_value_1_.data(),

@@ -22,6 +22,8 @@
 #include <linear_programming/pdlp.cuh>
 #include <linear_programming/utils.cuh>
 #include <mip/mip_constants.hpp>
+#include <utilities/copy_helpers.hpp>
+#include <utilities/cuda_helpers.cuh>
 #include "cuopt/linear_programming/pdlp/solver_solution.hpp"
 
 #include <raft/common/nvtx.hpp>
@@ -1048,6 +1050,9 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(
                             primal_size_h_,
                             clamp<f_t>(),
                             stream_view_);
+    // Triggers a false positive in compute-sanitizer otherwise (lack of initialization doesn't
+    // matter here)
+    cuopt::mark_span_as_initialized(make_span(unscaled_primal_avg_solution_), stream_view_);
     raft::linalg::ternaryOp(unscaled_primal_avg_solution_.data(),
                             unscaled_primal_avg_solution_.data(),
                             op_problem_scaled_.variable_lower_bounds.data(),

@@ -19,6 +19,8 @@
 #include <linear_programming/pdlp_constants.hpp>
 #include <linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp>
 #include <mip/mip_constants.hpp>
+#include <utilities/copy_helpers.hpp>
+#include <utilities/cuda_helpers.cuh>
 #include <utilities/unique_pinned_ptr.hpp>
 
 #include <raft/sparse/detail/cusparse_macros.h>
@@ -275,6 +277,10 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
 
   // Compute A_t @ (y' - y) = A_t @ y' - 1 * current_AtY
 
+  // cusparse flags a false positive here on the destination tmp buffer, silence it
+  cuopt::mark_span_as_initialized(make_span(current_saddle_point_state.get_next_AtY()),
+                                  handle_ptr_->get_stream());
+
   // First compute Ay' to be reused as Ay in next PDHG iteration (if found step size if valid)
   RAFT_CUSPARSE_TRY(
     raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),

@@ -19,6 +19,8 @@
 #include <linear_programming/termination_strategy/convergence_information.hpp>
 #include <linear_programming/utils.cuh>
 #include <mip/mip_constants.hpp>
+#include <utilities/copy_helpers.hpp>
+#include <utilities/cuda_helpers.cuh>
 
 #include <cuopt/linear_programming/pdlp/solver_settings.hpp>
 
@@ -223,6 +225,8 @@ void convergence_information_t<i_t, f_t>::compute_primal_residual(
   cusparse_view_t<i_t, f_t>& cusparse_view, rmm::device_uvector<f_t>& tmp_dual)
 {
   raft::common::nvtx::range fun_scope("compute_primal_residual");
+  // cusparse flags a false positive here on the destination tmp buffer, silence it
+  cuopt::mark_span_as_initialized(make_span(tmp_dual), handle_ptr_->get_stream());
 
   // primal_product
   RAFT_CUSPARSE_TRY(

@@ -21,6 +21,7 @@
 #include <cuopt/linear_programming/optimization_problem.hpp>
 #include <mip/mip_constants.hpp>
 #include <utilities/copy_helpers.hpp>
+#include <utilities/cuda_helpers.cuh>
 
 #include <thrust/functional.h>
 #include <thrust/logical.h>

@@ -450,6 +450,7 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
     lp_settings.return_first_feasible = false;
     lp_settings.save_state            = true;
     lp_settings.concurrent_halt       = &global_concurrent_halt;
+    lp_settings.has_initial_primal    = false;
     rmm::device_uvector<f_t> lp_optimal_solution_copy(lp_optimal_solution.size(),
                                                       problem_ptr->handle_ptr->get_stream());
     auto lp_result =

@@ -745,6 +745,7 @@ void population_t<i_t, f_t>::print()
     if (index.first == 0 && solutions[0].first) {
       CUOPT_LOG_DEBUG(" Best feasible: %f", solutions[index.first].second.get_user_objective());
     }
+    if (index.first == 0 && !solutions[0].first) continue;
     CUOPT_LOG_DEBUG("%d :  %f\t%f\t%f\t%d",
                     i,
                     index.second,

@@ -115,6 +115,7 @@ class bound_prop_recombiner_t : public recombiner_t<i_t, f_t> {
     auto other_view     = other.view();
     auto offspring_view = offspring.view();
     const f_t int_tol   = guiding.problem_ptr->tolerances.integrality_tolerance;
+    cuopt_assert(variable_map.size() == probing_values.size(), "The number of vars should match!");
     thrust::for_each(
       guiding.handle_ptr->get_thrust_policy(),
       thrust::make_counting_iterator(0lu),
@@ -183,6 +184,7 @@ class bound_prop_recombiner_t : public recombiner_t<i_t, f_t> {
     if (guiding_solution.get_feasible()) {
       this->compute_vars_to_fix(offspring, vars_to_fix, n_vars_from_other, n_vars_from_guiding);
       auto [fixed_problem, fixed_assignment, variable_map] = offspring.fix_variables(vars_to_fix);
+      probing_values.resize(fixed_problem.n_variables, a.handle_ptr->get_stream());
       timer_t timer(bp_recombiner_config_t::bounds_prop_time_limit);
       rmm::device_uvector<f_t> old_assignment(offspring.assignment,
                                               offspring.handle_ptr->get_stream());

@@ -506,6 +506,9 @@ void fj_t<i_t, f_t>::climber_init(i_t climber_idx, const rmm::cuda_stream_view&
 
   view = climber->view();
 
+  cuopt::mark_span_as_initialized(view.row_size_bin_prefix_sum, climber_stream);
+  cuopt::mark_span_as_initialized(view.row_size_nonbin_prefix_sum, climber_stream);
+
   if (pb_ptr->related_variables.size() > 0) {
     // for each variable, compute the number of nnzs that would be examined during a FJ move update
     // pass to help determine whether to run load balancing or not

@@ -67,6 +67,10 @@ feasibility_pump_t<i_t, f_t>::feasibility_pump_t(
     rng(cuopt::seed_generator::get_seed()),
     timer(20.)
 {
+  thrust::fill(context.problem_ptr->handle_ptr->get_thrust_policy(),
+               last_projection.begin(),
+               last_projection.end(),
+               (f_t)0);
 }
 
 template <typename Iter_T>

@@ -816,6 +816,7 @@ bool constraint_prop_t<i_t, f_t>::is_problem_ii(problem_t<i_t, f_t>& problem)
 {
   bounds_update.calculate_activity_on_problem_bounds(problem);
   bounds_update.calculate_infeasible_redundant_constraints(problem);
+  multi_probe.calculate_activity(problem, problem.handle_ptr);
   bool problem_ii = bounds_update.infeas_constraints_count > 0;
   return problem_ii;
 }