Refactor cudaq_host_dispatch_loop_ctx_t to compose public API structs

wsttiger · wsttiger · commit 842ae9fd445a · 2026-03-25T20:00:00.000Z
Embed cudaq_ringbuffer_t, cudaq_dispatcher_config_t, and
cudaq_function_table_t as members instead of flattening their fields.
This eliminates field duplication, makes the data provenance clear,
and simplifies construction to struct copies. Host-specific runtime
state (workers, idle_mask, io_ctxs, etc.) remains as direct fields.

Signed-off-by: Scott Thornton &lt;wsttiger@gmail.com&gt;
diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h b/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h
@@ -42,33 +42,22 @@ typedef struct {
 } cudaq_host_dispatch_worker_t;
 
 typedef struct {
-  void *rx_flags; ///< opaque cuda::std::atomic<uint64_t>*
-  void *tx_flags; ///< opaque cuda::std::atomic<uint64_t>*
-  uint8_t *rx_data_host;
-  uint8_t *rx_data_dev;
-  uint8_t *tx_data_host;
-  uint8_t *tx_data_dev;
-  size_t tx_stride_sz;
-  void **h_mailbox_bank;
-  size_t num_slots;
-  size_t slot_size;
+  // Composed public API structs
+  cudaq_ringbuffer_t ringbuffer;
+  cudaq_dispatcher_config_t config;
+  cudaq_function_table_t function_table;
+
+  // Host dispatch runtime state
   cudaq_host_dispatch_worker_t *workers;
   size_t num_workers;
-  /// Host-visible function table for lookup by function_id (GRAPH_LAUNCH only;
-  /// others dropped).
-  cudaq_function_entry_t *function_table;
-  size_t function_table_count;
-  void *shutdown_flag; ///< opaque cuda::std::atomic<int>*
+  void **h_mailbox_bank;
+  void *shutdown_flag;   ///< opaque cuda::std::atomic<int>*
   uint64_t *stats_counter;
   void *live_dispatched; ///< opaque cuda::std::atomic<uint64_t>*
   void *idle_mask;       ///< opaque cuda::std::atomic<uint64_t>*, 1=free 0=busy
   int *inflight_slot_tags; ///< worker_id -> origin FPGA slot for tx_flags
                            ///< routing
 
-  /// Device view of tx_flags (needed for GraphIOContext.tx_flag).
-  /// NULL when tx_flags is already a device-accessible pointer.
-  volatile uint64_t *tx_flags_dev;
-
   /// Per-worker GraphIOContext array for separate RX/TX buffer support.
   /// When non-NULL, launch_graph_worker fills a GraphIOContext per dispatch
   /// and writes its device address into h_mailbox_bank[worker_id].
diff --git a/realtime/lib/daemon/dispatcher/host_dispatcher.cu b/realtime/lib/daemon/dispatcher/host_dispatcher.cu
@@ -17,6 +17,9 @@ using atomic_int_sys = cuda::std::atomic<int>;
 static inline atomic_uint64_sys *as_atomic_u64(void *p) {
   return static_cast<atomic_uint64_sys *>(p);
 }
+static inline atomic_uint64_sys *as_atomic_u64(volatile uint64_t *p) {
+  return reinterpret_cast<atomic_uint64_sys *>(const_cast<uint64_t *>(p));
+}
 static inline atomic_int_sys *as_atomic_int(void *p) {
   return static_cast<atomic_int_sys *>(p);
 }
@@ -36,13 +39,13 @@ lookup_function(cudaq_function_entry_t *table, size_t count,
 }
 
 static int
-find_idle_graph_worker_for_function(const cudaq_host_dispatch_loop_ctx_t *config,
+find_idle_graph_worker_for_function(const cudaq_host_dispatch_loop_ctx_t *ctx,
                                     uint32_t function_id) {
-  uint64_t mask = as_atomic_u64(config->idle_mask)->load(
+  uint64_t mask = as_atomic_u64(ctx->idle_mask)->load(
       cuda::std::memory_order_acquire);
   while (mask != 0) {
     int worker_id = __builtin_ffsll(static_cast<long long>(mask)) - 1;
-    if (config->workers[static_cast<size_t>(worker_id)].function_id ==
+    if (ctx->workers[static_cast<size_t>(worker_id)].function_id ==
         function_id)
       return worker_id;
     mask &= ~(1ULL << worker_id);
@@ -58,104 +61,101 @@ struct ParsedSlot {
 
 static ParsedSlot
 parse_slot_with_function_table(void *slot_host,
-                               const cudaq_host_dispatch_loop_ctx_t *config) {
+                               const cudaq_host_dispatch_loop_ctx_t *ctx) {
   ParsedSlot out;
   const RPCHeader *header = static_cast<const RPCHeader *>(slot_host);
   if (header->magic != RPC_MAGIC_REQUEST) {
     out.drop = true;
     return out;
   }
   out.function_id = header->function_id;
-  out.entry = lookup_function(config->function_table,
-                              config->function_table_count, out.function_id);
+  out.entry = lookup_function(ctx->function_table.entries,
+                              ctx->function_table.count, out.function_id);
   if (!out.entry)
     out.drop = true;
   return out;
 }
 
-static void finish_slot_and_advance(const cudaq_host_dispatch_loop_ctx_t *config,
+static void finish_slot_and_advance(const cudaq_host_dispatch_loop_ctx_t *ctx,
                                     size_t &current_slot, size_t num_slots,
                                     uint64_t &packets_dispatched) {
-  as_atomic_u64(config->rx_flags)[current_slot].store(
+  as_atomic_u64(ctx->ringbuffer.rx_flags_host)[current_slot].store(
       0, cuda::std::memory_order_release);
   packets_dispatched++;
-  if (config->live_dispatched)
-    as_atomic_u64(config->live_dispatched)
+  if (ctx->live_dispatched)
+    as_atomic_u64(ctx->live_dispatched)
         ->fetch_add(1, cuda::std::memory_order_relaxed);
   current_slot = (current_slot + 1) % num_slots;
 }
 
-static int acquire_graph_worker(const cudaq_host_dispatch_loop_ctx_t *config,
+static int acquire_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx,
                                 bool use_function_table,
                                 const cudaq_function_entry_t *entry,
                                 uint32_t function_id) {
   if (use_function_table && entry &&
       entry->dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH)
-    return find_idle_graph_worker_for_function(config, function_id);
+    return find_idle_graph_worker_for_function(ctx, function_id);
   uint64_t mask =
-      as_atomic_u64(config->idle_mask)->load(cuda::std::memory_order_acquire);
+      as_atomic_u64(ctx->idle_mask)->load(cuda::std::memory_order_acquire);
   if (mask == 0)
     return -1;
   return __builtin_ffsll(static_cast<long long>(mask)) - 1;
 }
 
-static void launch_graph_worker(const cudaq_host_dispatch_loop_ctx_t *config,
+static void launch_graph_worker(const cudaq_host_dispatch_loop_ctx_t *ctx,
                                 int worker_id, void *slot_host,
                                 size_t current_slot) {
-  as_atomic_u64(config->idle_mask)
+  as_atomic_u64(ctx->idle_mask)
       ->fetch_and(~(1ULL << worker_id), cuda::std::memory_order_release);
-  config->inflight_slot_tags[worker_id] = static_cast<int>(current_slot);
+  ctx->inflight_slot_tags[worker_id] = static_cast<int>(current_slot);
 
   ptrdiff_t offset =
-      static_cast<uint8_t *>(slot_host) - config->rx_data_host;
-  void *data_dev = static_cast<void *>(config->rx_data_dev + offset);
+      static_cast<uint8_t *>(slot_host) - ctx->ringbuffer.rx_data_host;
+  void *data_dev = static_cast<void *>(ctx->ringbuffer.rx_data + offset);
 
-  if (config->io_ctxs_host != nullptr) {
-    // GraphIOContext mode: fill per-worker context with separate RX/TX info.
-    auto *h_ctxs = static_cast<GraphIOContext *>(config->io_ctxs_host);
-    auto *d_ctxs = static_cast<uint8_t *>(config->io_ctxs_dev);
+  if (ctx->io_ctxs_host != nullptr) {
+    auto *h_ctxs = static_cast<GraphIOContext *>(ctx->io_ctxs_host);
+    auto *d_ctxs = static_cast<uint8_t *>(ctx->io_ctxs_dev);
     GraphIOContext *h_ctx = &h_ctxs[worker_id];
 
     h_ctx->rx_slot = data_dev;
-    h_ctx->tx_slot = config->tx_data_dev + current_slot * config->tx_stride_sz;
-    h_ctx->tx_flag = &config->tx_flags_dev[current_slot];
+    h_ctx->tx_slot = ctx->ringbuffer.tx_data +
+                     current_slot * ctx->ringbuffer.tx_stride_sz;
+    h_ctx->tx_flag = &ctx->ringbuffer.tx_flags[current_slot];
     h_ctx->tx_flag_value =
         reinterpret_cast<uint64_t>(h_ctx->tx_slot);
-    h_ctx->tx_stride_sz = config->tx_stride_sz;
+    h_ctx->tx_stride_sz = ctx->ringbuffer.tx_stride_sz;
 
     void *d_ctx = d_ctxs + worker_id * sizeof(GraphIOContext);
-    config->h_mailbox_bank[worker_id] = d_ctx;
+    ctx->h_mailbox_bank[worker_id] = d_ctx;
 
-    // In GraphIOContext mode the graph kernel writes tx_flag_value (READY)
-    // to tx_flags from the GPU.  Set the in-flight marker BEFORE launch so
-    // the kernel's READY write is never clobbered by a late host write.
-    as_atomic_u64(config->tx_flags)[current_slot].store(
+    as_atomic_u64(ctx->ringbuffer.tx_flags_host)[current_slot].store(
         CUDAQ_TX_FLAG_IN_FLIGHT, cuda::std::memory_order_release);
     __sync_synchronize();
   } else {
-    config->h_mailbox_bank[worker_id] = data_dev;
+    ctx->h_mailbox_bank[worker_id] = data_dev;
   }
   __sync_synchronize();
 
   const size_t w = static_cast<size_t>(worker_id);
-  if (config->workers[w].pre_launch_fn)
-    config->workers[w].pre_launch_fn(config->workers[w].pre_launch_data,
-                                     data_dev, config->workers[w].stream);
-  cudaError_t err = cudaGraphLaunch(config->workers[w].graph_exec,
-                                    config->workers[w].stream);
+  if (ctx->workers[w].pre_launch_fn)
+    ctx->workers[w].pre_launch_fn(ctx->workers[w].pre_launch_data,
+                                     data_dev, ctx->workers[w].stream);
+  cudaError_t err = cudaGraphLaunch(ctx->workers[w].graph_exec,
+                                    ctx->workers[w].stream);
 
   if (err != cudaSuccess) {
     uint64_t error_val = CUDAQ_TX_FLAG_ERROR_TAG << 48 | (uint64_t)err;
-    as_atomic_u64(config->tx_flags)[current_slot].store(
+    as_atomic_u64(ctx->ringbuffer.tx_flags_host)[current_slot].store(
         error_val, cuda::std::memory_order_release);
-    as_atomic_u64(config->idle_mask)
+    as_atomic_u64(ctx->idle_mask)
         ->fetch_or(1ULL << worker_id, cuda::std::memory_order_release);
   } else {
-    if (config->workers[w].post_launch_fn)
-      config->workers[w].post_launch_fn(config->workers[w].post_launch_data,
-                                        data_dev, config->workers[w].stream);
-    if (config->io_ctxs_host == nullptr) {
-      as_atomic_u64(config->tx_flags)[current_slot].store(
+    if (ctx->workers[w].post_launch_fn)
+      ctx->workers[w].post_launch_fn(ctx->workers[w].post_launch_data,
+                                        data_dev, ctx->workers[w].stream);
+    if (ctx->io_ctxs_host == nullptr) {
+      as_atomic_u64(ctx->ringbuffer.tx_flags_host)[current_slot].store(
           CUDAQ_TX_FLAG_IN_FLIGHT, cuda::std::memory_order_release);
     }
   }
@@ -164,17 +164,18 @@ static void launch_graph_worker(const cudaq_host_dispatch_loop_ctx_t *config,
 } // anonymous namespace
 
 extern "C" void
-cudaq_host_dispatcher_loop(const cudaq_host_dispatch_loop_ctx_t *config) {
+cudaq_host_dispatcher_loop(const cudaq_host_dispatch_loop_ctx_t *ctx) {
   size_t current_slot = 0;
-  const size_t num_slots = config->num_slots;
+  const size_t num_slots = ctx->config.num_slots;
   uint64_t packets_dispatched = 0;
   const bool use_function_table =
-      (config->function_table != nullptr && config->function_table_count > 0);
+      (ctx->function_table.entries != nullptr && ctx->function_table.count > 0);
 
-  while (as_atomic_int(config->shutdown_flag)
+  while (as_atomic_int(ctx->shutdown_flag)
              ->load(cuda::std::memory_order_acquire) == 0) {
-    uint64_t rx_value = as_atomic_u64(config->rx_flags)[current_slot].load(
-        cuda::std::memory_order_acquire);
+    uint64_t rx_value =
+        as_atomic_u64(ctx->ringbuffer.rx_flags_host)[current_slot].load(
+            cuda::std::memory_order_acquire);
 
     if (rx_value == 0) {
       CUDAQ_REALTIME_CPU_RELAX();
@@ -187,9 +188,9 @@ cudaq_host_dispatcher_loop(const cudaq_host_dispatch_loop_ctx_t *config) {
 
     // TODO: Remove non-function-table path; RPC framing is always required.
     if (use_function_table) {
-      ParsedSlot parsed = parse_slot_with_function_table(slot_host, config);
+      ParsedSlot parsed = parse_slot_with_function_table(slot_host, ctx);
       if (parsed.drop) {
-        as_atomic_u64(config->rx_flags)[current_slot].store(
+        as_atomic_u64(ctx->ringbuffer.rx_flags_host)[current_slot].store(
             0, cuda::std::memory_order_release);
         current_slot = (current_slot + 1) % num_slots;
         continue;
@@ -199,29 +200,29 @@ cudaq_host_dispatcher_loop(const cudaq_host_dispatch_loop_ctx_t *config) {
     }
 
     if (entry && entry->dispatch_mode != CUDAQ_DISPATCH_GRAPH_LAUNCH) {
-      as_atomic_u64(config->rx_flags)[current_slot].store(
+      as_atomic_u64(ctx->ringbuffer.rx_flags_host)[current_slot].store(
           0, cuda::std::memory_order_release);
       current_slot = (current_slot + 1) % num_slots;
       continue;
     }
 
     int worker_id =
-        acquire_graph_worker(config, use_function_table, entry, function_id);
+        acquire_graph_worker(ctx, use_function_table, entry, function_id);
     if (worker_id < 0) {
       CUDAQ_REALTIME_CPU_RELAX();
       continue;
     }
 
-    launch_graph_worker(config, worker_id, slot_host, current_slot);
-    finish_slot_and_advance(config, current_slot, num_slots,
+    launch_graph_worker(ctx, worker_id, slot_host, current_slot);
+    finish_slot_and_advance(ctx, current_slot, num_slots,
                             packets_dispatched);
   }
 
-  for (size_t i = 0; i < config->num_workers; ++i) {
-    cudaStreamSynchronize(config->workers[i].stream);
+  for (size_t i = 0; i < ctx->num_workers; ++i) {
+    cudaStreamSynchronize(ctx->workers[i].stream);
   }
 
-  if (config->stats_counter) {
-    *config->stats_counter = packets_dispatched;
+  if (ctx->stats_counter) {
+    *ctx->stats_counter = packets_dispatched;
   }
 }
diff --git a/realtime/lib/daemon/dispatcher/host_dispatcher_capi.cu b/realtime/lib/daemon/dispatcher/host_dispatcher_capi.cu
@@ -147,37 +147,28 @@ extern "C" cudaq_host_dispatcher_handle_t *cudaq_host_dispatcher_start_thread(
   }
   handle->io_ctxs_pinned = io_ctxs_host_ptr;
 
-  cudaq_host_dispatch_loop_ctx_t host_config;
-  std::memset(&host_config, 0, sizeof(host_config));
-  host_config.rx_flags = (void *)(uintptr_t)ringbuffer->rx_flags_host;
-  host_config.tx_flags = (void *)(uintptr_t)ringbuffer->tx_flags_host;
-  host_config.rx_data_host = ringbuffer->rx_data_host;
-  host_config.rx_data_dev = ringbuffer->rx_data;
-  host_config.tx_data_host = ringbuffer->tx_data_host;
-  host_config.tx_data_dev = ringbuffer->tx_data;
-  host_config.tx_stride_sz = ringbuffer->tx_stride_sz;
-  host_config.h_mailbox_bank = handle->h_mailbox_bank;
-  host_config.num_slots = config->num_slots;
-  host_config.slot_size = config->slot_size;
-  host_config.workers = handle->workers;
-  host_config.num_workers = num_workers;
-  host_config.function_table = table->entries;
-  host_config.function_table_count = table->count;
+  cudaq_host_dispatch_loop_ctx_t ctx;
+  std::memset(&ctx, 0, sizeof(ctx));
+  ctx.ringbuffer = *ringbuffer;
+  ctx.config = *config;
+  ctx.function_table = *table;
+  ctx.workers = handle->workers;
+  ctx.num_workers = num_workers;
+  ctx.h_mailbox_bank = handle->h_mailbox_bank;
   // The C API takes volatile int* for ABI stability; internally the dispatch
   // loop accesses it via cuda::std::atomic<int>* for acquire semantics.
   // This is safe: cuda::std::atomic<int> is lock-free and layout-compatible
   // with int on all CUDA-supported platforms.
-  host_config.shutdown_flag = (void *)(uintptr_t)shutdown_flag;
-  host_config.stats_counter = stats;
-  host_config.live_dispatched = nullptr;
-  host_config.idle_mask = handle->idle_mask;
-  host_config.inflight_slot_tags = handle->inflight_slot_tags;
-  host_config.tx_flags_dev = ringbuffer->tx_flags;
-  host_config.io_ctxs_host = io_ctxs_host_ptr;
-  host_config.io_ctxs_dev = io_ctxs_dev_ptr;
+  ctx.shutdown_flag = (void *)(uintptr_t)shutdown_flag;
+  ctx.stats_counter = stats;
+  ctx.live_dispatched = nullptr;
+  ctx.idle_mask = handle->idle_mask;
+  ctx.inflight_slot_tags = handle->inflight_slot_tags;
+  ctx.io_ctxs_host = io_ctxs_host_ptr;
+  ctx.io_ctxs_dev = io_ctxs_dev_ptr;
 
   handle->thread = std::thread(
-      [cfg = host_config]() { cudaq_host_dispatcher_loop(&cfg); });
+      [cfg = ctx]() { cudaq_host_dispatcher_loop(&cfg); });
   return handle;
 }
 
diff --git a/realtime/unittests/test_host_dispatcher.cu b/realtime/unittests/test_host_dispatcher.cu
@@ -349,20 +349,21 @@ protected:
     idle_mask_->store((1ULL << workers_.size()) - 1,
                       cuda::std::memory_order_release);
 
-    config_.rx_flags = (void *)(uintptr_t)rx_flags_host_;
-    config_.tx_flags = (void *)(uintptr_t)tx_flags_host_;
-    config_.rx_data_host = rx_data_host_;
-    config_.rx_data_dev = rx_data_dev_;
-    config_.tx_data_host = tx_data_host_;
-    config_.tx_data_dev = tx_data_dev_;
-    config_.tx_stride_sz = slot_size_;
-    config_.h_mailbox_bank = h_mailbox_bank_;
-    config_.num_slots = num_slots_;
-    config_.slot_size = slot_size_;
+    config_.ringbuffer.rx_flags_host = rx_flags_host_;
+    config_.ringbuffer.tx_flags_host = tx_flags_host_;
+    config_.ringbuffer.rx_data_host = rx_data_host_;
+    config_.ringbuffer.rx_data = rx_data_dev_;
+    config_.ringbuffer.tx_data_host = tx_data_host_;
+    config_.ringbuffer.tx_data = tx_data_dev_;
+    config_.ringbuffer.tx_stride_sz = slot_size_;
+    config_.ringbuffer.tx_flags = tx_flags_dev_;
+    config_.config.num_slots = static_cast<uint32_t>(num_slots_);
+    config_.config.slot_size = static_cast<uint32_t>(slot_size_);
+    config_.function_table.entries = function_table_;
+    config_.function_table.count = static_cast<uint32_t>(function_table_count_);
     config_.workers = workers_.data();
     config_.num_workers = workers_.size();
-    config_.function_table = function_table_;
-    config_.function_table_count = function_table_count_;
+    config_.h_mailbox_bank = h_mailbox_bank_;
     config_.shutdown_flag = shutdown_flag_;
     config_.stats_counter = &stats_counter_;
     config_.live_dispatched = live_dispatched_;