Fix DMA chronological inconsistency

mikex86 · mikex86 · commit 2ed31f1b2d0d · 2024-09-01T15:05:07.000+02:00
diff --git a/driverapi/include/librecuda.h b/driverapi/include/librecuda.h
@@ -48,7 +48,7 @@ LIBRECUDA_EXPORT libreCudaStatus_t libreCuCtxGetCurrent(LibreCUcontext *pCtxOut)
 
 LIBRECUDA_EXPORT libreCudaStatus_t libreCuMemAlloc(void **pDevicePointer, size_t bytesize, bool mapToCpu = false);
 
-LIBRECUDA_EXPORT libreCudaStatus_t libreCuMemCpy(void *dst, void *src, size_t byteCount, LibreCUstream stream);
+LIBRECUDA_EXPORT libreCudaStatus_t libreCuMemCpy(void *dst, void *src, size_t byteCount, LibreCUstream stream, bool async = false);
 
 LIBRECUDA_EXPORT libreCudaStatus_t libreCuMemFree(void *devicePointer);
 
@@ -84,6 +84,7 @@ LIBRECUDA_EXPORT libreCudaStatus_t libreCuLaunchKernel(LibreCUFunction function,
                                                        void **kernelParams, size_t numParams,
                                                        void **extra,
                                                        bool async=false);
+
 /**
  * Submits the built up command buffer to the gpu.
  * Operations performed on streams fall into two types: "compute" (eg. launch kernel) and "dma".
diff --git a/driverapi/internal/cmdqueue.h b/driverapi/internal/cmdqueue.h
@@ -53,6 +53,7 @@ struct CommandBufSplit {
     std::vector<NvU32> commandBuffer{};
     QueueType queueType;
     NvU32 timelineCtr;
+    bool timelineNotifyPending;
 };
 
 class NvCommandQueue {
@@ -115,6 +116,13 @@ class NvCommandQueue {
      */
     NvU32 timelineCtr = 0;
 
+    /**
+     * State whether the last command that incremented the timeline also issued a signalNotify() command.
+     * If this flag is true and the stream is commenced, a trailing signalNotify() has to be inserted.
+     * Otherwise this is not necessary as the last COMPUTE/DMA command already issued it.
+     */
+    bool timelineNotifyPending = false;
+
     // TODO: To my knowledge there is no way to interleave COMPUTE and DMA queues with synchronization primitives.
     //  You can release semaphores on a DMA queue, but not acquire it. You need both for bi-directional sync.
     /**
@@ -180,7 +188,7 @@ class NvCommandQueue {
                    uint32_t sharedMemBytes,
                    void **params, size_t numParams, bool async);
 
-    libreCudaStatus_t gpuMemcpy(void *dst, void *src, size_t numBytes);
+    libreCudaStatus_t gpuMemcpy(void *dst, void *src, size_t numBytes, bool async);
 
 private:
 
diff --git a/driverapi/internal/memcopy.h b/driverapi/internal/memcopy.h
@@ -5,4 +5,4 @@
 
 libreCudaStatus_t loadMemcpyKernelsIfNeeded();
 
-libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream stream);
+libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream stream, bool async);
diff --git a/driverapi/src/cmdqueue.cpp b/driverapi/src/cmdqueue.cpp
@@ -435,8 +435,6 @@ libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(LibreCUFunction function)
                 },
                 COMPUTE
         ));
-        timelineCtr++;
-        LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, COMPUTE));
     }
 
     LIBRECUDA_SUCCEED();
@@ -455,17 +453,6 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
     LIBRECUDA_VALIDATE(function != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
     LIBRECUDA_VALIDATE(numParams == function->param_info.size(), LIBRECUDA_ERROR_INVALID_VALUE);
 
-    bool local_mem_changed;
-    {
-        auto pre_ctr = timelineCtr;
-        LIBRECUDA_ERR_PROPAGATE(ensureEnoughLocalMem(function));
-        local_mem_changed = timelineCtr > pre_ctr;
-    }
-
-    if (!async || local_mem_changed) {
-        LIBRECUDA_ERR_PROPAGATE(signalWaitGpu(timelineSignal, timelineCtr));
-    }
-
     if (dmaCommandBuffer.empty()) {
         currentQueueType = COMPUTE;
     }
@@ -474,6 +461,18 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
         currentQueueType = COMPUTE;
     }
 
+    LIBRECUDA_ERR_PROPAGATE(ensureEnoughLocalMem(function));
+
+
+    if (!async && timelineNotifyPending) {
+        LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, COMPUTE));
+        timelineNotifyPending = false;
+    }
+
+    if (!async) {
+        LIBRECUDA_ERR_PROPAGATE(signalWaitGpu(timelineSignal, timelineCtr));
+    }
+
     // prepare constbuf0
     NvU32 constbuf0_data[88] = {};
     {
@@ -688,21 +687,29 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
     timelineCtr++;
     if (!async) {
         LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, COMPUTE));
+        timelineNotifyPending = false;
+    } else {
+        // when async, we cannot do a signalNotify! This prevents parallelism, so we only do one
+        // signalNotify at the end to advance the timelineSignal to the timelineCtr at once, which
+        // may have been incremented multiple times.
+        // timelineNotifyPending tells startExecution that we have to issue a signalNotify at the end
+        // because a previous async kernel launch will not have issued this.
+        // Without this, we would wait forever because the gpu would never modify the timelineSignal at all.
+        timelineNotifyPending = true;
     }
     LIBRECUDA_SUCCEED();
 }
 
 
-libreCudaStatus_t NvCommandQueue::gpuMemcpy(void *dst, void *src, size_t numBytes) {
+libreCudaStatus_t NvCommandQueue::gpuMemcpy(void *dst, void *src, size_t numBytes, bool async) {
     LIBRECUDA_VALIDATE(dst != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
     LIBRECUDA_VALIDATE(src != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
     LIBRECUDA_VALIDATE(numBytes < UINT32_MAX, LIBRECUDA_ERROR_INVALID_VALUE);
 
-    if (computeCommandBuffer.empty() && currentQueueType == COMPUTE) {
+    if (computeCommandBuffer.empty()) {
         currentQueueType = DMA;
     }
 
-    // sync with compute queue
     if (currentQueueType == COMPUTE) {
         backlogCurrentCmdBuffer(COMPUTE);
         currentQueueType = DMA;
@@ -738,11 +745,15 @@ libreCudaStatus_t NvCommandQueue::gpuMemcpy(void *dst, void *src, size_t numByte
             DMA
     ));
     timelineCtr++;
-    // TODO: THERE SEEM TO BE SERIOUS PROBLEMS WITH DMA CHRONOLOGY GIVEN THERE IS NO WAY TO WAIT FOR SEMAPHORES...
-    //  NEED MORE TESTING!
-    //  This signalNotify might also not be needed at all, try to design a similar async system as in COMPUTE
-    //  for DMA if possible..., else more CPU involvement is required for chronological DMA operations
-    LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, DMA));
+
+    // same logic as for COMPUTE applies to DMA.
+    if (!async) {
+        LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, DMA));
+        timelineNotifyPending = false;
+    } else {
+        timelineNotifyPending = true;
+    }
+
     LIBRECUDA_SUCCEED();
 }
 
@@ -755,7 +766,8 @@ libreCudaStatus_t NvCommandQueue::backlogCurrentCmdBuffer(QueueType queueType) {
             commandBufBacklog.push_back(CommandBufSplit{
                     .commandBuffer=computeCommandBuffer,
                     .queueType=COMPUTE,
-                    .timelineCtr=timelineCtr
+                    .timelineCtr=timelineCtr,
+                    .timelineNotifyPending=timelineNotifyPending
             });
             computeCommandBuffer.clear();
             break;
@@ -767,12 +779,14 @@ libreCudaStatus_t NvCommandQueue::backlogCurrentCmdBuffer(QueueType queueType) {
             commandBufBacklog.push_back(CommandBufSplit{
                     .commandBuffer=dmaCommandBuffer,
                     .queueType=DMA,
-                    .timelineCtr=timelineCtr
+                    .timelineCtr=timelineCtr,
+                    .timelineNotifyPending=timelineNotifyPending
             });
             dmaCommandBuffer.clear();
             break;
         }
     }
+    timelineNotifyPending = false;
     LIBRECUDA_SUCCEED();
 }
 
@@ -792,22 +806,35 @@ libreCudaStatus_t NvCommandQueue::startExecution() {
                     break;
                 }
             }
-            LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, backlog_entry.timelineCtr, backlog_entry.queueType));
+            if (backlog_entry.timelineNotifyPending) {
+                LIBRECUDA_ERR_PROPAGATE(
+                        signalNotify(timelineSignal, backlog_entry.timelineCtr, backlog_entry.queueType)
+                );
+            }
             LIBRECUDA_ERR_PROPAGATE(submitToFifo(backlog_entry.queueType));
             LIBRECUDA_ERR_PROPAGATE(signalWaitCpu(timelineSignal, backlog_entry.timelineCtr));
         }
         commandBufBacklog.clear();
     } else {
-        if (!computeCommandBuffer.empty()) {
+        if (currentQueueType == COMPUTE) {
             LIBRECUDA_VALIDATE(dmaCommandBuffer.empty(), LIBRECUDA_ERROR_UNKNOWN);
-            LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, COMPUTE));
+
+            // only issue signalNotify if last command didn't already do that
+            if (timelineNotifyPending) {
+                LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, COMPUTE));
+            }
             LIBRECUDA_ERR_PROPAGATE(startExecution(COMPUTE));
         }
-        if (!dmaCommandBuffer.empty()) {
+        if (currentQueueType == DMA) {
             LIBRECUDA_VALIDATE(computeCommandBuffer.empty(), LIBRECUDA_ERROR_UNKNOWN);
-            LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, DMA));
+
+            // only issue signalNotify if last command didn't already do that
+            if (timelineNotifyPending) {
+                LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, DMA));
+            }
             LIBRECUDA_ERR_PROPAGATE(startExecution(DMA));
         }
+        timelineNotifyPending = false;
     }
     LIBRECUDA_SUCCEED();
 }
diff --git a/driverapi/src/librecuda.cpp b/driverapi/src/librecuda.cpp
@@ -793,16 +793,16 @@ libreCudaStatus_t libreCuMemFree(void *devicePointer) {
     LIBRECUDA_SUCCEED();
 }
 
-libreCudaStatus_t libreCuMemCpy(void *dst, void *src, size_t byteCount, LibreCUstream stream) {
+libreCudaStatus_t libreCuMemCpy(void *dst, void *src, size_t byteCount, LibreCUstream stream, bool async) {
     LIBRECUDA_VALIDATE(dst != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
     LIBRECUDA_VALIDATE(src != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
     LIBRECUDA_VALIDATE(stream != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
     LIBRECUDA_ENSURE_CTX_VALID();
     if (isDevicePtr(dst) && isDevicePtr(src)) {
         // is d2d copy
-        memcpyD2D(dst, src, byteCount, stream);
+        memcpyD2D(dst, src, byteCount, stream, async);
     } else {
-        stream->command_queue->gpuMemcpy(dst, src, byteCount);
+        stream->command_queue->gpuMemcpy(dst, src, byteCount, async);
     }
     LIBRECUDA_SUCCEED();
 }
@@ -1386,6 +1386,7 @@ libreCudaStatus_t libreCuStreamCreate(LibreCUstream *pStreamOut, uint32_t flags)
     LIBRECUDA_SUCCEED();
 }
 
+
 libreCudaStatus_t libreCuStreamCommence(LibreCUstream stream) {
     LIBRECUDA_VALIDATE(stream != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
     LIBRECUDA_ERR_PROPAGATE(stream->command_queue->startExecution());
@@ -1498,4 +1499,4 @@ libreCudaStatus_t libreCuFuncSetAttribute(LibreCUFunction function, LibreCuFunct
         default: LIBRECUDA_FAIL(LIBRECUDA_ERROR_INVALID_VALUE);
     }
     LIBRECUDA_SUCCEED();
-}
+}
diff --git a/driverapi/src/memcopy.cpp b/driverapi/src/memcopy.cpp
@@ -32,11 +32,22 @@ libreCudaStatus_t loadMemcpyKernelsIfNeeded() {
     LIBRECUDA_SUCCEED();
 }
 
-libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream stream) {
+libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream stream, bool async) {
     uint32_t blockSizeX = MEMCPY_BLOCK_SIZE;
 
     bool use_high_bw = size > (1024 * 1024); // 1 MiB
-    if (use_high_bw) {
+
+
+    // technically the three kernels can run concurrently, so that's what we want to do.
+    // if the entire memcpyD2D command is logically !asymc, we only want to sync at the last kernel of the three
+    // that runs
+    bool launch_kernel_1 = use_high_bw;
+    bool launch_kernel_2 = !use_high_bw || size % MEMCPY_HIGHBW_BYTE_GRANULARITY != 0;
+    bool launch_kernel_3 = size % MEMCPY_FINISH_BYTE_GRANULARITY != 0;
+
+    bool has_synced = false;
+
+    if (launch_kernel_1) {
         auto gridSizeX = (size /
                           (MEMCPY_THREAD_COPYSIZE * MEMCPY_BLOCK_SIZE));
         auto gridSizeY = CEIL_DIV(gridSizeX, MAX_GRID_SIZE_X);
@@ -49,10 +60,14 @@ libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream str
                                     gridSizeX, gridSizeY, 1,
                                     blockSizeX, 1, 1,
                                     0, stream, params,
-                                    sizeof(params) / sizeof(void *), nullptr)
+                                    sizeof(params) / sizeof(void *), nullptr, async ? true : has_synced);
         );
+        if (!async) {
+            has_synced = true;
+        }
     }
-    if (!use_high_bw || size % MEMCPY_HIGHBW_BYTE_GRANULARITY != 0) {
+
+    if (launch_kernel_2) {
         size_t bytes_copied = use_high_bw ? ((size / MEMCPY_HIGHBW_BYTE_GRANULARITY) * MEMCPY_HIGHBW_BYTE_GRANULARITY)
                                           : 0;
         size_t bytes_remaining = size - bytes_copied;
@@ -69,11 +84,14 @@ libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream str
                                     gridSizeX, 1, 1,
                                     blockSizeX, 1, 1,
                                     0, stream, params,
-                                    sizeof(params) / sizeof(void *), nullptr)
+                                    sizeof(params) / sizeof(void *), nullptr, async ? true : has_synced)
         );
+        if (!async) {
+            has_synced = true;
+        }
     }
 
-    if (size % MEMCPY_FINISH_BYTE_GRANULARITY != 0) {
+    if (launch_kernel_3) {
         size_t bytes_copied = ((size / MEMCPY_HIGHBW_BYTE_GRANULARITY) * MEMCPY_HIGHBW_BYTE_GRANULARITY);
         size_t bytes_remaining = size - bytes_copied;
         auto gridSizeX = CEIL_DIV(bytes_remaining, MEMCPY_BLOCK_SIZE);
@@ -88,7 +106,7 @@ libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream str
                                     gridSizeX, 1, 1,
                                     blockSizeX, 1, 1,
                                     0, stream, params,
-                                    sizeof(params) / sizeof(void *), nullptr)
+                                    sizeof(params) / sizeof(void *), nullptr, async ? true : has_synced)
         );
     }
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -2,4 +2,5 @@ add_subdirectory(write_float)
 add_subdirectory(memcopy)
 add_subdirectory(dynamic_shared_mem)
 add_subdirectory(compute_chronological_consistency)
-add_subdirectory(test_async_kernels)
+add_subdirectory(test_async_kernels)
+add_subdirectory(dma_chronological_consistency)
diff --git a/tests/dma_chronological_consistency/CMakeLists.txt b/tests/dma_chronological_consistency/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable(
+        test_dma_chronological_consistency
+        main.cpp
+)
+target_link_libraries(
+        test_dma_chronological_consistency
+        PRIVATE
+        driverapi
+)
diff --git a/tests/dma_chronological_consistency/main.cpp b/tests/dma_chronological_consistency/main.cpp
diff --git a/tests/memcopy/main.cpp b/tests/memcopy/main.cpp
diff --git a/tests/test_async_kernels/main.cpp b/tests/test_async_kernels/main.cpp

Original file line number	Diff line number	Diff line change
`@@ -5,4 +5,4 @@`
`5`	`5`
`6`	`6`	`libreCudaStatus_t loadMemcpyKernelsIfNeeded();`
`7`	`7`
`8`		`-libreCudaStatus_t memcpyD2D(void dst, void src, size_t size, LibreCUstream stream);`
	`8`	`+libreCudaStatus_t memcpyD2D(void dst, void src, size_t size, LibreCUstream stream, bool async);`