Skip to content

Commit 2ed31f1

Browse files
committed
Fix DMA chronological inconsistency
1 parent 86c56c3 commit 2ed31f1

File tree

11 files changed

+233
-45
lines changed

11 files changed

+233
-45
lines changed

driverapi/include/librecuda.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ LIBRECUDA_EXPORT libreCudaStatus_t libreCuCtxGetCurrent(LibreCUcontext *pCtxOut)
4848

4949
LIBRECUDA_EXPORT libreCudaStatus_t libreCuMemAlloc(void **pDevicePointer, size_t bytesize, bool mapToCpu = false);
5050

51-
LIBRECUDA_EXPORT libreCudaStatus_t libreCuMemCpy(void *dst, void *src, size_t byteCount, LibreCUstream stream);
51+
LIBRECUDA_EXPORT libreCudaStatus_t libreCuMemCpy(void *dst, void *src, size_t byteCount, LibreCUstream stream, bool async = false);
5252

5353
LIBRECUDA_EXPORT libreCudaStatus_t libreCuMemFree(void *devicePointer);
5454

@@ -84,6 +84,7 @@ LIBRECUDA_EXPORT libreCudaStatus_t libreCuLaunchKernel(LibreCUFunction function,
8484
void **kernelParams, size_t numParams,
8585
void **extra,
8686
bool async=false);
87+
8788
/**
8889
* Submits the built up command buffer to the gpu.
8990
* Operations performed on streams fall into two types: "compute" (eg. launch kernel) and "dma".

driverapi/internal/cmdqueue.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ struct CommandBufSplit {
5353
std::vector<NvU32> commandBuffer{};
5454
QueueType queueType;
5555
NvU32 timelineCtr;
56+
bool timelineNotifyPending;
5657
};
5758

5859
class NvCommandQueue {
@@ -115,6 +116,13 @@ class NvCommandQueue {
115116
*/
116117
NvU32 timelineCtr = 0;
117118

119+
/**
120+
* State whether the last command that incremented the timeline also issued a signalNotify() command.
121+
* If this flag is true and the stream is commenced, a trailing signalNotify() has to be inserted.
122+
* Otherwise this is not necessary as the last COMPUTE/DMA command already issued it.
123+
*/
124+
bool timelineNotifyPending = false;
125+
118126
// TODO: To my knowledge there is no way to interleave COMPUTE and DMA queues with synchronization primitives.
119127
// You can release semaphores on a DMA queue, but not acquire it. You need both for bi-directional sync.
120128
/**
@@ -180,7 +188,7 @@ class NvCommandQueue {
180188
uint32_t sharedMemBytes,
181189
void **params, size_t numParams, bool async);
182190

183-
libreCudaStatus_t gpuMemcpy(void *dst, void *src, size_t numBytes);
191+
libreCudaStatus_t gpuMemcpy(void *dst, void *src, size_t numBytes, bool async);
184192

185193
private:
186194

driverapi/internal/memcopy.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@
55

66
libreCudaStatus_t loadMemcpyKernelsIfNeeded();
77

8-
libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream stream);
8+
libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream stream, bool async);

driverapi/src/cmdqueue.cpp

Lines changed: 55 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -435,8 +435,6 @@ libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(LibreCUFunction function)
435435
},
436436
COMPUTE
437437
));
438-
timelineCtr++;
439-
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, COMPUTE));
440438
}
441439

442440
LIBRECUDA_SUCCEED();
@@ -455,17 +453,6 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
455453
LIBRECUDA_VALIDATE(function != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
456454
LIBRECUDA_VALIDATE(numParams == function->param_info.size(), LIBRECUDA_ERROR_INVALID_VALUE);
457455

458-
bool local_mem_changed;
459-
{
460-
auto pre_ctr = timelineCtr;
461-
LIBRECUDA_ERR_PROPAGATE(ensureEnoughLocalMem(function));
462-
local_mem_changed = timelineCtr > pre_ctr;
463-
}
464-
465-
if (!async || local_mem_changed) {
466-
LIBRECUDA_ERR_PROPAGATE(signalWaitGpu(timelineSignal, timelineCtr));
467-
}
468-
469456
if (dmaCommandBuffer.empty()) {
470457
currentQueueType = COMPUTE;
471458
}
@@ -474,6 +461,18 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
474461
currentQueueType = COMPUTE;
475462
}
476463

464+
LIBRECUDA_ERR_PROPAGATE(ensureEnoughLocalMem(function));
465+
466+
467+
if (!async && timelineNotifyPending) {
468+
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, COMPUTE));
469+
timelineNotifyPending = false;
470+
}
471+
472+
if (!async) {
473+
LIBRECUDA_ERR_PROPAGATE(signalWaitGpu(timelineSignal, timelineCtr));
474+
}
475+
477476
// prepare constbuf0
478477
NvU32 constbuf0_data[88] = {};
479478
{
@@ -688,21 +687,29 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
688687
timelineCtr++;
689688
if (!async) {
690689
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, COMPUTE));
690+
timelineNotifyPending = false;
691+
} else {
692+
// when async, we cannot do a signalNotify! This prevents parallelism, so we only do one
693+
// signalNotify at the end to advance the timelineSignal to the timelineCtr at once, which
694+
// may have been incremented multiple times.
695+
// timelineNotifyPending tells startExecution that we have to issue a signalNotify at the end
696+
// because a previous async kernel launch will not have issued this.
697+
// Without this, we would wait forever because the gpu would never modify the timelineSignal at all.
698+
timelineNotifyPending = true;
691699
}
692700
LIBRECUDA_SUCCEED();
693701
}
694702

695703

696-
libreCudaStatus_t NvCommandQueue::gpuMemcpy(void *dst, void *src, size_t numBytes) {
704+
libreCudaStatus_t NvCommandQueue::gpuMemcpy(void *dst, void *src, size_t numBytes, bool async) {
697705
LIBRECUDA_VALIDATE(dst != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
698706
LIBRECUDA_VALIDATE(src != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
699707
LIBRECUDA_VALIDATE(numBytes < UINT32_MAX, LIBRECUDA_ERROR_INVALID_VALUE);
700708

701-
if (computeCommandBuffer.empty() && currentQueueType == COMPUTE) {
709+
if (computeCommandBuffer.empty()) {
702710
currentQueueType = DMA;
703711
}
704712

705-
// sync with compute queue
706713
if (currentQueueType == COMPUTE) {
707714
backlogCurrentCmdBuffer(COMPUTE);
708715
currentQueueType = DMA;
@@ -738,11 +745,15 @@ libreCudaStatus_t NvCommandQueue::gpuMemcpy(void *dst, void *src, size_t numByte
738745
DMA
739746
));
740747
timelineCtr++;
741-
// TODO: THERE SEEM TO BE SERIOUS PROBLEMS WITH DMA CHRONOLOGY GIVEN THERE IS NO WAY TO WAIT FOR SEMAPHORES...
742-
// NEED MORE TESTING!
743-
// This signalNotify might also not be needed at all, try to design a similar async system as in COMPUTE
744-
// for DMA if possible..., else more CPU involvement is required for chronological DMA operations
745-
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, DMA));
748+
749+
// same logic as for COMPUTE applies to DMA.
750+
if (!async) {
751+
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, DMA));
752+
timelineNotifyPending = false;
753+
} else {
754+
timelineNotifyPending = true;
755+
}
756+
746757
LIBRECUDA_SUCCEED();
747758
}
748759

@@ -755,7 +766,8 @@ libreCudaStatus_t NvCommandQueue::backlogCurrentCmdBuffer(QueueType queueType) {
755766
commandBufBacklog.push_back(CommandBufSplit{
756767
.commandBuffer=computeCommandBuffer,
757768
.queueType=COMPUTE,
758-
.timelineCtr=timelineCtr
769+
.timelineCtr=timelineCtr,
770+
.timelineNotifyPending=timelineNotifyPending
759771
});
760772
computeCommandBuffer.clear();
761773
break;
@@ -767,12 +779,14 @@ libreCudaStatus_t NvCommandQueue::backlogCurrentCmdBuffer(QueueType queueType) {
767779
commandBufBacklog.push_back(CommandBufSplit{
768780
.commandBuffer=dmaCommandBuffer,
769781
.queueType=DMA,
770-
.timelineCtr=timelineCtr
782+
.timelineCtr=timelineCtr,
783+
.timelineNotifyPending=timelineNotifyPending
771784
});
772785
dmaCommandBuffer.clear();
773786
break;
774787
}
775788
}
789+
timelineNotifyPending = false;
776790
LIBRECUDA_SUCCEED();
777791
}
778792

@@ -792,22 +806,35 @@ libreCudaStatus_t NvCommandQueue::startExecution() {
792806
break;
793807
}
794808
}
795-
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, backlog_entry.timelineCtr, backlog_entry.queueType));
809+
if (backlog_entry.timelineNotifyPending) {
810+
LIBRECUDA_ERR_PROPAGATE(
811+
signalNotify(timelineSignal, backlog_entry.timelineCtr, backlog_entry.queueType)
812+
);
813+
}
796814
LIBRECUDA_ERR_PROPAGATE(submitToFifo(backlog_entry.queueType));
797815
LIBRECUDA_ERR_PROPAGATE(signalWaitCpu(timelineSignal, backlog_entry.timelineCtr));
798816
}
799817
commandBufBacklog.clear();
800818
} else {
801-
if (!computeCommandBuffer.empty()) {
819+
if (currentQueueType == COMPUTE) {
802820
LIBRECUDA_VALIDATE(dmaCommandBuffer.empty(), LIBRECUDA_ERROR_UNKNOWN);
803-
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, COMPUTE));
821+
822+
// only issue signalNotify if last command didn't already do that
823+
if (timelineNotifyPending) {
824+
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, COMPUTE));
825+
}
804826
LIBRECUDA_ERR_PROPAGATE(startExecution(COMPUTE));
805827
}
806-
if (!dmaCommandBuffer.empty()) {
828+
if (currentQueueType == DMA) {
807829
LIBRECUDA_VALIDATE(computeCommandBuffer.empty(), LIBRECUDA_ERROR_UNKNOWN);
808-
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, DMA));
830+
831+
// only issue signalNotify if last command didn't already do that
832+
if (timelineNotifyPending) {
833+
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, DMA));
834+
}
809835
LIBRECUDA_ERR_PROPAGATE(startExecution(DMA));
810836
}
837+
timelineNotifyPending = false;
811838
}
812839
LIBRECUDA_SUCCEED();
813840
}

driverapi/src/librecuda.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -793,16 +793,16 @@ libreCudaStatus_t libreCuMemFree(void *devicePointer) {
793793
LIBRECUDA_SUCCEED();
794794
}
795795

796-
libreCudaStatus_t libreCuMemCpy(void *dst, void *src, size_t byteCount, LibreCUstream stream) {
796+
libreCudaStatus_t libreCuMemCpy(void *dst, void *src, size_t byteCount, LibreCUstream stream, bool async) {
797797
LIBRECUDA_VALIDATE(dst != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
798798
LIBRECUDA_VALIDATE(src != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
799799
LIBRECUDA_VALIDATE(stream != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
800800
LIBRECUDA_ENSURE_CTX_VALID();
801801
if (isDevicePtr(dst) && isDevicePtr(src)) {
802802
// is d2d copy
803-
memcpyD2D(dst, src, byteCount, stream);
803+
memcpyD2D(dst, src, byteCount, stream, async);
804804
} else {
805-
stream->command_queue->gpuMemcpy(dst, src, byteCount);
805+
stream->command_queue->gpuMemcpy(dst, src, byteCount, async);
806806
}
807807
LIBRECUDA_SUCCEED();
808808
}
@@ -1386,6 +1386,7 @@ libreCudaStatus_t libreCuStreamCreate(LibreCUstream *pStreamOut, uint32_t flags)
13861386
LIBRECUDA_SUCCEED();
13871387
}
13881388

1389+
13891390
libreCudaStatus_t libreCuStreamCommence(LibreCUstream stream) {
13901391
LIBRECUDA_VALIDATE(stream != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
13911392
LIBRECUDA_ERR_PROPAGATE(stream->command_queue->startExecution());
@@ -1498,4 +1499,4 @@ libreCudaStatus_t libreCuFuncSetAttribute(LibreCUFunction function, LibreCuFunct
14981499
default: LIBRECUDA_FAIL(LIBRECUDA_ERROR_INVALID_VALUE);
14991500
}
15001501
LIBRECUDA_SUCCEED();
1501-
}
1502+
}

driverapi/src/memcopy.cpp

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,22 @@ libreCudaStatus_t loadMemcpyKernelsIfNeeded() {
3232
LIBRECUDA_SUCCEED();
3333
}
3434

35-
libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream stream) {
35+
libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream stream, bool async) {
3636
uint32_t blockSizeX = MEMCPY_BLOCK_SIZE;
3737

3838
bool use_high_bw = size > (1024 * 1024); // 1 MiB
39-
if (use_high_bw) {
39+
40+
41+
// technically the three kernels can run concurrently, so that's what we want to do.
42+
// if the entire memcpyD2D command is logically !asymc, we only want to sync at the last kernel of the three
43+
// that runs
44+
bool launch_kernel_1 = use_high_bw;
45+
bool launch_kernel_2 = !use_high_bw || size % MEMCPY_HIGHBW_BYTE_GRANULARITY != 0;
46+
bool launch_kernel_3 = size % MEMCPY_FINISH_BYTE_GRANULARITY != 0;
47+
48+
bool has_synced = false;
49+
50+
if (launch_kernel_1) {
4051
auto gridSizeX = (size /
4152
(MEMCPY_THREAD_COPYSIZE * MEMCPY_BLOCK_SIZE));
4253
auto gridSizeY = CEIL_DIV(gridSizeX, MAX_GRID_SIZE_X);
@@ -49,10 +60,14 @@ libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream str
4960
gridSizeX, gridSizeY, 1,
5061
blockSizeX, 1, 1,
5162
0, stream, params,
52-
sizeof(params) / sizeof(void *), nullptr)
63+
sizeof(params) / sizeof(void *), nullptr, async ? true : has_synced);
5364
);
65+
if (!async) {
66+
has_synced = true;
67+
}
5468
}
55-
if (!use_high_bw || size % MEMCPY_HIGHBW_BYTE_GRANULARITY != 0) {
69+
70+
if (launch_kernel_2) {
5671
size_t bytes_copied = use_high_bw ? ((size / MEMCPY_HIGHBW_BYTE_GRANULARITY) * MEMCPY_HIGHBW_BYTE_GRANULARITY)
5772
: 0;
5873
size_t bytes_remaining = size - bytes_copied;
@@ -69,11 +84,14 @@ libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream str
6984
gridSizeX, 1, 1,
7085
blockSizeX, 1, 1,
7186
0, stream, params,
72-
sizeof(params) / sizeof(void *), nullptr)
87+
sizeof(params) / sizeof(void *), nullptr, async ? true : has_synced)
7388
);
89+
if (!async) {
90+
has_synced = true;
91+
}
7492
}
7593

76-
if (size % MEMCPY_FINISH_BYTE_GRANULARITY != 0) {
94+
if (launch_kernel_3) {
7795
size_t bytes_copied = ((size / MEMCPY_HIGHBW_BYTE_GRANULARITY) * MEMCPY_HIGHBW_BYTE_GRANULARITY);
7896
size_t bytes_remaining = size - bytes_copied;
7997
auto gridSizeX = CEIL_DIV(bytes_remaining, MEMCPY_BLOCK_SIZE);
@@ -88,7 +106,7 @@ libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream str
88106
gridSizeX, 1, 1,
89107
blockSizeX, 1, 1,
90108
0, stream, params,
91-
sizeof(params) / sizeof(void *), nullptr)
109+
sizeof(params) / sizeof(void *), nullptr, async ? true : has_synced)
92110
);
93111
}
94112

tests/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ add_subdirectory(write_float)
22
add_subdirectory(memcopy)
33
add_subdirectory(dynamic_shared_mem)
44
add_subdirectory(compute_chronological_consistency)
5-
add_subdirectory(test_async_kernels)
5+
add_subdirectory(test_async_kernels)
6+
add_subdirectory(dma_chronological_consistency)
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
add_executable(
2+
test_dma_chronological_consistency
3+
main.cpp
4+
)
5+
target_link_libraries(
6+
test_dma_chronological_consistency
7+
PRIVATE
8+
driverapi
9+
)

0 commit comments

Comments
 (0)