Skip to content

Commit b6a9ef0

Browse files
committed
fix chronological consistency in compute queue with explicit wait
1 parent a6b7a90 commit b6a9ef0

File tree

12 files changed

+69
-29
lines changed

12 files changed

+69
-29
lines changed

driverapi/include/librecuda.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ LIBRECUDA_EXPORT libreCudaStatus_t libreCuLaunchKernel(LibreCUFunction function,
8282
uint32_t blockDimZ,
8383
uint32_t sharedMemBytes, LibreCUstream stream,
8484
void **kernelParams, size_t numParams,
85-
void **extra);
85+
void **extra,
86+
bool async=false);
8687
/**
8788
* Submits the built up command buffer to the gpu.
8889
* Operations performed on streams fall into two types: "compute" (eg. launch kernel) and "dma".

driverapi/internal/cmdqueue.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ class NvCommandQueue {
173173
/**
174174
* Waits for the pending operations in the currently executing command queue to complete
175175
*/
176-
libreCudaStatus_t awaitExecution(QueueType queueType);
176+
libreCudaStatus_t awaitExecution();
177177

178178
~NvCommandQueue();
179179

@@ -184,7 +184,7 @@ class NvCommandQueue {
184184
uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
185185
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
186186
uint32_t sharedMemBytes,
187-
void **params, size_t numParams);
187+
void **params, size_t numParams, bool async);
188188

189189
libreCudaStatus_t gpuMemcpy(void *dst, void *src, size_t numBytes);
190190

@@ -200,6 +200,14 @@ class NvCommandQueue {
200200

201201
libreCudaStatus_t signalWaitCpu(NvSignal *pSignal, NvU32 signalTarget);
202202

203+
/**
204+
* waits on the gpu compute queue until the signal reaches the specified target
205+
* @param pSignal the signal
206+
* @param signalTarget the target value
207+
* @return status
208+
*/
209+
libreCudaStatus_t signalWaitGpu(NvSignal *pSignal, NvU32 signalTarget);
210+
203211
libreCudaStatus_t submitToFifo(QueueType type);
204212

205213
libreCudaStatus_t allocKernArgs(NvU64 *pMemOut, size_t size);

driverapi/src/cmdqueue.cpp

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -131,20 +131,22 @@ libreCudaStatus_t NvCommandQueue::initializeQueue() {
131131
)
132132
);
133133
timelineCtr++;
134+
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, COMPUTE));
134135
}
135136
LIBRECUDA_ERR_PROPAGATE(startExecution(COMPUTE));
136-
LIBRECUDA_ERR_PROPAGATE(awaitExecution(COMPUTE));
137+
LIBRECUDA_ERR_PROPAGATE(awaitExecution());
137138

138139
// setup copy queue
139140
{
140141
LIBRECUDA_ERR_PROPAGATE(
141142
enqueue(makeNvMethod(4, NVC6C0_SET_OBJECT, 1), {AMPERE_DMA_COPY_B}, DMA)
142143
);
143144
timelineCtr++;
145+
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, DMA));
144146
}
145147

146148
LIBRECUDA_ERR_PROPAGATE(startExecution(DMA));
147-
LIBRECUDA_ERR_PROPAGATE(awaitExecution(DMA));
149+
LIBRECUDA_ERR_PROPAGATE(awaitExecution());
148150

149151
// allocate kernargs page
150152
{
@@ -250,9 +252,10 @@ NvCommandQueue::~NvCommandQueue() {
250252
gpuFree(ctx, reinterpret_cast<NvU64>(kernArgsPageVa));
251253
}
252254

253-
libreCudaStatus_t NvCommandQueue::awaitExecution(QueueType queueType) {
255+
libreCudaStatus_t NvCommandQueue::awaitExecution() {
254256
LIBRECUDA_VALIDATE(timelineSignal != nullptr, LIBRECUDA_ERROR_NOT_INITIALIZED);
255257
LIBRECUDA_ERR_PROPAGATE(signalWaitCpu(timelineSignal, timelineCtr));
258+
kernArgsWriteIdx = 0; // reset kern args write idx
256259
LIBRECUDA_SUCCEED();
257260
}
258261

@@ -339,12 +342,9 @@ libreCudaStatus_t NvCommandQueue::signalWaitCpu(NvSignal *pSignal, NvU32 signalT
339342
}
340343

341344
libreCudaStatus_t NvCommandQueue::startExecution(QueueType queueType) {
342-
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, queueType));
343345
LIBRECUDA_ERR_PROPAGATE(submitToFifo(
344346
queueType
345347
));
346-
347-
kernArgsWriteIdx = 0; // reset kern args write idx
348348
LIBRECUDA_SUCCEED();
349349
}
350350

@@ -437,6 +437,8 @@ libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(NvU32 localMemReq) {
437437
timelineCtr++;
438438
}
439439

440+
//LIBRECUDA_ERR_PROPAGATE(signalWaitGpu(timelineSignal, timelineCtr));
441+
440442
LIBRECUDA_SUCCEED();
441443
}
442444

@@ -448,10 +450,13 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
448450
uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
449451
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
450452
uint32_t sharedMemBytes,
451-
void **params, size_t numParams) {
453+
void **params, size_t numParams,
454+
bool async) {
452455
LIBRECUDA_VALIDATE(function != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
453456
LIBRECUDA_VALIDATE(numParams == function->param_info.size(), LIBRECUDA_ERROR_INVALID_VALUE);
454-
457+
if (!async) {
458+
LIBRECUDA_ERR_PROPAGATE(signalWaitGpu(timelineSignal, timelineCtr));
459+
}
455460
LIBRECUDA_ERR_PROPAGATE(ensureEnoughLocalMem(function->local_mem_req));
456461

457462
if (dmaCommandBuffer.empty()) {
@@ -673,6 +678,7 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
673678
));
674679
}
675680
timelineCtr++;
681+
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, COMPUTE));
676682
LIBRECUDA_SUCCEED();
677683
}
678684

@@ -722,7 +728,7 @@ libreCudaStatus_t NvCommandQueue::gpuMemcpy(void *dst, void *src, size_t numByte
722728
DMA
723729
));
724730
timelineCtr++;
725-
731+
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, DMA));
726732
LIBRECUDA_SUCCEED();
727733
}
728734

@@ -772,21 +778,42 @@ libreCudaStatus_t NvCommandQueue::startExecution() {
772778
break;
773779
}
774780
}
775-
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, backlog_entry.timelineCtr, backlog_entry.queueType));
776781
LIBRECUDA_ERR_PROPAGATE(submitToFifo(backlog_entry.queueType));
777782
LIBRECUDA_ERR_PROPAGATE(signalWaitCpu(timelineSignal, backlog_entry.timelineCtr));
778783
}
779784
commandBufBacklog.clear();
780785
} else {
781786
if (!computeCommandBuffer.empty()) {
782787
LIBRECUDA_ERR_PROPAGATE(startExecution(COMPUTE));
783-
LIBRECUDA_ERR_PROPAGATE(awaitExecution(COMPUTE));
784788
}
785789
if (!dmaCommandBuffer.empty()) {
786790
LIBRECUDA_ERR_PROPAGATE(startExecution(DMA));
787-
LIBRECUDA_ERR_PROPAGATE(awaitExecution(DMA));
788791
}
789792
}
790-
kernArgsWriteIdx = 0; // reset kern args write idx
793+
LIBRECUDA_SUCCEED();
794+
}
795+
796+
libreCudaStatus_t NvCommandQueue::signalWaitGpu(NvSignal *pSignal, NvU32 signalTarget) {
797+
if (pSignal->value == signalTarget) {
798+
// no need to wait, if cpu can confirm.
799+
800+
return LIBRECUDA_SUCCESS;
801+
}
802+
LIBRECUDA_ERR_PROPAGATE(enqueue(
803+
makeNvMethod(0, NVC56F_SEM_ADDR_LO, 5),
804+
{
805+
// little endian
806+
U64_LO_32_BITS(pSignal),
807+
U64_HI_32_BITS(pSignal),
808+
809+
// little endian
810+
signalTarget,
811+
0,
812+
813+
(NVC56F_SEM_EXECUTE_OPERATION_ACQUIRE) |
814+
(NVC56F_SEM_EXECUTE_PAYLOAD_SIZE_64BIT << 24)
815+
},
816+
COMPUTE
817+
));
791818
LIBRECUDA_SUCCEED();
792819
}

driverapi/src/librecuda.cpp

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1336,16 +1336,21 @@ libreCudaStatus_t libreCuLaunchKernel(LibreCUFunction function,
13361336
uint32_t sharedMemBytes,
13371337
LibreCUstream stream,
13381338
void **kernelParams, size_t numParams,
1339-
void **extra) {
1339+
void **extra,
1340+
bool async) {
13401341
LIBRECUDA_VALIDATE(function != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
13411342
LIBRECUDA_VALIDATE(stream != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
1342-
LIBRECUDA_ERR_PROPAGATE(stream->command_queue->launchFunction(function,
1343-
gridDimX, gridDimY, gridDimZ,
1344-
blockDimX, blockDimY, blockDimZ,
1345-
sharedMemBytes,
1346-
kernelParams,
1347-
numParams
1348-
));
1343+
LIBRECUDA_ERR_PROPAGATE(
1344+
stream->command_queue->launchFunction(
1345+
function,
1346+
gridDimX, gridDimY, gridDimZ,
1347+
blockDimX, blockDimY, blockDimZ,
1348+
sharedMemBytes,
1349+
kernelParams,
1350+
numParams,
1351+
async
1352+
)
1353+
);
13491354
LIBRECUDA_SUCCEED();
13501355
}
13511356

@@ -1385,8 +1390,7 @@ libreCudaStatus_t libreCuStreamCommence(LibreCUstream stream) {
13851390

13861391
libreCudaStatus_t libreCuStreamAwait(LibreCUstream stream) {
13871392
LIBRECUDA_VALIDATE(stream != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
1388-
LIBRECUDA_ERR_PROPAGATE(stream->command_queue->awaitExecution(COMPUTE));
1389-
LIBRECUDA_ERR_PROPAGATE(stream->command_queue->awaitExecution(DMA));
1393+
LIBRECUDA_ERR_PROPAGATE(stream->command_queue->awaitExecution());
13901394
LIBRECUDA_SUCCEED();
13911395
}
13921396

tests/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
add_subdirectory(write_float)
22
add_subdirectory(memcopy)
33
add_subdirectory(dynamic_shared_mem)
4-
add_subdirectory(compute_dma_iterleave)
4+
add_subdirectory(compute_chronological_consistency)

tests/compute_dma_iterleave/CMakeLists.txt renamed to tests/compute_chronological_consistency/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
add_executable(
2-
test_compute_dma_interleave
2+
test_chronological_consistency
33
main.cpp
44
)
55
target_link_libraries(
6-
test_compute_dma_interleave
6+
test_chronological_consistency
77
PRIVATE
88
driverapi
99
)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)