@@ -131,20 +131,22 @@ libreCudaStatus_t NvCommandQueue::initializeQueue() {
131
131
)
132
132
);
133
133
timelineCtr++;
134
+ LIBRECUDA_ERR_PROPAGATE (signalNotify (timelineSignal, timelineCtr, COMPUTE));
134
135
}
135
136
LIBRECUDA_ERR_PROPAGATE (startExecution (COMPUTE));
136
- LIBRECUDA_ERR_PROPAGATE (awaitExecution (COMPUTE ));
137
+ LIBRECUDA_ERR_PROPAGATE (awaitExecution ());
137
138
138
139
// setup copy queue
139
140
{
140
141
LIBRECUDA_ERR_PROPAGATE (
141
142
enqueue (makeNvMethod (4 , NVC6C0_SET_OBJECT, 1 ), {AMPERE_DMA_COPY_B}, DMA)
142
143
);
143
144
timelineCtr++;
145
+ LIBRECUDA_ERR_PROPAGATE (signalNotify (timelineSignal, timelineCtr, DMA));
144
146
}
145
147
146
148
LIBRECUDA_ERR_PROPAGATE (startExecution (DMA));
147
- LIBRECUDA_ERR_PROPAGATE (awaitExecution (DMA ));
149
+ LIBRECUDA_ERR_PROPAGATE (awaitExecution ());
148
150
149
151
// allocate kernargs page
150
152
{
@@ -250,9 +252,10 @@ NvCommandQueue::~NvCommandQueue() {
250
252
gpuFree (ctx, reinterpret_cast <NvU64>(kernArgsPageVa));
251
253
}
252
254
253
- libreCudaStatus_t NvCommandQueue::awaitExecution (QueueType queueType ) {
255
+ libreCudaStatus_t NvCommandQueue::awaitExecution () {
254
256
LIBRECUDA_VALIDATE (timelineSignal != nullptr , LIBRECUDA_ERROR_NOT_INITIALIZED);
255
257
LIBRECUDA_ERR_PROPAGATE (signalWaitCpu (timelineSignal, timelineCtr));
258
+ kernArgsWriteIdx = 0 ; // reset kern args write idx
256
259
LIBRECUDA_SUCCEED ();
257
260
}
258
261
@@ -339,12 +342,9 @@ libreCudaStatus_t NvCommandQueue::signalWaitCpu(NvSignal *pSignal, NvU32 signalT
339
342
}
340
343
341
344
libreCudaStatus_t NvCommandQueue::startExecution (QueueType queueType) {
342
- LIBRECUDA_ERR_PROPAGATE (signalNotify (timelineSignal, timelineCtr, queueType));
343
345
LIBRECUDA_ERR_PROPAGATE (submitToFifo (
344
346
queueType
345
347
));
346
-
347
- kernArgsWriteIdx = 0 ; // reset kern args write idx
348
348
LIBRECUDA_SUCCEED ();
349
349
}
350
350
@@ -437,6 +437,8 @@ libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(NvU32 localMemReq) {
437
437
timelineCtr++;
438
438
}
439
439
440
+ // LIBRECUDA_ERR_PROPAGATE(signalWaitGpu(timelineSignal, timelineCtr));
441
+
440
442
LIBRECUDA_SUCCEED ();
441
443
}
442
444
@@ -448,10 +450,13 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
448
450
uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
449
451
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
450
452
uint32_t sharedMemBytes,
451
- void **params, size_t numParams) {
453
+ void **params, size_t numParams,
454
+ bool async) {
452
455
LIBRECUDA_VALIDATE (function != nullptr , LIBRECUDA_ERROR_INVALID_VALUE);
453
456
LIBRECUDA_VALIDATE (numParams == function->param_info .size (), LIBRECUDA_ERROR_INVALID_VALUE);
454
-
457
+ if (!async) {
458
+ LIBRECUDA_ERR_PROPAGATE (signalWaitGpu (timelineSignal, timelineCtr));
459
+ }
455
460
LIBRECUDA_ERR_PROPAGATE (ensureEnoughLocalMem (function->local_mem_req ));
456
461
457
462
if (dmaCommandBuffer.empty ()) {
@@ -673,6 +678,7 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
673
678
));
674
679
}
675
680
timelineCtr++;
681
+ LIBRECUDA_ERR_PROPAGATE (signalNotify (timelineSignal, timelineCtr, COMPUTE));
676
682
LIBRECUDA_SUCCEED ();
677
683
}
678
684
@@ -722,7 +728,7 @@ libreCudaStatus_t NvCommandQueue::gpuMemcpy(void *dst, void *src, size_t numByte
722
728
DMA
723
729
));
724
730
timelineCtr++;
725
-
731
+ LIBRECUDA_ERR_PROPAGATE ( signalNotify (timelineSignal, timelineCtr, DMA));
726
732
LIBRECUDA_SUCCEED ();
727
733
}
728
734
@@ -772,21 +778,42 @@ libreCudaStatus_t NvCommandQueue::startExecution() {
772
778
break ;
773
779
}
774
780
}
775
- LIBRECUDA_ERR_PROPAGATE (signalNotify (timelineSignal, backlog_entry.timelineCtr , backlog_entry.queueType ));
776
781
LIBRECUDA_ERR_PROPAGATE (submitToFifo (backlog_entry.queueType ));
777
782
LIBRECUDA_ERR_PROPAGATE (signalWaitCpu (timelineSignal, backlog_entry.timelineCtr ));
778
783
}
779
784
commandBufBacklog.clear ();
780
785
} else {
781
786
if (!computeCommandBuffer.empty ()) {
782
787
LIBRECUDA_ERR_PROPAGATE (startExecution (COMPUTE));
783
- LIBRECUDA_ERR_PROPAGATE (awaitExecution (COMPUTE));
784
788
}
785
789
if (!dmaCommandBuffer.empty ()) {
786
790
LIBRECUDA_ERR_PROPAGATE (startExecution (DMA));
787
- LIBRECUDA_ERR_PROPAGATE (awaitExecution (DMA));
788
791
}
789
792
}
790
- kernArgsWriteIdx = 0 ; // reset kern args write idx
793
+ LIBRECUDA_SUCCEED ();
794
+ }
795
+
796
+ libreCudaStatus_t NvCommandQueue::signalWaitGpu (NvSignal *pSignal, NvU32 signalTarget) {
797
+ if (pSignal->value == signalTarget) {
798
+ // no need to wait, if cpu can confirm.
799
+
800
+ return LIBRECUDA_SUCCESS;
801
+ }
802
+ LIBRECUDA_ERR_PROPAGATE (enqueue (
803
+ makeNvMethod (0 , NVC56F_SEM_ADDR_LO, 5 ),
804
+ {
805
+ // little endian
806
+ U64_LO_32_BITS (pSignal),
807
+ U64_HI_32_BITS (pSignal),
808
+
809
+ // little endian
810
+ signalTarget,
811
+ 0 ,
812
+
813
+ (NVC56F_SEM_EXECUTE_OPERATION_ACQUIRE) |
814
+ (NVC56F_SEM_EXECUTE_PAYLOAD_SIZE_64BIT << 24 )
815
+ },
816
+ COMPUTE
817
+ ));
791
818
LIBRECUDA_SUCCEED ();
792
819
}
0 commit comments