@@ -435,8 +435,6 @@ libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(LibreCUFunction function)
435
435
},
436
436
COMPUTE
437
437
));
438
- timelineCtr++;
439
- LIBRECUDA_ERR_PROPAGATE (signalNotify (timelineSignal, timelineCtr, COMPUTE));
440
438
}
441
439
442
440
LIBRECUDA_SUCCEED ();
@@ -455,17 +453,6 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
455
453
LIBRECUDA_VALIDATE (function != nullptr , LIBRECUDA_ERROR_INVALID_VALUE);
456
454
LIBRECUDA_VALIDATE (numParams == function->param_info .size (), LIBRECUDA_ERROR_INVALID_VALUE);
457
455
458
- bool local_mem_changed;
459
- {
460
- auto pre_ctr = timelineCtr;
461
- LIBRECUDA_ERR_PROPAGATE (ensureEnoughLocalMem (function));
462
- local_mem_changed = timelineCtr > pre_ctr;
463
- }
464
-
465
- if (!async || local_mem_changed) {
466
- LIBRECUDA_ERR_PROPAGATE (signalWaitGpu (timelineSignal, timelineCtr));
467
- }
468
-
469
456
if (dmaCommandBuffer.empty ()) {
470
457
currentQueueType = COMPUTE;
471
458
}
@@ -474,6 +461,18 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
474
461
currentQueueType = COMPUTE;
475
462
}
476
463
464
+ LIBRECUDA_ERR_PROPAGATE (ensureEnoughLocalMem (function));
465
+
466
+
467
+ if (!async && timelineNotifyPending) {
468
+ LIBRECUDA_ERR_PROPAGATE (signalNotify (timelineSignal, timelineCtr, COMPUTE));
469
+ timelineNotifyPending = false ;
470
+ }
471
+
472
+ if (!async) {
473
+ LIBRECUDA_ERR_PROPAGATE (signalWaitGpu (timelineSignal, timelineCtr));
474
+ }
475
+
477
476
// prepare constbuf0
478
477
NvU32 constbuf0_data[88 ] = {};
479
478
{
@@ -688,21 +687,29 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
688
687
timelineCtr++;
689
688
if (!async) {
690
689
LIBRECUDA_ERR_PROPAGATE (signalNotify (timelineSignal, timelineCtr, COMPUTE));
690
+ timelineNotifyPending = false ;
691
+ } else {
692
+ // when async, we cannot do a signalNotify! This prevents parallelism, so we only do one
693
+ // signalNotify at the end to advance the timelineSignal to the timelineCtr at once, which
694
+ // may have been incremented multiple times.
695
+ // timelineNotifyPending tells startExecution that we have to issue a signalNotify at the end
696
+ // because a previous async kernel launch will not have issued this.
697
+ // Without this, we would wait forever because the gpu would never modify the timelineSignal at all.
698
+ timelineNotifyPending = true ;
691
699
}
692
700
LIBRECUDA_SUCCEED ();
693
701
}
694
702
695
703
696
- libreCudaStatus_t NvCommandQueue::gpuMemcpy (void *dst, void *src, size_t numBytes) {
704
+ libreCudaStatus_t NvCommandQueue::gpuMemcpy (void *dst, void *src, size_t numBytes, bool async ) {
697
705
LIBRECUDA_VALIDATE (dst != nullptr , LIBRECUDA_ERROR_INVALID_VALUE);
698
706
LIBRECUDA_VALIDATE (src != nullptr , LIBRECUDA_ERROR_INVALID_VALUE);
699
707
LIBRECUDA_VALIDATE (numBytes < UINT32_MAX, LIBRECUDA_ERROR_INVALID_VALUE);
700
708
701
- if (computeCommandBuffer.empty () && currentQueueType == COMPUTE ) {
709
+ if (computeCommandBuffer.empty ()) {
702
710
currentQueueType = DMA;
703
711
}
704
712
705
- // sync with compute queue
706
713
if (currentQueueType == COMPUTE) {
707
714
backlogCurrentCmdBuffer (COMPUTE);
708
715
currentQueueType = DMA;
@@ -738,11 +745,15 @@ libreCudaStatus_t NvCommandQueue::gpuMemcpy(void *dst, void *src, size_t numByte
738
745
DMA
739
746
));
740
747
timelineCtr++;
741
- // TODO: THERE SEEM TO BE SERIOUS PROBLEMS WITH DMA CHRONOLOGY GIVEN THERE IS NO WAY TO WAIT FOR SEMAPHORES...
742
- // NEED MORE TESTING!
743
- // This signalNotify might also not be needed at all, try to design a similar async system as in COMPUTE
744
- // for DMA if possible..., else more CPU involvement is required for chronological DMA operations
745
- LIBRECUDA_ERR_PROPAGATE (signalNotify (timelineSignal, timelineCtr, DMA));
748
+
749
+ // same logic as for COMPUTE applies to DMA.
750
+ if (!async) {
751
+ LIBRECUDA_ERR_PROPAGATE (signalNotify (timelineSignal, timelineCtr, DMA));
752
+ timelineNotifyPending = false ;
753
+ } else {
754
+ timelineNotifyPending = true ;
755
+ }
756
+
746
757
LIBRECUDA_SUCCEED ();
747
758
}
748
759
@@ -755,7 +766,8 @@ libreCudaStatus_t NvCommandQueue::backlogCurrentCmdBuffer(QueueType queueType) {
755
766
commandBufBacklog.push_back (CommandBufSplit{
756
767
.commandBuffer =computeCommandBuffer,
757
768
.queueType =COMPUTE,
758
- .timelineCtr =timelineCtr
769
+ .timelineCtr =timelineCtr,
770
+ .timelineNotifyPending =timelineNotifyPending
759
771
});
760
772
computeCommandBuffer.clear ();
761
773
break ;
@@ -767,12 +779,14 @@ libreCudaStatus_t NvCommandQueue::backlogCurrentCmdBuffer(QueueType queueType) {
767
779
commandBufBacklog.push_back (CommandBufSplit{
768
780
.commandBuffer =dmaCommandBuffer,
769
781
.queueType =DMA,
770
- .timelineCtr =timelineCtr
782
+ .timelineCtr =timelineCtr,
783
+ .timelineNotifyPending =timelineNotifyPending
771
784
});
772
785
dmaCommandBuffer.clear ();
773
786
break ;
774
787
}
775
788
}
789
+ timelineNotifyPending = false ;
776
790
LIBRECUDA_SUCCEED ();
777
791
}
778
792
@@ -792,22 +806,35 @@ libreCudaStatus_t NvCommandQueue::startExecution() {
792
806
break ;
793
807
}
794
808
}
795
- LIBRECUDA_ERR_PROPAGATE (signalNotify (timelineSignal, backlog_entry.timelineCtr , backlog_entry.queueType ));
809
+ if (backlog_entry.timelineNotifyPending ) {
810
+ LIBRECUDA_ERR_PROPAGATE (
811
+ signalNotify (timelineSignal, backlog_entry.timelineCtr , backlog_entry.queueType )
812
+ );
813
+ }
796
814
LIBRECUDA_ERR_PROPAGATE (submitToFifo (backlog_entry.queueType ));
797
815
LIBRECUDA_ERR_PROPAGATE (signalWaitCpu (timelineSignal, backlog_entry.timelineCtr ));
798
816
}
799
817
commandBufBacklog.clear ();
800
818
} else {
801
- if (!computeCommandBuffer. empty () ) {
819
+ if (currentQueueType == COMPUTE ) {
802
820
LIBRECUDA_VALIDATE (dmaCommandBuffer.empty (), LIBRECUDA_ERROR_UNKNOWN);
803
- LIBRECUDA_ERR_PROPAGATE (signalNotify (timelineSignal, timelineCtr, COMPUTE));
821
+
822
+ // only issue signalNotify if last command didn't already do that
823
+ if (timelineNotifyPending) {
824
+ LIBRECUDA_ERR_PROPAGATE (signalNotify (timelineSignal, timelineCtr, COMPUTE));
825
+ }
804
826
LIBRECUDA_ERR_PROPAGATE (startExecution (COMPUTE));
805
827
}
806
- if (!dmaCommandBuffer. empty () ) {
828
+ if (currentQueueType == DMA ) {
807
829
LIBRECUDA_VALIDATE (computeCommandBuffer.empty (), LIBRECUDA_ERROR_UNKNOWN);
808
- LIBRECUDA_ERR_PROPAGATE (signalNotify (timelineSignal, timelineCtr, DMA));
830
+
831
+ // only issue signalNotify if last command didn't already do that
832
+ if (timelineNotifyPending) {
833
+ LIBRECUDA_ERR_PROPAGATE (signalNotify (timelineSignal, timelineCtr, DMA));
834
+ }
809
835
LIBRECUDA_ERR_PROPAGATE (startExecution (DMA));
810
836
}
837
+ timelineNotifyPending = false ;
811
838
}
812
839
LIBRECUDA_SUCCEED ();
813
840
}
0 commit comments