Skip to content

Commit 86c56c3

Browse files
committed
No signalNotify() for async kernel launches, as it destroys parallelism
1 parent 1f6566c commit 86c56c3

File tree

12 files changed

+1014
-21
lines changed

12 files changed

+1014
-21
lines changed

driverapi/internal/cmdqueue.h

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,12 +133,6 @@ class NvCommandQueue {
133133
*/
134134
std::vector<CommandBufSplit> commandBufBacklog{};
135135

136-
/**
137-
* Virtual address of shader local memory used for shaders/kernels.
138-
* Only needs one instance > max required local memory of any kernels, as only one kernels can run at a time
139-
*/
140-
void *shaderLocalMemoryVa = nullptr;
141-
142136
/**
143137
* Current shader local memory per thread. NOT equal to the allocated size of localMemoryVa, but correlates.
144138
*/
@@ -177,7 +171,7 @@ class NvCommandQueue {
177171

178172
~NvCommandQueue();
179173

180-
libreCudaStatus_t ensureEnoughLocalMem(NvU32 localMemReq);
174+
libreCudaStatus_t ensureEnoughLocalMem(LibreCUFunction function);
181175

182176
libreCudaStatus_t
183177
launchFunction(LibreCUFunction function,

driverapi/internal/librecuda_internal.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,12 @@ struct LibreCUFunction_ {
8282
NvU64 function_size;
8383
std::vector<KernelConstantInfo> constants;
8484
std::vector<KernelParamInfo> param_info;
85+
86+
/**
87+
* Virtual address of shader local memory used for shaders/kernels.
88+
*/
89+
NvU64 shader_local_memory_va{};
90+
8591
};
8692

8793
struct LibreCUstream_ {

driverapi/src/cmdqueue.cpp

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -385,13 +385,14 @@ static inline NvU32 roundUp(NvU32 a, NvU32 b) {
385385
}
386386

387387

388-
libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(NvU32 localMemReq) {
388+
libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(LibreCUFunction function) {
389+
NvU32 localMemReq = function->local_mem_req;
389390
if (localMemReq <= currentSlmPerThread) {
390391
return LIBRECUDA_SUCCESS; // no action required, local memory is enough
391392
}
392393

393-
if (shaderLocalMemoryVa != nullptr) {
394-
LIBRECUDA_ERR_PROPAGATE(gpuFree(ctx, reinterpret_cast<NvU64>(shaderLocalMemoryVa)));
394+
if (function->shader_local_memory_va != 0) {
395+
LIBRECUDA_ERR_PROPAGATE(gpuFree(ctx, function->shader_local_memory_va));
395396
}
396397

397398
currentSlmPerThread = ceilDiv(localMemReq, 32u) * 32; // round up
@@ -407,7 +408,7 @@ libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(NvU32 localMemReq) {
407408
true,
408409
false,
409410
0,
410-
reinterpret_cast<NvU64 *>(&shaderLocalMemoryVa)
411+
&function->shader_local_memory_va
411412
)
412413
);
413414

@@ -417,8 +418,8 @@ libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(NvU32 localMemReq) {
417418
makeNvMethod(1, NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2),
418419
{
419420
// weird half big and little endian along int borders again...
420-
U64_HI_32_BITS(shaderLocalMemoryVa),
421-
U64_LO_32_BITS(shaderLocalMemoryVa)
421+
U64_HI_32_BITS(function->shader_local_memory_va),
422+
U64_LO_32_BITS(function->shader_local_memory_va)
422423
},
423424
COMPUTE
424425
));
@@ -457,7 +458,7 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
457458
bool local_mem_changed;
458459
{
459460
auto pre_ctr = timelineCtr;
460-
LIBRECUDA_ERR_PROPAGATE(ensureEnoughLocalMem(function->local_mem_req));
461+
LIBRECUDA_ERR_PROPAGATE(ensureEnoughLocalMem(function));
461462
local_mem_changed = timelineCtr > pre_ctr;
462463
}
463464

@@ -518,7 +519,8 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
518519

519520
size_t j = 0;
520521
for (size_t i = 0; i < numParams; i++) {
521-
switch (function->param_info[i].param_size) {
522+
size_t param_size = function->param_info[i].param_size;
523+
switch (param_size) {
522524
case 8: {
523525
auto *param_ptr = reinterpret_cast<NvU64 *>(params[i]);
524526
auto param_value = *param_ptr;
@@ -684,7 +686,9 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
684686
));
685687
}
686688
timelineCtr++;
687-
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, COMPUTE));
689+
if (!async) {
690+
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, COMPUTE));
691+
}
688692
LIBRECUDA_SUCCEED();
689693
}
690694

@@ -734,6 +738,10 @@ libreCudaStatus_t NvCommandQueue::gpuMemcpy(void *dst, void *src, size_t numByte
734738
DMA
735739
));
736740
timelineCtr++;
741+
// TODO: THERE SEEM TO BE SERIOUS PROBLEMS WITH DMA CHRONOLOGY GIVEN THERE IS NO WAY TO WAIT FOR SEMAPHORES...
742+
// NEED MORE TESTING!
743+
// This signalNotify might also not be needed at all, try to design a similar async system as in COMPUTE
744+
// for DMA if possible..., else more CPU involvement is required for chronological DMA operations
737745
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, DMA));
738746
LIBRECUDA_SUCCEED();
739747
}
@@ -784,15 +792,20 @@ libreCudaStatus_t NvCommandQueue::startExecution() {
784792
break;
785793
}
786794
}
795+
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, backlog_entry.timelineCtr, backlog_entry.queueType));
787796
LIBRECUDA_ERR_PROPAGATE(submitToFifo(backlog_entry.queueType));
788797
LIBRECUDA_ERR_PROPAGATE(signalWaitCpu(timelineSignal, backlog_entry.timelineCtr));
789798
}
790799
commandBufBacklog.clear();
791800
} else {
792801
if (!computeCommandBuffer.empty()) {
802+
LIBRECUDA_VALIDATE(dmaCommandBuffer.empty(), LIBRECUDA_ERROR_UNKNOWN);
803+
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, COMPUTE));
793804
LIBRECUDA_ERR_PROPAGATE(startExecution(COMPUTE));
794805
}
795806
if (!dmaCommandBuffer.empty()) {
807+
LIBRECUDA_VALIDATE(computeCommandBuffer.empty(), LIBRECUDA_ERROR_UNKNOWN);
808+
LIBRECUDA_ERR_PROPAGATE(signalNotify(timelineSignal, timelineCtr, DMA));
796809
LIBRECUDA_ERR_PROPAGATE(startExecution(DMA));
797810
}
798811
}

driverapi/src/librecuda.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1354,11 +1354,16 @@ libreCudaStatus_t libreCuLaunchKernel(LibreCUFunction function,
13541354
LIBRECUDA_SUCCEED();
13551355
}
13561356

1357-
libreCudaStatus_t libreCuModuleUnload(LibreCUmodule function) {
1357+
libreCudaStatus_t libreCuModuleUnload(LibreCUmodule module) {
13581358
LIBRECUDA_ENSURE_CTX_VALID();
1359-
LIBRECUDA_VALIDATE(function != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
1360-
LIBRECUDA_ERR_PROPAGATE(gpuFree(current_ctx, function->module_va_addr));
1361-
delete function;
1359+
LIBRECUDA_VALIDATE(module != nullptr, LIBRECUDA_ERROR_INVALID_VALUE);
1360+
LIBRECUDA_ERR_PROPAGATE(gpuFree(current_ctx, module->module_va_addr));
1361+
for (auto &function: module->functions) {
1362+
if (function.shader_local_memory_va != 0) {
1363+
LIBRECUDA_ERR_PROPAGATE(gpuFree(current_ctx, function.shader_local_memory_va));
1364+
}
1365+
}
1366+
delete module;
13621367
LIBRECUDA_SUCCEED();
13631368
}
13641369

tests/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
add_subdirectory(write_float)
22
add_subdirectory(memcopy)
33
add_subdirectory(dynamic_shared_mem)
4-
add_subdirectory(compute_chronological_consistency)
4+
add_subdirectory(compute_chronological_consistency)
5+
add_subdirectory(test_async_kernels)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
add_executable(
2+
test_async_kernels
3+
main.cpp
4+
)
5+
target_link_libraries(
6+
test_async_kernels
7+
PRIVATE
8+
driverapi
9+
)
10+
11+
configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/write_float COPYONLY)

tests/test_async_kernels/main.cpp

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
#include <librecuda.h>
2+
3+
#include <iostream>
4+
#include <vector>
5+
#include <fstream>
6+
#include <cstring>
7+
#include <chrono>
8+
9+
inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) {
10+
if (error != LIBRECUDA_SUCCESS) {
11+
const char *error_string;
12+
libreCuGetErrorString(error, &error_string);
13+
printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string);
14+
exit(EXIT_FAILURE);
15+
}
16+
};
17+
#define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__))
18+
19+
int main() {
20+
CUDA_CHECK(libreCuInit(0));
21+
22+
int device_count{};
23+
CUDA_CHECK(libreCuDeviceGetCount(&device_count));
24+
std::cout << "Device count: " + std::to_string(device_count) << std::endl;
25+
26+
LibreCUdevice device{};
27+
CUDA_CHECK(libreCuDeviceGet(&device, 0));
28+
29+
LibreCUcontext ctx{};
30+
CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device));
31+
32+
char name_buffer[256] = {};
33+
libreCuDeviceGetName(name_buffer, 256, device);
34+
std::cout << "Device Name: " + std::string(name_buffer) << std::endl;
35+
36+
LibreCUmodule module{};
37+
38+
// read cubin file
39+
uint8_t *image;
40+
size_t n_bytes;
41+
{
42+
std::ifstream input("write_float.cubin", std::ios::binary);
43+
std::vector<uint8_t> bytes(
44+
(std::istreambuf_iterator<char>(input)),
45+
(std::istreambuf_iterator<char>()));
46+
input.close();
47+
image = new uint8_t[bytes.size()];
48+
memcpy(image, bytes.data(), bytes.size());
49+
n_bytes = bytes.size();
50+
}
51+
CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes));
52+
53+
// read functions
54+
uint32_t num_funcs{};
55+
CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module));
56+
std::cout << "Num functions: " << num_funcs << std::endl;
57+
58+
auto *functions = new LibreCUFunction[num_funcs];
59+
CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module));
60+
61+
for (size_t i = 0; i < num_funcs; i++) {
62+
LibreCUFunction func = functions[i];
63+
const char *func_name{};
64+
CUDA_CHECK(libreCuFuncGetName(&func_name, func));
65+
std::cout << " function \"" << func_name << "\"" << std::endl;
66+
}
67+
68+
delete[] functions;
69+
70+
// find function
71+
LibreCUFunction func{};
72+
CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float"));
73+
74+
// create stream
75+
LibreCUstream stream{};
76+
CUDA_CHECK(libreCuStreamCreate(&stream, 0));
77+
78+
void *float_dst_compute_va{};
79+
void *float_dst_dma_va{};
80+
CUDA_CHECK(libreCuMemAlloc(&float_dst_compute_va, sizeof(float), true));
81+
CUDA_CHECK(libreCuMemAlloc(&float_dst_dma_va, sizeof(float), true));
82+
*(float *) float_dst_compute_va = 0.0f;
83+
*(float *) float_dst_dma_va = 0.0f;
84+
85+
// first time execution of the kernel
86+
auto start = std::chrono::high_resolution_clock::now();
87+
{
88+
void *params[] = {
89+
&float_dst_compute_va, &float_dst_dma_va
90+
};
91+
CUDA_CHECK(
92+
libreCuLaunchKernel(func,
93+
1, 1, 1,
94+
1, 1, 1,
95+
0,
96+
stream,
97+
params, sizeof(params) / sizeof(void *),
98+
nullptr,
99+
false
100+
)
101+
);
102+
}
103+
CUDA_CHECK(libreCuStreamCommence(stream));
104+
CUDA_CHECK(libreCuStreamAwait(stream));
105+
auto end = std::chrono::high_resolution_clock::now();
106+
std::cout << "Single kernel took: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
107+
<< "ms" << std::endl;
108+
109+
start = std::chrono::high_resolution_clock::now();
110+
{
111+
void *params[] = {
112+
&float_dst_compute_va, &float_dst_dma_va
113+
};
114+
for (int i = 0; i < 5; i++) {
115+
CUDA_CHECK(
116+
libreCuLaunchKernel(func,
117+
1, 1, 1,
118+
1, 1, 1,
119+
0,
120+
stream,
121+
params, sizeof(params) / sizeof(void *),
122+
nullptr,
123+
true
124+
)
125+
);
126+
}
127+
}
128+
CUDA_CHECK(libreCuStreamCommence(stream));
129+
CUDA_CHECK(libreCuStreamAwait(stream));
130+
end = std::chrono::high_resolution_clock::now();
131+
std::cout << "5xParallel kernel took: "
132+
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
133+
<< "ms" << std::endl;
134+
135+
// free memory
136+
CUDA_CHECK(libreCuMemFree(float_dst_compute_va));
137+
CUDA_CHECK(libreCuMemFree(float_dst_dma_va));
138+
139+
// destroy stream
140+
CUDA_CHECK(libreCuStreamDestroy(stream));
141+
142+
// unload module
143+
CUDA_CHECK(libreCuModuleUnload(module));
144+
145+
// destroy ctx
146+
CUDA_CHECK(libreCuCtxDestroy(ctx));
147+
return 0;
148+
}

0 commit comments

Comments
 (0)