-
Notifications
You must be signed in to change notification settings - Fork 244
Description
Sanity checks (read this first, then remove this section)
-
Make sure you're reporting a bug; for general questions, please use Discourse or
Slack. -
If you're dealing with a performance issue, make sure you disable scalar iteration
(CUDA.allowscalar(false)
). Only file an issue if that shows scalar iteration happening
in CUDA.jl or Base Julia, as opposed to your own code. -
If you're seeing an error message, follow the error message instructions, if any
(e.g.inspect code with @device_code_warntype
). If you can't solve the problem using
that information, make sure to post it as part of the issue. -
Always ensure you're using the latest version of CUDA.jl, and if possible, please
check the master branch to see if your issue hasn't been resolved yet.
If your bug is still valid, please go ahead and fill out the template below.
Describe the bug
I am facing a weird problem in our application. We have a julia function calling a C function, which is creating a pthread and calling Julia CUDA kernel. I have created a small example to illustrate and reproduce the problem. Unfortunately, we are not able to make it even more simpler. This is the simplest example to reproduce the issue.
The "call_c_function_direct
” Julia function calls the C function “c_function_direct
”, which calls Julia CUDA kernel “ccall_saxpy() -> saxpy_kernel()
”. It works without any issue.
However, when I create a pthread inside C function and call Julia CUDA kernel, it hangs the execution and no useful stack trace is available.
The "call_c_function_pthread
” Julia function calls the C function “c_function_pthread
”, which creates a pthread and calls Julia CUDA kernel “ccall_saxpy() -> saxpy_kernel()
”. It hangs the execution when it calls @cuda saxpy_kernel”.
To control the execution of Julia CUDA kernel either through direct or pthread based, a Julia variable is added in the file “julia_cuda.jl” with “direct =
true”. You can set it to false to run using pthread.
To reproduce
The Minimal Working Example (MWE) for this bug:
File: julia_code.jl
using CUDA
# Define the CUDA kernel for saxpy
function saxpy_kernel(A, B, C, alpha)
i = threadIdx().x
if i <= length(A)
C[i] = alpha * A[i] + B[i]
end
return
end
# Make the Julia function callable from C
export ccall_saxpy
function ccall_saxpy(ctx::Ptr{Cvoid}, device::Cint, A::Ptr{Float32}, B::Ptr{Float32}, C::Ptr{Float32}, alpha::Cfloat, n::Cint)::Cvoid
cu_ctx = unsafe_load(reinterpret(Ptr{CuContext}, ctx))
c_println("CUDA ctx: $cu_ctx Device:$device")
CUDA.context!(cu_ctx)
CUDA.device!(device)
size_dims=Tuple(Int64[n])
A_array = unsafe_wrap(CuArray, reinterpret(CuPtr{Float32}, A), size_dims, own=false)
B_array = unsafe_wrap(CuArray, reinterpret(CuPtr{Float32}, B), size_dims, own=false)
C_array = unsafe_wrap(CuArray, reinterpret(CuPtr{Float32}, C), size_dims, own=false)
c_println("A: $A_array, B:$B_array, C:$C_array Alpha:$alpha")
c_println("Calling CUDA function")
CUDA.@sync @cuda threads=size_dims saxpy_kernel(A_array, B_array, C_array, alpha)
c_println("CUDA call completed")
end
# Initialize the C function
function initialize_c_function()
func_ptr = @cfunction(ccall_saxpy, Cvoid, (Ptr{Cvoid}, Cint, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cfloat, Cint))
global stored_func_ptr = func_ptr
ccall((:initialize_c, "libcfunction"), Cvoid, (Ptr{Cvoid},), func_ptr)
end
function c_println(data::String)::Cvoid
ccall((:c_println, "libcfunction"), Cvoid, (Ptr{Cchar},), pointer(data))
end
# Define a function to call the C function
function call_c_function_pthread(A::Vector{Float32}, B::Vector{Float32}, C::Vector{Float32}, alpha::Float32, n::Cint)
ccall((:c_function_pthread, "./libcfunction.so"), Cvoid,
(Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Float32, Cint),
A, B, C, alpha, n)
end
# Define a function to call the C function
function call_c_function_direct(A::Vector{Float32}, B::Vector{Float32}, C::Vector{Float32}, alpha::Float32, n::Cint)
ccall((:c_function_direct, "./libcfunction.so"), Cvoid,
(Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Float32, Cint),
A, B, C, alpha, n)
end
# Example usage
A = Float32[1.0, 2.0, 3.0]
B = Float32[4.0, 5.0, 6.0]
C = Float32[0.0, 0.0, 0.0]
alpha = 2.0f0
n = 3
#direct = true
direct = false
initialize_c_function()
# Call the C function
if direct
call_c_function_direct(A, B, C, alpha, Int32(n))
else
call_c_function_pthread(A, B, C, alpha, Int32(n))
end
# Print the result
println("Result: $C")
C File: c_cuda.c
#include <julia.h>
#include <pthread.h>
#include <cuda_runtime.h>
#include <cuda.h>
typedef struct {
float *A;
float *B;
float *C;
float alpha;
int n;
} thread_data_t;
// Declare the Julia function
typedef void (*julia_saxpy_t)(void *ctx, int device, float *A, float *B, float *C, float alpha, int n);
julia_saxpy_t ccall_saxpy = NULL;
CUcontext cuContext;
CUdevice cuDevice;
void initialize_c(void *func) {
jl_init();
ccall_saxpy = (julia_saxpy_t) func;
cuInit(0);
cuDeviceGet(&cuDevice, 0);
cuCtxCreate(&cuContext, 0, cuDevice);
printf("C CUDA ctx: %p\n", cuContext);
}
void c_println(char *data) {
printf("%s\n", data);
}
void call_cuda_saxpy(float *A, float *B, float *C, float alpha, int n) {
// Allocate device memory
float *d_A, *d_B, *d_C;
cuCtxSetCurrent(cuContext);
cudaMalloc((void**)&d_A, n * sizeof(float));
cudaMalloc((void**)&d_B, n * sizeof(float));
cudaMalloc((void**)&d_C, n * sizeof(float));
// Copy data from host to device
cudaMemcpy(d_A, A, n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, n * sizeof(float), cudaMemcpyHostToDevice);
// Call the Julia function
ccall_saxpy((void *)&cuContext, (int)cuDevice, d_A, d_B, d_C, alpha, n);
// Copy result from device to host
cudaMemcpy(C, d_C, n * sizeof(float), cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}
void *thread_function(void *arg) {
thread_data_t *data = (thread_data_t *)arg;
call_cuda_saxpy(data->A, data->B, data->C, data->alpha, data->n);
return NULL;
}
void c_function_pthread(float *A, float *B, float *C, float alpha, int n) {
pthread_t thread;
thread_data_t data = {A, B, C, alpha, n};
pthread_create(&thread, NULL, thread_function, &data);
pthread_join(thread, NULL);
}
void c_function_direct(float *A, float *B, float *C, float alpha, int n) {
call_cuda_saxpy(A, B, C, alpha, n);
}
Build command and run ``` $ gcc -g -O0 -fPIC -shared -o libcfunction.so c_cuda.c -I$(JULIA)/include/julia -L$(JULIA)/lib -ljulia -lpthread -I$(NVHPC_ROOT)/cuda/include -L$(NVHPC_ROOT)/cuda/lib64 -lcuda -lcudart $ julia julia_cuda.jl ```
Expected behavior
A clear and concise description of what you expected to happen.
Version info
Details on Julia: 1.10.4
julia> versioninfo()
Julia Version 1.10.4
Commit 48d4fd48430 (2024-06-04 10:41 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 64 × AMD Ryzen Threadripper 3970X 32-Core Processor
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-15.0.7 (ORCJIT, znver2)
Threads: 1 default, 0 interactive, 1 GC (on 64 virtual cores)
Environment:
JULIA_VERSION = 1.10.3
LD_LIBRARY_PATH = /opt/nvidia/hpc_sdk/Linux_x86_64/24.3/comm_libs/nvshmem/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/24.3/comm_libs/nccl/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/24.3/math_libs/lib64:/opt/nvidia/hpc_sdk/Linux_x86_64/24.3/compilers/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/24.3/compilers/extras/qd/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/24.3/cuda/extras/CUPTI/lib64:/opt/nvidia/hpc_sdk/Linux_x86_64/24.3/cuda/lib64:
Details on CUDA:
# please post the output of:
julia> CUDA.versioninfo()
CUDA runtime 12.5, artifact installation
CUDA driver 12.4
NVIDIA driver 550.54.15
CUDA libraries:
- CUBLAS: 12.3.4
- CURAND: 10.3.6
- CUFFT: 11.2.3
- CUSOLVER: 11.6.3
- CUSPARSE: 12.5.1
- CUPTI: 23.0.0
- NVML: 12.0.0+550.54.15
Julia packages:
- CUDA: 5.4.2
- CUDA_Driver_jll: 0.9.1+1
- CUDA_Runtime_jll: 0.14.1+0
Toolchain:
- Julia: 1.10.4
- LLVM: 15.0.7
1 device:
0: NVIDIA GeForce RTX 3090 (sm_86, 22.880 GiB / 24.000 GiB available)