Merge pull request #645 from JuliaGPU/tb/multitasking

maleadt · web-flow · commit 7408dd933f61 · 2020-03-25T16:51:35.000+01:00
Support for Julia's multitasking.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -138,6 +138,7 @@ pool:split:
     - nvidia
   variables:
     CUARRAYS_MEMORY_POOL: 'split'
+  allow_failure: true
 
 debug:
   extends:
diff --git a/Manifest.toml b/Manifest.toml
@@ -39,12 +39,28 @@ uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
 version = "6.2.0"
 
 [[CUDAnative]]
-deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "MacroTools", "Pkg", "Printf", "TimerOutputs"]
-git-tree-sha1 = "e6742ce88d11f1fdf6a9357ba738735f86ce67b5"
-repo-rev = "58c6755445c05ff26f1bdc5c12c7ae0aa6c39bc2"
-repo-url = "https://github.com/JuliaGPU/CUDAnative.jl.git"
+deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "MacroTools", "Pkg", "Printf", "TimerOutputs"]
+git-tree-sha1 = "1ee71ece4332185ad49b93f7b6cf9d51017e40ef"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.10.2"
+version = "3.0.0"
+
+[[CodeTracking]]
+deps = ["InteractiveUtils", "UUIDs"]
+git-tree-sha1 = "0becdab7e6fbbcb7b88d8de5b72e5bb2f28239f3"
+uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
+version = "0.5.8"
+
+[[Compat]]
+deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
+git-tree-sha1 = "ed2c4abadf84c53d9e58510b5fc48912c2336fbb"
+uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
+version = "2.2.0"
+
+[[Cthulhu]]
+deps = ["CodeTracking", "InteractiveUtils", "TerminalMenus", "Unicode"]
+git-tree-sha1 = "5e0f928ccaab1fa2911fc4e204e8a6f5b0213eaf"
+uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f"
+version = "1.0.0"
 
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections"]
@@ -56,6 +72,10 @@ version = "0.17.10"
 deps = ["Printf"]
 uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
 
+[[DelimitedFiles]]
+deps = ["Mmap"]
+uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+
 [[Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
@@ -100,6 +120,9 @@ version = "0.5.4"
 deps = ["Base64"]
 uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
 
+[[Mmap]]
+uuid = "a63ad114-7e13-5084-954f-fe012c677804"
+
 [[NNlib]]
 deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "Requires", "Statistics"]
 git-tree-sha1 = "d9f196d911f55aeaff11b11f681b135980783824"
@@ -146,6 +169,10 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
+[[SharedArrays]]
+deps = ["Distributed", "Mmap", "Random", "Serialization"]
+uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
+
 [[Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
@@ -157,6 +184,12 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
+[[TerminalMenus]]
+deps = ["Compat", "REPL", "Test"]
+git-tree-sha1 = "9ae6ed0c94eee4d898e049820942af21daf15efc"
+uuid = "dc548174-15c3-5faf-af27-7997cfbde655"
+version = "0.1.0"
+
 [[Test]]
 deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/Project.toml b/Project.toml
@@ -30,7 +30,7 @@ Adapt = "1.0"
 CEnum = "0.2"
 CUDAapi = "3.0, 4.0"
 CUDAdrv = "6.0.1"
-CUDAnative = "2.10"
+CUDAnative = "3.0"
 DataStructures = "0.17"
 GPUArrays = "3.1"
 MacroTools = "0.5"
diff --git a/src/blas/CUBLAS.jl b/src/blas/CUBLAS.jl
@@ -25,68 +25,74 @@ include("wrappers.jl")
 # high-level integrations
 include("linalg.jl")
 
-const handles_lock = ReentrantLock()
-const created_handles = Dict{Tuple{UInt,Int},cublasHandle_t}()
-const created_xt_handles = Dict{Tuple{UInt,Int},cublasXtHandle_t}()
-const active_handles = Vector{Union{Nothing,cublasHandle_t}}()
-const active_xt_handles = Vector{Union{Nothing,cublasXtHandle_t}}()
+# thread cache for task-local library handles
+const thread_handles = Vector{Union{Nothing,cublasHandle_t}}()
+const thread_xt_handles = Vector{Union{Nothing,cublasXtHandle_t}}()
 
 function handle()
     tid = Threads.threadid()
-    if @inbounds active_handles[tid] === nothing
+    if @inbounds thread_handles[tid] === nothing
         ctx = context()
-        key = (objectid(ctx), tid)
-        lock(handles_lock) do
-            active_handles[tid] = get!(created_handles, key) do
-                handle = cublasCreate_v2()
-                atexit(()->CUDAdrv.isvalid(ctx) && cublasDestroy_v2(handle))
-
-                # enable tensor math mode if our device supports it, and fast math is enabled
-                dev = CUDAdrv.device()
-                if Base.JLOptions().fast_math == 1 && CUDAdrv.capability(dev) >= v"7.0" && version() >= v"9"
-                    cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)
+        thread_handles[tid] = get!(task_local_storage(), (:CUBLAS, ctx)) do
+            handle = cublasCreate_v2()
+            finalizer(current_task()) do task
+                CUDAdrv.isvalid(ctx) || return
+                context!(ctx) do
+                    cublasDestroy_v2(handle)
                 end
+            end
 
-                handle
+            # enable tensor math mode if our device supports it, and fast math is enabled
+            dev = CUDAdrv.device()
+            if Base.JLOptions().fast_math == 1 && CUDAdrv.capability(dev) >= v"7.0" && version() >= v"9"
+                cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)
             end
+
+            handle
         end
     end
-    @inbounds active_handles[tid]
+    @inbounds thread_handles[tid]
 end
 
 function xt_handle()
     tid = Threads.threadid()
-    if @inbounds active_xt_handles[tid] === nothing
+    if @inbounds thread_xt_handles[tid] === nothing
         ctx = context()
-        key = (objectid(ctx), tid)
-        lock(handles_lock) do
-            active_xt_handles[tid] = get!(created_xt_handles, key) do
-                handle = cublasXtCreate()
-                atexit(()->CUDAdrv.isvalid(ctx) && cublasXtDestroy(handle))
-
-                # select the devices
-                # TODO: this is weird, since we typically use a single device per thread/context
-                devs = convert.(Cint, CUDAdrv.devices())
-                cublasXtDeviceSelect(handle, length(devs), devs)
-
-                handle
+        thread_xt_handles[tid] = get!(task_local_storage(), (:CUBLASxt, ctx)) do
+            handle = cublasXtCreate()
+            finalizer(current_task()) do task
+                CUDAdrv.isvalid(ctx) || return
+                context!(ctx) do
+                    cublasXtDestroy(handle)
+                end
             end
+
+            # select the devices
+            # TODO: this is weird, since we typically use a single device per thread/context
+            devs = convert.(Cint, CUDAdrv.devices())
+            cublasXtDeviceSelect(handle, length(devs), devs)
+
+            handle
         end
     end
-    @inbounds active_xt_handles[tid]
+    @inbounds thread_xt_handles[tid]
 end
 
 function __init__()
-    resize!(active_handles, Threads.nthreads())
-    fill!(active_handles, nothing)
+    resize!(thread_handles, Threads.nthreads())
+    fill!(thread_handles, nothing)
 
-    resize!(active_xt_handles, Threads.nthreads())
-    fill!(active_xt_handles, nothing)
+    resize!(thread_xt_handles, Threads.nthreads())
+    fill!(thread_xt_handles, nothing)
 
     CUDAnative.atcontextswitch() do tid, ctx
-        # we don't eagerly initialize handles, but do so lazily when requested
-        active_handles[tid] = nothing
-        active_xt_handles[tid] = nothing
+        thread_handles[tid] = nothing
+        thread_xt_handles[tid] = nothing
+    end
+
+    CUDAnative.attaskswitch() do tid, task
+        thread_handles[tid] = nothing
+        thread_xt_handles[tid] = nothing
     end
 end
 
diff --git a/src/blas/error.jl b/src/blas/error.jl
@@ -48,8 +48,7 @@ end
 end
 
 function initialize_api()
-    # make sure the calling thread has an active context
-    CUDAnative.initialize_context()
+    CUDAnative.prepare_cuda_call()
 end
 
 macro check(ex)
diff --git a/src/dnn/CUDNN.jl b/src/dnn/CUDNN.jl
@@ -39,33 +39,38 @@ include("nnlib.jl")
 
 include("compat.jl")
 
-const handles_lock = ReentrantLock()
-const created_handles = Dict{Tuple{UInt,Int},cudnnHandle_t}()
-const active_handles = Vector{Union{Nothing,cudnnHandle_t}}()
+# thread cache for task-local library handles
+const thread_handles = Vector{Union{Nothing,cudnnHandle_t}}()
 
 function handle()
     tid = Threads.threadid()
-    if @inbounds active_handles[tid] === nothing
+    if @inbounds thread_handles[tid] === nothing
         ctx = context()
-        key = (objectid(ctx), tid)
-        lock(handles_lock) do
-            active_handles[tid] = get!(created_handles, key) do
-                handle = cudnnCreate()
-                atexit(()->CUDAdrv.isvalid(ctx) && cudnnDestroy(handle))
-                handle
+        thread_handles[tid] = get!(task_local_storage(), (:CUDNN, ctx)) do
+            handle = cudnnCreate()
+            finalizer(current_task()) do task
+                CUDAdrv.isvalid(ctx) || return
+                context!(ctx) do
+                    cudnnDestroy(handle)
+                end
             end
+
+            handle
         end
     end
-    @inbounds active_handles[tid]
+    @inbounds thread_handles[tid]
 end
 
 function __init__()
-    resize!(active_handles, Threads.nthreads())
-    fill!(active_handles, nothing)
+    resize!(thread_handles, Threads.nthreads())
+    fill!(thread_handles, nothing)
 
     CUDAnative.atcontextswitch() do tid, ctx
-        # we don't eagerly initialize handles, but do so lazily when requested
-        active_handles[tid] = nothing
+        thread_handles[tid] = nothing
+    end
+
+    CUDAnative.attaskswitch() do tid, task
+        thread_handles[tid] = nothing
     end
 end
 
diff --git a/src/dnn/error.jl b/src/dnn/error.jl
@@ -20,8 +20,7 @@ name(err::CUDNNError) = unsafe_string(cudnnGetErrorString(err))
 end
 
 function initialize_api()
-    # make sure the calling thread has an active context
-    CUDAnative.initialize_context()
+    CUDAnative.prepare_cuda_call()
 end
 
 macro check(ex)
diff --git a/src/fft/error.jl b/src/fft/error.jl
@@ -62,8 +62,7 @@ end
 end
 
 function initialize_api()
-    # make sure the calling thread has an active context
-    CUDAnative.initialize_context()
+    CUDAnative.prepare_cuda_call()
 end
 
 macro check(ex)
diff --git a/src/memory.jl b/src/memory.jl
@@ -299,7 +299,7 @@ synchronized right before and after executing `ex` to exclude any external effec
 macro time(ex)
     quote
         # @time might surround an application, so be sure to initialize CUDA before that
-        CUDAnative.initialize_context()
+        CUDAnative.prepare_cuda_call()
 
         # coarse synchronization to exclude effects from previously-executed code
         CUDAdrv.synchronize()
diff --git a/src/rand/CURAND.jl b/src/rand/CURAND.jl
@@ -23,33 +23,32 @@ include("wrappers.jl")
 # high-level integrations
 include("random.jl")
 
-const handles_lock = ReentrantLock()
-const created_generators = Dict{Tuple{UInt,Int},RNG}()
-const active_generators = Vector{Union{Nothing,RNG}}()
+# thread cache for task-local library handles
+const thread_generators = Vector{Union{Nothing,RNG}}()
 
 function generator()
     tid = Threads.threadid()
-    if @inbounds active_generators[tid] === nothing
+    if @inbounds thread_generators[tid] === nothing
         ctx = context()
-        key = (objectid(ctx), tid)
-        lock(handles_lock) do
-            active_generators[tid] = get!(created_generators, key) do
-                rng = RNG()
-                Random.seed!(rng)
-                rng
-            end
+        thread_generators[tid] = get!(task_local_storage(), (:CURAND, ctx)) do
+            rng = RNG()
+            Random.seed!(rng)
+            rng
         end
     end
-    @inbounds active_generators[tid]
+    @inbounds thread_generators[tid]
 end
 
 function __init__()
-    resize!(active_generators, Threads.nthreads())
-    fill!(active_generators, nothing)
+    resize!(thread_generators, Threads.nthreads())
+    fill!(thread_generators, nothing)
 
     CUDAnative.atcontextswitch() do tid, ctx
-        # we don't eagerly initialize handles, but do so lazily when requested
-        active_generators[tid] = nothing
+        thread_generators[tid] = nothing
+    end
+
+    CUDAnative.attaskswitch() do tid, task
+        thread_generators[tid] = nothing
     end
 end
 
diff --git a/src/rand/error.jl b/src/rand/error.jl
@@ -54,8 +54,7 @@ end
 end
 
 function initialize_api()
-    # make sure the calling thread has an active context
-    CUDAnative.initialize_context()
+    CUDAnative.prepare_cuda_call()
 end
 
 macro check(ex)
diff --git a/src/rand/random.jl b/src/rand/random.jl
@@ -29,7 +29,8 @@ mutable struct RNG <: Random.AbstractRNG
 end
 
 function unsafe_destroy!(rng::RNG)
-    if CUDAdrv.isvalid(rng.ctx)
+    CUDAdrv.isvalid(rng.ctx) || return
+    context!(rng.ctx) do
         curandDestroyGenerator(rng)
     end
 end
diff --git a/src/solver/CUSOLVER.jl b/src/solver/CUSOLVER.jl
diff --git a/src/solver/error.jl b/src/solver/error.jl
diff --git a/src/sparse/CUSPARSE.jl b/src/sparse/CUSPARSE.jl
diff --git a/src/sparse/error.jl b/src/sparse/error.jl
diff --git a/src/tensor/CUTENSOR.jl b/src/tensor/CUTENSOR.jl
diff --git a/src/tensor/error.jl b/src/tensor/error.jl