Skip to content

Commit 03744a6

Browse files
author
Michael Abbott
committed
allow PermutedDimsArray in gemm_strided_batched
copied from JuliaGPU/CuArrays.jl#664, needs FluxML/NNlib.jl#191
1 parent cd1191f commit 03744a6

File tree

4 files changed

+37
-24
lines changed

4 files changed

+37
-24
lines changed

Project.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
77
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
88
BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
99
CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
10+
Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
1011
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
1112
ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
1213
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
@@ -31,6 +32,7 @@ AbstractFFTs = "0.4, 0.5"
3132
Adapt = "2.2"
3233
BFloat16s = "0.1"
3334
CEnum = "0.2, 0.3, 0.4"
35+
Compat = "3.9"
3436
DataStructures = "0.17, 0.18"
3537
ExprTools = "0.1"
3638
GPUArrays = "6.1.0"

lib/cublas/wrappers.jl

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -923,15 +923,16 @@ for (fname, elty) in
923923
function gemm_strided_batched!(transA::Char,
924924
transB::Char,
925925
alpha::Number,
926-
A::DenseCuArray{$elty, 3},
927-
B::DenseCuArray{$elty, 3},
926+
A::AbstractArray{$elty, 3},
927+
B::AbstractArray{$elty, 3},
928928
beta::Number,
929-
C::DenseCuArray{$elty, 3})
929+
C::AbstractArray{$elty, 3})
930930
m = size(A, transA == 'N' ? 1 : 2)
931931
k = size(A, transA == 'N' ? 2 : 1)
932932
n = size(B, transB == 'N' ? 2 : 1)
933933

934-
@assert size(A, 3) == size(B, 3) == size(C, 3) "Batch size mismatch"
934+
@assert size(A, 3) == size(C, 3) || size(A, 3) == 1 "batch size mismatch: A != C"
935+
@assert size(B, 3) == size(C, 3) || size(B, 3) == 1 "batch size mismatch: B != C"
935936

936937
if m != size(C,1) || n != size(C,2) || k != size(B, transB == 'N' ? 1 : 2)
937938
throw(DimensionMismatch(""))
@@ -940,26 +941,26 @@ for (fname, elty) in
940941
ldb = max(1,stride(B,2))
941942
ldc = max(1,stride(C,2))
942943

943-
strideA = stride(A, 3)
944-
strideB = stride(B, 3)
944+
strideA = size(A, 3) == 1 ? 0 : stride(A, 3)
945+
strideB = size(B, 3) == 1 ? 0 : stride(B, 3)
945946
strideC = stride(C, 3)
946-
batchCount = size(A, 3)
947+
batchCount = size(C, 3)
947948
$fname(handle(), transA, transB, m, n, k, alpha, A, lda, strideA, B,
948949
ldb, strideB, beta, C, ldc, strideC, batchCount)
949950
C
950951
end
951952
function gemm_strided_batched(transA::Char,
952953
transB::Char,
953954
alpha::Number,
954-
A::DenseCuArray{$elty, 3},
955-
B::DenseCuArray{$elty, 3})
956-
C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), size(A, 3)))
955+
A::AbstractArray{$elty, 3},
956+
B::AbstractArray{$elty, 3})
957+
C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), max(size(A, 3), size(B, 3))))
957958
gemm_strided_batched!(transA, transB, alpha, A, B, zero($elty), C )
958959
end
959960
function gemm_strided_batched(transA::Char,
960961
transB::Char,
961-
A::DenseCuArray{$elty, 3},
962-
B::DenseCuArray{$elty, 3})
962+
A::AbstractArray{$elty, 3},
963+
B::AbstractArray{$elty, 3})
963964
gemm_strided_batched(transA, transB, one($elty), A, B)
964965
end
965966
end

src/nnlib.jl

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,7 @@ end
2323

2424

2525
# Batched matrix multiplication
26+
# Using storage_type from https://github.com/FluxML/NNlib.jl/pull/191
2627

27-
const batched_gemm_args = [
28-
(:(CuArray{T, 3}), 'N'),
29-
(:(NNlib.BatchedTranspose{T, <:CuArray{T, 3}}), 'T'),
30-
(:(NNlib.BatchedAdjoint{T, <:CuArray{T, 3}}), 'C')
31-
]
32-
33-
for (TA, transA) in batched_gemm_args, (TB, transB) in batched_gemm_args
34-
@eval function NNlib.batched_mul!(C::CuArray{T, 3}, A::$TA, B::$TB) where {T<:CUBLAS.CublasFloat}
35-
CUBLAS.gemm_strided_batched!($transA, $transB, one(T), NNlib._unbatch(A), NNlib._unbatch(B), zero(T), C)
36-
C
37-
end
38-
end
28+
NNlib._batched_gemm!(::Type{<:CuArray}, transA::Char, transB::Char, α::Number, A, B, β::Number, C) =
29+
CUBLAS.gemm_strided_batched!(transA, transB, α, A, B, β, C)

test/nnlib.jl

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,25 @@ using NNlib
1616
@test CuArray(Ca) batched_mul(CuArray(A), batched_adjoint(CuArray(B)))
1717
end
1818

19+
@testset "NNlib storage_type etc." begin
20+
using LinearAlgebra
21+
using NNlib: is_strided, are_strided, storage_type
22+
23+
M = cu(ones(10,10))
24+
25+
@test is_strided(M)
26+
@test is_strided(view(M, 1:2:5,:))
27+
@test is_strided(PermutedDimsArray(M, (2,1)))
28+
29+
@test !is_strided(reshape(view(M, 1:2:10,:), 10,:))
30+
@test !is_strided((M .+ im)')
31+
@test !is_strided(Diagonal(cu(ones(3))))
32+
33+
@test storage_type(M) == CuArray{Float32,2,Nothing}
34+
@test storage_type(reshape(view(M, 1:2:10,:), 10,:)) == CuArray{Float32,2,Nothing}
35+
36+
end
37+
1938
@testset "Broadcast Fix" begin
2039
if CUDA.has_cudnn()
2140
@test testf(x -> logσ.(x), rand(5))

0 commit comments

Comments
 (0)