Skip to content

Commit bb9ca6d

Browse files
authored
Merge pull request #367 from JuliaGPU/tb/occupancy
Rework occupancy, re-enable grid-stride broadcast.
2 parents e180162 + f299182 commit bb9ca6d

File tree

9 files changed

+65
-48
lines changed

9 files changed

+65
-48
lines changed

src/device/execution.jl

+35-26
Original file line numberDiff line numberDiff line change
@@ -28,38 +28,47 @@ host to influence how the kernel is executed. The following keyword arguments ar
2828
2929
- `target::AbstractArray`: specify which array object to use for determining execution
3030
properties (defaults to the first argument `arg0`).
31-
- `total_threads::Int`: how many threads should be launched _in total_. The actual number of
32-
threads and blocks is determined using a heuristic. Defaults to the length of `arg0` if
33-
no other keyword arguments that influence the launch configuration are specified.
31+
- `elements::Int`: how many elements will be processed by this kernel. In most
32+
circumstances, this will correspond to the total number of threads that needs to be
33+
launched, unless the kernel supports a variable number of elements to process per
34+
iteration. Defaults to the length of `arg0` if no other keyword arguments that influence
35+
the launch configuration are specified.
3436
- `threads::Int` and `blocks::Int`: configure exactly how many threads and blocks are
35-
launched. This cannot be used in combination with the `total_threads` argument.
36-
- `name::String`: inform the back end about the name of the kernel to be executed.
37-
This can be used to emit better diagnostics, and is useful with anonymous kernels.
37+
launched. This cannot be used in combination with the `elements` argument.
38+
- `name::String`: inform the back end about the name of the kernel to be executed. This can
39+
be used to emit better diagnostics, and is useful with anonymous kernels.
3840
"""
3941
function gpu_call(kernel::F, args::Vararg{Any,N};
4042
target::AbstractArray=first(args),
41-
total_threads::Union{Int,Nothing}=nothing,
43+
elements::Union{Int,Nothing}=nothing,
4244
threads::Union{Int,Nothing}=nothing,
4345
blocks::Union{Int,Nothing}=nothing,
4446
name::Union{String,Nothing}=nothing) where {F,N}
4547
# non-trivial default values for launch configuration
46-
if total_threads===nothing && threads===nothing && blocks===nothing
47-
total_threads = length(target)
48-
elseif total_threads===nothing
48+
if elements===nothing && threads===nothing && blocks===nothing
49+
elements = length(target)
50+
elseif elements===nothing
4951
if threads === nothing
5052
threads = 1
5153
end
5254
if blocks === nothing
5355
blocks = 1
5456
end
5557
elseif threads!==nothing || blocks!==nothing
56-
error("Cannot specify both total_threads and threads/blocks configuration")
58+
error("Cannot specify both elements and threads/blocks configuration")
5759
end
5860

59-
if total_threads !== nothing
60-
@assert total_threads > 0
61-
heuristic = launch_heuristic(backend(target), kernel, args...)
62-
config = launch_configuration(backend(target), heuristic, total_threads)
61+
# the number of elements to process needs to be passed to the kernel somehow, so there's
62+
# no easy way to do this without passing additional arguments or changing the context.
63+
# both are expensive, so require manual use of `launch_heuristic` for those kernels.
64+
elements_per_thread = 1
65+
66+
if elements !== nothing
67+
@assert elements > 0
68+
heuristic = launch_heuristic(backend(target), kernel, args...;
69+
elements, elements_per_thread)
70+
config = launch_configuration(backend(target), heuristic;
71+
elements, elements_per_thread)
6372
gpu_call(backend(target), kernel, args, config.threads, config.blocks; name=name)
6473
else
6574
@assert threads > 0
@@ -68,29 +77,29 @@ function gpu_call(kernel::F, args::Vararg{Any,N};
6877
end
6978
end
7079

71-
# how many threads and blocks this kernel need to fully saturate the GPU.
72-
# this can be specialised if more sophisticated heuristics are available.
80+
# how many threads and blocks `kernel` needs to be launched with, passing arguments `args`,
81+
# to fully saturate the GPU. `elements` indicates the number of elements that needs to be
82+
# processed, while `elements_per_threads` indicates the number of elements this kernel can
83+
# process (i.e. if it's a grid-stride kernel, or 1 if otherwise).
7384
#
74-
# the `maximize_blocksize` indicates whether the kernel benifits from a large block size
85+
# this heuristic should be specialized for the back-end, ideally using an API for maximizing
86+
# the occupancy of the launch configuration (like CUDA's occupancy API).
7587
function launch_heuristic(backend::AbstractGPUBackend, kernel, args...;
76-
maximize_blocksize=false)
88+
elements::Int, elements_per_thread::Int)
7789
return (threads=256, blocks=32)
7890
end
7991

8092
# determine how many threads and blocks to actually launch given upper limits.
8193
# returns a tuple of blocks, threads, and elements_per_thread (which is always 1
8294
# unless specified that the kernel can handle a number of elements per thread)
83-
function launch_configuration(backend::AbstractGPUBackend, heuristic,
84-
elements::Int, elements_per_thread::Int=1)
95+
function launch_configuration(backend::AbstractGPUBackend, heuristic;
96+
elements::Int, elements_per_thread::Int)
8597
threads = clamp(elements, 1, heuristic.threads)
8698
blocks = max(cld(elements, threads), 1)
8799

88-
# FIXME: use grid-stride loop when we can't launch the number of blocks we need
89-
90-
if false && elements_per_thread > 1 && blocks > heuristic.blocks
100+
if elements_per_thread > 1 && blocks > heuristic.blocks
91101
# we want to launch more blocks than required, so prefer a grid-stride loop instead
92-
# NOTE: this does not seem to improve performance
93-
nelem = clamp(cld(blocks, heuristic.blocks), 1, elements_per_thread)
102+
nelem = clamp(fld(blocks, heuristic.blocks), 1, elements_per_thread)
94103
blocks = cld(blocks, nelem)
95104
(threads=threads, blocks=blocks, elements_per_thread=nelem)
96105
else

src/host/abstractarray.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ function Base.copyto!(dest::AnyGPUArray, dstart::Integer,
116116

117117
gpu_call(linear_copy_kernel!,
118118
dest, dstart, src, sstart, n;
119-
total_threads=n)
119+
elements=n)
120120
return dest
121121
end
122122

@@ -188,7 +188,7 @@ function Base.copyto!(dest::AnyGPUArray{<:Any, N}, destcrange::CartesianIndices{
188188
src_offsets = first(srccrange) - oneunit(CartesianIndex{N})
189189
gpu_call(cartesian_copy_kernel!,
190190
dest, dest_offsets, src, src_offsets, shape, len;
191-
total_threads=len)
191+
elements=len)
192192
dest
193193
end
194194

src/host/base.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ function Base.repeat(a::AbstractGPUVecOrMat, m::Int, n::Int = 1)
66
if length(b) == 0
77
return b
88
end
9-
gpu_call(b, a, o, p, m, n; total_threads=n) do ctx, b, a, o, p, m, n
9+
gpu_call(b, a, o, p, m, n; elements=n) do ctx, b, a, o, p, m, n
1010
j = linear_index(ctx)
1111
j > n && return
1212
d = (j - 1) * p + 1
@@ -29,7 +29,7 @@ function Base.repeat(a::AbstractGPUVector, m::Int)
2929
if length(b) == 0
3030
return b
3131
end
32-
gpu_call(b, a, o, m; total_threads=m) do ctx, b, a, o, m
32+
gpu_call(b, a, o, m; elements=m) do ctx, b, a, o, m
3333
i = linear_index(ctx)
3434
i > m && return
3535
c = (i - 1)*o + 1

src/host/broadcast.jl

+12-4
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,12 @@ end
6060
end
6161
return
6262
end
63-
heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, bc′, 1)
64-
config = launch_configuration(backend(dest), heuristic, length(dest), typemax(Int))
63+
elements = length(dest)
64+
elements_per_thread = typemax(Int)
65+
heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, bc′, 1;
66+
elements, elements_per_thread)
67+
config = launch_configuration(backend(dest), heuristic;
68+
elements, elements_per_thread)
6569
gpu_call(broadcast_kernel, dest, bc′, config.elements_per_thread;
6670
threads=config.threads, blocks=config.blocks)
6771

@@ -121,8 +125,12 @@ function Base.map!(f, dest::BroadcastGPUArray, xs::AbstractArray...)
121125
end
122126
return
123127
end
124-
heuristic = launch_heuristic(backend(dest), map_kernel, dest, bc, 1)
125-
config = launch_configuration(backend(dest), heuristic, common_length, typemax(Int))
128+
elements = common_length
129+
elements_per_thread = typemax(Int)
130+
heuristic = launch_heuristic(backend(dest), map_kernel, dest, bc, 1;
131+
elements, elements_per_thread)
132+
config = launch_configuration(backend(dest), heuristic;
133+
elements, elements_per_thread)
126134
gpu_call(map_kernel, dest, bc, config.elements_per_thread;
127135
threads=config.threads, blocks=config.blocks)
128136

src/host/construction.jl

+3-3
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ end
2424
function (T::Type{<: AnyGPUArray{U}})(s::UniformScaling, dims::Dims{2}) where {U}
2525
res = similar(T, dims)
2626
fill!(res, zero(U))
27-
gpu_call(identity_kernel, res, size(res, 1), s.λ; total_threads=minimum(dims))
27+
gpu_call(identity_kernel, res, size(res, 1), s.λ; elements=minimum(dims))
2828
res
2929
end
3030

@@ -34,7 +34,7 @@ end
3434

3535
function Base.copyto!(A::AbstractGPUMatrix{T}, s::UniformScaling) where T
3636
fill!(A, zero(T))
37-
gpu_call(identity_kernel, A, size(A, 1), s.λ; total_threads=minimum(size(A)))
37+
gpu_call(identity_kernel, A, size(A, 1), s.λ; elements=minimum(size(A)))
3838
A
3939
end
4040

@@ -43,7 +43,7 @@ function _one(unit::T, x::AbstractGPUMatrix) where {T}
4343
m==n || throw(DimensionMismatch("multiplicative identity defined only for square matrices"))
4444
I = similar(x, T)
4545
fill!(I, zero(T))
46-
gpu_call(identity_kernel, I, m, unit; total_threads=m)
46+
gpu_call(identity_kernel, I, m, unit; elements=m)
4747
I
4848
end
4949

src/host/indexing.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ function _setindex!(dest::AbstractGPUArray, src, Is...)
149149
AT = typeof(dest).name.wrapper
150150
# NOTE: we are pretty liberal here supporting non-GPU sources and indices...
151151
gpu_call(setindex_kernel, dest, adapt(AT, src), idims, len, adapt(AT, Is)...;
152-
total_threads=len)
152+
elements=len)
153153
return dest
154154
end
155155

src/host/random.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ end
9494
function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Number
9595
threads = (length(A) - 1) ÷ 2 + 1
9696
length(A) == 0 && return
97-
gpu_call(A, rng.state; total_threads = threads) do ctx, a, randstates
97+
gpu_call(A, rng.state; elements = threads) do ctx, a, randstates
9898
idx = 2*(linear_index(ctx) - 1) + 1
9999
U1 = gpu_rand(T, ctx, randstates)
100100
U2 = gpu_rand(T, ctx, randstates)

src/host/uniformscaling.jl

+8-8
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,15 @@ for (t1, t2) in unittriangularwrappers
3434
B = similar(parent(A), typeof(oneunit(T) + J))
3535
copyto!(B, parent(A))
3636
min_size = minimum(size(B))
37-
gpu_call(kernel_unittriangular, B, J, one(eltype(B)), min_size; total_threads=min_size)
37+
gpu_call(kernel_unittriangular, B, J, one(eltype(B)), min_size; elements=min_size)
3838
return $t2(B)
3939
end
4040

4141
function (-)(J::UniformScaling, A::$t1{T, <:AbstractGPUMatrix}) where T
4242
B = similar(parent(A), typeof(J - oneunit(T)))
4343
B .= .- parent(A)
4444
min_size = minimum(size(B))
45-
gpu_call(kernel_unittriangular, B, J, -one(eltype(B)), min_size; total_threads=min_size)
45+
gpu_call(kernel_unittriangular, B, J, -one(eltype(B)), min_size; elements=min_size)
4646
return $t2(B)
4747
end
4848
end
@@ -54,15 +54,15 @@ for t in genericwrappers
5454
B = similar(parent(A), typeof(oneunit(T) + J))
5555
copyto!(B, parent(A))
5656
min_size = minimum(size(B))
57-
gpu_call(kernel_generic, B, J, min_size; total_threads=min_size)
57+
gpu_call(kernel_generic, B, J, min_size; elements=min_size)
5858
return $t(B)
5959
end
6060

6161
function (-)(J::UniformScaling, A::$t{T, <:AbstractGPUMatrix}) where T
6262
B = similar(parent(A), typeof(J - oneunit(T)))
6363
B .= .- parent(A)
6464
min_size = minimum(size(B))
65-
gpu_call(kernel_generic, B, J, min_size; total_threads=min_size)
65+
gpu_call(kernel_generic, B, J, min_size; elements=min_size)
6666
return $t(B)
6767
end
6868
end
@@ -73,15 +73,15 @@ function (+)(A::Hermitian{T,<:AbstractGPUMatrix}, J::UniformScaling{<:Complex})
7373
B = similar(parent(A), typeof(oneunit(T) + J))
7474
copyto!(B, parent(A))
7575
min_size = minimum(size(B))
76-
gpu_call(kernel_generic, B, J, min_size; total_threads=min_size)
76+
gpu_call(kernel_generic, B, J, min_size; elements=min_size)
7777
return B
7878
end
7979

8080
function (-)(J::UniformScaling{<:Complex}, A::Hermitian{T,<:AbstractGPUMatrix}) where T
8181
B = similar(parent(A), typeof(J - oneunit(T)))
8282
B .= .-parent(A)
8383
min_size = minimum(size(B))
84-
gpu_call(kernel_generic, B, J, min_size; total_threads=min_size)
84+
gpu_call(kernel_generic, B, J, min_size; elements=min_size)
8585
return B
8686
end
8787

@@ -90,14 +90,14 @@ function (+)(A::AbstractGPUMatrix{T}, J::UniformScaling) where T
9090
B = similar(A, typeof(oneunit(T) + J))
9191
copyto!(B, A)
9292
min_size = minimum(size(B))
93-
gpu_call(kernel_generic, B, J, min_size; total_threads=min_size)
93+
gpu_call(kernel_generic, B, J, min_size; elements=min_size)
9494
return B
9595
end
9696

9797
function (-)(J::UniformScaling, A::AbstractGPUMatrix{T}) where T
9898
B = similar(A, typeof(J - oneunit(T)))
9999
B .= .-A
100100
min_size = minimum(size(B))
101-
gpu_call(kernel_generic, B, J, min_size; total_threads=min_size)
101+
gpu_call(kernel_generic, B, J, min_size; elements=min_size)
102102
return B
103103
end

test/testsuite/gpuinterface.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
end
1111
@test all(x-> x == 2, Array(x))
1212

13-
gpu_call(x; total_threads=N) do ctx, x
13+
gpu_call(x; elements=N) do ctx, x
1414
x[linear_index(ctx)] = 2
1515
return
1616
end

0 commit comments

Comments
 (0)