@@ -28,38 +28,47 @@ host to influence how the kernel is executed. The following keyword arguments ar
28
28
29
29
- `target::AbstractArray`: specify which array object to use for determining execution
30
30
properties (defaults to the first argument `arg0`).
31
- - `total_threads::Int`: how many threads should be launched _in total_. The actual number of
32
- threads and blocks is determined using a heuristic. Defaults to the length of `arg0` if
33
- no other keyword arguments that influence the launch configuration are specified.
31
+ - `elements::Int`: how many elements will be processed by this kernel. In most
32
+ circumstances, this will correspond to the total number of threads that needs to be
33
+ launched, unless the kernel supports a variable number of elements to process per
34
+ iteration. Defaults to the length of `arg0` if no other keyword arguments that influence
35
+ the launch configuration are specified.
34
36
- `threads::Int` and `blocks::Int`: configure exactly how many threads and blocks are
35
- launched. This cannot be used in combination with the `total_threads ` argument.
36
- - `name::String`: inform the back end about the name of the kernel to be executed.
37
- This can be used to emit better diagnostics, and is useful with anonymous kernels.
37
+ launched. This cannot be used in combination with the `elements ` argument.
38
+ - `name::String`: inform the back end about the name of the kernel to be executed. This can
39
+ be used to emit better diagnostics, and is useful with anonymous kernels.
38
40
"""
39
41
function gpu_call (kernel:: F , args:: Vararg{Any,N} ;
40
42
target:: AbstractArray = first (args),
41
- total_threads :: Union{Int,Nothing} = nothing ,
43
+ elements :: Union{Int,Nothing} = nothing ,
42
44
threads:: Union{Int,Nothing} = nothing ,
43
45
blocks:: Union{Int,Nothing} = nothing ,
44
46
name:: Union{String,Nothing} = nothing ) where {F,N}
45
47
# non-trivial default values for launch configuration
46
- if total_threads === nothing && threads=== nothing && blocks=== nothing
47
- total_threads = length (target)
48
- elseif total_threads === nothing
48
+ if elements === nothing && threads=== nothing && blocks=== nothing
49
+ elements = length (target)
50
+ elseif elements === nothing
49
51
if threads === nothing
50
52
threads = 1
51
53
end
52
54
if blocks === nothing
53
55
blocks = 1
54
56
end
55
57
elseif threads!= = nothing || blocks!= = nothing
56
- error (" Cannot specify both total_threads and threads/blocks configuration" )
58
+ error (" Cannot specify both elements and threads/blocks configuration" )
57
59
end
58
60
59
- if total_threads != = nothing
60
- @assert total_threads > 0
61
- heuristic = launch_heuristic (backend (target), kernel, args... )
62
- config = launch_configuration (backend (target), heuristic, total_threads)
61
+ # the number of elements to process needs to be passed to the kernel somehow, so there's
62
+ # no easy way to do this without passing additional arguments or changing the context.
63
+ # both are expensive, so require manual use of `launch_heuristic` for those kernels.
64
+ elements_per_thread = 1
65
+
66
+ if elements != = nothing
67
+ @assert elements > 0
68
+ heuristic = launch_heuristic (backend (target), kernel, args... ;
69
+ elements, elements_per_thread)
70
+ config = launch_configuration (backend (target), heuristic;
71
+ elements, elements_per_thread)
63
72
gpu_call (backend (target), kernel, args, config. threads, config. blocks; name= name)
64
73
else
65
74
@assert threads > 0
@@ -68,29 +77,29 @@ function gpu_call(kernel::F, args::Vararg{Any,N};
68
77
end
69
78
end
70
79
71
- # how many threads and blocks this kernel need to fully saturate the GPU.
72
- # this can be specialised if more sophisticated heuristics are available.
80
+ # how many threads and blocks `kernel` needs to be launched with, passing arguments `args`,
81
+ # to fully saturate the GPU. `elements` indicates the number of elements that needs to be
82
+ # processed, while `elements_per_threads` indicates the number of elements this kernel can
83
+ # process (i.e. if it's a grid-stride kernel, or 1 if otherwise).
73
84
#
74
- # the `maximize_blocksize` indicates whether the kernel benifits from a large block size
85
+ # this heuristic should be specialized for the back-end, ideally using an API for maximizing
86
+ # the occupancy of the launch configuration (like CUDA's occupancy API).
75
87
function launch_heuristic (backend:: AbstractGPUBackend , kernel, args... ;
76
- maximize_blocksize = false )
88
+ elements :: Int , elements_per_thread :: Int )
77
89
return (threads= 256 , blocks= 32 )
78
90
end
79
91
80
92
# determine how many threads and blocks to actually launch given upper limits.
81
93
# returns a tuple of blocks, threads, and elements_per_thread (which is always 1
82
94
# unless specified that the kernel can handle a number of elements per thread)
83
- function launch_configuration (backend:: AbstractGPUBackend , heuristic,
84
- elements:: Int , elements_per_thread:: Int = 1 )
95
+ function launch_configuration (backend:: AbstractGPUBackend , heuristic;
96
+ elements:: Int , elements_per_thread:: Int )
85
97
threads = clamp (elements, 1 , heuristic. threads)
86
98
blocks = max (cld (elements, threads), 1 )
87
99
88
- # FIXME : use grid-stride loop when we can't launch the number of blocks we need
89
-
90
- if false && elements_per_thread > 1 && blocks > heuristic. blocks
100
+ if elements_per_thread > 1 && blocks > heuristic. blocks
91
101
# we want to launch more blocks than required, so prefer a grid-stride loop instead
92
- # NOTE: this does not seem to improve performance
93
- nelem = clamp (cld (blocks, heuristic. blocks), 1 , elements_per_thread)
102
+ nelem = clamp (fld (blocks, heuristic. blocks), 1 , elements_per_thread)
94
103
blocks = cld (blocks, nelem)
95
104
(threads= threads, blocks= blocks, elements_per_thread= nelem)
96
105
else
0 commit comments