Skip to content

Commit 5d9a108

Browse files
authored
Merge pull request #372 from JuliaGPU/tb/block_heuristic
Try to use the heuristic's block configuration when using grid-stride kernels.
2 parents 363b991 + 20f1a38 commit 5d9a108

File tree

1 file changed

+10
-4
lines changed

1 file changed

+10
-4
lines changed

src/device/execution.jl

+10-4
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,17 @@ function launch_configuration(backend::AbstractGPUBackend, heuristic;
9999

100100
if elements_per_thread > 1 && blocks > heuristic.blocks
101101
# we want to launch more blocks than required, so prefer a grid-stride loop instead
102-
nelem = clamp(fld(blocks, heuristic.blocks), 1, elements_per_thread)
103-
blocks = cld(blocks, nelem)
104-
(threads=threads, blocks=blocks, elements_per_thread=nelem)
102+
## try to stick to the number of blocks that the heuristic suggested
103+
blocks = heuristic.blocks
104+
nelem = cld(elements, blocks*threads)
105+
## only bump the number of blocks if we really need to
106+
if nelem > elements_per_thread
107+
nelem = elements_per_thread
108+
blocks = cld(elements, nelem*threads)
109+
end
110+
(; threads, blocks, elements_per_thread=nelem)
105111
else
106-
(threads=threads, blocks=blocks, elements_per_thread=1)
112+
(; threads, blocks, elements_per_thread=1)
107113
end
108114
end
109115

0 commit comments

Comments
 (0)