Skip to content

[SYCL] Pass handler & instead of queue across ABI for reduction utils #18834

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Merged
merged 2 commits into from
Jun 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 24 additions & 56 deletions sycl/include/sycl/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,17 +140,10 @@ template <typename... Ts> ReduTupleT<Ts...> makeReduTupleT(Ts... Elements) {
return sycl::detail::make_tuple(Elements...);
}

#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
__SYCL_EXPORT size_t reduGetMaxWGSize(const std::shared_ptr<queue_impl> &Queue,
size_t LocalMemBytesPerWorkItem);
__SYCL_EXPORT size_t reduGetPreferredWGSize(
const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem);
#else
__SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh,
size_t LocalMemBytesPerWorkItem);
__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh,
size_t LocalMemBytesPerWorkItem);
#endif
__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
size_t &NWorkGroups);

Expand Down Expand Up @@ -1245,15 +1238,12 @@ template <>
struct NDRangeReduction<reduction::strategy::local_atomic_and_atomic_cross_wg> {
template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
static_assert(Reduction::has_identity,
"Identityless reductions are not supported by the "
"local_atomic_and_atomic_cross_wg strategy.");

std::ignore = Queue;
using Name = __sycl_reduction_kernel<
reduction::MainKrn, KernelName,
reduction::strategy::local_atomic_and_atomic_cross_wg>;
Expand Down Expand Up @@ -1297,15 +1287,12 @@ struct NDRangeReduction<
reduction::strategy::group_reduce_and_last_wg_detection> {
template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
static_assert(Reduction::has_identity,
"Identityless reductions are not supported by the "
"group_reduce_and_last_wg_detection strategy.");

std::ignore = Queue;
size_t NElements = Reduction::num_elements;
size_t WGSize = NDRange.get_local_range().size();
size_t NWorkGroups = NDRange.get_group_range().size();
Expand Down Expand Up @@ -1497,9 +1484,7 @@ void doTreeReductionOnTuple(size_t WorkSize, size_t LID,
template <> struct NDRangeReduction<reduction::strategy::range_basic> {
template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
using reducer_type = typename Reduction::reducer_type;
using element_type = typename ReducerTraits<reducer_type>::element_type;
Expand All @@ -1511,7 +1496,6 @@ template <> struct NDRangeReduction<reduction::strategy::range_basic> {
constexpr bool UsePartialSumForOutput =
!Reduction::is_usm && Reduction::has_identity;

std::ignore = Queue;
size_t NElements = Reduction::num_elements;
size_t WGSize = NDRange.get_local_range().size();
size_t NWorkGroups = NDRange.get_group_range().size();
Expand Down Expand Up @@ -1609,15 +1593,12 @@ template <>
struct NDRangeReduction<reduction::strategy::group_reduce_and_atomic_cross_wg> {
template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
static_assert(Reduction::has_identity,
"Identityless reductions are not supported by the "
"group_reduce_and_atomic_cross_wg strategy.");

std::ignore = Queue;
using Name = __sycl_reduction_kernel<
reduction::MainKrn, KernelName,
reduction::strategy::group_reduce_and_atomic_cross_wg>;
Expand Down Expand Up @@ -1646,14 +1627,11 @@ struct NDRangeReduction<
reduction::strategy::local_mem_tree_and_atomic_cross_wg> {
template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
using reducer_type = typename Reduction::reducer_type;
using element_type = typename ReducerTraits<reducer_type>::element_type;

std::ignore = Queue;
using Name = __sycl_reduction_kernel<
reduction::MainKrn, KernelName,
reduction::strategy::local_mem_tree_and_atomic_cross_wg>;
Expand Down Expand Up @@ -1708,9 +1686,7 @@ struct NDRangeReduction<
reduction::strategy::group_reduce_and_multiple_kernels> {
template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
static_assert(Reduction::has_identity,
"Identityless reductions are not supported by the "
Expand All @@ -1729,7 +1705,7 @@ struct NDRangeReduction<
// TODO: currently the maximal work group size is determined for the given
// queue/device, while it may be safer to use queries to the kernel compiled
// for the device.
size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
size_t MaxWGSize = reduGetMaxWGSize(CGH, OneElemSize);
if (NDRange.get_local_range().size() > MaxWGSize)
throw sycl::exception(make_error_code(errc::nd_range),
"The implementation handling parallel_for with"
Expand Down Expand Up @@ -1847,9 +1823,7 @@ struct NDRangeReduction<
template <> struct NDRangeReduction<reduction::strategy::basic> {
template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
using element_type = typename Reduction::reducer_element_type;

Expand All @@ -1858,7 +1832,7 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
// TODO: currently the maximal work group size is determined for the given
// queue/device, while it may be safer to use queries to the kernel
// compiled for the device.
size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
size_t MaxWGSize = reduGetMaxWGSize(CGH, OneElemSize);
if (NDRange.get_local_range().size() > MaxWGSize)
throw sycl::exception(make_error_code(errc::nd_range),
"The implementation handling parallel_for with"
Expand Down Expand Up @@ -2623,9 +2597,8 @@ tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
template <> struct NDRangeReduction<reduction::strategy::multi> {
template <typename KernelName, int Dims, typename PropertiesT,
typename... RestT>
static void
run(handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
RestT... Rest) {
std::tuple<RestT...> ArgsTuple(Rest...);
constexpr size_t NumArgs = sizeof...(RestT);
auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
Expand All @@ -2636,7 +2609,7 @@ template <> struct NDRangeReduction<reduction::strategy::multi> {
// TODO: currently the maximal work group size is determined for the given
// queue/device, while it is safer to use queries to the kernel compiled
// for the device.
size_t MaxWGSize = reduGetMaxWGSize(Queue, LocalMemPerWorkItem);
size_t MaxWGSize = reduGetMaxWGSize(CGH, LocalMemPerWorkItem);
if (NDRange.get_local_range().size() > MaxWGSize)
throw sycl::exception(make_error_code(errc::nd_range),
"The implementation handling parallel_for with"
Expand Down Expand Up @@ -2667,13 +2640,10 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {

template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
auto Delegate = [&](auto Impl) {
Impl.template run<KernelName>(CGH, Queue, NDRange, Properties, Redu,
KernelFunc);
Impl.template run<KernelName>(CGH, NDRange, Properties, Redu, KernelFunc);
};

if constexpr (Reduction::has_float64_atomics) {
Expand Down Expand Up @@ -2715,10 +2685,9 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
}
template <typename KernelName, int Dims, typename PropertiesT,
typename... RestT>
static void
run(handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
return Impl<Strat::multi>::run<KernelName>(CGH, Queue, NDRange, Properties,
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
RestT... Rest) {
return Impl<Strat::multi>::run<KernelName>(CGH, NDRange, Properties,
Rest...);
}
};
Expand All @@ -2727,12 +2696,11 @@ template <typename KernelName, reduction::strategy Strategy, int Dims,
typename PropertiesT, typename... RestT>
void reduction_parallel_for(handler &CGH, nd_range<Dims> NDRange,
PropertiesT Properties, RestT... Rest) {
NDRangeReduction<Strategy>::template run<KernelName>(CGH, CGH.MQueue, NDRange,
Properties, Rest...);
NDRangeReduction<Strategy>::template run<KernelName>(CGH, NDRange, Properties,
Rest...);
}

__SYCL_EXPORT uint32_t
reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh);

template <typename KernelName, reduction::strategy Strategy, int Dims,
typename PropertiesT, typename... RestT>
Expand Down Expand Up @@ -2763,13 +2731,13 @@ void reduction_parallel_for(handler &CGH, range<Dims> Range,
#ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
__SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
#else
reduGetMaxNumConcurrentWorkGroups(CGH.MQueue);
reduGetMaxNumConcurrentWorkGroups(CGH);
#endif

// TODO: currently the preferred work group size is determined for the given
// queue/device, while it is safer to use queries to the kernel pre-compiled
// for the device.
size_t PrefWGSize = reduGetPreferredWGSize(CGH.MQueue, OneElemSize);
size_t PrefWGSize = reduGetPreferredWGSize(CGH, OneElemSize);

size_t NWorkItems = Range.size();
size_t WGSize = std::min(NWorkItems, PrefWGSize);
Expand Down
92 changes: 59 additions & 33 deletions sycl/source/detail/reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,24 @@ __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
return WGSize;
}

#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
// Inline this helper:
#endif
uint32_t reduGetMaxNumConcurrentWorkGroups(device_impl &Dev) {
uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
// TODO: The heuristics here require additional tuning for various devices
// and vendors. Also, it would be better to check vendor/generation/etc.
if (Dev.is_gpu() && Dev.get_info<sycl::info::device::host_unified_memory>())
NumThreads *= 8;
return NumThreads;
}
// Returns the estimated number of physical threads on the device associated
// with the given queue.
__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh) {
return reduGetMaxNumConcurrentWorkGroups(getSyclObjImpl(cgh)->get_device());
}

#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
std::shared_ptr<sycl::detail::queue_impl> Queue) {
// TODO: Graphs extension explicit API uses a handler with no queue attached,
Expand All @@ -63,25 +79,14 @@ __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
if (Queue == nullptr) {
return 8;
}
device Dev = Queue->get_device();
uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
// TODO: The heuristics here require additional tuning for various devices
// and vendors. Also, it would be better to check vendor/generation/etc.
if (Dev.is_gpu() && Dev.get_info<sycl::info::device::host_unified_memory>())
NumThreads *= 8;
return NumThreads;
return reduGetMaxNumConcurrentWorkGroups(Queue->getDeviceImpl());
}
#endif

#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
__SYCL_EXPORT size_t
reduGetMaxWGSize(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
size_t LocalMemBytesPerWorkItem) {
#else
__SYCL_EXPORT size_t
reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
size_t LocalMemBytesPerWorkItem) {
// Inline this helper:
#endif
device Dev = Queue->get_device();
size_t reduGetMaxWGSize(device_impl &Dev, size_t LocalMemBytesPerWorkItem) {
size_t MaxWGSize = Dev.get_info<sycl::info::device::max_work_group_size>();

size_t WGSizePerMem = MaxWGSize * 2;
Expand Down Expand Up @@ -118,26 +123,24 @@ reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,

return WGSize;
}
__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh,
size_t LocalMemBytesPerWorkItem) {
return reduGetMaxWGSize(getSyclObjImpl(cgh)->get_device(),
LocalMemBytesPerWorkItem);
}
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
__SYCL_EXPORT
size_t reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
size_t LocalMemBytesPerWorkItem) {
return reduGetMaxWGSize(Queue->getDeviceImpl(), LocalMemBytesPerWorkItem);
}
#endif

#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
__SYCL_EXPORT size_t reduGetPreferredWGSize(
const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem) {
#else
__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
size_t LocalMemBytesPerWorkItem) {
// Inline this helper:
#endif
// TODO: Graphs extension explicit API uses a handler with a null queue to
// process CGFs, in future we should have access to the device so we can
// correctly calculate this.
//
// The 32 value was chosen as the hardcoded value as it is the returned
// value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
// Intel HD Graphics devices used as a L0 backend during development.
if (Queue == nullptr) {
return 32;
}
device Dev = Queue->get_device();

size_t reduGetPreferredWGSize(device_impl &Dev,
size_t LocalMemBytesPerWorkItem) {
// The maximum WGSize returned by CPU devices is very large and does not
// help the reduction implementation: since all work associated with a
// work-group is typically assigned to one CPU thread, selecting a large
Expand Down Expand Up @@ -174,8 +177,31 @@ __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
}

// Use the maximum work-group size otherwise.
return reduGetMaxWGSize(Queue, LocalMemBytesPerWorkItem);
return reduGetMaxWGSize(Dev, LocalMemBytesPerWorkItem);
}
__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh,
size_t LocalMemBytesPerWorkItem) {
return reduGetPreferredWGSize(getSyclObjImpl(cgh)->get_device(),
LocalMemBytesPerWorkItem);
}
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
size_t LocalMemBytesPerWorkItem) {
// TODO: Graphs extension explicit API uses a handler with a null queue to
// process CGFs, in future we should have access to the device so we can
// correctly calculate this.
//
// The 32 value was chosen as the hardcoded value as it is the returned
// value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
// Intel HD Graphics devices used as a L0 backend during development.
if (Queue == nullptr) {
return 32;
}
device_impl &Dev = Queue->getDeviceImpl();

return reduGetPreferredWGSize(Dev, LocalMemBytesPerWorkItem);
}
#endif

__SYCL_EXPORT void
addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
Expand Down
Loading