From f59a17bd178b8343042c7508b073ae05683a6b1c Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Thu, 5 Jun 2025 09:11:19 -0700 Subject: [PATCH 1/2] [SYCL] Pass `handler &` instead of queue across ABI for reduction utils Queue might be `nullptr` in case of graph, but the information this utils query is device-specific. By passing entire `handler &` and having access to graph information we'd be able to return more precise results. Another positive side-effect is that we eliminiate explicit `std::shared_ptr` which is a small step forward in the ongoing refactoring efforts to prefer passing `*_impl` by raw ptr/ref with explicit `shared_from_this` whenever lifetimes need to be extended. --- sycl/include/sycl/reduction.hpp | 36 ++++------ sycl/source/detail/reduction.cpp | 92 ++++++++++++++++--------- sycl/test/abi/sycl_symbols_linux.dump | 3 + sycl/test/abi/sycl_symbols_windows.dump | 3 + 4 files changed, 78 insertions(+), 56 deletions(-) diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp index 6c03c80eab0f0..36448e719baf3 100644 --- a/sycl/include/sycl/reduction.hpp +++ b/sycl/include/sycl/reduction.hpp @@ -140,17 +140,10 @@ template ReduTupleT makeReduTupleT(Ts... Elements) { return sycl::detail::make_tuple(Elements...); } -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES -__SYCL_EXPORT size_t reduGetMaxWGSize(const std::shared_ptr &Queue, - size_t LocalMemBytesPerWorkItem); -__SYCL_EXPORT size_t reduGetPreferredWGSize( - const std::shared_ptr &Queue, size_t LocalMemBytesPerWorkItem); -#else -__SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr Queue, +__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh, size_t LocalMemBytesPerWorkItem); -__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr &Queue, +__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh, size_t LocalMemBytesPerWorkItem); -#endif __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups); @@ -1708,8 +1701,7 @@ struct NDRangeReduction< reduction::strategy::group_reduce_and_multiple_kernels> { template - static void run(handler &CGH, - const std::shared_ptr &Queue, + static void run(handler &CGH, const std::shared_ptr &, nd_range NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc) { static_assert(Reduction::has_identity, @@ -1729,7 +1721,7 @@ struct NDRangeReduction< // TODO: currently the maximal work group size is determined for the given // queue/device, while it may be safer to use queries to the kernel compiled // for the device. - size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize); + size_t MaxWGSize = reduGetMaxWGSize(CGH, OneElemSize); if (NDRange.get_local_range().size() > MaxWGSize) throw sycl::exception(make_error_code(errc::nd_range), "The implementation handling parallel_for with" @@ -1847,8 +1839,7 @@ struct NDRangeReduction< template <> struct NDRangeReduction { template - static void run(handler &CGH, - const std::shared_ptr &Queue, + static void run(handler &CGH, const std::shared_ptr &, nd_range NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc) { using element_type = typename Reduction::reducer_element_type; @@ -1858,7 +1849,7 @@ template <> struct NDRangeReduction { // TODO: currently the maximal work group size is determined for the given // queue/device, while it may be safer to use queries to the kernel // compiled for the device. - size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize); + size_t MaxWGSize = reduGetMaxWGSize(CGH, OneElemSize); if (NDRange.get_local_range().size() > MaxWGSize) throw sycl::exception(make_error_code(errc::nd_range), "The implementation handling parallel_for with" @@ -2623,9 +2614,9 @@ tuple_select_elements(TupleT Tuple, std::index_sequence) { template <> struct NDRangeReduction { template - static void - run(handler &CGH, const std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, RestT... Rest) { + static void run(handler &CGH, const std::shared_ptr &, + nd_range NDRange, PropertiesT &Properties, + RestT... Rest) { std::tuple ArgsTuple(Rest...); constexpr size_t NumArgs = sizeof...(RestT); auto KernelFunc = std::get(ArgsTuple); @@ -2636,7 +2627,7 @@ template <> struct NDRangeReduction { // TODO: currently the maximal work group size is determined for the given // queue/device, while it is safer to use queries to the kernel compiled // for the device. - size_t MaxWGSize = reduGetMaxWGSize(Queue, LocalMemPerWorkItem); + size_t MaxWGSize = reduGetMaxWGSize(CGH, LocalMemPerWorkItem); if (NDRange.get_local_range().size() > MaxWGSize) throw sycl::exception(make_error_code(errc::nd_range), "The implementation handling parallel_for with" @@ -2731,8 +2722,7 @@ void reduction_parallel_for(handler &CGH, nd_range NDRange, Properties, Rest...); } -__SYCL_EXPORT uint32_t -reduGetMaxNumConcurrentWorkGroups(std::shared_ptr Queue); +__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh); template @@ -2763,13 +2753,13 @@ void reduction_parallel_for(handler &CGH, range Range, #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS; #else - reduGetMaxNumConcurrentWorkGroups(CGH.MQueue); + reduGetMaxNumConcurrentWorkGroups(CGH); #endif // TODO: currently the preferred work group size is determined for the given // queue/device, while it is safer to use queries to the kernel pre-compiled // for the device. - size_t PrefWGSize = reduGetPreferredWGSize(CGH.MQueue, OneElemSize); + size_t PrefWGSize = reduGetPreferredWGSize(CGH, OneElemSize); size_t NWorkItems = Range.size(); size_t WGSize = std::min(NWorkItems, PrefWGSize); diff --git a/sycl/source/detail/reduction.cpp b/sycl/source/detail/reduction.cpp index 702f6692825b4..fb68ee368fcd6 100644 --- a/sycl/source/detail/reduction.cpp +++ b/sycl/source/detail/reduction.cpp @@ -49,8 +49,24 @@ __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, return WGSize; } +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +// Inline this helper: +#endif +uint32_t reduGetMaxNumConcurrentWorkGroups(device_impl &Dev) { + uint32_t NumThreads = Dev.get_info(); + // TODO: The heuristics here require additional tuning for various devices + // and vendors. Also, it would be better to check vendor/generation/etc. + if (Dev.is_gpu() && Dev.get_info()) + NumThreads *= 8; + return NumThreads; +} // Returns the estimated number of physical threads on the device associated // with the given queue. +__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh) { + return reduGetMaxNumConcurrentWorkGroups(getSyclObjImpl(cgh)->get_device()); +} + +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups( std::shared_ptr Queue) { // TODO: Graphs extension explicit API uses a handler with no queue attached, @@ -63,25 +79,14 @@ __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups( if (Queue == nullptr) { return 8; } - device Dev = Queue->get_device(); - uint32_t NumThreads = Dev.get_info(); - // TODO: The heuristics here require additional tuning for various devices - // and vendors. Also, it would be better to check vendor/generation/etc. - if (Dev.is_gpu() && Dev.get_info()) - NumThreads *= 8; - return NumThreads; + return reduGetMaxNumConcurrentWorkGroups(Queue->getDeviceImpl()); } +#endif #ifdef __INTEL_PREVIEW_BREAKING_CHANGES -__SYCL_EXPORT size_t -reduGetMaxWGSize(const std::shared_ptr &Queue, - size_t LocalMemBytesPerWorkItem) { -#else -__SYCL_EXPORT size_t -reduGetMaxWGSize(std::shared_ptr Queue, - size_t LocalMemBytesPerWorkItem) { +// Inline this helper: #endif - device Dev = Queue->get_device(); +size_t reduGetMaxWGSize(device_impl &Dev, size_t LocalMemBytesPerWorkItem) { size_t MaxWGSize = Dev.get_info(); size_t WGSizePerMem = MaxWGSize * 2; @@ -118,26 +123,24 @@ reduGetMaxWGSize(std::shared_ptr Queue, return WGSize; } +__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh, + size_t LocalMemBytesPerWorkItem) { + return reduGetMaxWGSize(getSyclObjImpl(cgh)->get_device(), + LocalMemBytesPerWorkItem); +} +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES +__SYCL_EXPORT +size_t reduGetMaxWGSize(std::shared_ptr Queue, + size_t LocalMemBytesPerWorkItem) { + return reduGetMaxWGSize(Queue->getDeviceImpl(), LocalMemBytesPerWorkItem); +} +#endif #ifdef __INTEL_PREVIEW_BREAKING_CHANGES -__SYCL_EXPORT size_t reduGetPreferredWGSize( - const std::shared_ptr &Queue, size_t LocalMemBytesPerWorkItem) { -#else -__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr &Queue, - size_t LocalMemBytesPerWorkItem) { +// Inline this helper: #endif - // TODO: Graphs extension explicit API uses a handler with a null queue to - // process CGFs, in future we should have access to the device so we can - // correctly calculate this. - // - // The 32 value was chosen as the hardcoded value as it is the returned - // value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on - // Intel HD Graphics devices used as a L0 backend during development. - if (Queue == nullptr) { - return 32; - } - device Dev = Queue->get_device(); - +size_t reduGetPreferredWGSize(device_impl &Dev, + size_t LocalMemBytesPerWorkItem) { // The maximum WGSize returned by CPU devices is very large and does not // help the reduction implementation: since all work associated with a // work-group is typically assigned to one CPU thread, selecting a large @@ -174,8 +177,31 @@ __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr &Queue, } // Use the maximum work-group size otherwise. - return reduGetMaxWGSize(Queue, LocalMemBytesPerWorkItem); + return reduGetMaxWGSize(Dev, LocalMemBytesPerWorkItem); +} +__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh, + size_t LocalMemBytesPerWorkItem) { + return reduGetPreferredWGSize(getSyclObjImpl(cgh)->get_device(), + LocalMemBytesPerWorkItem); } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES +__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr &Queue, + size_t LocalMemBytesPerWorkItem) { + // TODO: Graphs extension explicit API uses a handler with a null queue to + // process CGFs, in future we should have access to the device so we can + // correctly calculate this. + // + // The 32 value was chosen as the hardcoded value as it is the returned + // value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on + // Intel HD Graphics devices used as a L0 backend during development. + if (Queue == nullptr) { + return 32; + } + device_impl &Dev = Queue->getDeviceImpl(); + + return reduGetPreferredWGSize(Dev, LocalMemBytesPerWorkItem); +} +#endif __SYCL_EXPORT void addCounterInit(handler &CGH, std::shared_ptr &Queue, diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index c822e0f4ec78d..67b58c8095c2a 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3290,6 +3290,7 @@ _ZN4sycl3_V16detail16AccessorBaseHostC1ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6a _ZN4sycl3_V16detail16AccessorBaseHostC1ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6access4modeEPviimbRKNS0_13property_listE _ZN4sycl3_V16detail16AccessorBaseHostC2ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6access4modeEPviibmbRKNS0_13property_listE _ZN4sycl3_V16detail16AccessorBaseHostC2ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6access4modeEPviimbRKNS0_13property_listE +_ZN4sycl3_V16detail16reduGetMaxWGSizeERNS0_7handlerEm _ZN4sycl3_V16detail16reduGetMaxWGSizeESt10shared_ptrINS1_10queue_implEEm _ZN4sycl3_V16detail17HostProfilingInfo3endEv _ZN4sycl3_V16detail17HostProfilingInfo5startEv @@ -3324,6 +3325,7 @@ _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6devi _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKS5_INS0_9kernel_idESaISB_EENS0_12bundle_stateE _ZN4sycl3_V16detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EENS0_12bundle_stateE _ZN4sycl3_V16detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKS5_INS0_9kernel_idESaISB_EENS0_12bundle_stateE +_ZN4sycl3_V16detail22reduGetPreferredWGSizeERNS0_7handlerEm _ZN4sycl3_V16detail22reduGetPreferredWGSizeERSt10shared_ptrINS1_10queue_implEEm _ZN4sycl3_V16detail22removeDuplicateDevicesERKSt6vectorINS0_6deviceESaIS3_EE _ZN4sycl3_V16detail23constructorNotificationEPvS2_NS0_6access6targetENS3_4modeERKNS1_13code_locationE @@ -3343,6 +3345,7 @@ _ZN4sycl3_V16detail30UnsampledImageAccessorBaseHost6getPtrEv _ZN4sycl3_V16detail30UnsampledImageAccessorBaseHostC1ENS0_5rangeILi3EEENS0_6access4modeEPviiNS0_2idILi3EEENS0_18image_channel_typeENS0_19image_channel_orderERKNS0_13property_listE _ZN4sycl3_V16detail30UnsampledImageAccessorBaseHostC2ENS0_5rangeILi3EEENS0_6access4modeEPviiNS0_2idILi3EEENS0_18image_channel_typeENS0_19image_channel_orderERKNS0_13property_listE _ZN4sycl3_V16detail33enable_ext_oneapi_default_contextEb +_ZN4sycl3_V16detail33reduGetMaxNumConcurrentWorkGroupsERNS0_7handlerE _ZN4sycl3_V16detail33reduGetMaxNumConcurrentWorkGroupsESt10shared_ptrINS1_10queue_implEE _ZN4sycl3_V16detail34addHostSampledImageAccessorAndWaitEPNS1_28SampledImageAccessorImplHostE _ZN4sycl3_V16detail35sampledImageConstructorNotificationEPvS2_RKSt8optionalINS0_12image_targetEEPKvjRKNS1_13code_locationE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index ed718ca0da207..32ebec506c12f 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -4342,9 +4342,12 @@ ?processArg@handler@_V1@sycl@@AEAAXPEAXAEBW4kernel_param_kind_t@detail@23@H_KAEA_K_N4@Z ?query@tls_code_loc_t@detail@_V1@sycl@@QEAAAEBUcode_location@234@XZ ?reduComputeWGSize@detail@_V1@sycl@@YA_K_K0AEA_K@Z +?reduGetMaxNumConcurrentWorkGroups@detail@_V1@sycl@@YAIAEAVhandler@23@@Z ?reduGetMaxNumConcurrentWorkGroups@detail@_V1@sycl@@YAIV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@@Z +?reduGetMaxWGSize@detail@_V1@sycl@@YA_KAEAVhandler@23@_K@Z ?reduGetMaxWGSize@detail@_V1@sycl@@YA_KV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z ?reduGetPreferredWGSize@detail@_V1@sycl@@YA_KAEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z +?reduGetPreferredWGSize@detail@_V1@sycl@@YA_KAEAVhandler@23@_K@Z ?registerDynamicParameter@handler@_V1@sycl@@AEAAXAEAVdynamic_parameter_base@detail@experimental@oneapi@ext@23@H@Z ?registerDynamicParameter@handler@_V1@sycl@@AEAAXPEAVdynamic_parameter_impl@detail@experimental@oneapi@ext@23@H@Z ?release_external_memory@experimental@oneapi@ext@_V1@sycl@@YAXUexternal_mem@12345@AEBVdevice@45@AEBVcontext@45@@Z From a58070233f71cf04978c51995d7dcf6098bbe7d7 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Thu, 5 Jun 2025 15:14:36 -0700 Subject: [PATCH 2/2] Stop passing Queue around in reductions --- sycl/include/sycl/reduction.hpp | 52 ++++++++++----------------------- 1 file changed, 15 insertions(+), 37 deletions(-) diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp index 36448e719baf3..79d1dbde93c29 100644 --- a/sycl/include/sycl/reduction.hpp +++ b/sycl/include/sycl/reduction.hpp @@ -1238,15 +1238,12 @@ template <> struct NDRangeReduction { template - static void run(handler &CGH, - const std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, + static void run(handler &CGH, nd_range NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc) { static_assert(Reduction::has_identity, "Identityless reductions are not supported by the " "local_atomic_and_atomic_cross_wg strategy."); - std::ignore = Queue; using Name = __sycl_reduction_kernel< reduction::MainKrn, KernelName, reduction::strategy::local_atomic_and_atomic_cross_wg>; @@ -1290,15 +1287,12 @@ struct NDRangeReduction< reduction::strategy::group_reduce_and_last_wg_detection> { template - static void run(handler &CGH, - const std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, + static void run(handler &CGH, nd_range NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc) { static_assert(Reduction::has_identity, "Identityless reductions are not supported by the " "group_reduce_and_last_wg_detection strategy."); - std::ignore = Queue; size_t NElements = Reduction::num_elements; size_t WGSize = NDRange.get_local_range().size(); size_t NWorkGroups = NDRange.get_group_range().size(); @@ -1490,9 +1484,7 @@ void doTreeReductionOnTuple(size_t WorkSize, size_t LID, template <> struct NDRangeReduction { template - static void run(handler &CGH, - const std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, + static void run(handler &CGH, nd_range NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc) { using reducer_type = typename Reduction::reducer_type; using element_type = typename ReducerTraits::element_type; @@ -1504,7 +1496,6 @@ template <> struct NDRangeReduction { constexpr bool UsePartialSumForOutput = !Reduction::is_usm && Reduction::has_identity; - std::ignore = Queue; size_t NElements = Reduction::num_elements; size_t WGSize = NDRange.get_local_range().size(); size_t NWorkGroups = NDRange.get_group_range().size(); @@ -1602,15 +1593,12 @@ template <> struct NDRangeReduction { template - static void run(handler &CGH, - const std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, + static void run(handler &CGH, nd_range NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc) { static_assert(Reduction::has_identity, "Identityless reductions are not supported by the " "group_reduce_and_atomic_cross_wg strategy."); - std::ignore = Queue; using Name = __sycl_reduction_kernel< reduction::MainKrn, KernelName, reduction::strategy::group_reduce_and_atomic_cross_wg>; @@ -1639,14 +1627,11 @@ struct NDRangeReduction< reduction::strategy::local_mem_tree_and_atomic_cross_wg> { template - static void run(handler &CGH, - const std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, + static void run(handler &CGH, nd_range NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc) { using reducer_type = typename Reduction::reducer_type; using element_type = typename ReducerTraits::element_type; - std::ignore = Queue; using Name = __sycl_reduction_kernel< reduction::MainKrn, KernelName, reduction::strategy::local_mem_tree_and_atomic_cross_wg>; @@ -1701,8 +1686,7 @@ struct NDRangeReduction< reduction::strategy::group_reduce_and_multiple_kernels> { template - static void run(handler &CGH, const std::shared_ptr &, - nd_range NDRange, PropertiesT &Properties, + static void run(handler &CGH, nd_range NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc) { static_assert(Reduction::has_identity, "Identityless reductions are not supported by the " @@ -1839,8 +1823,7 @@ struct NDRangeReduction< template <> struct NDRangeReduction { template - static void run(handler &CGH, const std::shared_ptr &, - nd_range NDRange, PropertiesT &Properties, + static void run(handler &CGH, nd_range NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc) { using element_type = typename Reduction::reducer_element_type; @@ -2614,8 +2597,7 @@ tuple_select_elements(TupleT Tuple, std::index_sequence) { template <> struct NDRangeReduction { template - static void run(handler &CGH, const std::shared_ptr &, - nd_range NDRange, PropertiesT &Properties, + static void run(handler &CGH, nd_range NDRange, PropertiesT &Properties, RestT... Rest) { std::tuple ArgsTuple(Rest...); constexpr size_t NumArgs = sizeof...(RestT); @@ -2658,13 +2640,10 @@ template <> struct NDRangeReduction { template - static void run(handler &CGH, - const std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, + static void run(handler &CGH, nd_range NDRange, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc) { auto Delegate = [&](auto Impl) { - Impl.template run(CGH, Queue, NDRange, Properties, Redu, - KernelFunc); + Impl.template run(CGH, NDRange, Properties, Redu, KernelFunc); }; if constexpr (Reduction::has_float64_atomics) { @@ -2706,10 +2685,9 @@ template <> struct NDRangeReduction { } template - static void - run(handler &CGH, const std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, RestT... Rest) { - return Impl::run(CGH, Queue, NDRange, Properties, + static void run(handler &CGH, nd_range NDRange, PropertiesT &Properties, + RestT... Rest) { + return Impl::run(CGH, NDRange, Properties, Rest...); } }; @@ -2718,8 +2696,8 @@ template void reduction_parallel_for(handler &CGH, nd_range NDRange, PropertiesT Properties, RestT... Rest) { - NDRangeReduction::template run(CGH, CGH.MQueue, NDRange, - Properties, Rest...); + NDRangeReduction::template run(CGH, NDRange, Properties, + Rest...); } __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh);