From f59a17bd178b8343042c7508b073ae05683a6b1c Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov@intel.com>
Date: Thu, 5 Jun 2025 09:11:19 -0700
Subject: [PATCH 1/2] [SYCL] Pass `handler &` instead of queue across ABI for
 reduction utils

Queue might be `nullptr` in case of graph, but the information this
utils query is device-specific. By passing entire `handler &` and having
access to graph information we'd be able to return more precise results.

Another positive side-effect is that we eliminiate explicit
`std::shared_ptr<queue_impl>` which is a small step forward in the
ongoing refactoring efforts to prefer passing `*_impl` by raw ptr/ref
with explicit `shared_from_this` whenever lifetimes need to be extended.
---
 sycl/include/sycl/reduction.hpp         | 36 ++++------
 sycl/source/detail/reduction.cpp        | 92 ++++++++++++++++---------
 sycl/test/abi/sycl_symbols_linux.dump   |  3 +
 sycl/test/abi/sycl_symbols_windows.dump |  3 +
 4 files changed, 78 insertions(+), 56 deletions(-)
diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp
index 6c03c80eab0f0..36448e719baf3 100644
--- a/sycl/include/sycl/reduction.hpp
+++ b/sycl/include/sycl/reduction.hpp
@@ -140,17 +140,10 @@ template <typename... Ts> ReduTupleT<Ts...> makeReduTupleT(Ts... Elements) {
   return sycl::detail::make_tuple(Elements...);
 }
 
-#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-__SYCL_EXPORT size_t reduGetMaxWGSize(const std::shared_ptr<queue_impl> &Queue,
-                                      size_t LocalMemBytesPerWorkItem);
-__SYCL_EXPORT size_t reduGetPreferredWGSize(
-    const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem);
-#else
-__SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
+__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh,
                                       size_t LocalMemBytesPerWorkItem);
-__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
+__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh,
                                             size_t LocalMemBytesPerWorkItem);
-#endif
 __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
                                        size_t &NWorkGroups);
 
@@ -1708,8 +1701,7 @@ struct NDRangeReduction<
     reduction::strategy::group_reduce_and_multiple_kernels> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
+  static void run(handler &CGH, const std::shared_ptr<detail::queue_impl> &,
                   nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     static_assert(Reduction::has_identity,
@@ -1729,7 +1721,7 @@ struct NDRangeReduction<
     // TODO: currently the maximal work group size is determined for the given
     // queue/device, while it may be safer to use queries to the kernel compiled
     // for the device.
-    size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
+    size_t MaxWGSize = reduGetMaxWGSize(CGH, OneElemSize);
     if (NDRange.get_local_range().size() > MaxWGSize)
       throw sycl::exception(make_error_code(errc::nd_range),
                             "The implementation handling parallel_for with"
@@ -1847,8 +1839,7 @@ struct NDRangeReduction<
 template <> struct NDRangeReduction<reduction::strategy::basic> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
+  static void run(handler &CGH, const std::shared_ptr<detail::queue_impl> &,
                   nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     using element_type = typename Reduction::reducer_element_type;
@@ -1858,7 +1849,7 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
     // TODO: currently the maximal work group size is determined for the given
     // queue/device, while it may be safer to use queries to the kernel
     // compiled for the device.
-    size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
+    size_t MaxWGSize = reduGetMaxWGSize(CGH, OneElemSize);
     if (NDRange.get_local_range().size() > MaxWGSize)
       throw sycl::exception(make_error_code(errc::nd_range),
                             "The implementation handling parallel_for with"
@@ -2623,9 +2614,9 @@ tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
 template <> struct NDRangeReduction<reduction::strategy::multi> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename... RestT>
-  static void
-  run(handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
-      nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
+  static void run(handler &CGH, const std::shared_ptr<detail::queue_impl> &,
+                  nd_range<Dims> NDRange, PropertiesT &Properties,
+                  RestT... Rest) {
     std::tuple<RestT...> ArgsTuple(Rest...);
     constexpr size_t NumArgs = sizeof...(RestT);
     auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
@@ -2636,7 +2627,7 @@ template <> struct NDRangeReduction<reduction::strategy::multi> {
     // TODO: currently the maximal work group size is determined for the given
     // queue/device, while it is safer to use queries to the kernel compiled
     // for the device.
-    size_t MaxWGSize = reduGetMaxWGSize(Queue, LocalMemPerWorkItem);
+    size_t MaxWGSize = reduGetMaxWGSize(CGH, LocalMemPerWorkItem);
     if (NDRange.get_local_range().size() > MaxWGSize)
       throw sycl::exception(make_error_code(errc::nd_range),
                             "The implementation handling parallel_for with"
@@ -2731,8 +2722,7 @@ void reduction_parallel_for(handler &CGH, nd_range<Dims> NDRange,
                                                        Properties, Rest...);
 }
 
-__SYCL_EXPORT uint32_t
-reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
+__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh);
 
 template <typename KernelName, reduction::strategy Strategy, int Dims,
           typename PropertiesT, typename... RestT>
@@ -2763,13 +2753,13 @@ void reduction_parallel_for(handler &CGH, range<Dims> Range,
 #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
       __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
 #else
-      reduGetMaxNumConcurrentWorkGroups(CGH.MQueue);
+      reduGetMaxNumConcurrentWorkGroups(CGH);
 #endif
 
   // TODO: currently the preferred work group size is determined for the given
   // queue/device, while it is safer to use queries to the kernel pre-compiled
   // for the device.
-  size_t PrefWGSize = reduGetPreferredWGSize(CGH.MQueue, OneElemSize);
+  size_t PrefWGSize = reduGetPreferredWGSize(CGH, OneElemSize);
 
   size_t NWorkItems = Range.size();
   size_t WGSize = std::min(NWorkItems, PrefWGSize);
diff --git a/sycl/source/detail/reduction.cpp b/sycl/source/detail/reduction.cpp
index 702f6692825b4..fb68ee368fcd6 100644
--- a/sycl/source/detail/reduction.cpp
+++ b/sycl/source/detail/reduction.cpp
@@ -49,8 +49,24 @@ __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
   return WGSize;
 }
 
+#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
+// Inline this helper:
+#endif
+uint32_t reduGetMaxNumConcurrentWorkGroups(device_impl &Dev) {
+  uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
+  // TODO: The heuristics here require additional tuning for various devices
+  // and vendors. Also, it would be better to check vendor/generation/etc.
+  if (Dev.is_gpu() && Dev.get_info<sycl::info::device::host_unified_memory>())
+    NumThreads *= 8;
+  return NumThreads;
+}
 // Returns the estimated number of physical threads on the device associated
 // with the given queue.
+__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh) {
+  return reduGetMaxNumConcurrentWorkGroups(getSyclObjImpl(cgh)->get_device());
+}
+
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
     std::shared_ptr<sycl::detail::queue_impl> Queue) {
   // TODO: Graphs extension explicit API uses a handler with no queue attached,
@@ -63,25 +79,14 @@ __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
   if (Queue == nullptr) {
     return 8;
   }
-  device Dev = Queue->get_device();
-  uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
-  // TODO: The heuristics here require additional tuning for various devices
-  // and vendors. Also, it would be better to check vendor/generation/etc.
-  if (Dev.is_gpu() && Dev.get_info<sycl::info::device::host_unified_memory>())
-    NumThreads *= 8;
-  return NumThreads;
+  return reduGetMaxNumConcurrentWorkGroups(Queue->getDeviceImpl());
 }
+#endif
 
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-__SYCL_EXPORT size_t
-reduGetMaxWGSize(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
-                 size_t LocalMemBytesPerWorkItem) {
-#else
-__SYCL_EXPORT size_t
-reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
-                 size_t LocalMemBytesPerWorkItem) {
+// Inline this helper:
 #endif
-  device Dev = Queue->get_device();
+size_t reduGetMaxWGSize(device_impl &Dev, size_t LocalMemBytesPerWorkItem) {
   size_t MaxWGSize = Dev.get_info<sycl::info::device::max_work_group_size>();
 
   size_t WGSizePerMem = MaxWGSize * 2;
@@ -118,26 +123,24 @@ reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
 
   return WGSize;
 }
+__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh,
+                                      size_t LocalMemBytesPerWorkItem) {
+  return reduGetMaxWGSize(getSyclObjImpl(cgh)->get_device(),
+                          LocalMemBytesPerWorkItem);
+}
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+__SYCL_EXPORT
+size_t reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
+                        size_t LocalMemBytesPerWorkItem) {
+  return reduGetMaxWGSize(Queue->getDeviceImpl(), LocalMemBytesPerWorkItem);
+}
+#endif
 
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-__SYCL_EXPORT size_t reduGetPreferredWGSize(
-    const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem) {
-#else
-__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
-                                            size_t LocalMemBytesPerWorkItem) {
+// Inline this helper:
 #endif
-  // TODO: Graphs extension explicit API uses a handler with a null queue to
-  // process CGFs, in future we should have access to the device so we can
-  // correctly calculate this.
-  //
-  // The 32 value was chosen as the hardcoded value as it is the returned
-  // value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
-  // Intel HD Graphics devices used as a L0 backend during development.
-  if (Queue == nullptr) {
-    return 32;
-  }
-  device Dev = Queue->get_device();
-
+size_t reduGetPreferredWGSize(device_impl &Dev,
+                              size_t LocalMemBytesPerWorkItem) {
   // The maximum WGSize returned by CPU devices is very large and does not
   // help the reduction implementation: since all work associated with a
   // work-group is typically assigned to one CPU thread, selecting a large
@@ -174,8 +177,31 @@ __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
   }
 
   // Use the maximum work-group size otherwise.
-  return reduGetMaxWGSize(Queue, LocalMemBytesPerWorkItem);
+  return reduGetMaxWGSize(Dev, LocalMemBytesPerWorkItem);
+}
+__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh,
+                                            size_t LocalMemBytesPerWorkItem) {
+  return reduGetPreferredWGSize(getSyclObjImpl(cgh)->get_device(),
+                                LocalMemBytesPerWorkItem);
 }
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
+                                            size_t LocalMemBytesPerWorkItem) {
+  // TODO: Graphs extension explicit API uses a handler with a null queue to
+  // process CGFs, in future we should have access to the device so we can
+  // correctly calculate this.
+  //
+  // The 32 value was chosen as the hardcoded value as it is the returned
+  // value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
+  // Intel HD Graphics devices used as a L0 backend during development.
+  if (Queue == nullptr) {
+    return 32;
+  }
+  device_impl &Dev = Queue->getDeviceImpl();
+
+  return reduGetPreferredWGSize(Dev, LocalMemBytesPerWorkItem);
+}
+#endif
 
 __SYCL_EXPORT void
 addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index c822e0f4ec78d..67b58c8095c2a 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3290,6 +3290,7 @@ _ZN4sycl3_V16detail16AccessorBaseHostC1ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6a
 _ZN4sycl3_V16detail16AccessorBaseHostC1ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6access4modeEPviimbRKNS0_13property_listE
 _ZN4sycl3_V16detail16AccessorBaseHostC2ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6access4modeEPviibmbRKNS0_13property_listE
 _ZN4sycl3_V16detail16AccessorBaseHostC2ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6access4modeEPviimbRKNS0_13property_listE
+_ZN4sycl3_V16detail16reduGetMaxWGSizeERNS0_7handlerEm
 _ZN4sycl3_V16detail16reduGetMaxWGSizeESt10shared_ptrINS1_10queue_implEEm
 _ZN4sycl3_V16detail17HostProfilingInfo3endEv
 _ZN4sycl3_V16detail17HostProfilingInfo5startEv
@@ -3324,6 +3325,7 @@ _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6devi
 _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKS5_INS0_9kernel_idESaISB_EENS0_12bundle_stateE
 _ZN4sycl3_V16detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EENS0_12bundle_stateE
 _ZN4sycl3_V16detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKS5_INS0_9kernel_idESaISB_EENS0_12bundle_stateE
+_ZN4sycl3_V16detail22reduGetPreferredWGSizeERNS0_7handlerEm
 _ZN4sycl3_V16detail22reduGetPreferredWGSizeERSt10shared_ptrINS1_10queue_implEEm
 _ZN4sycl3_V16detail22removeDuplicateDevicesERKSt6vectorINS0_6deviceESaIS3_EE
 _ZN4sycl3_V16detail23constructorNotificationEPvS2_NS0_6access6targetENS3_4modeERKNS1_13code_locationE
@@ -3343,6 +3345,7 @@ _ZN4sycl3_V16detail30UnsampledImageAccessorBaseHost6getPtrEv
 _ZN4sycl3_V16detail30UnsampledImageAccessorBaseHostC1ENS0_5rangeILi3EEENS0_6access4modeEPviiNS0_2idILi3EEENS0_18image_channel_typeENS0_19image_channel_orderERKNS0_13property_listE
 _ZN4sycl3_V16detail30UnsampledImageAccessorBaseHostC2ENS0_5rangeILi3EEENS0_6access4modeEPviiNS0_2idILi3EEENS0_18image_channel_typeENS0_19image_channel_orderERKNS0_13property_listE
 _ZN4sycl3_V16detail33enable_ext_oneapi_default_contextEb
+_ZN4sycl3_V16detail33reduGetMaxNumConcurrentWorkGroupsERNS0_7handlerE
 _ZN4sycl3_V16detail33reduGetMaxNumConcurrentWorkGroupsESt10shared_ptrINS1_10queue_implEE
 _ZN4sycl3_V16detail34addHostSampledImageAccessorAndWaitEPNS1_28SampledImageAccessorImplHostE
 _ZN4sycl3_V16detail35sampledImageConstructorNotificationEPvS2_RKSt8optionalINS0_12image_targetEEPKvjRKNS1_13code_locationE
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index ed718ca0da207..32ebec506c12f 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -4342,9 +4342,12 @@
 ?processArg@handler@_V1@sycl@@AEAAXPEAXAEBW4kernel_param_kind_t@detail@23@H_KAEA_K_N4@Z
 ?query@tls_code_loc_t@detail@_V1@sycl@@QEAAAEBUcode_location@234@XZ
 ?reduComputeWGSize@detail@_V1@sycl@@YA_K_K0AEA_K@Z
+?reduGetMaxNumConcurrentWorkGroups@detail@_V1@sycl@@YAIAEAVhandler@23@@Z
 ?reduGetMaxNumConcurrentWorkGroups@detail@_V1@sycl@@YAIV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@@Z
+?reduGetMaxWGSize@detail@_V1@sycl@@YA_KAEAVhandler@23@_K@Z
 ?reduGetMaxWGSize@detail@_V1@sycl@@YA_KV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z
 ?reduGetPreferredWGSize@detail@_V1@sycl@@YA_KAEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z
+?reduGetPreferredWGSize@detail@_V1@sycl@@YA_KAEAVhandler@23@_K@Z
 ?registerDynamicParameter@handler@_V1@sycl@@AEAAXAEAVdynamic_parameter_base@detail@experimental@oneapi@ext@23@H@Z
 ?registerDynamicParameter@handler@_V1@sycl@@AEAAXPEAVdynamic_parameter_impl@detail@experimental@oneapi@ext@23@H@Z
 ?release_external_memory@experimental@oneapi@ext@_V1@sycl@@YAXUexternal_mem@12345@AEBVdevice@45@AEBVcontext@45@@Z

From a58070233f71cf04978c51995d7dcf6098bbe7d7 Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov@intel.com>
Date: Thu, 5 Jun 2025 15:14:36 -0700
Subject: [PATCH 2/2] Stop passing Queue around in reductions

---
 sycl/include/sycl/reduction.hpp | 52 ++++++++++-----------------------
 1 file changed, 15 insertions(+), 37 deletions(-)

diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp
index 36448e719baf3..79d1dbde93c29 100644
--- a/sycl/include/sycl/reduction.hpp
+++ b/sycl/include/sycl/reduction.hpp
@@ -1238,15 +1238,12 @@ template <>
 struct NDRangeReduction<reduction::strategy::local_atomic_and_atomic_cross_wg> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     static_assert(Reduction::has_identity,
                   "Identityless reductions are not supported by the "
                   "local_atomic_and_atomic_cross_wg strategy.");
 
-    std::ignore = Queue;
     using Name = __sycl_reduction_kernel<
         reduction::MainKrn, KernelName,
         reduction::strategy::local_atomic_and_atomic_cross_wg>;
@@ -1290,15 +1287,12 @@ struct NDRangeReduction<
     reduction::strategy::group_reduce_and_last_wg_detection> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     static_assert(Reduction::has_identity,
                   "Identityless reductions are not supported by the "
                   "group_reduce_and_last_wg_detection strategy.");
 
-    std::ignore = Queue;
     size_t NElements = Reduction::num_elements;
     size_t WGSize = NDRange.get_local_range().size();
     size_t NWorkGroups = NDRange.get_group_range().size();
@@ -1490,9 +1484,7 @@ void doTreeReductionOnTuple(size_t WorkSize, size_t LID,
 template <> struct NDRangeReduction<reduction::strategy::range_basic> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     using reducer_type = typename Reduction::reducer_type;
     using element_type = typename ReducerTraits<reducer_type>::element_type;
@@ -1504,7 +1496,6 @@ template <> struct NDRangeReduction<reduction::strategy::range_basic> {
     constexpr bool UsePartialSumForOutput =
         !Reduction::is_usm && Reduction::has_identity;
 
-    std::ignore = Queue;
     size_t NElements = Reduction::num_elements;
     size_t WGSize = NDRange.get_local_range().size();
     size_t NWorkGroups = NDRange.get_group_range().size();
@@ -1602,15 +1593,12 @@ template <>
 struct NDRangeReduction<reduction::strategy::group_reduce_and_atomic_cross_wg> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     static_assert(Reduction::has_identity,
                   "Identityless reductions are not supported by the "
                   "group_reduce_and_atomic_cross_wg strategy.");
 
-    std::ignore = Queue;
     using Name = __sycl_reduction_kernel<
         reduction::MainKrn, KernelName,
         reduction::strategy::group_reduce_and_atomic_cross_wg>;
@@ -1639,14 +1627,11 @@ struct NDRangeReduction<
     reduction::strategy::local_mem_tree_and_atomic_cross_wg> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     using reducer_type = typename Reduction::reducer_type;
     using element_type = typename ReducerTraits<reducer_type>::element_type;
 
-    std::ignore = Queue;
     using Name = __sycl_reduction_kernel<
         reduction::MainKrn, KernelName,
         reduction::strategy::local_mem_tree_and_atomic_cross_wg>;
@@ -1701,8 +1686,7 @@ struct NDRangeReduction<
     reduction::strategy::group_reduce_and_multiple_kernels> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH, const std::shared_ptr<detail::queue_impl> &,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     static_assert(Reduction::has_identity,
                   "Identityless reductions are not supported by the "
@@ -1839,8 +1823,7 @@ struct NDRangeReduction<
 template <> struct NDRangeReduction<reduction::strategy::basic> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH, const std::shared_ptr<detail::queue_impl> &,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     using element_type = typename Reduction::reducer_element_type;
 
@@ -2614,8 +2597,7 @@ tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
 template <> struct NDRangeReduction<reduction::strategy::multi> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename... RestT>
-  static void run(handler &CGH, const std::shared_ptr<detail::queue_impl> &,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   RestT... Rest) {
     std::tuple<RestT...> ArgsTuple(Rest...);
     constexpr size_t NumArgs = sizeof...(RestT);
@@ -2658,13 +2640,10 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
 
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     auto Delegate = [&](auto Impl) {
-      Impl.template run<KernelName>(CGH, Queue, NDRange, Properties, Redu,
-                                    KernelFunc);
+      Impl.template run<KernelName>(CGH, NDRange, Properties, Redu, KernelFunc);
     };
 
     if constexpr (Reduction::has_float64_atomics) {
@@ -2706,10 +2685,9 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
   }
   template <typename KernelName, int Dims, typename PropertiesT,
             typename... RestT>
-  static void
-  run(handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
-      nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
-    return Impl<Strat::multi>::run<KernelName>(CGH, Queue, NDRange, Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
+                  RestT... Rest) {
+    return Impl<Strat::multi>::run<KernelName>(CGH, NDRange, Properties,
                                                Rest...);
   }
 };
@@ -2718,8 +2696,8 @@ template <typename KernelName, reduction::strategy Strategy, int Dims,
           typename PropertiesT, typename... RestT>
 void reduction_parallel_for(handler &CGH, nd_range<Dims> NDRange,
                             PropertiesT Properties, RestT... Rest) {
-  NDRangeReduction<Strategy>::template run<KernelName>(CGH, CGH.MQueue, NDRange,
-                                                       Properties, Rest...);
+  NDRangeReduction<Strategy>::template run<KernelName>(CGH, NDRange, Properties,
+                                                       Rest...);
 }
 
 __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh);