intel · aelovikov-intel · Jun 6, 2025 · Jun 5, 2025 · Jun 5, 2025
@@ -140,17 +140,10 @@ template <typename... Ts> ReduTupleT<Ts...> makeReduTupleT(Ts... Elements) {
   return sycl::detail::make_tuple(Elements...);
 }
 
-#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-__SYCL_EXPORT size_t reduGetMaxWGSize(const std::shared_ptr<queue_impl> &Queue,
-                                      size_t LocalMemBytesPerWorkItem);
-__SYCL_EXPORT size_t reduGetPreferredWGSize(
-    const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem);
-#else
-__SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
+__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh,
                                       size_t LocalMemBytesPerWorkItem);
-__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
+__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh,
                                             size_t LocalMemBytesPerWorkItem);
-#endif
 __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
                                        size_t &NWorkGroups);
 
@@ -1245,15 +1238,12 @@ template <>
 struct NDRangeReduction<reduction::strategy::local_atomic_and_atomic_cross_wg> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     static_assert(Reduction::has_identity,
                   "Identityless reductions are not supported by the "
                   "local_atomic_and_atomic_cross_wg strategy.");
 
-    std::ignore = Queue;
     using Name = __sycl_reduction_kernel<
         reduction::MainKrn, KernelName,
         reduction::strategy::local_atomic_and_atomic_cross_wg>;
@@ -1297,15 +1287,12 @@ struct NDRangeReduction<
     reduction::strategy::group_reduce_and_last_wg_detection> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     static_assert(Reduction::has_identity,
                   "Identityless reductions are not supported by the "
                   "group_reduce_and_last_wg_detection strategy.");
 
-    std::ignore = Queue;
     size_t NElements = Reduction::num_elements;
     size_t WGSize = NDRange.get_local_range().size();
     size_t NWorkGroups = NDRange.get_group_range().size();
@@ -1497,9 +1484,7 @@ void doTreeReductionOnTuple(size_t WorkSize, size_t LID,
 template <> struct NDRangeReduction<reduction::strategy::range_basic> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     using reducer_type = typename Reduction::reducer_type;
     using element_type = typename ReducerTraits<reducer_type>::element_type;
@@ -1511,7 +1496,6 @@ template <> struct NDRangeReduction<reduction::strategy::range_basic> {
     constexpr bool UsePartialSumForOutput =
         !Reduction::is_usm && Reduction::has_identity;
 
-    std::ignore = Queue;
     size_t NElements = Reduction::num_elements;
     size_t WGSize = NDRange.get_local_range().size();
     size_t NWorkGroups = NDRange.get_group_range().size();
@@ -1609,15 +1593,12 @@ template <>
 struct NDRangeReduction<reduction::strategy::group_reduce_and_atomic_cross_wg> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     static_assert(Reduction::has_identity,
                   "Identityless reductions are not supported by the "
                   "group_reduce_and_atomic_cross_wg strategy.");
 
-    std::ignore = Queue;
     using Name = __sycl_reduction_kernel<
         reduction::MainKrn, KernelName,
         reduction::strategy::group_reduce_and_atomic_cross_wg>;
@@ -1646,14 +1627,11 @@ struct NDRangeReduction<
     reduction::strategy::local_mem_tree_and_atomic_cross_wg> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     using reducer_type = typename Reduction::reducer_type;
     using element_type = typename ReducerTraits<reducer_type>::element_type;
 
-    std::ignore = Queue;
     using Name = __sycl_reduction_kernel<
         reduction::MainKrn, KernelName,
         reduction::strategy::local_mem_tree_and_atomic_cross_wg>;
@@ -1708,9 +1686,7 @@ struct NDRangeReduction<
     reduction::strategy::group_reduce_and_multiple_kernels> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     static_assert(Reduction::has_identity,
                   "Identityless reductions are not supported by the "
@@ -1729,7 +1705,7 @@ struct NDRangeReduction<
     // TODO: currently the maximal work group size is determined for the given
     // queue/device, while it may be safer to use queries to the kernel compiled
     // for the device.
-    size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
+    size_t MaxWGSize = reduGetMaxWGSize(CGH, OneElemSize);
     if (NDRange.get_local_range().size() > MaxWGSize)
       throw sycl::exception(make_error_code(errc::nd_range),
                             "The implementation handling parallel_for with"
@@ -1847,9 +1823,7 @@ struct NDRangeReduction<
 template <> struct NDRangeReduction<reduction::strategy::basic> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     using element_type = typename Reduction::reducer_element_type;
 
@@ -1858,7 +1832,7 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
     // TODO: currently the maximal work group size is determined for the given
     // queue/device, while it may be safer to use queries to the kernel
     // compiled for the device.
-    size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
+    size_t MaxWGSize = reduGetMaxWGSize(CGH, OneElemSize);
     if (NDRange.get_local_range().size() > MaxWGSize)
       throw sycl::exception(make_error_code(errc::nd_range),
                             "The implementation handling parallel_for with"
@@ -2623,9 +2597,8 @@ tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
 template <> struct NDRangeReduction<reduction::strategy::multi> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename... RestT>
-  static void
-  run(handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
-      nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
+                  RestT... Rest) {
     std::tuple<RestT...> ArgsTuple(Rest...);
     constexpr size_t NumArgs = sizeof...(RestT);
     auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
@@ -2636,7 +2609,7 @@ template <> struct NDRangeReduction<reduction::strategy::multi> {
     // TODO: currently the maximal work group size is determined for the given
     // queue/device, while it is safer to use queries to the kernel compiled
     // for the device.
-    size_t MaxWGSize = reduGetMaxWGSize(Queue, LocalMemPerWorkItem);
+    size_t MaxWGSize = reduGetMaxWGSize(CGH, LocalMemPerWorkItem);
     if (NDRange.get_local_range().size() > MaxWGSize)
       throw sycl::exception(make_error_code(errc::nd_range),
                             "The implementation handling parallel_for with"
@@ -2667,13 +2640,10 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
 
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     auto Delegate = [&](auto Impl) {
-      Impl.template run<KernelName>(CGH, Queue, NDRange, Properties, Redu,
-                                    KernelFunc);
+      Impl.template run<KernelName>(CGH, NDRange, Properties, Redu, KernelFunc);
     };
 
     if constexpr (Reduction::has_float64_atomics) {
@@ -2715,10 +2685,9 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
   }
   template <typename KernelName, int Dims, typename PropertiesT,
             typename... RestT>
-  static void
-  run(handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
-      nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
-    return Impl<Strat::multi>::run<KernelName>(CGH, Queue, NDRange, Properties,
+  static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
+                  RestT... Rest) {
+    return Impl<Strat::multi>::run<KernelName>(CGH, NDRange, Properties,
                                                Rest...);
   }
 };
@@ -2727,12 +2696,11 @@ template <typename KernelName, reduction::strategy Strategy, int Dims,
           typename PropertiesT, typename... RestT>
 void reduction_parallel_for(handler &CGH, nd_range<Dims> NDRange,
                             PropertiesT Properties, RestT... Rest) {
-  NDRangeReduction<Strategy>::template run<KernelName>(CGH, CGH.MQueue, NDRange,
-                                                       Properties, Rest...);
+  NDRangeReduction<Strategy>::template run<KernelName>(CGH, NDRange, Properties,
+                                                       Rest...);
 }
 
-__SYCL_EXPORT uint32_t
-reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
+__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh);
 
 template <typename KernelName, reduction::strategy Strategy, int Dims,
           typename PropertiesT, typename... RestT>
@@ -2763,13 +2731,13 @@ void reduction_parallel_for(handler &CGH, range<Dims> Range,
 #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
       __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
 #else
-      reduGetMaxNumConcurrentWorkGroups(CGH.MQueue);
+      reduGetMaxNumConcurrentWorkGroups(CGH);
 #endif
 
   // TODO: currently the preferred work group size is determined for the given
   // queue/device, while it is safer to use queries to the kernel pre-compiled
   // for the device.
-  size_t PrefWGSize = reduGetPreferredWGSize(CGH.MQueue, OneElemSize);
+  size_t PrefWGSize = reduGetPreferredWGSize(CGH, OneElemSize);
 
   size_t NWorkItems = Range.size();
   size_t WGSize = std::min(NWorkItems, PrefWGSize);

@@ -49,8 +49,24 @@ __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
   return WGSize;
 }
 
+#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
+// Inline this helper:
+#endif
+uint32_t reduGetMaxNumConcurrentWorkGroups(device_impl &Dev) {
+  uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
+  // TODO: The heuristics here require additional tuning for various devices
+  // and vendors. Also, it would be better to check vendor/generation/etc.
+  if (Dev.is_gpu() && Dev.get_info<sycl::info::device::host_unified_memory>())
+    NumThreads *= 8;
+  return NumThreads;
+}
 // Returns the estimated number of physical threads on the device associated
 // with the given queue.
+__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh) {
+  return reduGetMaxNumConcurrentWorkGroups(getSyclObjImpl(cgh)->get_device());
+}
+
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
     std::shared_ptr<sycl::detail::queue_impl> Queue) {
   // TODO: Graphs extension explicit API uses a handler with no queue attached,
@@ -63,25 +79,14 @@ __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
   if (Queue == nullptr) {
     return 8;
   }
-  device Dev = Queue->get_device();
-  uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
-  // TODO: The heuristics here require additional tuning for various devices
-  // and vendors. Also, it would be better to check vendor/generation/etc.
-  if (Dev.is_gpu() && Dev.get_info<sycl::info::device::host_unified_memory>())
-    NumThreads *= 8;
-  return NumThreads;
+  return reduGetMaxNumConcurrentWorkGroups(Queue->getDeviceImpl());
 }
+#endif
 
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-__SYCL_EXPORT size_t
-reduGetMaxWGSize(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
-                 size_t LocalMemBytesPerWorkItem) {
-#else
-__SYCL_EXPORT size_t
-reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
-                 size_t LocalMemBytesPerWorkItem) {
+// Inline this helper:
 #endif
-  device Dev = Queue->get_device();
+size_t reduGetMaxWGSize(device_impl &Dev, size_t LocalMemBytesPerWorkItem) {
   size_t MaxWGSize = Dev.get_info<sycl::info::device::max_work_group_size>();
 
   size_t WGSizePerMem = MaxWGSize * 2;
@@ -118,26 +123,24 @@ reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
 
   return WGSize;
 }
+__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh,
+                                      size_t LocalMemBytesPerWorkItem) {
+  return reduGetMaxWGSize(getSyclObjImpl(cgh)->get_device(),
+                          LocalMemBytesPerWorkItem);
+}
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+__SYCL_EXPORT
+size_t reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
+                        size_t LocalMemBytesPerWorkItem) {
+  return reduGetMaxWGSize(Queue->getDeviceImpl(), LocalMemBytesPerWorkItem);
+}
+#endif
 
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-__SYCL_EXPORT size_t reduGetPreferredWGSize(
-    const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem) {
-#else
-__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
-                                            size_t LocalMemBytesPerWorkItem) {
+// Inline this helper:
 #endif
-  // TODO: Graphs extension explicit API uses a handler with a null queue to
-  // process CGFs, in future we should have access to the device so we can
-  // correctly calculate this.
-  //
-  // The 32 value was chosen as the hardcoded value as it is the returned
-  // value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
-  // Intel HD Graphics devices used as a L0 backend during development.
-  if (Queue == nullptr) {
-    return 32;
-  }
-  device Dev = Queue->get_device();
-
+size_t reduGetPreferredWGSize(device_impl &Dev,
+                              size_t LocalMemBytesPerWorkItem) {
   // The maximum WGSize returned by CPU devices is very large and does not
   // help the reduction implementation: since all work associated with a
   // work-group is typically assigned to one CPU thread, selecting a large
@@ -174,8 +177,31 @@ __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
   }
 
   // Use the maximum work-group size otherwise.
-  return reduGetMaxWGSize(Queue, LocalMemBytesPerWorkItem);
+  return reduGetMaxWGSize(Dev, LocalMemBytesPerWorkItem);
+}
+__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh,
+                                            size_t LocalMemBytesPerWorkItem) {
+  return reduGetPreferredWGSize(getSyclObjImpl(cgh)->get_device(),
+                                LocalMemBytesPerWorkItem);
 }
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
+                                            size_t LocalMemBytesPerWorkItem) {
+  // TODO: Graphs extension explicit API uses a handler with a null queue to
+  // process CGFs, in future we should have access to the device so we can
+  // correctly calculate this.
+  //
+  // The 32 value was chosen as the hardcoded value as it is the returned
+  // value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
+  // Intel HD Graphics devices used as a L0 backend during development.
+  if (Queue == nullptr) {
+    return 32;
+  }
+  device_impl &Dev = Queue->getDeviceImpl();
+
+  return reduGetPreferredWGSize(Dev, LocalMemBytesPerWorkItem);
+}
+#endif
 
 __SYCL_EXPORT void
 addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,