intel · bader · Jun 30, 2021 · Jun 17, 2021 · Jun 23, 2021 · Jun 23, 2021
@@ -53,6 +53,22 @@ using IsReduOptForFastAtomicFetch =
                    sycl::detail::IsBitAND<T, BinaryOperation>::value)>;
 #endif
 
+// This type trait is used to detect if the group algorithm reduce() used with
+// operands of the type T and the operation Plus is available
+// for using in reduction. Note that this type trait is a subset of
+// IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
+// using the reduce() algorithm to produce stable results across same type
+// devices.
+template <typename T, class BinaryOperation>
+using IsReduOptForFastFloatAtomicAdd =
+#ifdef SYCL_REDUCTION_DETERMINISTIC
+    bool_constant<false>;
+#else
+    bool_constant<sycl::detail::IsPlus<T, BinaryOperation>::value &&
+                  sycl::detail::is_sgenfloat<T>::value &&
+                  (sizeof(T) == 4 || sizeof(T) == 8)>;
+#endif
+
 // This type trait is used to detect if the group algorithm reduce() used with
 // operands of the type T and the operation BinaryOperation is available
 // for using in reduction.
@@ -288,6 +304,18 @@ class reducer<T, BinaryOperation,
         .fetch_max(MValue);
   }
 
+  /// Atomic ADD operation: for floating point using atomic_ref
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
+              IsReduOptForFastFloatAtomicAdd<T, _BinaryOperation>::value>
+  atomic_combine(_T *ReduVarPtr) const {
+
+    atomic_ref<T, sycl::ONEAPI::memory_order::relaxed,
+               sycl::ONEAPI::memory_scope::device,
+               access::address_space::global_space>(
+        *global_ptr<T>(ReduVarPtr)) += MValue;
+  }
+
   T MValue;
 };
 
@@ -330,6 +358,8 @@ class reduction_impl : private reduction_impl_base {
   using local_accessor_type =
       accessor<T, buffer_dim, access::mode::read_write, access::target::local>;
 
+  static constexpr bool has_atomic_add_float =
+      IsReduOptForFastFloatAtomicAdd<T, BinaryOperation>::value;
   static constexpr bool has_fast_atomics =
       IsReduOptForFastAtomicFetch<T, BinaryOperation>::value;
   static constexpr bool has_fast_reduce =
@@ -636,8 +666,9 @@ class reduction_impl : private reduction_impl_base {
   /// require initialization with identity value, then return user's read-write
   /// accessor. Otherwise, create 1-element global buffer initialized with
   /// identity value and return an accessor to that buffer.
+
   template <bool HasFastAtomics = has_fast_atomics>
-  std::enable_if_t<HasFastAtomics, rw_accessor_type>
+  std::enable_if_t<HasFastAtomics || has_atomic_add_float, rw_accessor_type>
   getReadWriteAccessorToInitializedMem(handler &CGH) {
     if (!is_usm && !initializeToIdentity())
       return *MRWAcc;
@@ -1467,6 +1498,42 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc,
   }
 }
 
+// Specialization for devices with the atomic64 aspect, which guarantees 64 (and
+// 32) bit floating point support for atomic add.
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+std::enable_if_t<Reduction::has_atomic_add_float>
+reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc,
+                       const nd_range<Dims> &Range, Reduction &,
+                       typename Reduction::rw_accessor_type Out) {
+  using Name = typename get_reduction_main_kernel_name_t<
+      KernelName, KernelType, Reduction::is_usm,
+      Reduction::has_atomic_add_float,
+      typename Reduction::rw_accessor_type>::name;
+  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
+    // Call user's function. Reducer.MValue gets initialized there.
+    typename Reduction::reducer_type Reducer;
+    KernelFunc(NDIt, Reducer);
+
+    typename Reduction::binary_operation BOp;
+    Reducer.MValue = reduce_over_group(NDIt.get_group(), Reducer.MValue, BOp);
+    if (NDIt.get_local_linear_id() == 0) {
+      Reducer.atomic_combine(Reduction::getOutPointer(Out));
+    }
+  });
+}
+
+// Specialization for devices with the atomic64 aspect, which guarantees 64 (and
+// 32) bit floating point support for atomic add.
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+enable_if_t<Reduction::has_atomic_add_float>
+reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
+                   const nd_range<Dims> &Range, Reduction &Redu) {
+
+  auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
+  reduCGFuncImplAtomic64<KernelName, KernelType, Dims, Reduction>(
+      CGH, KernelFunc, Range, Redu, Out);
+}
+
 inline void associateReduAccsWithHandlerHelper(handler &) {}
 
 template <typename ReductionT>

@@ -38,7 +38,8 @@ enum class aspect {
   ext_intel_gpu_subslices_per_slice = 22,
   ext_intel_gpu_eu_count_per_subslice = 23,
   ext_intel_max_mem_bandwidth = 24,
-  ext_intel_mem_channel = 25
+  ext_intel_mem_channel = 25,
+  atomic64 = 26
 };
 
 } // namespace sycl

@@ -285,7 +285,9 @@ typedef enum {
   PI_DEVICE_INFO_GPU_SLICES = 0x10023,
   PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 0x10024,
   PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 0x10025,
-  PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026
+  PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026,
+  // These are extensions that are currently only implemented for nvidia.
+  PI_DEVICE_INFO_ATOMIC_64 = 0x10110
 } _pi_device_info;
 
 typedef enum {

@@ -210,6 +210,11 @@ class reduction_impl;
 using cl::sycl::detail::enable_if_t;
 using cl::sycl::detail::queue_impl;
 
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+enable_if_t<Reduction::has_atomic_add_float>
+reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
+                   const nd_range<Dims> &Range, Reduction &Redu);
+
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
 enable_if_t<Reduction::has_fast_atomics>
 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
@@ -1373,6 +1378,49 @@ class __SYCL_EXPORT handler {
     }
   }
 
+  /// Implements parallel_for() accepting nd_range \p Range and one reduction
+  /// object. This version is a specialization for the add operator.
+  /// It performs runtime checks for device aspect "atomic64"; if found, fast
+  /// sycl::atomic_ref operations are used to update the reduction at the
+  /// end of each work-group work.  Otherwise the default implementation is
+  /// used.
+  //
+  // If the reduction variable must be initialized with the identity value
+  // before the kernel run, then an additional working accessor is created,
+  // initialized with the identity value and used in the kernel. That working
+  // accessor is then copied to user's accessor or USM pointer after
+  // the kernel run.
+  // For USM pointers without initialize_to_identity properties the same scheme
+  // with working accessor is used as re-using user's USM pointer in the kernel
+  // would require creation of another variant of user's kernel, which does not
+  // seem efficient.
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims, typename Reduction>
+  detail::enable_if_t<Reduction::has_atomic_add_float>
+  parallel_for(nd_range<Dims> Range, Reduction Redu,
+               _KERNELFUNCPARAM(KernelFunc)) {
+
+    shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
+    device D = detail::getDeviceFromHandler(*this);
+
+    if (D.has(aspect::atomic64)) {
+
+      ONEAPI::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc, Range,
+                                                     Redu);
+
+      if (Reduction::is_usm || Redu.initializeToIdentity()) {
+        this->finalize();
+        handler CopyHandler(QueueCopy, MIsHost);
+        CopyHandler.saveCodeLoc(MCodeLoc);
+        ONEAPI::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
+                                                                 Redu);
+        MLastEvent = CopyHandler.finalize();
+      }
+    } else {
+      parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
+    }
+  }
+
   /// Defines and invokes a SYCL kernel function for the specified nd_range.
   /// Performs reduction operation specified in \p Redu.
   ///
@@ -1389,9 +1437,19 @@ class __SYCL_EXPORT handler {
   /// optimized implementations waiting for their turn of code-review.
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims, typename Reduction>
-  detail::enable_if_t<!Reduction::has_fast_atomics>
+  detail::enable_if_t<!Reduction::has_fast_atomics &&
+                      !Reduction::has_atomic_add_float>
   parallel_for(nd_range<Dims> Range, Reduction Redu,
                _KERNELFUNCPARAM(KernelFunc)) {
+
+    parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
+  }
+
+  template <typename KernelName, typename KernelType, int Dims,
+            typename Reduction>
+  detail::enable_if_t<!Reduction::has_fast_atomics>
+  parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
+                    KernelType KernelFunc) {
     // This parallel_for() is lowered to the following sequence:
     // 1) Call a kernel that a) call user's lambda function and b) performs
     //    one iteration of reduction, storing the partial reductions/sums

@@ -22,6 +22,7 @@ __SYCL_PARAM_TRAITS_SPEC(device, max_clock_frequency, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, address_bits, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, max_mem_alloc_size, pi_uint64)
 __SYCL_PARAM_TRAITS_SPEC(device, image_support, bool)
+__SYCL_PARAM_TRAITS_SPEC(device, atomic64, bool)
 __SYCL_PARAM_TRAITS_SPEC(device, max_read_image_args, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, max_write_image_args, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, image2d_max_width, size_t)

@@ -141,7 +141,9 @@ enum class device : cl_device_info {
   ext_intel_gpu_eu_count_per_subslice =
       PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE,
   ext_intel_max_mem_bandwidth = PI_DEVICE_INFO_MAX_MEM_BANDWIDTH,
-  ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL
+  ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL,
+  // currently only implemented for nvidia
+  atomic64 = PI_DEVICE_INFO_ATOMIC_64
 };
 
 enum class device_type : pi_uint64 {

@@ -981,6 +981,19 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     bool ifp = (major >= 7);
     return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
   }
+
+  case PI_DEVICE_INFO_ATOMIC_64: {
+    int major = 0;
+    cl::sycl::detail::pi::assertion(
+        cuDeviceGetAttribute(&major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             device->get()) == CUDA_SUCCESS);
+
+    bool atomic64 = (major >= 6) ? true : false;
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   atomic64);
+  }
+
   case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
     // NVIDIA devices only support one sub-group size (the warp size)
     int warpSize = 0;

@@ -176,6 +176,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
   case PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
   case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
   case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
+  case PI_DEVICE_INFO_ATOMIC_64:
     return PI_INVALID_VALUE;
 
   default:

@@ -239,6 +239,8 @@ bool device_impl::has(aspect Aspect) const {
     return has_extension("cl_khr_int64_base_atomics");
   case aspect::int64_extended_atomics:
     return has_extension("cl_khr_int64_extended_atomics");
+  case aspect::atomic64:
+    return get_info<info::device::atomic64>();
   case aspect::image:
     return get_info<info::device::image_support>();
   case aspect::online_compiler:

@@ -233,6 +233,28 @@ template <> struct get_device_info<bool, info::device::queue_profiling> {
   }
 };
 
+// Specialization for atomic64 that is necessary because
+// PI_DEVICE_INFO_ATOMIC_64 isn't implemented for backend other than cuda.
+// TODO the if-statement can be removed when the other backends support
+// PI_DEVICE_INFO_ATOMIC_64.
+template <> struct get_device_info<bool, info::device::atomic64> {
+  static bool get(RT::PiDevice dev, const plugin &Plugin) {
+
+    bool result = false;
+
+    platform plt =
+        get_device_info<platform, info::device::platform>::get(dev, Plugin);
+
+    if (plt.get_backend() == backend::cuda) {
+      Plugin.call<PiApiKind::piDeviceGetInfo>(
+          dev, pi::cast<RT::PiDeviceInfo>(info::device::atomic64),
+          sizeof(result), &result, nullptr);
+    }
+
+    return (result);
+  }
+};
+
 // Specialization for exec_capabilities, OpenCL returns a bitfield
 template <>
 struct get_device_info<vector_class<info::execution_capability>,
@@ -613,6 +635,10 @@ template <> inline bool get_device_info_host<info::device::image_support>() {
   return true;
 }
 
+template <> inline bool get_device_info_host<info::device::atomic64>() {
+  return false;
+}
+
 template <>
 inline cl_uint get_device_info_host<info::device::max_read_image_args>() {
   // current value is the required minimum

@@ -4103,6 +4103,7 @@ _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65571EEENS3_12param_traitsIS4_XT_
 _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65572EEENS3_12param_traitsIS4_XT_EE11return_typeEv
 _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65573EEENS3_12param_traitsIS4_XT_EE11return_typeEv
 _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65574EEENS3_12param_traitsIS4_XT_EE11return_typeEv
+_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65808EEENS3_12param_traitsIS4_XT_EE11return_typeEv
 _ZNK2cl4sycl6device9getNativeEv
 _ZNK2cl4sycl6kernel11get_contextEv
 _ZNK2cl4sycl6kernel11get_programEv

@@ -57,6 +57,9 @@ int main() {
     if (plt.has(aspect::int64_extended_atomics)) {
       std::cout << "  extended atomic operations" << std::endl;
     }
+    if (plt.has(aspect::atomic64)) {
+      std::cout << "  atomic64" << std::endl;
+    }
     if (plt.has(aspect::image)) {
       std::cout << "  images" << std::endl;
     }