Skip to content

[SYCL][CUDA] atomic_ref.fetch_add used for fp64 reduction if device.has(atomic64) #3950

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Merged
merged 11 commits into from
Jun 30, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 68 additions & 1 deletion sycl/include/CL/sycl/ONEAPI/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,22 @@ using IsReduOptForFastAtomicFetch =
sycl::detail::IsBitAND<T, BinaryOperation>::value)>;
#endif

// This type trait is used to detect if the group algorithm reduce() used with
// operands of the type T and the operation Plus is available
// for using in reduction. Note that this type trait is a subset of
// IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
// using the reduce() algorithm to produce stable results across same type
// devices.
template <typename T, class BinaryOperation>
using IsReduOptForFastFloatAtomicAdd =
#ifdef SYCL_REDUCTION_DETERMINISTIC
bool_constant<false>;
#else
bool_constant<sycl::detail::IsPlus<T, BinaryOperation>::value &&
sycl::detail::is_sgenfloat<T>::value &&
(sizeof(T) == 4 || sizeof(T) == 8)>;
#endif

// This type trait is used to detect if the group algorithm reduce() used with
// operands of the type T and the operation BinaryOperation is available
// for using in reduction.
Expand Down Expand Up @@ -288,6 +304,18 @@ class reducer<T, BinaryOperation,
.fetch_max(MValue);
}

/// Atomic ADD operation: for floating point using atomic_ref
template <typename _T = T, class _BinaryOperation = BinaryOperation>
enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
IsReduOptForFastFloatAtomicAdd<T, _BinaryOperation>::value>
atomic_combine(_T *ReduVarPtr) const {

atomic_ref<T, sycl::ONEAPI::memory_order::relaxed,
sycl::ONEAPI::memory_scope::device,
access::address_space::global_space>(
*global_ptr<T>(ReduVarPtr)) += MValue;
}

T MValue;
};

Expand Down Expand Up @@ -330,6 +358,8 @@ class reduction_impl : private reduction_impl_base {
using local_accessor_type =
accessor<T, buffer_dim, access::mode::read_write, access::target::local>;

static constexpr bool has_atomic_add_float =
IsReduOptForFastFloatAtomicAdd<T, BinaryOperation>::value;
static constexpr bool has_fast_atomics =
IsReduOptForFastAtomicFetch<T, BinaryOperation>::value;
static constexpr bool has_fast_reduce =
Expand Down Expand Up @@ -636,8 +666,9 @@ class reduction_impl : private reduction_impl_base {
/// require initialization with identity value, then return user's read-write
/// accessor. Otherwise, create 1-element global buffer initialized with
/// identity value and return an accessor to that buffer.

template <bool HasFastAtomics = has_fast_atomics>
std::enable_if_t<HasFastAtomics, rw_accessor_type>
std::enable_if_t<HasFastAtomics || has_atomic_add_float, rw_accessor_type>
getReadWriteAccessorToInitializedMem(handler &CGH) {
if (!is_usm && !initializeToIdentity())
return *MRWAcc;
Expand Down Expand Up @@ -1467,6 +1498,42 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc,
}
}

// Specialization for devices with the atomic64 aspect, which guarantees 64 (and
// 32) bit floating point support for atomic add.
template <typename KernelName, typename KernelType, int Dims, class Reduction>
std::enable_if_t<Reduction::has_atomic_add_float>
reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range, Reduction &,
typename Reduction::rw_accessor_type Out) {
using Name = typename get_reduction_main_kernel_name_t<
KernelName, KernelType, Reduction::is_usm,
Reduction::has_atomic_add_float,
typename Reduction::rw_accessor_type>::name;
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
// Call user's function. Reducer.MValue gets initialized there.
typename Reduction::reducer_type Reducer;
KernelFunc(NDIt, Reducer);

typename Reduction::binary_operation BOp;
Reducer.MValue = reduce_over_group(NDIt.get_group(), Reducer.MValue, BOp);
if (NDIt.get_local_linear_id() == 0) {
Reducer.atomic_combine(Reduction::getOutPointer(Out));
}
});
}

// Specialization for devices with the atomic64 aspect, which guarantees 64 (and
// 32) bit floating point support for atomic add.
template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<Reduction::has_atomic_add_float>
reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range, Reduction &Redu) {

auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
reduCGFuncImplAtomic64<KernelName, KernelType, Dims, Reduction>(
CGH, KernelFunc, Range, Redu, Out);
}

inline void associateReduAccsWithHandlerHelper(handler &) {}

template <typename ReductionT>
Expand Down
3 changes: 2 additions & 1 deletion sycl/include/CL/sycl/aspects.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ enum class aspect {
ext_intel_gpu_subslices_per_slice = 22,
ext_intel_gpu_eu_count_per_subslice = 23,
ext_intel_max_mem_bandwidth = 24,
ext_intel_mem_channel = 25
ext_intel_mem_channel = 25,
atomic64 = 26
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically, there are int64_base_atomics and int64_extended_atomics that overlap with atomic64. Shouldn't we deprecate int64_*_atomics aspects?

};

} // namespace sycl
Expand Down
4 changes: 3 additions & 1 deletion sycl/include/CL/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,9 @@ typedef enum {
PI_DEVICE_INFO_GPU_SLICES = 0x10023,
PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 0x10024,
PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 0x10025,
PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026
PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026,
// These are extensions that are currently only implemented for nvidia.
PI_DEVICE_INFO_ATOMIC_64 = 0x10110
} _pi_device_info;

typedef enum {
Expand Down
60 changes: 59 additions & 1 deletion sycl/include/CL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,11 @@ class reduction_impl;
using cl::sycl::detail::enable_if_t;
using cl::sycl::detail::queue_impl;

template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<Reduction::has_atomic_add_float>
reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range, Reduction &Redu);

template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<Reduction::has_fast_atomics>
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
Expand Down Expand Up @@ -1373,6 +1378,49 @@ class __SYCL_EXPORT handler {
}
}

/// Implements parallel_for() accepting nd_range \p Range and one reduction
/// object. This version is a specialization for the add operator.
/// It performs runtime checks for device aspect "atomic64"; if found, fast
/// sycl::atomic_ref operations are used to update the reduction at the
/// end of each work-group work. Otherwise the default implementation is
/// used.
//
// If the reduction variable must be initialized with the identity value
// before the kernel run, then an additional working accessor is created,
// initialized with the identity value and used in the kernel. That working
// accessor is then copied to user's accessor or USM pointer after
// the kernel run.
// For USM pointers without initialize_to_identity properties the same scheme
// with working accessor is used as re-using user's USM pointer in the kernel
// would require creation of another variant of user's kernel, which does not
// seem efficient.
template <typename KernelName = detail::auto_name, typename KernelType,
int Dims, typename Reduction>
detail::enable_if_t<Reduction::has_atomic_add_float>
parallel_for(nd_range<Dims> Range, Reduction Redu,
_KERNELFUNCPARAM(KernelFunc)) {

shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
device D = detail::getDeviceFromHandler(*this);

if (D.has(aspect::atomic64)) {

ONEAPI::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc, Range,
Redu);

if (Reduction::is_usm || Redu.initializeToIdentity()) {
this->finalize();
handler CopyHandler(QueueCopy, MIsHost);
CopyHandler.saveCodeLoc(MCodeLoc);
ONEAPI::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
Redu);
MLastEvent = CopyHandler.finalize();
}
} else {
parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
}
}

/// Defines and invokes a SYCL kernel function for the specified nd_range.
/// Performs reduction operation specified in \p Redu.
///
Expand All @@ -1389,9 +1437,19 @@ class __SYCL_EXPORT handler {
/// optimized implementations waiting for their turn of code-review.
template <typename KernelName = detail::auto_name, typename KernelType,
int Dims, typename Reduction>
detail::enable_if_t<!Reduction::has_fast_atomics>
detail::enable_if_t<!Reduction::has_fast_atomics &&
!Reduction::has_atomic_add_float>
parallel_for(nd_range<Dims> Range, Reduction Redu,
_KERNELFUNCPARAM(KernelFunc)) {

parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
}

template <typename KernelName, typename KernelType, int Dims,
typename Reduction>
detail::enable_if_t<!Reduction::has_fast_atomics>
parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
KernelType KernelFunc) {
// This parallel_for() is lowered to the following sequence:
// 1) Call a kernel that a) call user's lambda function and b) performs
// one iteration of reduction, storing the partial reductions/sums
Expand Down
1 change: 1 addition & 0 deletions sycl/include/CL/sycl/info/device_traits.def
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ __SYCL_PARAM_TRAITS_SPEC(device, max_clock_frequency, pi_uint32)
__SYCL_PARAM_TRAITS_SPEC(device, address_bits, pi_uint32)
__SYCL_PARAM_TRAITS_SPEC(device, max_mem_alloc_size, pi_uint64)
__SYCL_PARAM_TRAITS_SPEC(device, image_support, bool)
__SYCL_PARAM_TRAITS_SPEC(device, atomic64, bool)
__SYCL_PARAM_TRAITS_SPEC(device, max_read_image_args, pi_uint32)
__SYCL_PARAM_TRAITS_SPEC(device, max_write_image_args, pi_uint32)
__SYCL_PARAM_TRAITS_SPEC(device, image2d_max_width, size_t)
Expand Down
4 changes: 3 additions & 1 deletion sycl/include/CL/sycl/info/info_desc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,9 @@ enum class device : cl_device_info {
ext_intel_gpu_eu_count_per_subslice =
PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE,
ext_intel_max_mem_bandwidth = PI_DEVICE_INFO_MAX_MEM_BANDWIDTH,
ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL
ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL,
// currently only implemented for nvidia
atomic64 = PI_DEVICE_INFO_ATOMIC_64
};

enum class device_type : pi_uint64 {
Expand Down
13 changes: 13 additions & 0 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -981,6 +981,19 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
bool ifp = (major >= 7);
return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
}

case PI_DEVICE_INFO_ATOMIC_64: {
int major = 0;
cl::sycl::detail::pi::assertion(
cuDeviceGetAttribute(&major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
device->get()) == CUDA_SUCCESS);

bool atomic64 = (major >= 6) ? true : false;
return getInfo(param_value_size, param_value, param_value_size_ret,
atomic64);
}

case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
// NVIDIA devices only support one sub-group size (the warp size)
int warpSize = 0;
Expand Down
1 change: 1 addition & 0 deletions sycl/plugins/opencl/pi_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
case PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
case PI_DEVICE_INFO_ATOMIC_64:
return PI_INVALID_VALUE;

default:
Expand Down
2 changes: 2 additions & 0 deletions sycl/source/detail/device_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@ bool device_impl::has(aspect Aspect) const {
return has_extension("cl_khr_int64_base_atomics");
case aspect::int64_extended_atomics:
return has_extension("cl_khr_int64_extended_atomics");
case aspect::atomic64:
return get_info<info::device::atomic64>();
case aspect::image:
return get_info<info::device::image_support>();
case aspect::online_compiler:
Expand Down
26 changes: 26 additions & 0 deletions sycl/source/detail/device_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,28 @@ template <> struct get_device_info<bool, info::device::queue_profiling> {
}
};

// Specialization for atomic64 that is necessary because
// PI_DEVICE_INFO_ATOMIC_64 isn't implemented for backend other than cuda.
// TODO the if-statement can be removed when the other backends support
// PI_DEVICE_INFO_ATOMIC_64.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see comment below, just allow backend to "not know" the value and gracefully react to an error
we don't need to have backend specific paths in SYCL RT for this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK thanks. Done.

template <> struct get_device_info<bool, info::device::atomic64> {
static bool get(RT::PiDevice dev, const plugin &Plugin) {

bool result = false;

platform plt =
get_device_info<platform, info::device::platform>::get(dev, Plugin);

if (plt.get_backend() == backend::cuda) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of checking for "cuda" here you should run for any backend, but allow errors (from backends that don't support it), and gracefully return false.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the suggestion. I've implemented this as suggested.

Plugin.call<PiApiKind::piDeviceGetInfo>(
dev, pi::cast<RT::PiDeviceInfo>(info::device::atomic64),
sizeof(result), &result, nullptr);
}

return (result);
}
};

// Specialization for exec_capabilities, OpenCL returns a bitfield
template <>
struct get_device_info<vector_class<info::execution_capability>,
Expand Down Expand Up @@ -613,6 +635,10 @@ template <> inline bool get_device_info_host<info::device::image_support>() {
return true;
}

template <> inline bool get_device_info_host<info::device::atomic64>() {
return false;
}

template <>
inline cl_uint get_device_info_host<info::device::max_read_image_args>() {
// current value is the required minimum
Expand Down
1 change: 1 addition & 0 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -4103,6 +4103,7 @@ _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65571EEENS3_12param_traitsIS4_XT_
_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65572EEENS3_12param_traitsIS4_XT_EE11return_typeEv
_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65573EEENS3_12param_traitsIS4_XT_EE11return_typeEv
_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65574EEENS3_12param_traitsIS4_XT_EE11return_typeEv
_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65808EEENS3_12param_traitsIS4_XT_EE11return_typeEv
_ZNK2cl4sycl6device9getNativeEv
_ZNK2cl4sycl6kernel11get_contextEv
_ZNK2cl4sycl6kernel11get_programEv
Expand Down
3 changes: 3 additions & 0 deletions sycl/test/on-device/basic_tests/aspects.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ int main() {
if (plt.has(aspect::int64_extended_atomics)) {
std::cout << " extended atomic operations" << std::endl;
}
if (plt.has(aspect::atomic64)) {
std::cout << " atomic64" << std::endl;
}
if (plt.has(aspect::image)) {
std::cout << " images" << std::endl;
}
Expand Down