Skip to content

Commit a657117

Browse files
authored
[SYCL][Reduction] Remove atomic64 check for float reductions (#6434)
1 parent b2c192b commit a657117

File tree

2 files changed

+22
-25
lines changed

2 files changed

+22
-25
lines changed

sycl/include/sycl/ext/oneapi/reduction.hpp

Lines changed: 20 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ using IsReduOptForFastAtomicFetch =
8686
#ifdef SYCL_REDUCTION_DETERMINISTIC
8787
bool_constant<false>;
8888
#else
89-
bool_constant<sycl::detail::is_sgeninteger<T>::value &&
89+
bool_constant<((sycl::detail::is_sgenfloat<T>::value && sizeof(T) == 4) ||
90+
sycl::detail::is_sgeninteger<T>::value) &&
9091
sycl::detail::IsValidAtomicType<T>::value &&
9192
(sycl::detail::IsPlus<T, BinaryOperation>::value ||
9293
sycl::detail::IsMinimum<T, BinaryOperation>::value ||
@@ -104,18 +105,15 @@ using IsReduOptForFastAtomicFetch =
104105
// IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
105106
// using the reduce_over_group() algorithm to produce stable results across same
106107
// type devices.
107-
// TODO 32 bit floating point atomics are eventually expected to be supported by
108-
// the has_fast_atomics specialization. Once the reducer class is updated to
109-
// replace the deprecated atomic class with atomic_ref, the (sizeof(T) == 4)
110-
// case should be removed here and replaced in IsReduOptForFastAtomicFetch.
111108
template <typename T, class BinaryOperation>
112-
using IsReduOptForAtomic64Add =
109+
using IsReduOptForAtomic64Op =
113110
#ifdef SYCL_REDUCTION_DETERMINISTIC
114111
bool_constant<false>;
115112
#else
116-
bool_constant<sycl::detail::IsPlus<T, BinaryOperation>::value &&
117-
sycl::detail::is_sgenfloat<T>::value &&
118-
(sizeof(T) == 4 || sizeof(T) == 8)>;
113+
bool_constant<(sycl::detail::IsPlus<T, BinaryOperation>::value ||
114+
sycl::detail::IsMinimum<T, BinaryOperation>::value ||
115+
sycl::detail::IsMaximum<T, BinaryOperation>::value) &&
116+
sycl::detail::is_sgenfloat<T>::value && sizeof(T) == 8>;
119117
#endif
120118

121119
// This type trait is used to detect if the group algorithm reduce() used with
@@ -278,7 +276,7 @@ template <class Reducer> class combiner {
278276
typename _T = T, class _BinaryOperation = BinaryOperation>
279277
enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
280278
(IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value ||
281-
IsReduOptForAtomic64Add<T, _BinaryOperation>::value) &&
279+
IsReduOptForAtomic64Op<T, _BinaryOperation>::value) &&
282280
sycl::detail::IsPlus<T, _BinaryOperation>::value>
283281
atomic_combine(_T *ReduVarPtr) const {
284282
atomic_combine_impl<Space>(
@@ -324,7 +322,8 @@ template <class Reducer> class combiner {
324322
template <access::address_space Space = access::address_space::global_space,
325323
typename _T = T, class _BinaryOperation = BinaryOperation>
326324
enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
327-
IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value &&
325+
(IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value ||
326+
IsReduOptForAtomic64Op<T, _BinaryOperation>::value) &&
328327
sycl::detail::IsMinimum<T, _BinaryOperation>::value>
329328
atomic_combine(_T *ReduVarPtr) const {
330329
atomic_combine_impl<Space>(
@@ -335,7 +334,8 @@ template <class Reducer> class combiner {
335334
template <access::address_space Space = access::address_space::global_space,
336335
typename _T = T, class _BinaryOperation = BinaryOperation>
337336
enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
338-
IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value &&
337+
(IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value ||
338+
IsReduOptForAtomic64Op<T, _BinaryOperation>::value) &&
339339
sycl::detail::IsMaximum<T, _BinaryOperation>::value>
340340
atomic_combine(_T *ReduVarPtr) const {
341341
atomic_combine_impl<Space>(
@@ -591,8 +591,8 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
591591
using rw_accessor_type = accessor<T, accessor_dim, access::mode::read_write,
592592
access::target::device, is_placeholder,
593593
ext::oneapi::accessor_property_list<>>;
594-
static constexpr bool has_atomic_add_float64 =
595-
IsReduOptForAtomic64Add<T, BinaryOperation>::value;
594+
static constexpr bool has_float64_atomics =
595+
IsReduOptForAtomic64Op<T, BinaryOperation>::value;
596596
static constexpr bool has_fast_atomics =
597597
IsReduOptForFastAtomicFetch<T, BinaryOperation>::value;
598598
static constexpr bool has_fast_reduce =
@@ -678,7 +678,7 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
678678
/// require initialization with identity value, then return user's read-write
679679
/// accessor. Otherwise, create global buffer with 'num_elements' initialized
680680
/// with identity value and return an accessor to that buffer.
681-
template <bool HasFastAtomics = (has_fast_atomics || has_atomic_add_float64)>
681+
template <bool HasFastAtomics = (has_fast_atomics || has_float64_atomics)>
682682
std::enable_if_t<HasFastAtomics, rw_accessor_type>
683683
getReadWriteAccessorToInitializedMem(handler &CGH) {
684684
if constexpr (is_rw_acc) {
@@ -2093,18 +2093,15 @@ template <class KernelName> struct NDRangeAtomic64;
20932093
} // namespace main_krn
20942094
} // namespace reduction
20952095

2096-
// Specialization for devices with the atomic64 aspect, which guarantees 64 (and
2097-
// temporarily 32) bit floating point support for atomic add.
2098-
// TODO 32 bit floating point atomics are eventually expected to be supported by
2099-
// the has_fast_atomics specialization. Corresponding changes to
2100-
// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
2101-
// be made.
2096+
// Specialization for devices with the atomic64 aspect, which guarantees 64 bit
2097+
// floating point support for atomic reduction operation.
21022098
template <typename KernelName, typename KernelType, int Dims, class Reduction>
21032099
void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
21042100
const nd_range<Dims> &Range, Reduction &Redu) {
21052101
auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
2106-
static_assert(Reduction::has_atomic_add_float64,
2107-
"Only suitable for reductions that have FP64 atomic add.");
2102+
static_assert(
2103+
Reduction::has_float64_atomics,
2104+
"Only suitable for reductions that have FP64 atomic operations.");
21082105
constexpr size_t NElements = Reduction::num_elements;
21092106
using Name =
21102107
__sycl_reduction_kernel<reduction::main_krn::NDRangeAtomic64, KernelName>;

sycl/include/sycl/handler.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1639,13 +1639,13 @@ class __SYCL_EXPORT handler {
16391639
void parallel_for(nd_range<Dims> Range, Reduction Redu,
16401640
_KERNELFUNCPARAM(KernelFunc)) {
16411641
if constexpr (!Reduction::has_fast_atomics &&
1642-
!Reduction::has_atomic_add_float64) {
1642+
!Reduction::has_float64_atomics) {
16431643
// The most basic implementation.
16441644
parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
16451645
return;
16461646
} else { // Can't "early" return for "if constexpr".
16471647
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1648-
if constexpr (Reduction::has_atomic_add_float64) {
1648+
if constexpr (Reduction::has_float64_atomics) {
16491649
/// This version is a specialization for the add
16501650
/// operator. It performs runtime checks for device aspect "atomic64";
16511651
/// if found, fast sycl::atomic_ref operations are used to update the

0 commit comments

Comments
 (0)