@@ -1577,18 +1577,6 @@ reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
1577
1577
});
1578
1578
}
1579
1579
1580
- // / For the given 'Reductions' types pack and indices enumerating only
1581
- // / the reductions for which a local accessors are needed, this function creates
1582
- // / those local accessors and returns a tuple consisting of them.
1583
- template <typename ... Reductions, size_t ... Is>
1584
- auto createReduLocalAccs (size_t Size, handler &CGH,
1585
- std::index_sequence<Is...>) {
1586
- return makeReduTupleT (
1587
- local_accessor<typename std::tuple_element_t <
1588
- Is, std::tuple<Reductions...>>::result_type,
1589
- 1 >{Size, CGH}...);
1590
- }
1591
-
1592
1580
// / For the given 'Reductions' types pack and indices enumerating them this
1593
1581
// / function either creates new temporary accessors for partial sums (if IsOneWG
1594
1582
// / is false) or returns user's accessor/USM-pointer if (IsOneWG is true).
@@ -1603,95 +1591,6 @@ auto createReduOutAccs(size_t NWorkGroups, handler &CGH,
1603
1591
CGH)...);
1604
1592
}
1605
1593
1606
- // / For the given 'Reductions' types pack and indices enumerating them this
1607
- // / function returns accessors to buffers holding partial sums generated in the
1608
- // / previous kernel invocation.
1609
- template <typename ... Reductions, size_t ... Is>
1610
- auto getReadAccsToPreviousPartialReds (handler &CGH,
1611
- std::tuple<Reductions...> &ReduTuple,
1612
- std::index_sequence<Is...>) {
1613
- return makeReduTupleT (
1614
- std::get<Is>(ReduTuple).getReadAccToPreviousPartialReds (CGH)...);
1615
- }
1616
-
1617
- template <typename ... Reductions, size_t ... Is>
1618
- ReduTupleT<typename Reductions::result_type...>
1619
- getReduIdentities (std::tuple<Reductions...> &ReduTuple,
1620
- std::index_sequence<Is...>) {
1621
- return {std::get<Is>(ReduTuple).getIdentity ()...};
1622
- }
1623
-
1624
- template <typename ... Reductions, size_t ... Is>
1625
- ReduTupleT<typename Reductions::binary_operation...>
1626
- getReduBOPs (std::tuple<Reductions...> &ReduTuple, std::index_sequence<Is...>) {
1627
- return {std::get<Is>(ReduTuple).getBinaryOperation ()...};
1628
- }
1629
-
1630
- template <typename ... Reductions, size_t ... Is>
1631
- std::array<bool , sizeof ...(Reductions)>
1632
- getInitToIdentityProperties (std::tuple<Reductions...> &ReduTuple,
1633
- std::index_sequence<Is...>) {
1634
- return {std::get<Is>(ReduTuple).initializeToIdentity ()...};
1635
- }
1636
-
1637
- template <typename ... Reductions, size_t ... Is>
1638
- std::tuple<typename Reductions::reducer_type...>
1639
- createReducers (ReduTupleT<typename Reductions::result_type...> Identities,
1640
- ReduTupleT<typename Reductions::binary_operation...> BOPsTuple,
1641
- std::index_sequence<Is...>) {
1642
- return {typename Reductions::reducer_type{std::get<Is>(Identities),
1643
- std::get<Is>(BOPsTuple)}...};
1644
- }
1645
-
1646
- template <typename KernelType, int Dims, typename ... ReducerT, size_t ... Is>
1647
- void callReduUserKernelFunc (KernelType KernelFunc, nd_item<Dims> NDIt,
1648
- std::tuple<ReducerT...> &Reducers,
1649
- std::index_sequence<Is...>) {
1650
- KernelFunc (NDIt, std::get<Is>(Reducers)...);
1651
- }
1652
-
1653
- template <typename ... LocalAccT, typename ... ReducerT, typename ... ResultT,
1654
- size_t ... Is>
1655
- void initReduLocalAccs (bool Pow2WG, size_t LID, size_t WGSize,
1656
- ReduTupleT<LocalAccT...> LocalAccs,
1657
- const std::tuple<ReducerT...> &Reducers,
1658
- ReduTupleT<ResultT...> Identities,
1659
- std::index_sequence<Is...>) {
1660
- ((std::get<Is>(LocalAccs)[LID] = std::get<Is>(Reducers).MValue ), ...);
1661
-
1662
- // For work-groups, which size is not power of two, local accessors have
1663
- // an additional element with index WGSize that is used by the tree-reduction
1664
- // algorithm. Initialize those additional elements with identity values here.
1665
- if (!Pow2WG)
1666
- ((std::get<Is>(LocalAccs)[WGSize] = std::get<Is>(Identities)), ...);
1667
- }
1668
-
1669
- template <typename ... LocalAccT, typename ... InputAccT, typename ... ResultT,
1670
- size_t ... Is>
1671
- void initReduLocalAccs (bool UniformPow2WG, size_t LID, size_t GID,
1672
- size_t NWorkItems, size_t WGSize,
1673
- ReduTupleT<InputAccT...> LocalAccs,
1674
- ReduTupleT<LocalAccT...> InputAccs,
1675
- ReduTupleT<ResultT...> Identities,
1676
- std::index_sequence<Is...>) {
1677
- // Normally, the local accessors are initialized with elements from the input
1678
- // accessors. The exception is the case when (GID >= NWorkItems), which
1679
- // possible only when UniformPow2WG is false. For that case the elements of
1680
- // local accessors are initialized with identity value, so they would not
1681
- // give any impact into the final partial sums during the tree-reduction
1682
- // algorithm work.
1683
- if (UniformPow2WG || GID < NWorkItems)
1684
- ((std::get<Is>(LocalAccs)[LID] = std::get<Is>(InputAccs)[GID]), ...);
1685
- else
1686
- ((std::get<Is>(LocalAccs)[LID] = std::get<Is>(Identities)), ...);
1687
-
1688
- // For work-groups, which size is not power of two, local accessors have
1689
- // an additional element with index WGSize that is used by the tree-reduction
1690
- // algorithm. Initialize those additional elements with identity values here.
1691
- if (!UniformPow2WG)
1692
- ((std::get<Is>(LocalAccs)[WGSize] = std::get<Is>(Identities)), ...);
1693
- }
1694
-
1695
1594
template <typename ... LocalAccT, typename ... BOPsT, size_t ... Is>
1696
1595
void reduceReduLocalAccs (size_t IndexA, size_t IndexB,
1697
1596
ReduTupleT<LocalAccT...> LocalAccs,
@@ -1830,8 +1729,16 @@ void reduCGFuncImplScalar(
1830
1729
std::index_sequence<Is...> ReduIndices) {
1831
1730
size_t WGSize = NDIt.get_local_range ().size ();
1832
1731
size_t LID = NDIt.get_local_linear_id ();
1833
- initReduLocalAccs (Pow2WG, LID, WGSize, LocalAccsTuple, ReducersTuple,
1834
- IdentitiesTuple, ReduIndices);
1732
+
1733
+ ((std::get<Is>(LocalAccsTuple)[LID] = std::get<Is>(ReducersTuple).MValue ),
1734
+ ...);
1735
+
1736
+ // For work-groups, which size is not power of two, local accessors have
1737
+ // an additional element with index WGSize that is used by the tree-reduction
1738
+ // algorithm. Initialize those additional elements with identity values here.
1739
+ if (!Pow2WG)
1740
+ ((std::get<Is>(LocalAccsTuple)[WGSize] = std::get<Is>(IdentitiesTuple)),
1741
+ ...);
1835
1742
NDIt.barrier ();
1836
1743
1837
1744
size_t PrevStep = WGSize;
@@ -1967,8 +1874,10 @@ void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
1967
1874
1968
1875
// Create inputs using the global order of all reductions
1969
1876
size_t LocalAccSize = WGSize + (Pow2WG ? 0 : 1 );
1877
+
1970
1878
auto LocalAccsTuple =
1971
- createReduLocalAccs<Reductions...>(LocalAccSize, CGH, ReduIndices);
1879
+ makeReduTupleT (local_accessor<typename Reductions::result_type, 1 >{
1880
+ LocalAccSize, CGH}...);
1972
1881
1973
1882
size_t NWorkGroups = Range.get_group_range ().size ();
1974
1883
bool IsOneWG = NWorkGroups == 1 ;
@@ -1977,20 +1886,23 @@ void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
1977
1886
// one WorkGroup and when there are multiple. Use this lambda to write the
1978
1887
// code just once.
1979
1888
auto Rest = [&](auto OutAccsTuple) {
1980
- auto IdentitiesTuple = getReduIdentities (ReduTuple, ReduIndices);
1981
- auto BOPsTuple = getReduBOPs (ReduTuple, ReduIndices);
1982
- auto InitToIdentityProps =
1983
- getInitToIdentityProperties (ReduTuple, ReduIndices);
1889
+ auto IdentitiesTuple =
1890
+ makeReduTupleT (std::get<Is>(ReduTuple).getIdentity ()...);
1891
+ auto BOPsTuple =
1892
+ makeReduTupleT (std::get<Is>(ReduTuple).getBinaryOperation ()...);
1893
+ std::array InitToIdentityProps{
1894
+ std::get<Is>(ReduTuple).initializeToIdentity ()...};
1984
1895
1985
1896
using Name = __sycl_reduction_kernel<reduction::main_krn::NDRangeMulti,
1986
1897
KernelName, decltype (OutAccsTuple)>;
1987
1898
CGH.parallel_for <Name>(Range, Properties, [=](nd_item<Dims> NDIt) {
1988
1899
// Pass all reductions to user's lambda in the same order as supplied
1989
1900
// Each reducer initializes its own storage
1990
1901
auto ReduIndices = std::index_sequence_for<Reductions...>();
1991
- auto ReducersTuple = createReducers<Reductions...>(
1992
- IdentitiesTuple, BOPsTuple, ReduIndices);
1993
- callReduUserKernelFunc (KernelFunc, NDIt, ReducersTuple, ReduIndices);
1902
+ auto ReducersTuple = std::tuple{typename Reductions::reducer_type{
1903
+ std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple)}...};
1904
+ std::apply ([&](auto &...Reducers ) { KernelFunc (NDIt, Reducers...); },
1905
+ ReducersTuple);
1994
1906
1995
1907
// Combine and write-back the results of any scalar reductions
1996
1908
// reduCGFuncImplScalar<Reductions...>(NDIt, LocalAccsTuple, OutAccsTuple,
@@ -2080,8 +1992,24 @@ void reduAuxCGFuncImplScalar(
2080
1992
ReduTupleT<BOPsT...> BOPsTuple,
2081
1993
std::array<bool , sizeof ...(Reductions)> InitToIdentityProps,
2082
1994
std::index_sequence<Is...> ReduIndices) {
2083
- initReduLocalAccs (UniformPow2WG, LID, GID, NWorkItems, WGSize, LocalAccsTuple,
2084
- InAccsTuple, IdentitiesTuple, ReduIndices);
1995
+ // Normally, the local accessors are initialized with elements from the input
1996
+ // accessors. The exception is the case when (GID >= NWorkItems), which
1997
+ // possible only when UniformPow2WG is false. For that case the elements of
1998
+ // local accessors are initialized with identity value, so they would not
1999
+ // give any impact into the final partial sums during the tree-reduction
2000
+ // algorithm work.
2001
+ ((std::get<Is>(LocalAccsTuple)[LID] = UniformPow2WG || GID < NWorkItems
2002
+ ? std::get<Is>(InAccsTuple)[GID]
2003
+ : std::get<Is>(IdentitiesTuple)),
2004
+ ...);
2005
+
2006
+ // For work-groups, which size is not power of two, local accessors have
2007
+ // an additional element with index WGSize that is used by the tree-reduction
2008
+ // algorithm. Initialize those additional elements with identity values here.
2009
+ if (!UniformPow2WG)
2010
+ ((std::get<Is>(LocalAccsTuple)[WGSize] = std::get<Is>(IdentitiesTuple)),
2011
+ ...);
2012
+
2085
2013
NDIt.barrier ();
2086
2014
2087
2015
size_t PrevStep = WGSize;
@@ -2253,14 +2181,17 @@ size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
2253
2181
2254
2182
size_t LocalAccSize = WGSize + (HasUniformWG ? 0 : 1 );
2255
2183
auto LocalAccsTuple =
2256
- createReduLocalAccs<Reductions...>(LocalAccSize, CGH, ReduIndices);
2257
- auto InAccsTuple =
2258
- getReadAccsToPreviousPartialReds (CGH, ReduTuple, ReduIndices);
2259
-
2260
- auto IdentitiesTuple = getReduIdentities (ReduTuple, ReduIndices);
2261
- auto BOPsTuple = getReduBOPs (ReduTuple, ReduIndices);
2262
- auto InitToIdentityProps =
2263
- getInitToIdentityProperties (ReduTuple, ReduIndices);
2184
+ makeReduTupleT (local_accessor<typename Reductions::result_type, 1 >{
2185
+ LocalAccSize, CGH}...);
2186
+ auto InAccsTuple = makeReduTupleT (
2187
+ std::get<Is>(ReduTuple).getReadAccToPreviousPartialReds (CGH)...);
2188
+
2189
+ auto IdentitiesTuple =
2190
+ makeReduTupleT (std::get<Is>(ReduTuple).getIdentity ()...);
2191
+ auto BOPsTuple =
2192
+ makeReduTupleT (std::get<Is>(ReduTuple).getBinaryOperation ()...);
2193
+ std::array InitToIdentityProps{
2194
+ std::get<Is>(ReduTuple).initializeToIdentity ()...};
2264
2195
2265
2196
// Predicate/OutAccsTuple below have different type depending on us having
2266
2197
// just a single WG or multiple WGs. Use this lambda to avoid code
0 commit comments