Skip to content

Commit 70e23bc

Browse files
[NFC][SYCL][Reduction] Inline some helpers to improve code locality (#7209)
It seems that those were mostly to process variadic templates but that acn be done without extra functions (at least in C++17). I've inlined those that either had just one callsite or were short enough.
1 parent bc1b306 commit 70e23bc

File tree

1 file changed

+52
-121
lines changed

1 file changed

+52
-121
lines changed

sycl/include/sycl/reduction.hpp

Lines changed: 52 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -1577,18 +1577,6 @@ reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
15771577
});
15781578
}
15791579

1580-
/// For the given 'Reductions' types pack and indices enumerating only
1581-
/// the reductions for which a local accessors are needed, this function creates
1582-
/// those local accessors and returns a tuple consisting of them.
1583-
template <typename... Reductions, size_t... Is>
1584-
auto createReduLocalAccs(size_t Size, handler &CGH,
1585-
std::index_sequence<Is...>) {
1586-
return makeReduTupleT(
1587-
local_accessor<typename std::tuple_element_t<
1588-
Is, std::tuple<Reductions...>>::result_type,
1589-
1>{Size, CGH}...);
1590-
}
1591-
15921580
/// For the given 'Reductions' types pack and indices enumerating them this
15931581
/// function either creates new temporary accessors for partial sums (if IsOneWG
15941582
/// is false) or returns user's accessor/USM-pointer if (IsOneWG is true).
@@ -1603,95 +1591,6 @@ auto createReduOutAccs(size_t NWorkGroups, handler &CGH,
16031591
CGH)...);
16041592
}
16051593

1606-
/// For the given 'Reductions' types pack and indices enumerating them this
1607-
/// function returns accessors to buffers holding partial sums generated in the
1608-
/// previous kernel invocation.
1609-
template <typename... Reductions, size_t... Is>
1610-
auto getReadAccsToPreviousPartialReds(handler &CGH,
1611-
std::tuple<Reductions...> &ReduTuple,
1612-
std::index_sequence<Is...>) {
1613-
return makeReduTupleT(
1614-
std::get<Is>(ReduTuple).getReadAccToPreviousPartialReds(CGH)...);
1615-
}
1616-
1617-
template <typename... Reductions, size_t... Is>
1618-
ReduTupleT<typename Reductions::result_type...>
1619-
getReduIdentities(std::tuple<Reductions...> &ReduTuple,
1620-
std::index_sequence<Is...>) {
1621-
return {std::get<Is>(ReduTuple).getIdentity()...};
1622-
}
1623-
1624-
template <typename... Reductions, size_t... Is>
1625-
ReduTupleT<typename Reductions::binary_operation...>
1626-
getReduBOPs(std::tuple<Reductions...> &ReduTuple, std::index_sequence<Is...>) {
1627-
return {std::get<Is>(ReduTuple).getBinaryOperation()...};
1628-
}
1629-
1630-
template <typename... Reductions, size_t... Is>
1631-
std::array<bool, sizeof...(Reductions)>
1632-
getInitToIdentityProperties(std::tuple<Reductions...> &ReduTuple,
1633-
std::index_sequence<Is...>) {
1634-
return {std::get<Is>(ReduTuple).initializeToIdentity()...};
1635-
}
1636-
1637-
template <typename... Reductions, size_t... Is>
1638-
std::tuple<typename Reductions::reducer_type...>
1639-
createReducers(ReduTupleT<typename Reductions::result_type...> Identities,
1640-
ReduTupleT<typename Reductions::binary_operation...> BOPsTuple,
1641-
std::index_sequence<Is...>) {
1642-
return {typename Reductions::reducer_type{std::get<Is>(Identities),
1643-
std::get<Is>(BOPsTuple)}...};
1644-
}
1645-
1646-
template <typename KernelType, int Dims, typename... ReducerT, size_t... Is>
1647-
void callReduUserKernelFunc(KernelType KernelFunc, nd_item<Dims> NDIt,
1648-
std::tuple<ReducerT...> &Reducers,
1649-
std::index_sequence<Is...>) {
1650-
KernelFunc(NDIt, std::get<Is>(Reducers)...);
1651-
}
1652-
1653-
template <typename... LocalAccT, typename... ReducerT, typename... ResultT,
1654-
size_t... Is>
1655-
void initReduLocalAccs(bool Pow2WG, size_t LID, size_t WGSize,
1656-
ReduTupleT<LocalAccT...> LocalAccs,
1657-
const std::tuple<ReducerT...> &Reducers,
1658-
ReduTupleT<ResultT...> Identities,
1659-
std::index_sequence<Is...>) {
1660-
((std::get<Is>(LocalAccs)[LID] = std::get<Is>(Reducers).MValue), ...);
1661-
1662-
// For work-groups, which size is not power of two, local accessors have
1663-
// an additional element with index WGSize that is used by the tree-reduction
1664-
// algorithm. Initialize those additional elements with identity values here.
1665-
if (!Pow2WG)
1666-
((std::get<Is>(LocalAccs)[WGSize] = std::get<Is>(Identities)), ...);
1667-
}
1668-
1669-
template <typename... LocalAccT, typename... InputAccT, typename... ResultT,
1670-
size_t... Is>
1671-
void initReduLocalAccs(bool UniformPow2WG, size_t LID, size_t GID,
1672-
size_t NWorkItems, size_t WGSize,
1673-
ReduTupleT<InputAccT...> LocalAccs,
1674-
ReduTupleT<LocalAccT...> InputAccs,
1675-
ReduTupleT<ResultT...> Identities,
1676-
std::index_sequence<Is...>) {
1677-
// Normally, the local accessors are initialized with elements from the input
1678-
// accessors. The exception is the case when (GID >= NWorkItems), which
1679-
// possible only when UniformPow2WG is false. For that case the elements of
1680-
// local accessors are initialized with identity value, so they would not
1681-
// give any impact into the final partial sums during the tree-reduction
1682-
// algorithm work.
1683-
if (UniformPow2WG || GID < NWorkItems)
1684-
((std::get<Is>(LocalAccs)[LID] = std::get<Is>(InputAccs)[GID]), ...);
1685-
else
1686-
((std::get<Is>(LocalAccs)[LID] = std::get<Is>(Identities)), ...);
1687-
1688-
// For work-groups, which size is not power of two, local accessors have
1689-
// an additional element with index WGSize that is used by the tree-reduction
1690-
// algorithm. Initialize those additional elements with identity values here.
1691-
if (!UniformPow2WG)
1692-
((std::get<Is>(LocalAccs)[WGSize] = std::get<Is>(Identities)), ...);
1693-
}
1694-
16951594
template <typename... LocalAccT, typename... BOPsT, size_t... Is>
16961595
void reduceReduLocalAccs(size_t IndexA, size_t IndexB,
16971596
ReduTupleT<LocalAccT...> LocalAccs,
@@ -1830,8 +1729,16 @@ void reduCGFuncImplScalar(
18301729
std::index_sequence<Is...> ReduIndices) {
18311730
size_t WGSize = NDIt.get_local_range().size();
18321731
size_t LID = NDIt.get_local_linear_id();
1833-
initReduLocalAccs(Pow2WG, LID, WGSize, LocalAccsTuple, ReducersTuple,
1834-
IdentitiesTuple, ReduIndices);
1732+
1733+
((std::get<Is>(LocalAccsTuple)[LID] = std::get<Is>(ReducersTuple).MValue),
1734+
...);
1735+
1736+
// For work-groups, which size is not power of two, local accessors have
1737+
// an additional element with index WGSize that is used by the tree-reduction
1738+
// algorithm. Initialize those additional elements with identity values here.
1739+
if (!Pow2WG)
1740+
((std::get<Is>(LocalAccsTuple)[WGSize] = std::get<Is>(IdentitiesTuple)),
1741+
...);
18351742
NDIt.barrier();
18361743

18371744
size_t PrevStep = WGSize;
@@ -1967,8 +1874,10 @@ void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
19671874

19681875
// Create inputs using the global order of all reductions
19691876
size_t LocalAccSize = WGSize + (Pow2WG ? 0 : 1);
1877+
19701878
auto LocalAccsTuple =
1971-
createReduLocalAccs<Reductions...>(LocalAccSize, CGH, ReduIndices);
1879+
makeReduTupleT(local_accessor<typename Reductions::result_type, 1>{
1880+
LocalAccSize, CGH}...);
19721881

19731882
size_t NWorkGroups = Range.get_group_range().size();
19741883
bool IsOneWG = NWorkGroups == 1;
@@ -1977,20 +1886,23 @@ void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
19771886
// one WorkGroup and when there are multiple. Use this lambda to write the
19781887
// code just once.
19791888
auto Rest = [&](auto OutAccsTuple) {
1980-
auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices);
1981-
auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices);
1982-
auto InitToIdentityProps =
1983-
getInitToIdentityProperties(ReduTuple, ReduIndices);
1889+
auto IdentitiesTuple =
1890+
makeReduTupleT(std::get<Is>(ReduTuple).getIdentity()...);
1891+
auto BOPsTuple =
1892+
makeReduTupleT(std::get<Is>(ReduTuple).getBinaryOperation()...);
1893+
std::array InitToIdentityProps{
1894+
std::get<Is>(ReduTuple).initializeToIdentity()...};
19841895

19851896
using Name = __sycl_reduction_kernel<reduction::main_krn::NDRangeMulti,
19861897
KernelName, decltype(OutAccsTuple)>;
19871898
CGH.parallel_for<Name>(Range, Properties, [=](nd_item<Dims> NDIt) {
19881899
// Pass all reductions to user's lambda in the same order as supplied
19891900
// Each reducer initializes its own storage
19901901
auto ReduIndices = std::index_sequence_for<Reductions...>();
1991-
auto ReducersTuple = createReducers<Reductions...>(
1992-
IdentitiesTuple, BOPsTuple, ReduIndices);
1993-
callReduUserKernelFunc(KernelFunc, NDIt, ReducersTuple, ReduIndices);
1902+
auto ReducersTuple = std::tuple{typename Reductions::reducer_type{
1903+
std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple)}...};
1904+
std::apply([&](auto &...Reducers) { KernelFunc(NDIt, Reducers...); },
1905+
ReducersTuple);
19941906

19951907
// Combine and write-back the results of any scalar reductions
19961908
// reduCGFuncImplScalar<Reductions...>(NDIt, LocalAccsTuple, OutAccsTuple,
@@ -2080,8 +1992,24 @@ void reduAuxCGFuncImplScalar(
20801992
ReduTupleT<BOPsT...> BOPsTuple,
20811993
std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
20821994
std::index_sequence<Is...> ReduIndices) {
2083-
initReduLocalAccs(UniformPow2WG, LID, GID, NWorkItems, WGSize, LocalAccsTuple,
2084-
InAccsTuple, IdentitiesTuple, ReduIndices);
1995+
// Normally, the local accessors are initialized with elements from the input
1996+
// accessors. The exception is the case when (GID >= NWorkItems), which
1997+
// possible only when UniformPow2WG is false. For that case the elements of
1998+
// local accessors are initialized with identity value, so they would not
1999+
// give any impact into the final partial sums during the tree-reduction
2000+
// algorithm work.
2001+
((std::get<Is>(LocalAccsTuple)[LID] = UniformPow2WG || GID < NWorkItems
2002+
? std::get<Is>(InAccsTuple)[GID]
2003+
: std::get<Is>(IdentitiesTuple)),
2004+
...);
2005+
2006+
// For work-groups, which size is not power of two, local accessors have
2007+
// an additional element with index WGSize that is used by the tree-reduction
2008+
// algorithm. Initialize those additional elements with identity values here.
2009+
if (!UniformPow2WG)
2010+
((std::get<Is>(LocalAccsTuple)[WGSize] = std::get<Is>(IdentitiesTuple)),
2011+
...);
2012+
20852013
NDIt.barrier();
20862014

20872015
size_t PrevStep = WGSize;
@@ -2253,14 +2181,17 @@ size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
22532181

22542182
size_t LocalAccSize = WGSize + (HasUniformWG ? 0 : 1);
22552183
auto LocalAccsTuple =
2256-
createReduLocalAccs<Reductions...>(LocalAccSize, CGH, ReduIndices);
2257-
auto InAccsTuple =
2258-
getReadAccsToPreviousPartialReds(CGH, ReduTuple, ReduIndices);
2259-
2260-
auto IdentitiesTuple = getReduIdentities(ReduTuple, ReduIndices);
2261-
auto BOPsTuple = getReduBOPs(ReduTuple, ReduIndices);
2262-
auto InitToIdentityProps =
2263-
getInitToIdentityProperties(ReduTuple, ReduIndices);
2184+
makeReduTupleT(local_accessor<typename Reductions::result_type, 1>{
2185+
LocalAccSize, CGH}...);
2186+
auto InAccsTuple = makeReduTupleT(
2187+
std::get<Is>(ReduTuple).getReadAccToPreviousPartialReds(CGH)...);
2188+
2189+
auto IdentitiesTuple =
2190+
makeReduTupleT(std::get<Is>(ReduTuple).getIdentity()...);
2191+
auto BOPsTuple =
2192+
makeReduTupleT(std::get<Is>(ReduTuple).getBinaryOperation()...);
2193+
std::array InitToIdentityProps{
2194+
std::get<Is>(ReduTuple).initializeToIdentity()...};
22642195

22652196
// Predicate/OutAccsTuple below have different type depending on us having
22662197
// just a single WG or multiple WGs. Use this lambda to avoid code

0 commit comments

Comments
 (0)