Skip to content

Commit 9827440

Browse files
authored
[libc++] Optimize ranges::{for_each, for_each_n} for segmented iterators (#132896)
Previously, the segmented iterator optimization was limited to `std::{for_each, for_each_n}`. This patch extends the optimization to `std::ranges::for_each` and `std::ranges::for_each_n`, ensuring consistent optimizations across these algorithms. This patch first generalizes the `std` algorithms by introducing a `Projection` parameter, which is set to `__identity` for the `std` algorithms. Then we let the `ranges` algorithms to directly call their `std` counterparts with a general `__proj` argument. Benchmarks demonstrate performance improvements of up to 21.4x for ``std::deque::iterator`` and 22.3x for ``join_view`` of ``vector<vector<char>>``. Addresses a subtask of #102817.
1 parent dd40c46 commit 9827440

File tree

12 files changed

+197
-57
lines changed

12 files changed

+197
-57
lines changed

libcxx/docs/ReleaseNotes/21.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,9 @@ Improvements and New Features
7070
- The segmented iterator optimization for ``std::for_each`` has been backported to C++11. Previously it was only available
7171
in C++23 and later.
7272

73-
- The ``std::for_each_n`` algorithm has been optimized for segmented iterators, resulting in a performance improvement of
74-
up to 17.7x for ``std::deque<short>`` iterators, and up to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
73+
- The ``std::for_each_n``, ``std::ranges::for_each`` and ``std::ranges::for_each_n`` algorithms have been optimized for
74+
segmented iterators, resulting in a performance improvement of up to 17.7x for ``std::deque<short>`` iterators, and up
75+
to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
7576

7677
- The ``bitset::to_string`` function has been optimized, resulting in a performance improvement of up to 8.3x for bitsets
7778
with uniformly distributed zeros and ones, and up to 13.5x and 16.1x for sparse and dense bitsets, respectively.

libcxx/include/__algorithm/for_each.h

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,41 +12,54 @@
1212

1313
#include <__algorithm/for_each_segment.h>
1414
#include <__config>
15+
#include <__functional/identity.h>
1516
#include <__iterator/segmented_iterator.h>
1617
#include <__type_traits/enable_if.h>
18+
#include <__type_traits/invoke.h>
19+
#include <__utility/move.h>
1720

1821
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
1922
# pragma GCC system_header
2023
#endif
2124

25+
_LIBCPP_PUSH_MACROS
26+
#include <__undef_macros>
27+
2228
_LIBCPP_BEGIN_NAMESPACE_STD
2329

24-
template <class _InputIterator, class _Sent, class _Func>
25-
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __for_each(_InputIterator __first, _Sent __last, _Func& __f) {
30+
template <class _InputIterator, class _Sent, class _Func, class _Proj>
31+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
32+
__for_each(_InputIterator __first, _Sent __last, _Func& __f, _Proj& __proj) {
2633
for (; __first != __last; ++__first)
27-
__f(*__first);
34+
std::__invoke(__f, std::__invoke(__proj, *__first));
35+
return __first;
2836
}
2937

3038
#ifndef _LIBCPP_CXX03_LANG
3139
template <class _SegmentedIterator,
32-
class _Function,
40+
class _Func,
41+
class _Proj,
3342
__enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
34-
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
35-
__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __func) {
43+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
44+
__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Func& __func, _Proj& __proj) {
3645
using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
3746
std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
38-
std::__for_each(__lfirst, __llast, __func);
47+
std::__for_each(__lfirst, __llast, __func, __proj);
3948
});
49+
return __last;
4050
}
4151
#endif // !_LIBCPP_CXX03_LANG
4252

43-
template <class _InputIterator, class _Function>
44-
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
45-
for_each(_InputIterator __first, _InputIterator __last, _Function __f) {
46-
std::__for_each(__first, __last, __f);
53+
template <class _InputIterator, class _Func>
54+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Func
55+
for_each(_InputIterator __first, _InputIterator __last, _Func __f) {
56+
__identity __proj;
57+
std::__for_each(__first, __last, __f, __proj);
4758
return __f;
4859
}
4960

5061
_LIBCPP_END_NAMESPACE_STD
5162

63+
_LIBCPP_POP_MACROS
64+
5265
#endif // _LIBCPP___ALGORITHM_FOR_EACH_H

libcxx/include/__algorithm/for_each_n.h

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,12 @@
1313
#include <__algorithm/for_each.h>
1414
#include <__algorithm/for_each_n_segment.h>
1515
#include <__config>
16+
#include <__functional/identity.h>
1617
#include <__iterator/iterator_traits.h>
1718
#include <__iterator/segmented_iterator.h>
1819
#include <__type_traits/disjunction.h>
1920
#include <__type_traits/enable_if.h>
21+
#include <__type_traits/invoke.h>
2022
#include <__type_traits/negation.h>
2123
#include <__utility/convert_to_integral.h>
2224
#include <__utility/move.h>
@@ -33,16 +35,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD
3335
template <class _InputIterator,
3436
class _Size,
3537
class _Func,
38+
class _Proj,
3639
__enable_if_t<!__has_random_access_iterator_category<_InputIterator>::value &&
3740
_Or< _Not<__is_segmented_iterator<_InputIterator> >,
3841
_Not<__has_random_access_local_iterator<_InputIterator> > >::value,
3942
int> = 0>
4043
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
41-
__for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f) {
44+
__for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
4245
typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
4346
_IntegralSize __n = __orig_n;
4447
while (__n > 0) {
45-
__f(*__first);
48+
std::__invoke(__f, std::__invoke(__proj, *__first));
4649
++__first;
4750
--__n;
4851
}
@@ -52,39 +55,42 @@ __for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f) {
5255
template <class _RandIter,
5356
class _Size,
5457
class _Func,
58+
class _Proj,
5559
__enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
5660
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
57-
__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f) {
61+
__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
5862
typename std::iterator_traits<_RandIter>::difference_type __n = __orig_n;
5963
auto __last = __first + __n;
60-
std::__for_each(__first, __last, __f);
61-
return std::move(__last);
64+
std::__for_each(__first, __last, __f, __proj);
65+
return __last;
6266
}
6367

6468
#ifndef _LIBCPP_CXX03_LANG
6569
template <class _SegmentedIterator,
6670
class _Size,
6771
class _Func,
72+
class _Proj,
6873
__enable_if_t<!__has_random_access_iterator_category<_SegmentedIterator>::value &&
6974
__is_segmented_iterator<_SegmentedIterator>::value &&
7075
__has_random_access_iterator_category<
7176
typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
7277
int> = 0>
7378
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
74-
__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f) {
79+
__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
7580
using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
7681
return std::__for_each_n_segment(__first, __orig_n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
77-
std::__for_each(__lfirst, __llast, __f);
82+
std::__for_each(__lfirst, __llast, __f, __proj);
7883
});
7984
}
8085
#endif // !_LIBCPP_CXX03_LANG
8186

8287
#if _LIBCPP_STD_VER >= 17
8388

84-
template <class _InputIterator, class _Size, class _Function>
89+
template <class _InputIterator, class _Size, class _Func>
8590
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
86-
for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
87-
return std::__for_each_n(__first, __orig_n, __f);
91+
for_each_n(_InputIterator __first, _Size __orig_n, _Func __f) {
92+
__identity __proj;
93+
return std::__for_each_n(__first, __orig_n, __f, __proj);
8894
}
8995

9096
#endif // _LIBCPP_STD_VER >= 17

libcxx/include/__algorithm/ranges_for_each.h

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,12 @@
99
#ifndef _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H
1010
#define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H
1111

12+
#include <__algorithm/for_each.h>
13+
#include <__algorithm/for_each_n.h>
1214
#include <__algorithm/in_fun_result.h>
15+
#include <__concepts/assignable.h>
1316
#include <__config>
1417
#include <__functional/identity.h>
15-
#include <__functional/invoke.h>
1618
#include <__iterator/concepts.h>
1719
#include <__iterator/projected.h>
1820
#include <__ranges/access.h>
@@ -41,9 +43,17 @@ struct __for_each {
4143
template <class _Iter, class _Sent, class _Proj, class _Func>
4244
_LIBCPP_HIDE_FROM_ABI constexpr static for_each_result<_Iter, _Func>
4345
__for_each_impl(_Iter __first, _Sent __last, _Func& __func, _Proj& __proj) {
44-
for (; __first != __last; ++__first)
45-
std::invoke(__func, std::invoke(__proj, *__first));
46-
return {std::move(__first), std::move(__func)};
46+
// In the case where we have different iterator and sentinel types, the segmented iterator optimization
47+
// in std::for_each will not kick in. Therefore, we prefer std::for_each_n in that case (whenever we can
48+
// obtain the `n`).
49+
if constexpr (!std::assignable_from<_Iter&, _Sent> && std::sized_sentinel_for<_Sent, _Iter>) {
50+
auto __n = __last - __first;
51+
auto __end = std::__for_each_n(std::move(__first), __n, __func, __proj);
52+
return {std::move(__end), std::move(__func)};
53+
} else {
54+
auto __end = std::__for_each(std::move(__first), std::move(__last), __func, __proj);
55+
return {std::move(__end), std::move(__func)};
56+
}
4757
}
4858

4959
public:

libcxx/include/__algorithm/ranges_for_each_n.h

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@
99
#ifndef _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H
1010
#define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H
1111

12+
#include <__algorithm/for_each_n.h>
1213
#include <__algorithm/in_fun_result.h>
1314
#include <__config>
1415
#include <__functional/identity.h>
15-
#include <__functional/invoke.h>
1616
#include <__iterator/concepts.h>
1717
#include <__iterator/incrementable_traits.h>
1818
#include <__iterator/iterator_traits.h>
@@ -40,11 +40,8 @@ struct __for_each_n {
4040
template <input_iterator _Iter, class _Proj = identity, indirectly_unary_invocable<projected<_Iter, _Proj>> _Func>
4141
_LIBCPP_HIDE_FROM_ABI constexpr for_each_n_result<_Iter, _Func>
4242
operator()(_Iter __first, iter_difference_t<_Iter> __count, _Func __func, _Proj __proj = {}) const {
43-
while (__count-- > 0) {
44-
std::invoke(__func, std::invoke(__proj, *__first));
45-
++__first;
46-
}
47-
return {std::move(__first), std::move(__func)};
43+
auto __last = std::__for_each_n(std::move(__first), __count, __func, __proj);
44+
return {std::move(__last), std::move(__func)};
4845
}
4946
};
5047

libcxx/include/experimental/iterator

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ _LIBCPP_POP_MACROS
127127
# if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
128128
# include <cstddef>
129129
# include <iosfwd>
130+
# include <optional>
130131
# include <type_traits>
131132
# endif
132133
#endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)

libcxx/include/mutex

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,7 @@ _LIBCPP_POP_MACROS
504504
# include <initializer_list>
505505
# include <iosfwd>
506506
# include <new>
507+
# include <optional>
507508
# include <stdexcept>
508509
# include <system_error>
509510
# include <type_traits>

libcxx/include/shared_mutex

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,7 @@ _LIBCPP_POP_MACROS
457457
# endif // _LIBCPP_HAS_THREADS
458458

459459
# if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
460+
# include <optional>
460461
# include <system_error>
461462
# endif
462463
#endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)

libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <cstddef>
1313
#include <deque>
1414
#include <list>
15+
#include <ranges>
1516
#include <string>
1617
#include <vector>
1718

@@ -23,6 +24,7 @@ int main(int argc, char** argv) {
2324
// {std,ranges}::for_each
2425
{
2526
auto bm = []<class Container>(std::string name, auto for_each) {
27+
using ElemType = typename Container::value_type;
2628
benchmark::RegisterBenchmark(
2729
name,
2830
[for_each](auto& st) {
@@ -33,15 +35,14 @@ int main(int argc, char** argv) {
3335

3436
for ([[maybe_unused]] auto _ : st) {
3537
benchmark::DoNotOptimize(c);
36-
auto result = for_each(first, last, [](int& x) { x = std::clamp(x, 10, 100); });
38+
auto result = for_each(first, last, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
3739
benchmark::DoNotOptimize(result);
3840
}
3941
})
4042
->Arg(8)
4143
->Arg(32)
4244
->Arg(50) // non power-of-two
43-
->Arg(8192)
44-
->Arg(1 << 20);
45+
->Arg(8192);
4546
};
4647
bm.operator()<std::vector<int>>("std::for_each(vector<int>)", std_for_each);
4748
bm.operator()<std::deque<int>>("std::for_each(deque<int>)", std_for_each);
@@ -51,6 +52,42 @@ int main(int argc, char** argv) {
5152
bm.operator()<std::list<int>>("rng::for_each(list<int>)", std::ranges::for_each);
5253
}
5354

55+
// {std,ranges}::for_each for join_view
56+
{
57+
auto bm = []<class Container>(std::string name, auto for_each) {
58+
using C1 = typename Container::value_type;
59+
using ElemType = typename C1::value_type;
60+
61+
benchmark::RegisterBenchmark(
62+
name,
63+
[for_each](auto& st) {
64+
std::size_t const size = st.range(0);
65+
std::size_t const seg_size = 256;
66+
std::size_t const segments = (size + seg_size - 1) / seg_size;
67+
Container c(segments);
68+
for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
69+
c[i].resize(std::min(seg_size, n), ElemType(1));
70+
}
71+
72+
auto view = c | std::views::join;
73+
auto first = view.begin();
74+
auto last = view.end();
75+
76+
for ([[maybe_unused]] auto _ : st) {
77+
benchmark::DoNotOptimize(c);
78+
auto result = for_each(first, last, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
79+
benchmark::DoNotOptimize(result);
80+
}
81+
})
82+
->Arg(8)
83+
->Arg(32)
84+
->Arg(50) // non power-of-two
85+
->Arg(8192);
86+
};
87+
bm.operator()<std::vector<std::vector<int>>>("std::for_each(join_view(vector<vector<int>>))", std_for_each);
88+
bm.operator()<std::vector<std::vector<int>>>("rng::for_each(join_view(vector<vector<int>>)", std::ranges::for_each);
89+
}
90+
5491
benchmark::Initialize(&argc, argv);
5592
benchmark::RunSpecifiedBenchmarks();
5693
benchmark::Shutdown();

libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
int main(int argc, char** argv) {
2222
auto std_for_each_n = [](auto first, auto n, auto f) { return std::for_each_n(first, n, f); };
2323

24-
// std::for_each_n
24+
// {std,ranges}::for_each_n
2525
{
2626
auto bm = []<class Container>(std::string name, auto for_each_n) {
2727
using ElemType = typename Container::value_type;
@@ -41,19 +41,17 @@ int main(int argc, char** argv) {
4141
->Arg(8)
4242
->Arg(32)
4343
->Arg(50) // non power-of-two
44-
->Arg(1024)
45-
->Arg(4096)
46-
->Arg(8192)
47-
->Arg(1 << 14)
48-
->Arg(1 << 16)
49-
->Arg(1 << 18);
44+
->Arg(8192);
5045
};
5146
bm.operator()<std::vector<int>>("std::for_each_n(vector<int>)", std_for_each_n);
5247
bm.operator()<std::deque<int>>("std::for_each_n(deque<int>)", std_for_each_n);
5348
bm.operator()<std::list<int>>("std::for_each_n(list<int>)", std_for_each_n);
49+
bm.operator()<std::vector<int>>("rng::for_each_n(vector<int>)", std::ranges::for_each_n);
50+
bm.operator()<std::deque<int>>("rng::for_each_n(deque<int>)", std::ranges::for_each_n);
51+
bm.operator()<std::list<int>>("rng::for_each_n(list<int>)", std::ranges::for_each_n);
5452
}
5553

56-
// std::for_each_n for join_view
54+
// {std,ranges}::for_each_n for join_view
5755
{
5856
auto bm = []<class Container>(std::string name, auto for_each_n) {
5957
using C1 = typename Container::value_type;
@@ -81,14 +79,11 @@ int main(int argc, char** argv) {
8179
->Arg(8)
8280
->Arg(32)
8381
->Arg(50) // non power-of-two
84-
->Arg(1024)
85-
->Arg(4096)
86-
->Arg(8192)
87-
->Arg(1 << 14)
88-
->Arg(1 << 16)
89-
->Arg(1 << 18);
82+
->Arg(8192);
9083
};
9184
bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
85+
bm.operator()<std::vector<std::vector<int>>>(
86+
"rng::for_each_n(join_view(vector<vector<int>>)", std::ranges::for_each_n);
9287
}
9388

9489
benchmark::Initialize(&argc, argv);

0 commit comments

Comments
 (0)