Skip to content

Commit 4950d65

Browse files
committed
simple-stream: avoid memcpy calls in fragmented streams for constant sizes
In 3a24d6d ("simple_stream: add [[gnu::always_inline]]"), we sprinkled [[gnu::always_inline]] to encourage constant propagation of the size parameter, in non-fragmented streams. When the size is a constant (which it is, when reading serialized integrals), the memcpy() call can be optimized into a single instruction to read memory. Here, we do the same for fragmented streams. Since the code is prepared for the integral to span two (or more) fragments, it will issue tiny memcpy calls which expend a large amount of instructions to figure out the right path, then copy memory using a single instruction and return. To fix this, we do the following: 1. split for_each_fragment() into a fast-path and slow-path. 2. select the fast path when the size is a constant and it happens to fit into the first fragment (which is very likely, as constant sizes are usually for the various integral types). 3. encourage inlining (without which we don't get constant propagation) with [[gnu::always_inline]]. The fast-path is guarded with __builtin_constant_p. For non-constant data, we'll call memcpy() anyway, so we don't get much from splitting the paths. A demonstration of the optimization is available in [1]. [1] https://godbolt.org/z/rWdMa7bfK
1 parent 0a90f79 commit 4950d65

File tree

1 file changed

+30
-0
lines changed

1 file changed

+30
-0
lines changed

include/seastar/core/simple-stream.hh

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,11 +129,22 @@ class fragmented_memory_output_stream {
129129
private:
130130
template<typename Func>
131131
//requires requires(Func f, view bv) { { f(bv) } -> void; }
132+
[[gnu::always_inline]]
132133
void for_each_fragment(size_t size, Func&& func) {
133134
if (size > _size) {
134135
throw std::out_of_range("serialization buffer overflow");
135136
}
136137
_size -= size;
138+
// Fast path, avoids memcpy for constant sizes.
139+
if constexpr (__builtin_constant_p(size)) {
140+
if (size <= _current.size()) [[likely]] {
141+
func(_current.write_substream(size));
142+
return;
143+
}
144+
}
145+
for_each_fragment_slowpath(size, std::forward<Func>(func));
146+
}
147+
void for_each_fragment_slowpath(size_t size, std::invocable<simple_memory_output_stream> auto&& func) {
137148
while (size) {
138149
if (!_current.size()) {
139150
_current = simple(reinterpret_cast<char*>((*_it).get_write()), (*_it).size());
@@ -156,9 +167,11 @@ public:
156167
: _it(it), _size(size) {
157168
}
158169

170+
[[gnu::always_inline]]
159171
void skip(size_t size) {
160172
for_each_fragment(size, [] (auto) { });
161173
}
174+
[[gnu::always_inline]]
162175
memory_output_stream<Iterator> write_substream(size_t size) {
163176
if (size > _size) {
164177
throw std::out_of_range("serialization buffer overflow");
@@ -171,12 +184,14 @@ public:
171184
skip(size);
172185
return substream;
173186
}
187+
[[gnu::always_inline]]
174188
void write(const char* p, size_t size) {
175189
for_each_fragment(size, [&p] (auto bv) {
176190
std::copy_n(p, bv.size(), bv.begin());
177191
p += bv.size();
178192
});
179193
}
194+
[[gnu::always_inline]]
180195
void fill(char c, size_t size) {
181196
for_each_fragment(size, [c] (simple fragment) {
182197
std::fill_n(fragment.begin(), fragment.size(), c);
@@ -391,11 +406,22 @@ class fragmented_memory_input_stream {
391406
private:
392407
template<typename Func>
393408
//requires requires(Func f, view bv) { { f(bv) } -> void; }
409+
[[gnu::always_inline]]
394410
void for_each_fragment(size_t size, Func&& func) {
395411
if (size > _size) {
396412
throw std::out_of_range("deserialization buffer underflow");
397413
}
398414
_size -= size;
415+
// Fast path, avoids memcpy for constant sizes.
416+
if constexpr (__builtin_constant_p(size)) {
417+
if (size <= _current.size()) [[likely]] {
418+
func(_current.read_substream(size));
419+
return;
420+
}
421+
}
422+
for_each_fragment_slowpath(size, std::forward<Func>(func));
423+
}
424+
void for_each_fragment_slowpath(size_t size, std::invocable<simple_memory_input_stream> auto&& func) {
399425
while (size) {
400426
if (!_current.size()) {
401427
_current = simple(reinterpret_cast<const char*>((*_it).begin()), (*_it).size());
@@ -416,9 +442,11 @@ public:
416442
: _it(it), _size(size) {
417443
}
418444

445+
[[gnu::always_inline]]
419446
void skip(size_t size) {
420447
for_each_fragment(size, [] (auto) { });
421448
}
449+
[[gnu::always_inline]]
422450
fragmented read_substream(size_t size) {
423451
if (size > _size) {
424452
throw std::out_of_range("deserialization buffer underflow");
@@ -427,12 +455,14 @@ public:
427455
skip(size);
428456
return substream;
429457
}
458+
[[gnu::always_inline]]
430459
void read(char* p, size_t size) {
431460
for_each_fragment(size, [&p] (auto bv) {
432461
p = std::copy_n(bv.begin(), bv.size(), p);
433462
});
434463
}
435464
template<typename Output>
465+
[[gnu::always_inline]]
436466
void copy_to(Output& out) {
437467
for_each_fragment(_size, [&out] (auto bv) {
438468
bv.copy_to(out);

0 commit comments

Comments
 (0)