5
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
6
//
7
7
// ===----------------------------------------------------------------------===//
8
+ // The functions defined in this file give approximate code size. These sizes
9
+ // assume the following configuration options:
10
+ // - LIBC_CONF_KEEP_FRAME_POINTER = false
11
+ // - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false
12
+ // - LIBC_ADD_NULL_CHECKS = false
8
13
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
9
14
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
10
15
11
16
#include " src/__support/macros/attributes.h" // LIBC_INLINE
12
17
#include " src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
18
+ #include " src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY
13
19
#include " src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align
14
20
15
21
#include < stddef.h> // size_t
16
22
17
- // https://libc.llvm.org/compiler_support.html
18
- // Support for [[likely]] / [[unlikely]]
19
- // [X] GCC 12.2
20
- // [X] Clang 12
21
- // [ ] Clang 11
22
- #define LIBC_ATTR_LIKELY [[likely]]
23
- #define LIBC_ATTR_UNLIKELY [[unlikely]]
24
-
25
- #if defined(LIBC_COMPILER_IS_CLANG)
26
- #if LIBC_COMPILER_CLANG_VER < 1200
27
- #undef LIBC_ATTR_LIKELY
28
- #undef LIBC_ATTR_UNLIKELY
29
- #define LIBC_ATTR_LIKELY
30
- #define LIBC_ATTR_UNLIKELY
31
- #endif
32
- #endif
33
-
34
23
namespace LIBC_NAMESPACE_DECL {
35
24
36
25
namespace {
37
26
38
- LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof (uint32_t );
39
-
40
- enum Strategy {
41
- ForceWordLdStChain,
42
- AssumeWordAligned,
43
- AssumeUnaligned,
44
- };
27
+ // Performs a copy of `bytes` byte from `src` to `dst`. This function has the
28
+ // semantics of `memcpy` where `src` and `dst` are `__restrict`. The compiler is
29
+ // free to use whatever instruction is best for the size and assumed access.
30
+ template <size_t bytes, AssumeAccess access>
31
+ LIBC_INLINE void copy (void *dst, const void *src) {
32
+ if constexpr (access == AssumeAccess::kAligned ) {
33
+ constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes;
34
+ memcpy_inline<bytes>(assume_aligned<alignment>(dst),
35
+ assume_aligned<alignment>(src));
36
+ } else if constexpr (access == AssumeAccess::kUnknown ) {
37
+ memcpy_inline<bytes>(dst, src);
38
+ } else {
39
+ static_assert (false );
40
+ }
41
+ }
45
42
46
- template <size_t bytes, Strategy strategy = AssumeUnaligned>
47
- LIBC_INLINE void copy_and_bump_pointers (Ptr &dst, CPtr &src) {
48
- if constexpr (strategy == AssumeUnaligned) {
49
- memcpy_inline<bytes>(assume_aligned<1 >(dst), assume_aligned<1 >(src));
50
- } else if constexpr (strategy == AssumeWordAligned) {
51
- static_assert (bytes >= kWordSize );
52
- memcpy_inline<bytes>(assume_aligned<kWordSize >(dst),
53
- assume_aligned<kWordSize >(src));
54
- } else if constexpr (strategy == ForceWordLdStChain) {
43
+ template <size_t bytes, BlockOp block_op = BlockOp::kFull ,
44
+ AssumeAccess access = AssumeAccess::kUnknown >
45
+ LIBC_INLINE void copy_block_and_bump_pointers (Ptr &dst, CPtr &src) {
46
+ if constexpr (block_op == BlockOp::kFull ) {
47
+ copy<bytes, access>(dst, src);
48
+ } else if constexpr (block_op == BlockOp::kByWord ) {
55
49
// We restrict loads/stores to 4 byte to prevent the use of load/store
56
- // multiple (LDM, STM) and load/store double (LDRD, STRD). First, they may
57
- // fault (see notes below) and second, they use more registers which in turn
58
- // adds push/pop instructions in the hot path.
50
+ // multiple (LDM, STM) and load/store double (LDRD, STRD).
59
51
static_assert ((bytes % kWordSize == 0 ) && (bytes >= kWordSize ));
60
52
LIBC_LOOP_UNROLL
61
- for (size_t i = 0 ; i < bytes / kWordSize ; ++i) {
62
- const size_t offset = i * kWordSize ;
63
- memcpy_inline<kWordSize >(dst + offset, src + offset);
53
+ for (size_t offset = 0 ; offset < bytes; offset += kWordSize ) {
54
+ copy<kWordSize , access>(dst + offset, src + offset);
64
55
}
56
+ } else {
57
+ static_assert (false , " Invalid BlockOp" );
65
58
}
66
59
// In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting
67
60
// into the load/store instructions.
@@ -72,39 +65,27 @@ LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
72
65
src += bytes;
73
66
}
74
67
75
- LIBC_INLINE void copy_bytes_and_bump_pointers (Ptr &dst, CPtr &src,
76
- const size_t size) {
68
+ template < size_t bytes, BlockOp block_op, AssumeAccess access>
69
+ LIBC_INLINE void consume_by_block (Ptr &dst, CPtr &src, size_t & size) {
77
70
LIBC_LOOP_NOUNROLL
78
- for (size_t i = 0 ; i < size; ++i)
79
- *dst++ = *src++;
71
+ for (size_t i = 0 ; i < size / bytes; ++i)
72
+ copy_block_and_bump_pointers<bytes, block_op, access>(dst, src);
73
+ size %= bytes;
80
74
}
81
75
82
- template <size_t block_size, Strategy strategy>
83
- LIBC_INLINE void copy_blocks_and_update_args (Ptr &dst, CPtr &src,
84
- size_t &size) {
76
+ [[maybe_unused]] LIBC_INLINE void
77
+ copy_bytes_and_bump_pointers (Ptr &dst, CPtr &src, size_t size) {
85
78
LIBC_LOOP_NOUNROLL
86
- for (size_t i = 0 ; i < size / block_size; ++i)
87
- copy_and_bump_pointers<block_size, strategy>(dst, src);
88
- // Update `size` once at the end instead of once per iteration.
89
- size %= block_size;
90
- }
91
-
92
- LIBC_INLINE CPtr bitwise_or (CPtr a, CPtr b) {
93
- return cpp::bit_cast<CPtr>(cpp::bit_cast<uintptr_t >(a) |
94
- cpp::bit_cast<uintptr_t >(b));
95
- }
96
-
97
- LIBC_INLINE auto misaligned (CPtr a) {
98
- return distance_to_align_down<kWordSize >(a);
79
+ for (size_t i = 0 ; i < size; ++i)
80
+ *dst++ = *src++;
99
81
}
100
82
101
83
} // namespace
102
84
103
- // Implementation for Cortex-M0, M0+, M1.
104
- // Notes:
105
- // - It compiles down to 196 bytes, but 220 bytes when used through `memcpy`
106
- // that also needs to return the `dst` ptr.
107
- // - These cores do not allow for unaligned loads/stores.
85
+ // Implementation for Cortex-M0, M0+, M1 cores that do not allow for unaligned
86
+ // loads/stores. It compiles down to 208 bytes when used through `memcpy` that
87
+ // also needs to return the `dst` ptr.
88
+ // Note:
108
89
// - When `src` and `dst` are coaligned, we start by aligning them and perform
109
90
// bulk copies. We let the compiler know the pointers are aligned so it can
110
91
// use load/store multiple (LDM, STM). This significantly increase throughput
@@ -125,9 +106,18 @@ LIBC_INLINE auto misaligned(CPtr a) {
125
106
if (src_alignment == 0 )
126
107
LIBC_ATTR_LIKELY {
127
108
// Both `src` and `dst` are now word-aligned.
128
- copy_blocks_and_update_args<64 , AssumeWordAligned>(dst, src, size);
129
- copy_blocks_and_update_args<16 , AssumeWordAligned>(dst, src, size);
130
- copy_blocks_and_update_args<4 , AssumeWordAligned>(dst, src, size);
109
+ // We first copy by blocks of 64 bytes, the compiler will use 4
110
+ // load/store multiple (LDM, STM), each of 4 words. This requires more
111
+ // registers so additional push/pop are needed but the speedup is worth
112
+ // it.
113
+ consume_by_block<64 , BlockOp::kFull , AssumeAccess::kAligned >(dst, src,
114
+ size);
115
+ // Then we use blocks of 4 word load/store.
116
+ consume_by_block<16 , BlockOp::kByWord , AssumeAccess::kAligned >(dst, src,
117
+ size);
118
+ // Then we use word by word copy.
119
+ consume_by_block<4 , BlockOp::kByWord , AssumeAccess::kAligned >(dst, src,
120
+ size);
131
121
}
132
122
else {
133
123
// `dst` is aligned but `src` is not.
@@ -138,7 +128,7 @@ LIBC_INLINE auto misaligned(CPtr a) {
138
128
src_alignment == 2
139
129
? load_aligned<uint32_t , uint16_t , uint16_t >(src)
140
130
: load_aligned<uint32_t , uint8_t , uint16_t , uint8_t >(src);
141
- memcpy_inline <kWordSize >(assume_aligned< kWordSize >( dst) , &value);
131
+ copy <kWordSize , AssumeAccess:: kAligned >( dst, &value);
142
132
dst += kWordSize ;
143
133
src += kWordSize ;
144
134
size -= kWordSize ;
@@ -151,56 +141,69 @@ LIBC_INLINE auto misaligned(CPtr a) {
151
141
}
152
142
153
143
// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
154
- // support for unaligned loads and stores.
155
- // Notes:
156
- // - It compiles down to 266 bytes.
157
- // - `dst` and `src` are not `__restrict` to prevent the compiler from
158
- // reordering loads/stores.
159
- // - We keep state variables to a strict minimum to keep everything in the free
160
- // registers and prevent costly push / pop.
161
- // - If unaligned single loads/stores to normal memory are supported, unaligned
162
- // accesses for load/store multiple (LDM, STM) and load/store double (LDRD,
163
- // STRD) instructions are generally not supported and will still fault so we
164
- // make sure to restrict unrolling to word loads/stores.
144
+ // support for unaligned loads and stores. It compiles down to 272 bytes when
145
+ // used through `memcpy` that also needs to return the `dst` ptr.
165
146
[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end (Ptr dst, CPtr src,
166
147
size_t size) {
167
148
if (misaligned (bitwise_or (src, dst)))
168
149
LIBC_ATTR_UNLIKELY {
169
150
if (size < 8 )
170
151
LIBC_ATTR_UNLIKELY {
171
152
if (size & 1 )
172
- copy_and_bump_pointers <1 >(dst, src);
153
+ copy_block_and_bump_pointers <1 >(dst, src);
173
154
if (size & 2 )
174
- copy_and_bump_pointers <2 >(dst, src);
155
+ copy_block_and_bump_pointers <2 >(dst, src);
175
156
if (size & 4 )
176
- copy_and_bump_pointers <4 >(dst, src);
157
+ copy_block_and_bump_pointers <4 >(dst, src);
177
158
return ;
178
159
}
179
160
if (misaligned (src))
180
161
LIBC_ATTR_UNLIKELY {
181
162
const size_t offset = distance_to_align_up<kWordSize >(dst);
182
163
if (offset & 1 )
183
- copy_and_bump_pointers <1 >(dst, src);
164
+ copy_block_and_bump_pointers <1 >(dst, src);
184
165
if (offset & 2 )
185
- copy_and_bump_pointers <2 >(dst, src);
166
+ copy_block_and_bump_pointers <2 >(dst, src);
186
167
size -= offset;
187
168
}
188
169
}
189
- copy_blocks_and_update_args<64 , ForceWordLdStChain>(dst, src, size);
190
- copy_blocks_and_update_args<16 , ForceWordLdStChain>(dst, src, size);
191
- copy_blocks_and_update_args<4 , AssumeUnaligned>(dst, src, size);
170
+ // `dst` and `src` are not necessarily both aligned at that point but this
171
+ // implementation assumes hardware support for unaligned loads and stores so
172
+ // it is still fast to perform unrolled word by word copy. Note that wider
173
+ // accesses through the use of load/store multiple (LDM, STM) and load/store
174
+ // double (LDRD, STRD) instructions are generally not supported and can fault.
175
+ // By forcing decomposition of 64 bytes copy into word by word copy, the
176
+ // compiler can use the first load to prefetch memory:
177
+ // ldr r3, [r1, #64]! <- prefetch next cache line
178
+ // str r3, [r0]
179
+ // ldr r3, [r1, #0x4]
180
+ // str r3, [r0, #0x4]
181
+ // ...
182
+ // ldr r3, [r1, #0x3c]
183
+ // str r3, [r0, #0x3c]
184
+ // This is a bit detrimental for sizes between 64 and 256 (less than 10%
185
+ // penalty) but the prefetch yields better throughput for larger copies.
186
+ consume_by_block<64 , BlockOp::kByWord , AssumeAccess::kUnknown >(dst, src,
187
+ size);
188
+ consume_by_block<16 , BlockOp::kByWord , AssumeAccess::kUnknown >(dst, src,
189
+ size);
190
+ consume_by_block<4 , BlockOp::kByWord , AssumeAccess::kUnknown >(dst, src, size);
192
191
if (size & 1 )
193
- copy_and_bump_pointers <1 >(dst, src);
192
+ copy_block_and_bump_pointers <1 >(dst, src);
194
193
if (size & 2 )
195
- LIBC_ATTR_UNLIKELY
196
- copy_and_bump_pointers<2 >(dst, src);
194
+ copy_block_and_bump_pointers<2 >(dst, src);
197
195
}
198
196
199
- [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm (void *__restrict dst_,
200
- const void *__restrict src_,
197
+ [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm (Ptr dst, CPtr src,
201
198
size_t size) {
202
- Ptr dst = cpp::bit_cast<Ptr>(dst_);
203
- CPtr src = cpp::bit_cast<CPtr>(src_);
199
+ // The compiler performs alias analysis and is able to prove that `dst` and
200
+ // `src` do not alias by propagating the `__restrict` keyword from the
201
+ // `memcpy` prototype. This allows the compiler to merge consecutive
202
+ // load/store (LDR, STR) instructions generated in
203
+ // `copy_block_and_bump_pointers` with `BlockOp::kByWord` into load/store
204
+ // double (LDRD, STRD) instructions, this is is undesirable so we prevent the
205
+ // compiler from inferring `__restrict` with the following line.
206
+ asm volatile (" " : " +r" (dst), " +r" (src));
204
207
#ifdef __ARM_FEATURE_UNALIGNED
205
208
return inline_memcpy_arm_mid_end (dst, src, size);
206
209
#else
@@ -210,8 +213,4 @@ LIBC_INLINE auto misaligned(CPtr a) {
210
213
211
214
} // namespace LIBC_NAMESPACE_DECL
212
215
213
- // Cleanup local macros
214
- #undef LIBC_ATTR_LIKELY
215
- #undef LIBC_ATTR_UNLIKELY
216
-
217
216
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
0 commit comments