@@ -7,6 +7,7 @@ const USIZE_BITS: u32 = usize::BITS;
7
7
const HAS_CTZ : bool = if cfg ! ( target_arch = "riscv32" ) || cfg ! ( target_arch = "riscv64" ) {
8
8
cfg ! ( target_feature = "b" ) || cfg ! ( target_feature = "experimental-b" )
9
9
} else if cfg ! ( target_arch = "arm" ) {
10
+ // (It's actually CLZ + RBIT)
10
11
// Thumb-2
11
12
cfg ! ( target_feature = "v6t2" )
12
13
// Armv5T and later, only in Arm mode
@@ -42,6 +43,34 @@ const HAS_SHIFTER: bool = if cfg!(target_arch = "msp430") {
42
43
true
43
44
} ;
44
45
46
+ /// Indicates whether an array-based look-up table would be faster than other
47
+ /// techniques.
48
+ ///
49
+ /// Some targets would use constant pools anyway. On such targets, bit
50
+ /// manipulation tricks relying on an instruction-embedded LUT would actually
51
+ /// read from a data bus anyway and therefore would never be faster than an
52
+ /// array-based LUT.
53
+ ///
54
+ /// Small microcontrollers usually have a low-latency memory system and a
55
+ /// single-issue in-order pipeline. Bit manipulation tricks often require many
56
+ /// bit manipulation instructions to move bits into a correct place, which
57
+ /// sometimes over-weighs the cost of loading an LUT address and then loading
58
+ /// one of its entries. Examples: <https://rust.godbolt.org/z/961Pej> (Armv6-M
59
+ /// and Armv7-M), <https://cpp.godbolt.org/z/WPnxon> (MSP430 and AVR)
60
+ ///
61
+ /// There are extreme cases that should be taken into consideration as well.
62
+ /// For example, SiFive E31 (used in SiFive Freedom E310) does not have a data
63
+ /// cache for XiP from an external SPI flash. Therefore, using an array-based
64
+ /// LUT on such systems would lead to a catastrophic performance degradation and
65
+ /// must be avoided at any cost.
66
+ #[ allow( clippy:: needless_bool) ]
67
+ const HAS_FAST_LOAD : bool =
68
+ if cfg ! ( target_arch = "arm" ) || cfg ! ( target_arch = "msp430" ) || cfg ! ( target_arch = "avr" ) {
69
+ true
70
+ } else {
71
+ false
72
+ } ;
73
+
45
74
/// Return the number of trailing zeros in `x` (`< 1 << BITS`). Returns
46
75
/// `usize::BITS` if `x` is zero.
47
76
#[ inline]
@@ -56,6 +85,12 @@ pub fn trailing_zeros<const BITS: usize>(x: usize) -> u32 {
56
85
}
57
86
} else if HAS_CTZ {
58
87
x. trailing_zeros ( )
88
+ } else if BITS == 2 && HAS_FAST_LOAD {
89
+ ctz_array_lut :: < 4 > ( x)
90
+ } else if BITS == 3 && HAS_FAST_LOAD {
91
+ ctz_array_lut :: < 8 > ( x)
92
+ } else if BITS == 4 && HAS_FAST_LOAD {
93
+ ctz_array_lut :: < 16 > ( x)
59
94
} else if BITS <= 2 {
60
95
ctz2 ( x)
61
96
} else if BITS <= 3 && HAS_SHIFTER {
@@ -164,6 +199,30 @@ fn ctz2(x: usize) -> u32 {
164
199
}
165
200
}
166
201
202
+ /// Implements [`trailing_zeros`] using an array-based look-up table.
203
+ #[ inline]
204
+ fn ctz_array_lut < const LEN : usize > ( x : usize ) -> u32 {
205
+ struct Lut < const LEN : usize > ;
206
+ trait LutTrait {
207
+ const LUT : & ' static [ u8 ] ;
208
+ }
209
+ impl < const LEN : usize > LutTrait for Lut < LEN > {
210
+ const LUT : & ' static [ u8 ] = & {
211
+ let mut array = [ 0u8 ; LEN ] ;
212
+ // FIXME: Work-around for `for` being unsupported in `const fn`
213
+ let mut i = 0 ;
214
+ while i < array. len ( ) {
215
+ array[ i] = i. trailing_zeros ( ) as u8 ;
216
+ i += 1 ;
217
+ }
218
+ array
219
+ } ;
220
+ }
221
+
222
+ let lut = Lut :: < LEN > :: LUT ;
223
+ lut[ x & ( lut. len ( ) - 1 ) ] as u32
224
+ }
225
+
167
226
/// Implements [`trailing_zeros`] using linear search.
168
227
#[ inline]
169
228
fn ctz_linear < const BITS : usize > ( mut x : usize ) -> u32 {
@@ -211,7 +270,11 @@ fn ctz_bsearch32<const BITS: usize>(x: usize) -> u32 {
211
270
x &= 0xf ;
212
271
}
213
272
214
- i += ctz4_lut_nonzero ( x as usize ) ;
273
+ if HAS_FAST_LOAD {
274
+ i += ctz_array_lut :: < 16 > ( x as usize ) ;
275
+ } else {
276
+ i += ctz4_lut_nonzero ( x as usize ) ;
277
+ }
215
278
216
279
i
217
280
}
@@ -282,6 +345,11 @@ mod tests {
282
345
gen_test ! ( ctz4_lut, super :: ctz4_lut, 4 ) ;
283
346
gen_test ! ( ctz3_lut, super :: ctz3_lut, 3 ) ;
284
347
gen_test ! ( ctz2, super :: ctz2, 2 ) ;
348
+ gen_test ! ( ctz_array_lut_1, super :: ctz_array_lut:: <2 >, 1 ) ;
349
+ gen_test ! ( ctz_array_lut_2, super :: ctz_array_lut:: <4 >, 2 ) ;
350
+ gen_test ! ( ctz_array_lut_3, super :: ctz_array_lut:: <8 >, 3 ) ;
351
+ gen_test ! ( ctz_array_lut_4, super :: ctz_array_lut:: <16 >, 4 ) ;
352
+ gen_test ! ( ctz_array_lut_8, super :: ctz_array_lut:: <256 >, 8 ) ;
285
353
gen_test ! ( ctz_linear_0, super :: ctz_linear:: <0 >, 0 ) ;
286
354
gen_test ! ( ctz_linear_1, super :: ctz_linear:: <1 >, 1 ) ;
287
355
gen_test ! ( ctz_linear_2, super :: ctz_linear:: <2 >, 2 ) ;
0 commit comments