Skip to content

Commit 92c3b06

Browse files
committed
perf(utils): use an array-based CTZ look-up table if load operations are fast
1 parent b02c5b2 commit 92c3b06

File tree

1 file changed

+69
-1
lines changed

1 file changed

+69
-1
lines changed

src/r3/src/utils/ctz.rs

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ const USIZE_BITS: u32 = usize::BITS;
77
const HAS_CTZ: bool = if cfg!(target_arch = "riscv32") || cfg!(target_arch = "riscv64") {
88
cfg!(target_feature = "b") || cfg!(target_feature = "experimental-b")
99
} else if cfg!(target_arch = "arm") {
10+
// (It's actually CLZ + RBIT)
1011
// Thumb-2
1112
cfg!(target_feature = "v6t2")
1213
// Armv5T and later, only in Arm mode
@@ -42,6 +43,34 @@ const HAS_SHIFTER: bool = if cfg!(target_arch = "msp430") {
4243
true
4344
};
4445

46+
/// Indicates whether an array-based look-up table would be faster than other
47+
/// techniques.
48+
///
49+
/// Some targets would use constant pools anyway. On such targets, bit
50+
/// manipulation tricks relying on an instruction-embedded LUT would actually
51+
/// read from a data bus anyway and therefore would never be faster than an
52+
/// array-based LUT.
53+
///
54+
/// Small microcontrollers usually have a low-latency memory system and a
55+
/// single-issue in-order pipeline. Bit manipulation tricks often require many
56+
/// bit manipulation instructions to move bits into a correct place, which
57+
/// sometimes over-weighs the cost of loading an LUT address and then loading
58+
/// one of its entries. Examples: <https://rust.godbolt.org/z/961Pej> (Armv6-M
59+
/// and Armv7-M), <https://cpp.godbolt.org/z/WPnxon> (MSP430 and AVR)
60+
///
61+
/// There are extreme cases that should be taken into consideration as well.
62+
/// For example, SiFive E31 (used in SiFive Freedom E310) does not have a data
63+
/// cache for XiP from an external SPI flash. Therefore, using an array-based
64+
/// LUT on such systems would lead to a catastrophic performance degradation and
65+
/// must be avoided at any cost.
66+
#[allow(clippy::needless_bool)]
67+
const HAS_FAST_LOAD: bool =
68+
if cfg!(target_arch = "arm") || cfg!(target_arch = "msp430") || cfg!(target_arch = "avr") {
69+
true
70+
} else {
71+
false
72+
};
73+
4574
/// Return the number of trailing zeros in `x` (`< 1 << BITS`). Returns
4675
/// `usize::BITS` if `x` is zero.
4776
#[inline]
@@ -56,6 +85,12 @@ pub fn trailing_zeros<const BITS: usize>(x: usize) -> u32 {
5685
}
5786
} else if HAS_CTZ {
5887
x.trailing_zeros()
88+
} else if BITS == 2 && HAS_FAST_LOAD {
89+
ctz_array_lut::<4>(x)
90+
} else if BITS == 3 && HAS_FAST_LOAD {
91+
ctz_array_lut::<8>(x)
92+
} else if BITS == 4 && HAS_FAST_LOAD {
93+
ctz_array_lut::<16>(x)
5994
} else if BITS <= 2 {
6095
ctz2(x)
6196
} else if BITS <= 3 && HAS_SHIFTER {
@@ -164,6 +199,30 @@ fn ctz2(x: usize) -> u32 {
164199
}
165200
}
166201

202+
/// Implements [`trailing_zeros`] using an array-based look-up table.
203+
#[inline]
204+
fn ctz_array_lut<const LEN: usize>(x: usize) -> u32 {
205+
struct Lut<const LEN: usize>;
206+
trait LutTrait {
207+
const LUT: &'static [u8];
208+
}
209+
impl<const LEN: usize> LutTrait for Lut<LEN> {
210+
const LUT: &'static [u8] = &{
211+
let mut array = [0u8; LEN];
212+
// FIXME: Work-around for `for` being unsupported in `const fn`
213+
let mut i = 0;
214+
while i < array.len() {
215+
array[i] = i.trailing_zeros() as u8;
216+
i += 1;
217+
}
218+
array
219+
};
220+
}
221+
222+
let lut = Lut::<LEN>::LUT;
223+
lut[x & (lut.len() - 1)] as u32
224+
}
225+
167226
/// Implements [`trailing_zeros`] using linear search.
168227
#[inline]
169228
fn ctz_linear<const BITS: usize>(mut x: usize) -> u32 {
@@ -211,7 +270,11 @@ fn ctz_bsearch32<const BITS: usize>(x: usize) -> u32 {
211270
x &= 0xf;
212271
}
213272

214-
i += ctz4_lut_nonzero(x as usize);
273+
if HAS_FAST_LOAD {
274+
i += ctz_array_lut::<16>(x as usize);
275+
} else {
276+
i += ctz4_lut_nonzero(x as usize);
277+
}
215278

216279
i
217280
}
@@ -282,6 +345,11 @@ mod tests {
282345
gen_test!(ctz4_lut, super::ctz4_lut, 4);
283346
gen_test!(ctz3_lut, super::ctz3_lut, 3);
284347
gen_test!(ctz2, super::ctz2, 2);
348+
gen_test!(ctz_array_lut_1, super::ctz_array_lut::<2>, 1);
349+
gen_test!(ctz_array_lut_2, super::ctz_array_lut::<4>, 2);
350+
gen_test!(ctz_array_lut_3, super::ctz_array_lut::<8>, 3);
351+
gen_test!(ctz_array_lut_4, super::ctz_array_lut::<16>, 4);
352+
gen_test!(ctz_array_lut_8, super::ctz_array_lut::<256>, 8);
285353
gen_test!(ctz_linear_0, super::ctz_linear::<0>, 0);
286354
gen_test!(ctz_linear_1, super::ctz_linear::<1>, 1);
287355
gen_test!(ctz_linear_2, super::ctz_linear::<2>, 2);

0 commit comments

Comments
 (0)