Skip to content
This repository was archived by the owner on May 28, 2025. It is now read-only.

Commit 4ddfd2f

Browse files
miguelrazworkingjubilee
authored andcommitted
non allocating fold simd
allocating fold with std::ops::Add::add
1 parent 4615805 commit 4ddfd2f

File tree

1 file changed

+31
-0
lines changed

1 file changed

+31
-0
lines changed

crates/core_simd/examples/dot_product.rs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,37 @@ pub fn dot_prod_simd_3(a: &[f32], b: &[f32]) -> f32 {
108108

109109
sums.reduce_sum()
110110
}
111+
112+
// Finally, we present an iterator version for handling remainders in a scalar fashion at the end of the loop.
113+
// Unfortunately, this is allocating 1 `XMM` register on the order of `~len(a)` - we'll see how we can get around it in the
114+
// next example.
115+
pub fn dot_prod_simd_4(a: &[f32], b: &[f32]) -> f32 {
116+
let mut sum = a
117+
.array_chunks::<4>()
118+
.map(|&a| f32x4::from_array(a))
119+
.zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
120+
.map(|(a, b)| a * b)
121+
.fold(f32x4::splat(0.0), std::ops::Add::add)
122+
.reduce_sum();
123+
let remain = a.len() - (a.len() % 4);
124+
sum += a[remain..]
125+
.iter()
126+
.zip(&b[remain..])
127+
.map(|(a, b)| a * b)
128+
.sum::<f32>();
129+
sum
130+
}
131+
132+
// This version allocates a single `XMM` register for accumulation, and the folds don't allocate on top of that.
133+
// Notice the the use of `mul_add`, which can do a multiply and an add operation ber iteration.
134+
pub fn dot_prod_simd_5(a: &[f32], b: &[f32]) -> f32 {
135+
a.array_chunks::<4>()
136+
.map(|&a| f32x4::from_array(a))
137+
.zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
138+
.fold(f32x4::splat(0.), |acc, (a, b)| acc.mul_add(a, b))
139+
.reduce_sum()
140+
}
141+
111142
fn main() {
112143
// Empty main to make cargo happy
113144
}

0 commit comments

Comments
 (0)