Skip to content

Commit 6d72edc

Browse files
committed
fewer copy in generic mont reduce
1 parent 4ac8b20 commit 6d72edc

File tree

2 files changed

+65
-35
lines changed

2 files changed

+65
-35
lines changed

ff/src/fields/models/fp/montgomery_backend.rs

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1274,34 +1274,31 @@ impl<T: MontConfig<N>, const N: usize> Fp<MontBackend<T, N>, N> {
12741274
/// Treats `limbs` as `[lo[0..N), hi[0..N), extra...]` and updates only the high half.
12751275
/// Returns the final carry-out (0 or 1) from the top of the reduction.
12761276
#[inline(always)]
1277+
#[unroll_for_loops(12)]
12771278
pub fn montgomery_reduce_in_place<const L: usize>(limbs: &mut BigInt<L>) -> u64 {
12781279
debug_assert!(L >= 2 * N, "montgomery_reduce_in_place requires L >= 2N");
12791280

1280-
// Copy the leading 2N limbs into local halves to mirror the canonical subroutine.
1281-
let mut lo = [0u64; N];
1282-
let mut hi = [0u64; N];
1283-
lo.copy_from_slice(&limbs.0[0..N]);
1284-
hi.copy_from_slice(&limbs.0[N..(N + N)]);
1281+
// Work directly on the buffer to avoid copies: split into lo and hi views.
1282+
let (lo, rest) = limbs.0.split_at_mut(N);
1283+
let hi = &mut rest[..N];
12851284

12861285
// Montgomery reduction (canonical form)
12871286
let mut carry2 = 0u64;
1288-
crate::const_for!((i in 0..N) {
1287+
for i in 0..N {
12891288
let tmp = lo[i].wrapping_mul(T::INV);
12901289
let mut carry;
12911290
mac!(lo[i], tmp, T::MODULUS.0[0], &mut carry);
1292-
crate::const_for!((j in 1..N) {
1291+
for j in 1..N {
12931292
let k = i + j;
12941293
if k >= N {
1295-
hi[k - N] = mac_with_carry!(hi[k - N], tmp, T::MODULUS.0[j], &mut carry);
1296-
} else {
1294+
let idx = k - N;
1295+
hi[idx] = mac_with_carry!(hi[idx], tmp, T::MODULUS.0[j], &mut carry);
1296+
} else {
12971297
lo[k] = mac_with_carry!(lo[k], tmp, T::MODULUS.0[j], &mut carry);
12981298
}
1299-
});
1299+
}
13001300
hi[i] = adc!(hi[i], carry, &mut carry2);
1301-
});
1302-
1303-
// Write the reduced high half back into the buffer; low half is discarded by callers.
1304-
limbs.0[N..(N + N)].copy_from_slice(&hi);
1301+
}
13051302

13061303
carry2
13071304
}

test-curves/benches/small_mul.rs

Lines changed: 54 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,15 @@ fn mul_small_bench(c: &mut Criterion) {
227227
});
228228

229229
// Reduction benchmarks
230+
group.bench_function("montgomery_reduce_in_place core (L=8)", |bench| {
231+
let mut i = 0;
232+
bench.iter(|| {
233+
i = (i + 1) % SAMPLES;
234+
let mut x = bigint_2n_s[i];
235+
criterion::black_box(Fr::montgomery_reduce_in_place::<8>(&mut x))
236+
})
237+
});
238+
230239
group.bench_function("from_montgomery_reduce (L=2N)", |bench| {
231240
let mut i = 0;
232241
bench.iter(|| {
@@ -235,29 +244,53 @@ fn mul_small_bench(c: &mut Criterion) {
235244
})
236245
});
237246

238-
// group.bench_function("from_unchecked_nplus1 (Barrett N+1)", |bench| {
239-
// let mut i = 0;
240-
// bench.iter(|| {
241-
// i = (i + 1) % SAMPLES;
242-
// criterion::black_box(Fr::from_unchecked_nplus1::<5>(bigint_nplus1_s[i]))
243-
// })
244-
// });
247+
// L=9 inputs: derive by zero-extending L=8 inputs
248+
let bigint_9_s = bigint_2n_s
249+
.iter()
250+
.map(|b8| ark_ff::BigInt::<9>::zero_extend_from::<8>(b8))
251+
.collect::<Vec<_>>();
245252

246-
// group.bench_function("from_unchecked_nplus2 (Barrett N+2)", |bench| {
247-
// let mut i = 0;
248-
// bench.iter(|| {
249-
// i = (i + 1) % SAMPLES;
250-
// criterion::black_box(Fr::from_unchecked_nplus2::<5, 6>(bigint_nplus2_s[i]))
251-
// })
252-
// });
253+
group.bench_function("montgomery_reduce_in_place core (L=9)", |bench| {
254+
let mut i = 0;
255+
bench.iter(|| {
256+
i = (i + 1) % SAMPLES;
257+
let mut x = bigint_9_s[i];
258+
criterion::black_box(Fr::montgomery_reduce_in_place::<9>(&mut x))
259+
})
260+
});
253261

254-
// group.bench_function("from_unchecked_nplus3 (Barrett N+3)", |bench| {
255-
// let mut i = 0;
256-
// bench.iter(|| {
257-
// i = (i + 1) % SAMPLES;
258-
// criterion::black_box(Fr::from_unchecked_nplus3::<5, 6, 7>(bigint_nplus3_s[i]))
259-
// })
260-
// });
262+
group.bench_function("from_montgomery_reduce (L=9)", |bench| {
263+
let mut i = 0;
264+
bench.iter(|| {
265+
i = (i + 1) % SAMPLES;
266+
criterion::black_box(Fr::from_montgomery_reduce::<9, 5>(bigint_9_s[i]))
267+
})
268+
});
269+
270+
// Barrett reductions
271+
group.bench_function("from_barrett_reduce (L=5)", |bench| {
272+
let mut i = 0;
273+
bench.iter(|| {
274+
i = (i + 1) % SAMPLES;
275+
criterion::black_box(Fr::from_barrett_reduce::<5, 5>(bigint_nplus1_s[i]))
276+
})
277+
});
278+
279+
group.bench_function("from_barrett_reduce (L=6)", |bench| {
280+
let mut i = 0;
281+
bench.iter(|| {
282+
i = (i + 1) % SAMPLES;
283+
criterion::black_box(Fr::from_barrett_reduce::<6, 5>(bigint_nplus2_s[i]))
284+
})
285+
});
286+
287+
group.bench_function("from_barrett_reduce (L=7)", |bench| {
288+
let mut i = 0;
289+
bench.iter(|| {
290+
i = (i + 1) % SAMPLES;
291+
criterion::black_box(Fr::from_barrett_reduce::<7, 5>(bigint_nplus3_s[i]))
292+
})
293+
});
261294

262295
// Linear combination benchmarks
263296
group.bench_function("linear_combination_u64 (2 terms)", |bench| {

0 commit comments

Comments
 (0)