fewer copy in generic mont reduce

quangvdao · quangvdao · commit 6d72edc18b4c · 2025-09-24T15:17:12.000-04:00
diff --git a/ff/src/fields/models/fp/montgomery_backend.rs b/ff/src/fields/models/fp/montgomery_backend.rs
@@ -1274,34 +1274,31 @@ impl<T: MontConfig<N>, const N: usize> Fp<MontBackend<T, N>, N> {
     /// Treats `limbs` as `[lo[0..N), hi[0..N), extra...]` and updates only the high half.
     /// Returns the final carry-out (0 or 1) from the top of the reduction.
     #[inline(always)]
+    #[unroll_for_loops(12)]
     pub fn montgomery_reduce_in_place<const L: usize>(limbs: &mut BigInt<L>) -> u64 {
         debug_assert!(L >= 2 * N, "montgomery_reduce_in_place requires L >= 2N");
 
-        // Copy the leading 2N limbs into local halves to mirror the canonical subroutine.
-        let mut lo = [0u64; N];
-        let mut hi = [0u64; N];
-        lo.copy_from_slice(&limbs.0[0..N]);
-        hi.copy_from_slice(&limbs.0[N..(N + N)]);
+        // Work directly on the buffer to avoid copies: split into lo and hi views.
+        let (lo, rest) = limbs.0.split_at_mut(N);
+        let hi = &mut rest[..N];
 
         // Montgomery reduction (canonical form)
         let mut carry2 = 0u64;
-        crate::const_for!((i in 0..N) {
+        for i in 0..N {
             let tmp = lo[i].wrapping_mul(T::INV);
             let mut carry;
             mac!(lo[i], tmp, T::MODULUS.0[0], &mut carry);
-            crate::const_for!((j in 1..N) {
+            for j in 1..N {
                 let k = i + j;
                 if k >= N {
-                    hi[k - N] = mac_with_carry!(hi[k - N], tmp, T::MODULUS.0[j], &mut carry);
-                }  else {
+                    let idx = k - N;
+                    hi[idx] = mac_with_carry!(hi[idx], tmp, T::MODULUS.0[j], &mut carry);
+                } else {
                     lo[k] = mac_with_carry!(lo[k], tmp, T::MODULUS.0[j], &mut carry);
                 }
-            });
+            }
             hi[i] = adc!(hi[i], carry, &mut carry2);
-        });
-
-        // Write the reduced high half back into the buffer; low half is discarded by callers.
-        limbs.0[N..(N + N)].copy_from_slice(&hi);
+        }
 
         carry2
     }
diff --git a/test-curves/benches/small_mul.rs b/test-curves/benches/small_mul.rs
@@ -227,6 +227,15 @@ fn mul_small_bench(c: &mut Criterion) {
     });
 
     // Reduction benchmarks
+    group.bench_function("montgomery_reduce_in_place core (L=8)", |bench| {
+        let mut i = 0;
+        bench.iter(|| {
+            i = (i + 1) % SAMPLES;
+            let mut x = bigint_2n_s[i];
+            criterion::black_box(Fr::montgomery_reduce_in_place::<8>(&mut x))
+        })
+    });
+
     group.bench_function("from_montgomery_reduce (L=2N)", |bench| {
         let mut i = 0;
         bench.iter(|| {
@@ -235,29 +244,53 @@ fn mul_small_bench(c: &mut Criterion) {
         })
     });
 
-    // group.bench_function("from_unchecked_nplus1 (Barrett N+1)", |bench| {
-    //     let mut i = 0;
-    //     bench.iter(|| {
-    //         i = (i + 1) % SAMPLES;
-    //         criterion::black_box(Fr::from_unchecked_nplus1::<5>(bigint_nplus1_s[i]))
-    //     })
-    // });
+    // L=9 inputs: derive by zero-extending L=8 inputs
+    let bigint_9_s = bigint_2n_s
+        .iter()
+        .map(|b8| ark_ff::BigInt::<9>::zero_extend_from::<8>(b8))
+        .collect::<Vec<_>>();
 
-    // group.bench_function("from_unchecked_nplus2 (Barrett N+2)", |bench| {
-    //     let mut i = 0;
-    //     bench.iter(|| {
-    //         i = (i + 1) % SAMPLES;
-    //         criterion::black_box(Fr::from_unchecked_nplus2::<5, 6>(bigint_nplus2_s[i]))
-    //     })
-    // });
+    group.bench_function("montgomery_reduce_in_place core (L=9)", |bench| {
+        let mut i = 0;
+        bench.iter(|| {
+            i = (i + 1) % SAMPLES;
+            let mut x = bigint_9_s[i];
+            criterion::black_box(Fr::montgomery_reduce_in_place::<9>(&mut x))
+        })
+    });
 
-    // group.bench_function("from_unchecked_nplus3 (Barrett N+3)", |bench| {
-    //     let mut i = 0;
-    //     bench.iter(|| {
-    //         i = (i + 1) % SAMPLES;
-    //         criterion::black_box(Fr::from_unchecked_nplus3::<5, 6, 7>(bigint_nplus3_s[i]))
-    //     })
-    // });
+    group.bench_function("from_montgomery_reduce (L=9)", |bench| {
+        let mut i = 0;
+        bench.iter(|| {
+            i = (i + 1) % SAMPLES;
+            criterion::black_box(Fr::from_montgomery_reduce::<9, 5>(bigint_9_s[i]))
+        })
+    });
+
+    // Barrett reductions
+    group.bench_function("from_barrett_reduce (L=5)", |bench| {
+        let mut i = 0;
+        bench.iter(|| {
+            i = (i + 1) % SAMPLES;
+            criterion::black_box(Fr::from_barrett_reduce::<5, 5>(bigint_nplus1_s[i]))
+        })
+    });
+
+    group.bench_function("from_barrett_reduce (L=6)", |bench| {
+        let mut i = 0;
+        bench.iter(|| {
+            i = (i + 1) % SAMPLES;
+            criterion::black_box(Fr::from_barrett_reduce::<6, 5>(bigint_nplus2_s[i]))
+        })
+    });
+
+    group.bench_function("from_barrett_reduce (L=7)", |bench| {
+        let mut i = 0;
+        bench.iter(|| {
+            i = (i + 1) % SAMPLES;
+            criterion::black_box(Fr::from_barrett_reduce::<7, 5>(bigint_nplus3_s[i]))
+        })
+    });
 
     // Linear combination benchmarks
     group.bench_function("linear_combination_u64 (2 terms)", |bench| {