Add audio normalizer (#642)

yujonglee · web-flow · commit bdbf36ed827e · 2025-05-11T20:51:15.000-07:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/audio/Cargo.toml b/crates/audio/Cargo.toml
@@ -20,6 +20,7 @@ tokio-stream = { workspace = true }
 cpal = { workspace = true }
 rodio = { workspace = true, features = ["vorbis"] }
 
+ebur128 = "0.1.10"
 kalosm-sound = { workspace = true, default-features = false }
 ringbuf = "0.4.8"
 
@@ -33,5 +34,7 @@ wasapi = { git = "https://github.com/HEnquist/wasapi-rs", rev = "24ae99c0134f7e1
 alsa = "0.9.1"
 
 [dev-dependencies]
-rodio = "*"
+hound = { workspace = true }
+hypr-data = { workspace = true }
+rodio = { workspace = true }
 serial_test = { workspace = true }
diff --git a/crates/audio/src/lib.rs b/crates/audio/src/lib.rs
@@ -1,10 +1,12 @@
 mod errors;
 mod mic;
+mod norm;
 mod speaker;
 mod stream;
 
 pub use errors::*;
 pub use mic::*;
+pub use norm::*;
 pub use speaker::*;
 pub use stream::*;
 
diff --git a/crates/audio/src/norm.rs b/crates/audio/src/norm.rs
@@ -0,0 +1,160 @@
+use std::pin::Pin;
+use std::task::{Context, Poll};
+
+use ebur128::{EbuR128, Mode};
+use futures_util::Stream;
+
+const CHANNELS: u32 = 1;
+const TARGET_LUFS: f64 = -23.0;
+const TRUE_PEAK_LIMIT: f64 = -1.0;
+const LIMITER_LOOKAHEAD_MS: usize = 10;
+const ANALYZE_CHUNK_SIZE: usize = 512;
+
+pub struct NormalizedSource<S: kalosm_sound::AsyncSource> {
+    source: S,
+    gain_linear: f32,
+    ebur128: EbuR128,
+    loudness_buffer: Vec<f32>,
+    limiter: TruePeakLimiter,
+    true_peak_limit: f32,
+}
+
+struct TruePeakLimiter {
+    lookahead_samples: usize,
+    buffer: Vec<f32>,
+    gain_reduction: Vec<f32>,
+    current_position: usize,
+}
+
+impl TruePeakLimiter {
+    fn new(sample_rate: u32) -> Self {
+        let lookahead_samples = ((sample_rate as usize * LIMITER_LOOKAHEAD_MS) / 1000).max(1);
+
+        Self {
+            lookahead_samples,
+            buffer: vec![0.0; lookahead_samples],
+            gain_reduction: vec![1.0; lookahead_samples],
+            current_position: 0,
+        }
+    }
+
+    fn process(&mut self, sample: f32, true_peak_limit: f32) -> f32 {
+        self.buffer[self.current_position] = sample;
+
+        let sample_abs = sample.abs();
+        if sample_abs > true_peak_limit {
+            let reduction = true_peak_limit / sample_abs;
+            self.gain_reduction[self.current_position] = reduction;
+        } else {
+            self.gain_reduction[self.current_position] = 1.0;
+        }
+
+        let output_position = (self.current_position + 1) % self.lookahead_samples;
+        let output_sample = self.buffer[output_position] * self.gain_reduction[output_position];
+
+        self.current_position = output_position;
+        output_sample
+    }
+}
+
+pub trait NormalizeExt<S: kalosm_sound::AsyncSource> {
+    fn normalize(self) -> NormalizedSource<S>;
+}
+
+impl<S: kalosm_sound::AsyncSource> NormalizeExt<S> for S {
+    fn normalize(self) -> NormalizedSource<S> {
+        let sample_rate = self.sample_rate();
+        let ebur128 = EbuR128::new(CHANNELS, sample_rate, Mode::I | Mode::TRUE_PEAK)
+            .expect("Failed to create EBU R128 analyzer");
+
+        let true_peak_limit = 10_f32.powf(TRUE_PEAK_LIMIT as f32 / 20.0);
+
+        NormalizedSource {
+            source: self,
+            gain_linear: 1.0,
+            ebur128,
+            loudness_buffer: Vec::with_capacity(ANALYZE_CHUNK_SIZE),
+            limiter: TruePeakLimiter::new(sample_rate),
+            true_peak_limit,
+        }
+    }
+}
+
+impl<S: kalosm_sound::AsyncSource + Unpin> Stream for NormalizedSource<S> {
+    type Item = f32;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        let mut inner = std::pin::pin!(this.source.as_stream());
+
+        match inner.as_mut().poll_next(cx) {
+            Poll::Ready(Some(sample)) => {
+                this.loudness_buffer.push(sample);
+
+                if this.loudness_buffer.len() >= ANALYZE_CHUNK_SIZE {
+                    let _ = this.ebur128.add_frames_f32(&this.loudness_buffer);
+                    this.loudness_buffer.clear();
+
+                    if let Ok(current_lufs) = this.ebur128.loudness_global() {
+                        if current_lufs.is_finite() && current_lufs < 0.0 {
+                            let gain_db = TARGET_LUFS - current_lufs;
+                            this.gain_linear = 10_f32.powf(gain_db as f32 / 20.0);
+                        }
+                    }
+                }
+
+                let amplified = sample * this.gain_linear;
+                let limited = this.limiter.process(amplified, this.true_peak_limit);
+
+                Poll::Ready(Some(limited))
+            }
+            Poll::Pending => Poll::Pending,
+            Poll::Ready(None) => Poll::Ready(None),
+        }
+    }
+}
+
+impl<S: kalosm_sound::AsyncSource + Unpin> kalosm_sound::AsyncSource for NormalizedSource<S> {
+    fn sample_rate(&self) -> u32 {
+        self.source.sample_rate()
+    }
+
+    fn as_stream(&mut self) -> impl Stream<Item = f32> + '_ {
+        Box::pin(self)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use futures_util::StreamExt;
+    use kalosm_sound::AsyncSource;
+
+    #[tokio::test]
+    async fn test_normalize() {
+        let audio = rodio::Decoder::new_wav(std::io::BufReader::new(
+            std::fs::File::open(hypr_data::english_1::AUDIO_PATH).unwrap(),
+        ))
+        .unwrap();
+
+        let sample_rate = audio.sample_rate();
+        let mut normalized = audio.normalize();
+
+        let mut writer = {
+            let spec = hound::WavSpec {
+                channels: 1,
+                sample_rate,
+                bits_per_sample: 32,
+                sample_format: hound::SampleFormat::Float,
+            };
+            let output_path = std::path::Path::new("./normalized_output.wav");
+            hound::WavWriter::create(output_path, spec).unwrap()
+        };
+
+        let mut stream = normalized.as_stream();
+        while let Some(sample) = stream.next().await {
+            writer.write_sample(sample).unwrap();
+        }
+        writer.finalize().unwrap();
+    }
+}
diff --git a/crates/audio/src/speaker/macos.rs b/crates/audio/src/speaker/macos.rs
@@ -1,11 +1,12 @@
+use std::sync::{Arc, Mutex};
+use std::task::{Poll, Waker};
+
 use anyhow::Result;
 use futures_util::Stream;
 use ringbuf::{
     traits::{Consumer, Producer, Split},
-    HeapRb,
+    HeapCons, HeapProd, HeapRb,
 };
-use std::sync::{Arc, Mutex};
-use std::task::{Poll, Waker};
 
 use ca::aggregate_device_keys as agg_keys;
 use cidre::{arc, av, cat, cf, core_audio as ca, ns, os};
@@ -24,7 +25,7 @@ struct WakerState {
 }
 
 pub struct SpeakerStream {
-    consumer: ringbuf::HeapCons<f32>,
+    consumer: HeapCons<f32>,
     stream_desc: cat::AudioBasicStreamDesc,
     sample_rate_override: Option<u32>,
     _device: ca::hardware::StartedDevice<ca::AggregateDevice>,
@@ -48,7 +49,7 @@ impl SpeakerStream {
 
 struct Ctx {
     format: arc::R<av::AudioFormat>,
-    producer: ringbuf::HeapProd<f32>,
+    producer: HeapProd<f32>,
     waker_state: Arc<Mutex<WakerState>>,
 }