-
Notifications
You must be signed in to change notification settings - Fork 41
Add audio processing module #99
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 12 commits
32acf70
cb23c17
aad73bc
0892526
7c6f099
6294861
624935f
5569acc
51c596b
fc0e83f
e09c6cc
bbad26a
87f723a
5f71c75
af3d8ec
89437aa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
using System.Threading; | ||
using LiveKit.Internal; | ||
using UnityEngine; | ||
|
||
namespace LiveKit | ||
{ | ||
/// <summary> | ||
/// Captures and processes the reverse audio stream using an <see cref="AudioProcessingModule"/>. | ||
/// </summary> | ||
/// <remarks> | ||
/// The reverse stream is captured from the scene's audio listener. | ||
/// </remarks> | ||
internal class ApmReverseStream | ||
{ | ||
private readonly AudioBuffer _captureBuffer = new AudioBuffer(); | ||
private readonly AudioProcessingModule _apm; // APM is thread safe | ||
private Thread _thread; | ||
private AudioFilter _audioFilter; | ||
|
||
internal ApmReverseStream(AudioProcessingModule apm) | ||
{ | ||
_apm = apm; | ||
} | ||
|
||
internal void Start() | ||
{ | ||
var audioListener = GameObject.FindObjectOfType<AudioListener>(); | ||
if (audioListener == null) | ||
{ | ||
Utils.Error("AudioListener not found in scene"); | ||
return; | ||
} | ||
_audioFilter = audioListener.gameObject.AddComponent<AudioFilter>(); | ||
_audioFilter.AudioRead += OnAudioRead; | ||
|
||
_thread = new Thread(ProcessReverseStream); | ||
_thread.Start(); | ||
} | ||
|
||
internal void Stop() | ||
{ | ||
_thread?.Abort(); | ||
if (_audioFilter != null) | ||
Object.Destroy(_audioFilter); | ||
} | ||
|
||
private void ProcessReverseStream() | ||
{ | ||
while (true) | ||
{ | ||
Thread.Sleep(Constants.TASK_DELAY); | ||
using var frame = _captureBuffer.ReadDuration(AudioProcessingModule.FRAME_DURATION_MS); | ||
if (frame == null) continue; | ||
_apm.ProcessReverseStream(frame); | ||
} | ||
} | ||
|
||
private void OnAudioRead(float[] data, int channels, int sampleRate) | ||
{ | ||
_captureBuffer.Write(data, (uint)channels, (uint)sampleRate); | ||
|
||
} | ||
} | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
using LiveKit.Proto; | ||
using LiveKit.Internal.FFIClients.Requests; | ||
using LiveKit.Internal; | ||
using System; | ||
using System.Runtime.CompilerServices; | ||
[assembly: InternalsVisibleTo("Tests")] | ||
|
||
namespace LiveKit | ||
{ | ||
/// <summary> | ||
/// Provides WebRTC audio processing capabilities including echo cancellation, noise suppression, | ||
/// high-pass filtering, and gain control. | ||
/// </summary> | ||
public sealed class AudioProcessingModule | ||
{ | ||
internal readonly FfiHandle Handle; | ||
|
||
/// <summary> | ||
/// Initializes an <see cref="AudioProcessingModule" /> instance with the specified audio processing features. | ||
/// </summary> | ||
/// <param name="echoCancellationEnabled">Whether to enable echo cancellation.</param> | ||
/// <param name="noiseSuppressionEnabled">Whether to enable noise suppression.</param> | ||
/// <param name="highPassFilterEnabled">Whether to enable high-pass filtering.</param> | ||
/// <param name="gainControllerEnabled">Whether to enable gain control.</param> | ||
public AudioProcessingModule( | ||
bool echoCancellationEnabled, | ||
bool noiseSuppressionEnabled, | ||
bool highPassFilterEnabled, | ||
bool gainControllerEnabled) | ||
{ | ||
using var request = FFIBridge.Instance.NewRequest<NewApmRequest>(); | ||
var newApm = request.request; | ||
newApm.EchoCancellerEnabled = echoCancellationEnabled; | ||
newApm.NoiseSuppressionEnabled = noiseSuppressionEnabled; | ||
newApm.HighPassFilterEnabled = highPassFilterEnabled; | ||
newApm.GainControllerEnabled = gainControllerEnabled; | ||
|
||
using var response = request.Send(); | ||
FfiResponse res = response; | ||
Handle = FfiHandle.FromOwnedHandle(res.NewApm.Apm.Handle); | ||
} | ||
|
||
/// <summary> | ||
/// Process the provided audio frame using the configured audio processing features. | ||
/// </summary> | ||
/// <param name="data">The audio frame to process.</param> | ||
/// <remarks> | ||
/// Important: Audio frames must be exactly 10 ms in duration. | ||
/// | ||
/// The input audio frame is modified in-place (if applicable) by the underlying audio | ||
/// processing module (e.g., echo cancellation, noise suppression, etc.). | ||
/// </remarks> | ||
public void ProcessStream(AudioFrame data) | ||
{ | ||
using var request = FFIBridge.Instance.NewRequest<ApmProcessStreamRequest>(); | ||
var processStream = request.request; | ||
processStream.ApmHandle = (ulong)Handle.DangerousGetHandle(); | ||
processStream.DataPtr = (ulong)data.Data; | ||
processStream.Size = (uint)data.Length; | ||
processStream.SampleRate = data.SampleRate; | ||
processStream.NumChannels = data.NumChannels; | ||
|
||
using var response = request.Send(); | ||
FfiResponse res = response; | ||
if (res.ApmProcessStream.HasError) | ||
{ | ||
throw new Exception(res.ApmProcessStream.Error); | ||
} | ||
} | ||
|
||
/// <summary> | ||
/// Process the reverse audio frame (typically used for echo cancellation in a full-duplex setup). | ||
/// </summary> | ||
/// <param name="data">The audio frame to process.</param> | ||
/// <remarks> | ||
/// Important: Audio frames must be exactly 10 ms in duration. | ||
/// | ||
/// In an echo cancellation scenario, this method is used to process the "far-end" audio | ||
/// prior to mixing or feeding it into the echo canceller. Like <see cref="ProcessStream"/>, the | ||
/// input audio frame is modified in-place by the underlying processing module. | ||
/// </remarks> | ||
public void ProcessReverseStream(AudioFrame data) | ||
{ | ||
using var request = FFIBridge.Instance.NewRequest<ApmProcessReverseStreamRequest>(); | ||
var processReverseStream = request.request; | ||
processReverseStream.ApmHandle = (ulong)Handle.DangerousGetHandle(); | ||
processReverseStream.DataPtr = (ulong)data.Data; | ||
processReverseStream.Size = (uint)data.Length; | ||
processReverseStream.SampleRate = data.SampleRate; | ||
processReverseStream.NumChannels = data.NumChannels; | ||
|
||
using var response = request.Send(); | ||
FfiResponse res = response; | ||
if (res.ApmProcessReverseStream.HasError) | ||
{ | ||
throw new Exception(res.ApmProcessReverseStream.Error); | ||
} | ||
} | ||
|
||
/// <summary> | ||
/// This must be called if and only if echo processing is enabled. | ||
/// </summary> | ||
/// <remarks> | ||
/// Sets the `delay` in milliseconds between receiving a far-end frame in <see cref="ProcessReverseStream"/> | ||
/// and receiving the corresponding echo in a near-end frame in <see cref="ProcessStream"/>. | ||
/// | ||
/// The delay can be calculated as: delay = (t_render - t_analyze) + (t_process - t_capture) | ||
/// | ||
/// Where: | ||
/// - t_analyze: Time when frame is passed to <see cref="ProcessReverseStream"/> | ||
/// - t_render: Time when first sample of frame is rendered by audio hardware | ||
/// - t_capture: Time when first sample of frame is captured by audio hardware | ||
/// - t_process: Time when frame is passed to <see cref="ProcessStream"/> | ||
/// </remarks> | ||
public void SetStreamDelayMs(int delayMs) | ||
{ | ||
using var request = FFIBridge.Instance.NewRequest<ApmSetStreamDelayRequest>(); | ||
var setStreamDelay = request.request; | ||
setStreamDelay.ApmHandle = (ulong)Handle.DangerousGetHandle(); | ||
setStreamDelay.DelayMs = delayMs; | ||
|
||
using var response = request.Send(); | ||
FfiResponse res = response; | ||
if (res.ApmSetStreamDelay.HasError) | ||
{ | ||
throw new Exception(res.ApmSetStreamDelay.Error); | ||
} | ||
} | ||
|
||
/// <summary> | ||
/// The required duration for audio frames being processed. | ||
/// </summary> | ||
public const uint FRAME_DURATION_MS = 10; | ||
} | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
using System; | ||
using LiveKit.Internal; | ||
|
||
namespace LiveKit | ||
{ | ||
/// <summary> | ||
/// A thread-safe buffer for buffering audio samples. | ||
/// </summary> | ||
internal class AudioBuffer | ||
{ | ||
private readonly uint _bufferDurationMs; | ||
private RingBuffer _buffer; | ||
private uint _channels; | ||
private uint _sampleRate; | ||
private object _lock = new object(); | ||
|
||
/// <summary> | ||
/// Initializes a new audio sample buffer for holding samples for a given duration. | ||
/// </summary> | ||
internal AudioBuffer(uint bufferDurationMs = 200) | ||
{ | ||
_bufferDurationMs = bufferDurationMs; | ||
} | ||
|
||
/// <summary> | ||
/// Write audio samples. | ||
/// </summary> | ||
/// <remarks> | ||
/// The float data will be converted to short format before being written to the buffer. | ||
/// If the number of channels or sample rate changes, the buffer will be recreated. | ||
/// </remarks> | ||
/// <param name="data">The audio samples to write.</param> | ||
/// <param name="channels">The number of channels in the audio data.</param> | ||
/// <param name="sampleRate">The sample rate of the audio data in Hz.</param> | ||
internal void Write(float[] data, uint channels, uint sampleRate) | ||
{ | ||
static short FloatToS16(float v) | ||
{ | ||
v *= 32768f; | ||
v = Math.Min(v, 32767f); | ||
v = Math.Max(v, -32768f); | ||
return (short)(v + Math.Sign(v) * 0.5f); | ||
} | ||
|
||
var s16Data = new short[data.Length]; | ||
for (int i = 0; i < data.Length; i++) | ||
{ | ||
s16Data[i] = FloatToS16(data[i]); | ||
} | ||
Capture(s16Data, channels, sampleRate); | ||
} | ||
|
||
private void Capture(short[] data, uint channels, uint sampleRate) | ||
{ | ||
lock (_lock) | ||
{ | ||
if (_buffer == null || channels != _channels || sampleRate != _sampleRate) | ||
{ | ||
var size = (int)(channels * sampleRate * (_bufferDurationMs / 1000f)); | ||
_buffer?.Dispose(); | ||
_buffer = new RingBuffer(size * sizeof(short)); | ||
_channels = channels; | ||
_sampleRate = sampleRate; | ||
} | ||
unsafe | ||
{ | ||
fixed (short* pData = data) | ||
{ | ||
var byteData = new ReadOnlySpan<byte>(pData, data.Length * sizeof(short)); | ||
_buffer.Write(byteData); | ||
} | ||
} | ||
} | ||
} | ||
|
||
/// <summary> | ||
/// Reads a frame that is the length of the given duration. | ||
/// </summary> | ||
/// <param name="durationMs">The duration of the audio samples to read in milliseconds.</param> | ||
/// <returns>An AudioFrame containing the read audio samples or if there is not enough samples, null.</returns> | ||
internal AudioFrame ReadDuration(uint durationMs) | ||
{ | ||
lock (_lock) | ||
{ | ||
if (_buffer == null) return null; | ||
|
||
var samplesForDuration = (uint)(_sampleRate * (durationMs / 1000f)); | ||
var requiredLength = samplesForDuration * _channels * sizeof(short); | ||
if (_buffer.AvailableRead() < requiredLength) return null; | ||
|
||
var frame = new AudioFrame(_sampleRate, _channels, samplesForDuration); | ||
unsafe | ||
{ | ||
var frameData = new Span<byte>(frame.Data.ToPointer(), frame.Length); | ||
_buffer.Read(frameData); | ||
} | ||
return frame; | ||
} | ||
} | ||
} | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we're likely going to have a skew here. (this will impact the AEC a lot in long room duration)
Is there a way to directly process the frames as we receive them?