Skip to content

Add CXX API for VAD #2077

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions .github/workflows/cxx-api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,45 @@ jobs:
otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib
fi

- name: Test VAD
shell: bash
run: |
name=vad-cxx-api
g++ -std=c++17 -o $name ./cxx-api-examples/$name.cc \
-I ./build/install/include \
-L ./build/install/lib/ \
-l sherpa-onnx-cxx-api \
-l sherpa-onnx-c-api \
-l onnxruntime

ls -lh $name

export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH

if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then
ldd ./$name
echo "----"
readelf -d ./$name
fi

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx

./$name

mkdir vad-test
cp -v lei-jun-test*.wav vad-test

ls -lh vad-test

rm $name

- uses: actions/upload-artifact@v4
with:
name: vad-test-wavs-cxx-${{ matrix.os }}
path: ./vad-test/*.wav

- name: Test Speech Enhancement (GTCRN)
shell: bash
run: |
Expand Down
2 changes: 2 additions & 0 deletions cmake/cmake_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def get_binaries():
"sherpa-onnx-microphone-offline-speaker-identification",
"sherpa-onnx-offline",
"sherpa-onnx-offline-audio-tagging",
"sherpa-onnx-offline-denoiser",
"sherpa-onnx-offline-language-identification",
"sherpa-onnx-offline-punctuation",
"sherpa-onnx-offline-speaker-diarization",
Expand All @@ -62,6 +63,7 @@ def get_binaries():
"sherpa-onnx-online-punctuation",
"sherpa-onnx-online-websocket-client",
"sherpa-onnx-online-websocket-server",
"sherpa-onnx-vad",
"sherpa-onnx-vad-microphone",
"sherpa-onnx-vad-microphone-offline-asr",
"sherpa-onnx-vad-with-offline-asr",
Expand Down
3 changes: 3 additions & 0 deletions cxx-api-examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api)
add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc)
target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api)

add_executable(vad-cxx-api ./vad-cxx-api.cc)
target_link_libraries(vad-cxx-api sherpa-onnx-cxx-api)

if(SHERPA_ONNX_ENABLE_TTS)
add_executable(matcha-tts-zh-cxx-api ./matcha-tts-zh-cxx-api.cc)
target_link_libraries(matcha-tts-zh-cxx-api sherpa-onnx-cxx-api)
Expand Down
85 changes: 85 additions & 0 deletions cxx-api-examples/vad-cxx-api.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// cxx-api-examples/vad-cxx-api.cc
//
// Copyright (c) 2025 Xiaomi Corporation

//
// This file demonstrates how to use VAD to remove silences from a file
// clang-format off
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
//
// clang-format on
#include <iostream>
#include <string>

#include "sherpa-onnx/c-api/cxx-api.h"

int32_t main() {
using namespace sherpa_onnx::cxx; // NOLINT

std::string wave_filename = "./lei-jun-test.wav";
std::string vad_filename = "./silero_vad.onnx";

VadModelConfig config;
config.silero_vad.model = vad_filename;
config.silero_vad.threshold = 0.1;
config.silero_vad.min_silence_duration = 0.5;
config.silero_vad.min_speech_duration = 0.25;
config.silero_vad.max_speech_duration = 20;
config.sample_rate = 16000;
config.debug = true;

VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20);
if (!vad.Get()) {
std::cerr << "Failed to create VAD. Please check your config\n";
return -1;
}

Wave wave = ReadWave(wave_filename);
if (wave.samples.empty()) {
std::cerr << "Failed to read: '" << wave_filename << "'\n";
return -1;
}
bool is_eof = false;
int32_t i = 0;
int32_t window_size = config.silero_vad.window_size;

int32_t sample_rate = config.sample_rate;

std::vector<float> samples_without_silence;

while (!is_eof) {
if (i + window_size < wave.samples.size()) {
vad.AcceptWaveform(wave.samples.data() + i, window_size);
i += window_size;
} else {
is_eof = true;
vad.Flush();
}

while (!vad.IsEmpty()) {
auto segment = vad.Front();
float start_time = segment.start / static_cast<float>(sample_rate);
float end_time =
start_time + segment.samples.size() / static_cast<float>(sample_rate);
printf("%.3f -- %.3f\n", start_time, end_time);

samples_without_silence.insert(samples_without_silence.end(),
segment.samples.begin(),
segment.samples.end());

vad.Pop();
}
}

bool ok = WriteWave("./lei-jun-test-no-silence.wav",
{samples_without_silence, sample_rate});
if (ok) {
std::cout << "Saved to ./lei-jun-test-no-silence.wav\n";
} else {
std::cerr << "Failed to write ./lei-jun-test-no-silence.wav\n";
}

return 0;
}
2 changes: 1 addition & 1 deletion sherpa-onnx/c-api/c-api.h
Original file line number Diff line number Diff line change
Expand Up @@ -785,7 +785,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
// in seconds
float min_speech_duration;

int window_size;
int32_t window_size;

// If a speech segment is longer than this value, then we increase
// the threshold to 0.9. After finishing detecting the segment,
Expand Down
110 changes: 110 additions & 0 deletions sherpa-onnx/c-api/cxx-api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -558,4 +558,114 @@ int32_t OfflineSpeechDenoiser::GetSampleRate() const {
return SherpaOnnxOfflineSpeechDenoiserGetSampleRate(p_);
}

CircularBuffer CircularBuffer::Create(int32_t capacity) {
auto p = SherpaOnnxCreateCircularBuffer(capacity);
return CircularBuffer(p);
}

CircularBuffer::CircularBuffer(const SherpaOnnxCircularBuffer *p)
: MoveOnly<CircularBuffer, SherpaOnnxCircularBuffer>(p) {}

void CircularBuffer::Destroy(const SherpaOnnxCircularBuffer *p) const {
SherpaOnnxDestroyCircularBuffer(p);
}

void CircularBuffer::Push(const float *samples, int32_t n) const {
SherpaOnnxCircularBufferPush(p_, samples, n);
}

std::vector<float> CircularBuffer::Get(int32_t start_index, int32_t n) const {
const float *samples = SherpaOnnxCircularBufferGet(p_, start_index, n);
std::vector<float> ans(n);
std::copy(samples, samples + n, ans.begin());

SherpaOnnxCircularBufferFree(samples);
return ans;
}

void CircularBuffer::Pop(int32_t n) const {
SherpaOnnxCircularBufferPop(p_, n);
}

int32_t CircularBuffer::Size() const {
return SherpaOnnxCircularBufferSize(p_);
}

int32_t CircularBuffer::Head() const {
return SherpaOnnxCircularBufferHead(p_);
}

void CircularBuffer::Reset() const { SherpaOnnxCircularBufferReset(p_); }

VoiceActivityDetector VoiceActivityDetector::Create(
const VadModelConfig &config, float buffer_size_in_seconds) {
struct SherpaOnnxVadModelConfig c;
memset(&c, 0, sizeof(c));

c.silero_vad.model = config.silero_vad.model.c_str();
c.silero_vad.threshold = config.silero_vad.threshold;
c.silero_vad.min_silence_duration = config.silero_vad.min_silence_duration;
c.silero_vad.min_speech_duration = config.silero_vad.min_speech_duration;
c.silero_vad.window_size = config.silero_vad.window_size;
c.silero_vad.max_speech_duration = config.silero_vad.max_speech_duration;

c.sample_rate = config.sample_rate;
c.num_threads = config.num_threads;
c.provider = config.provider.c_str();
c.debug = config.debug;

auto p = SherpaOnnxCreateVoiceActivityDetector(&c, buffer_size_in_seconds);
return VoiceActivityDetector(p);
}

VoiceActivityDetector::VoiceActivityDetector(
const SherpaOnnxVoiceActivityDetector *p)
: MoveOnly<VoiceActivityDetector, SherpaOnnxVoiceActivityDetector>(p) {}

void VoiceActivityDetector::Destroy(
const SherpaOnnxVoiceActivityDetector *p) const {
SherpaOnnxDestroyVoiceActivityDetector(p);
}

void VoiceActivityDetector::AcceptWaveform(const float *samples,
int32_t n) const {
SherpaOnnxVoiceActivityDetectorAcceptWaveform(p_, samples, n);
}

bool VoiceActivityDetector::IsEmpty() const {
return SherpaOnnxVoiceActivityDetectorEmpty(p_);
}

bool VoiceActivityDetector ::IsDetected() const {
return SherpaOnnxVoiceActivityDetectorDetected(p_);
}

void VoiceActivityDetector::Pop() const {
SherpaOnnxVoiceActivityDetectorPop(p_);
}

void VoiceActivityDetector::Clear() const {
SherpaOnnxVoiceActivityDetectorClear(p_);
}

SpeechSegment VoiceActivityDetector::Front() const {
auto f = SherpaOnnxVoiceActivityDetectorFront(p_);

SpeechSegment segment;
segment.start = f->start;
segment.samples = std::vector<float>{f->samples, f->samples + f->n};

SherpaOnnxDestroySpeechSegment(f);

return segment;
}

void VoiceActivityDetector::Reset() const {
SherpaOnnxVoiceActivityDetectorReset(p_);
}

void VoiceActivityDetector::Flush() const {
SherpaOnnxVoiceActivityDetectorFlush(p_);
}

} // namespace sherpa_onnx::cxx
78 changes: 78 additions & 0 deletions sherpa-onnx/c-api/cxx-api.h
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,84 @@ class SHERPA_ONNX_API OfflineSpeechDenoiser
explicit OfflineSpeechDenoiser(const SherpaOnnxOfflineSpeechDenoiser *p);
};

// ==============================
// VAD
// ==============================

struct SileroVadModelConfig {
std::string model;
float threshold = 0.5;
float min_silence_duration = 0.5;
float min_speech_duration = 0.25;
int32_t window_size = 512;
float max_speech_duration = 20;
};

struct VadModelConfig {
SileroVadModelConfig silero_vad;

int32_t sample_rate = 16000;
int32_t num_threads = 1;
std::string provider = "cpu";
bool debug = false;
};

struct SpeechSegment {
int32_t start;
std::vector<float> samples;
};

class SHERPA_ONNX_API CircularBuffer
: public MoveOnly<CircularBuffer, SherpaOnnxCircularBuffer> {
public:
static CircularBuffer Create(int32_t capacity);

void Destroy(const SherpaOnnxCircularBuffer *p) const;

void Push(const float *p, int32_t n) const;

std::vector<float> Get(int32_t start_index, int32_t n) const;

void Pop(int32_t n) const;

int32_t Size() const;

int32_t Head() const;

void Reset() const;

private:
explicit CircularBuffer(const SherpaOnnxCircularBuffer *p);
};

class SHERPA_ONNX_API VoiceActivityDetector
: public MoveOnly<VoiceActivityDetector, SherpaOnnxVoiceActivityDetector> {
public:
static VoiceActivityDetector Create(const VadModelConfig &config,
float buffer_size_in_seconds);

void Destroy(const SherpaOnnxVoiceActivityDetector *p) const;

void AcceptWaveform(const float *samples, int32_t n) const;

bool IsEmpty() const;

bool IsDetected() const;

void Pop() const;

void Clear() const;

SpeechSegment Front() const;

void Reset() const;

void Flush() const;

private:
explicit VoiceActivityDetector(const SherpaOnnxVoiceActivityDetector *p);
};

} // namespace sherpa_onnx::cxx

#endif // SHERPA_ONNX_C_API_CXX_API_H_
6 changes: 4 additions & 2 deletions sherpa-onnx/csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -317,11 +317,12 @@ if(SHERPA_ONNX_ENABLE_BINARY)
add_executable(sherpa-onnx-keyword-spotter sherpa-onnx-keyword-spotter.cc)
add_executable(sherpa-onnx-offline sherpa-onnx-offline.cc)
add_executable(sherpa-onnx-offline-audio-tagging sherpa-onnx-offline-audio-tagging.cc)
add_executable(sherpa-onnx-offline-denoiser sherpa-onnx-offline-denoiser.cc)
add_executable(sherpa-onnx-offline-language-identification sherpa-onnx-offline-language-identification.cc)
add_executable(sherpa-onnx-offline-parallel sherpa-onnx-offline-parallel.cc)
add_executable(sherpa-onnx-offline-punctuation sherpa-onnx-offline-punctuation.cc)
add_executable(sherpa-onnx-online-punctuation sherpa-onnx-online-punctuation.cc)
add_executable(sherpa-onnx-offline-denoiser sherpa-onnx-offline-denoiser.cc)
add_executable(sherpa-onnx-vad sherpa-onnx-vad.cc)

if(SHERPA_ONNX_ENABLE_TTS)
add_executable(sherpa-onnx-offline-tts sherpa-onnx-offline-tts.cc)
Expand All @@ -336,11 +337,12 @@ if(SHERPA_ONNX_ENABLE_BINARY)
sherpa-onnx-keyword-spotter
sherpa-onnx-offline
sherpa-onnx-offline-audio-tagging
sherpa-onnx-offline-denoiser
sherpa-onnx-offline-language-identification
sherpa-onnx-offline-parallel
sherpa-onnx-offline-punctuation
sherpa-onnx-offline-denoiser
sherpa-onnx-online-punctuation
sherpa-onnx-vad
)
if(SHERPA_ONNX_ENABLE_TTS)
list(APPEND main_exes
Expand Down
Loading
Loading