diff --git a/examples/addon.node/README.md b/examples/addon.node/README.md index 16df7d95870..ffd7720f9e5 100644 --- a/examples/addon.node/README.md +++ b/examples/addon.node/README.md @@ -1,8 +1,10 @@ -# addon +# whisper.cpp Node.js addon This is an addon demo that can **perform whisper model reasoning in `node` and `electron` environments**, based on [cmake-js](https://github.com/cmake-js/cmake-js). It can be used as a reference for using the whisper.cpp project in other node projects. +This addon now supports **Voice Activity Detection (VAD)** for improved transcription performance. + ## Install ```shell @@ -26,12 +28,88 @@ For Electron addon and cmake-js options, you can see [cmake-js](https://github.c ## Run +### Basic Usage + ```shell cd examples/addon.node node index.js --language='language' --model='model-path' --fname_inp='file-path' ``` -Because this is a simple Demo, only the above parameters are set in the node environment. +### VAD (Voice Activity Detection) Usage + +Run the VAD example with performance comparison: + +```shell +node vad-example.js +``` + +## Voice Activity Detection (VAD) Support + +VAD can significantly improve transcription performance by only processing speech segments, which is especially beneficial for audio files with long periods of silence. + +### VAD Model Setup + +Before using VAD, download a VAD model: + +```shell +# From the whisper.cpp root directory +./models/download-vad-model.sh silero-v5.1.2 +``` + +### VAD Parameters + +All VAD parameters are optional and have sensible defaults: + +- `vad`: Enable VAD (default: false) +- `vad_model`: Path to VAD model file (required when VAD enabled) +- `vad_threshold`: Speech detection threshold 0.0-1.0 (default: 0.5) +- `vad_min_speech_duration_ms`: Min speech duration in ms (default: 250) +- `vad_min_silence_duration_ms`: Min silence duration in ms (default: 100) +- `vad_max_speech_duration_s`: Max speech duration in seconds (default: FLT_MAX) +- `vad_speech_pad_ms`: Speech padding in ms (default: 30) +- `vad_samples_overlap`: Sample overlap 0.0-1.0 (default: 0.1) + +### JavaScript API Example + +```javascript +const path = require("path"); +const { whisper } = require(path.join(__dirname, "../../build/Release/addon.node")); +const { promisify } = require("util"); + +const whisperAsync = promisify(whisper); + +// With VAD enabled +const vadParams = { + language: "en", + model: path.join(__dirname, "../../models/ggml-base.en.bin"), + fname_inp: path.join(__dirname, "../../samples/jfk.wav"), + vad: true, + vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"), + vad_threshold: 0.5, + progress_callback: (progress) => console.log(`Progress: ${progress}%`) +}; + +whisperAsync(vadParams).then(result => console.log(result)); +``` + +## Supported Parameters + +Both traditional whisper.cpp parameters and new VAD parameters are supported: -Other parameters can also be specified in the node environment. +- `language`: Language code (e.g., "en", "es", "fr") +- `model`: Path to whisper model file +- `fname_inp`: Path to input audio file +- `use_gpu`: Enable GPU acceleration (default: true) +- `flash_attn`: Enable flash attention (default: false) +- `no_prints`: Disable console output (default: false) +- `no_timestamps`: Disable timestamps (default: false) +- `detect_language`: Auto-detect language (default: false) +- `audio_ctx`: Audio context size (default: 0) +- `max_len`: Maximum segment length (default: 0) +- `max_context`: Maximum context size (default: -1) +- `prompt`: Initial prompt for decoder +- `comma_in_time`: Use comma in timestamps (default: true) +- `print_progress`: Print progress info (default: false) +- `progress_callback`: Progress callback function +- VAD parameters (see above section) diff --git a/examples/addon.node/__test__/whisper.spec.js b/examples/addon.node/__test__/whisper.spec.js index ce063211efe..b07430aa249 100644 --- a/examples/addon.node/__test__/whisper.spec.js +++ b/examples/addon.node/__test__/whisper.spec.js @@ -1,39 +1,133 @@ -const path = require("path"); -const { whisper } = require(path.join( - __dirname, - "../../../build/Release/addon.node" -)); -const { promisify } = require("util"); +const { join } = require('path'); +const { whisper } = require('../../../build/Release/addon.node'); +const { promisify } = require('util'); const whisperAsync = promisify(whisper); -const whisperParamsMock = { - language: "en", - model: path.join(__dirname, "../../../models/ggml-base.en.bin"), - fname_inp: path.join(__dirname, "../../../samples/jfk.wav"), +const commonParams = { + language: 'en', + model: join(__dirname, '../../../models/ggml-base.en.bin'), + fname_inp: join(__dirname, '../../../samples/jfk.wav'), use_gpu: true, flash_attn: false, no_prints: true, - comma_in_time: false, - translate: true, no_timestamps: false, detect_language: false, audio_ctx: 0, - max_len: 0, - prompt: "", - print_progress: false, - progress_callback: (progress) => { - console.log(`Progress: ${progress}`); - }, - max_context: -1 + max_len: 0 }; -describe("Run whisper.node", () => { - test("it should receive a non-empty value", async () => { - let result = await whisperAsync(whisperParamsMock); - console.log(result); +describe('Whisper.cpp Node.js addon with VAD support', () => { + test('Basic whisper transcription without VAD', async () => { + const params = { + ...commonParams, + vad: false + }; - expect(result['transcription'].length).toBeGreaterThan(0); - }, 10000); + const result = await whisperAsync(params); + + expect(typeof result).toBe('object'); + expect(Array.isArray(result.transcription)).toBe(true); + expect(result.transcription.length).toBeGreaterThan(0); + + // Check that we got some transcription text + const text = result.transcription.map(segment => segment[2]).join(' '); + expect(text.length).toBeGreaterThan(0); + expect(text.toLowerCase()).toContain('ask not'); + }, 30000); + + test('VAD parameters validation', async () => { + // Test with invalid VAD model - should return empty transcription + const invalidParams = { + ...commonParams, + vad: true, + vad_model: 'non-existent-model.bin', + vad_threshold: 0.5 + }; + + // This should handle the error gracefully and return empty transcription + const result = await whisperAsync(invalidParams); + expect(typeof result).toBe('object'); + expect(Array.isArray(result.transcription)).toBe(true); + // When VAD model doesn't exist, it should return empty transcription + expect(result.transcription.length).toBe(0); + }, 10000); + + test('VAD parameter parsing', async () => { + // Test that VAD parameters are properly parsed (even if VAD model doesn't exist) + const vadParams = { + ...commonParams, + vad: false, // Disabled so no model required + vad_threshold: 0.7, + vad_min_speech_duration_ms: 300, + vad_min_silence_duration_ms: 150, + vad_max_speech_duration_s: 45.0, + vad_speech_pad_ms: 50, + vad_samples_overlap: 0.15 + }; + + const result = await whisperAsync(vadParams); + + expect(typeof result).toBe('object'); + expect(Array.isArray(result.transcription)).toBe(true); + }, 30000); + + test('Progress callback with VAD disabled', async () => { + let progressCalled = false; + let lastProgress = 0; + + const params = { + ...commonParams, + vad: false, + progress_callback: (progress) => { + progressCalled = true; + lastProgress = progress; + expect(progress).toBeGreaterThanOrEqual(0); + expect(progress).toBeLessThanOrEqual(100); + } + }; + + const result = await whisperAsync(params); + + expect(progressCalled).toBe(true); + expect(lastProgress).toBe(100); + expect(typeof result).toBe('object'); + }, 30000); + + test('Language detection without VAD', async () => { + const params = { + ...commonParams, + vad: false, + detect_language: true, + language: 'auto' + }; + + const result = await whisperAsync(params); + + expect(typeof result).toBe('object'); + expect(typeof result.language).toBe('string'); + expect(result.language.length).toBeGreaterThan(0); + }, 30000); + + test('Basic transcription with all VAD parameters set', async () => { + // Test with VAD disabled but all parameters set to ensure no crashes + const params = { + ...commonParams, + vad: false, // Disabled so it works without VAD model + vad_model: '', // Empty model path + vad_threshold: 0.6, + vad_min_speech_duration_ms: 200, + vad_min_silence_duration_ms: 80, + vad_max_speech_duration_s: 25.0, + vad_speech_pad_ms: 40, + vad_samples_overlap: 0.08 + }; + + const result = await whisperAsync(params); + + expect(typeof result).toBe('object'); + expect(Array.isArray(result.transcription)).toBe(true); + expect(result.transcription.length).toBeGreaterThan(0); + }, 30000); }); diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp index 67b1ec92d7b..952e44e3ce7 100644 --- a/examples/addon.node/addon.cpp +++ b/examples/addon.node/addon.cpp @@ -9,6 +9,7 @@ #include #include #include +#include struct whisper_params { int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); @@ -51,6 +52,16 @@ struct whisper_params { std::vector fname_out = {}; std::vector pcmf32 = {}; // mono-channel F32 PCM + + // Voice Activity Detection (VAD) parameters + bool vad = false; + std::string vad_model = ""; + float vad_threshold = 0.5f; + int vad_min_speech_duration_ms = 250; + int vad_min_silence_duration_ms = 100; + float vad_max_speech_duration_s = FLT_MAX; + int vad_speech_pad_ms = 30; + float vad_samples_overlap = 0.1f; }; struct whisper_print_user_data { @@ -333,16 +344,16 @@ class ProgressWorker : public Napi::AsyncWorker { }; wparams.progress_callback_user_data = this; - // Abort mechanism example - { - static bool is_aborted = false; // Note: this should be atomic to avoid data races + // Set VAD parameters + wparams.vad = params.vad; + wparams.vad_model_path = params.vad_model.c_str(); - wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) { - bool is_aborted = *(bool*)user_data; - return !is_aborted; - }; - wparams.encoder_begin_callback_user_data = &is_aborted; - } + wparams.vad_params.threshold = params.vad_threshold; + wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms; + wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms; + wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s; + wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms; + wparams.vad_params.samples_overlap = params.vad_samples_overlap; if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) { fprintf(stderr, "failed to process audio\n"); @@ -385,14 +396,46 @@ Napi::Value whisper(const Napi::CallbackInfo& info) { std::string language = whisper_params.Get("language").As(); std::string model = whisper_params.Get("model").As(); std::string input = whisper_params.Get("fname_inp").As(); - bool use_gpu = whisper_params.Get("use_gpu").As(); - bool flash_attn = whisper_params.Get("flash_attn").As(); - bool no_prints = whisper_params.Get("no_prints").As(); - bool no_timestamps = whisper_params.Get("no_timestamps").As(); - bool detect_language = whisper_params.Get("detect_language").As(); - int32_t audio_ctx = whisper_params.Get("audio_ctx").As(); - bool comma_in_time = whisper_params.Get("comma_in_time").As(); - int32_t max_len = whisper_params.Get("max_len").As(); + + bool use_gpu = true; + if (whisper_params.Has("use_gpu") && whisper_params.Get("use_gpu").IsBoolean()) { + use_gpu = whisper_params.Get("use_gpu").As(); + } + + bool flash_attn = false; + if (whisper_params.Has("flash_attn") && whisper_params.Get("flash_attn").IsBoolean()) { + flash_attn = whisper_params.Get("flash_attn").As(); + } + + bool no_prints = false; + if (whisper_params.Has("no_prints") && whisper_params.Get("no_prints").IsBoolean()) { + no_prints = whisper_params.Get("no_prints").As(); + } + + bool no_timestamps = false; + if (whisper_params.Has("no_timestamps") && whisper_params.Get("no_timestamps").IsBoolean()) { + no_timestamps = whisper_params.Get("no_timestamps").As(); + } + + bool detect_language = false; + if (whisper_params.Has("detect_language") && whisper_params.Get("detect_language").IsBoolean()) { + detect_language = whisper_params.Get("detect_language").As(); + } + + int32_t audio_ctx = 0; + if (whisper_params.Has("audio_ctx") && whisper_params.Get("audio_ctx").IsNumber()) { + audio_ctx = whisper_params.Get("audio_ctx").As(); + } + + bool comma_in_time = true; + if (whisper_params.Has("comma_in_time") && whisper_params.Get("comma_in_time").IsBoolean()) { + comma_in_time = whisper_params.Get("comma_in_time").As(); + } + + int32_t max_len = 0; + if (whisper_params.Has("max_len") && whisper_params.Get("max_len").IsNumber()) { + max_len = whisper_params.Get("max_len").As(); + } // Add support for max_context int32_t max_context = -1; @@ -408,7 +451,7 @@ Napi::Value whisper(const Napi::CallbackInfo& info) { // Add support for print_progress bool print_progress = false; - if (whisper_params.Has("print_progress")) { + if (whisper_params.Has("print_progress") && whisper_params.Get("print_progress").IsBoolean()) { print_progress = whisper_params.Get("print_progress").As(); } // Add support for progress_callback @@ -417,6 +460,47 @@ Napi::Value whisper(const Napi::CallbackInfo& info) { progress_callback = whisper_params.Get("progress_callback").As(); } + // Add support for VAD parameters + bool vad = false; + if (whisper_params.Has("vad") && whisper_params.Get("vad").IsBoolean()) { + vad = whisper_params.Get("vad").As(); + } + + std::string vad_model = ""; + if (whisper_params.Has("vad_model") && whisper_params.Get("vad_model").IsString()) { + vad_model = whisper_params.Get("vad_model").As(); + } + + float vad_threshold = 0.5f; + if (whisper_params.Has("vad_threshold") && whisper_params.Get("vad_threshold").IsNumber()) { + vad_threshold = whisper_params.Get("vad_threshold").As(); + } + + int vad_min_speech_duration_ms = 250; + if (whisper_params.Has("vad_min_speech_duration_ms") && whisper_params.Get("vad_min_speech_duration_ms").IsNumber()) { + vad_min_speech_duration_ms = whisper_params.Get("vad_min_speech_duration_ms").As(); + } + + int vad_min_silence_duration_ms = 100; + if (whisper_params.Has("vad_min_silence_duration_ms") && whisper_params.Get("vad_min_silence_duration_ms").IsNumber()) { + vad_min_silence_duration_ms = whisper_params.Get("vad_min_silence_duration_ms").As(); + } + + float vad_max_speech_duration_s = FLT_MAX; + if (whisper_params.Has("vad_max_speech_duration_s") && whisper_params.Get("vad_max_speech_duration_s").IsNumber()) { + vad_max_speech_duration_s = whisper_params.Get("vad_max_speech_duration_s").As(); + } + + int vad_speech_pad_ms = 30; + if (whisper_params.Has("vad_speech_pad_ms") && whisper_params.Get("vad_speech_pad_ms").IsNumber()) { + vad_speech_pad_ms = whisper_params.Get("vad_speech_pad_ms").As(); + } + + float vad_samples_overlap = 0.1f; + if (whisper_params.Has("vad_samples_overlap") && whisper_params.Get("vad_samples_overlap").IsNumber()) { + vad_samples_overlap = whisper_params.Get("vad_samples_overlap").As(); + } + Napi::Value pcmf32Value = whisper_params.Get("pcmf32"); std::vector pcmf32_vec; if (pcmf32Value.IsTypedArray()) { @@ -444,6 +528,16 @@ Napi::Value whisper(const Napi::CallbackInfo& info) { params.prompt = prompt; params.detect_language = detect_language; + // Set VAD parameters + params.vad = vad; + params.vad_model = vad_model; + params.vad_threshold = vad_threshold; + params.vad_min_speech_duration_ms = vad_min_speech_duration_ms; + params.vad_min_silence_duration_ms = vad_min_silence_duration_ms; + params.vad_max_speech_duration_s = vad_max_speech_duration_s; + params.vad_speech_pad_ms = vad_speech_pad_ms; + params.vad_samples_overlap = vad_samples_overlap; + Napi::Function callback = info[1].As(); // Create a new Worker class with progress callback support ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env); diff --git a/examples/addon.node/vad-example.js b/examples/addon.node/vad-example.js new file mode 100644 index 00000000000..a9e0dae7adf --- /dev/null +++ b/examples/addon.node/vad-example.js @@ -0,0 +1,132 @@ +const path = require("path"); +const { whisper } = require(path.join( + __dirname, + "../../build/Release/addon.node" +)); +const { promisify } = require("util"); + +const whisperAsync = promisify(whisper); + +// Example with VAD enabled +const vadParams = { + language: "en", + model: path.join(__dirname, "../../models/ggml-base.en.bin"), + fname_inp: path.join(__dirname, "../../samples/jfk.wav"), + use_gpu: true, + flash_attn: false, + no_prints: false, + comma_in_time: true, + translate: false, + no_timestamps: false, + detect_language: false, + audio_ctx: 0, + max_len: 0, + // VAD parameters + vad: true, + vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"), // You need to download this model + vad_threshold: 0.5, + vad_min_speech_duration_ms: 250, + vad_min_silence_duration_ms: 100, + vad_max_speech_duration_s: 30.0, + vad_speech_pad_ms: 30, + vad_samples_overlap: 0.1, + progress_callback: (progress) => { + console.log(`VAD Transcription progress: ${progress}%`); + } +}; + +// Example without VAD (traditional approach) +const traditionalParams = { + language: "en", + model: path.join(__dirname, "../../models/ggml-base.en.bin"), + fname_inp: path.join(__dirname, "../../samples/jfk.wav"), + use_gpu: true, + flash_attn: false, + no_prints: false, + comma_in_time: true, + translate: false, + no_timestamps: false, + detect_language: false, + audio_ctx: 0, + max_len: 0, + vad: false, // Explicitly disable VAD + progress_callback: (progress) => { + console.log(`Traditional transcription progress: ${progress}%`); + } +}; + +async function runVADExample() { + try { + console.log("=== Whisper.cpp Node.js VAD Example ===\n"); + + // Check if VAD model exists + const fs = require('fs'); + if (!fs.existsSync(vadParams.vad_model)) { + console.log("āš ļø VAD model not found. Please download the VAD model first:"); + console.log(" ./models/download-vad-model.sh silero-v5.1.2"); + console.log(" Or run: python models/convert-silero-vad-to-ggml.py"); + console.log("\n Falling back to traditional transcription without VAD...\n"); + + // Run without VAD + console.log("šŸŽµ Running traditional transcription..."); + const traditionalResult = await whisperAsync(traditionalParams); + console.log("\nšŸ“ Traditional transcription result:"); + console.log(traditionalResult); + return; + } + + console.log("šŸŽµ Running transcription with VAD enabled..."); + console.log("VAD Parameters:"); + console.log(` - Threshold: ${vadParams.vad_threshold}`); + console.log(` - Min speech duration: ${vadParams.vad_min_speech_duration_ms}ms`); + console.log(` - Min silence duration: ${vadParams.vad_min_silence_duration_ms}ms`); + console.log(` - Max speech duration: ${vadParams.vad_max_speech_duration_s}s`); + console.log(` - Speech padding: ${vadParams.vad_speech_pad_ms}ms`); + console.log(` - Samples overlap: ${vadParams.vad_samples_overlap}\n`); + + const startTime = Date.now(); + const vadResult = await whisperAsync(vadParams); + const vadDuration = Date.now() - startTime; + + console.log("\nāœ… VAD transcription completed!"); + console.log(`ā±ļø Processing time: ${vadDuration}ms`); + console.log("\nšŸ“ VAD transcription result:"); + console.log(vadResult); + + // Compare with traditional approach + console.log("\nšŸ”„ Running traditional transcription for comparison..."); + const traditionalStartTime = Date.now(); + const traditionalResult = await whisperAsync(traditionalParams); + const traditionalDuration = Date.now() - traditionalStartTime; + + console.log("\nāœ… Traditional transcription completed!"); + console.log(`ā±ļø Processing time: ${traditionalDuration}ms`); + console.log("\nšŸ“ Traditional transcription result:"); + console.log(traditionalResult); + + // Performance comparison + console.log("\nšŸ“Š Performance Comparison:"); + console.log(`VAD: ${vadDuration}ms`); + console.log(`Traditional: ${traditionalDuration}ms`); + const speedup = traditionalDuration / vadDuration; + if (speedup > 1) { + console.log(`šŸš€ VAD is ${speedup.toFixed(2)}x faster!`); + } else { + console.log(`ā„¹ļø Traditional approach was ${(1/speedup).toFixed(2)}x faster in this case.`); + } + + } catch (error) { + console.error("āŒ Error during transcription:", error); + } +} + +// Run the example +if (require.main === module) { + runVADExample(); +} + +module.exports = { + runVADExample, + vadParams, + traditionalParams +}; \ No newline at end of file