Skip to content

Commit d9999d5

Browse files
buxukulinxiaodong
andauthored
feat: support vad for addon.node (#3301)
Co-authored-by: linxiaodong <calm.lin@wukongsch.com>
1 parent bca021c commit d9999d5

File tree

4 files changed

+444
-46
lines changed

4 files changed

+444
-46
lines changed

examples/addon.node/README.md

Lines changed: 81 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1-
# addon
1+
# whisper.cpp Node.js addon
22

33
This is an addon demo that can **perform whisper model reasoning in `node` and `electron` environments**, based on [cmake-js](https://github.com/cmake-js/cmake-js).
44
It can be used as a reference for using the whisper.cpp project in other node projects.
55

6+
This addon now supports **Voice Activity Detection (VAD)** for improved transcription performance.
7+
68
## Install
79

810
```shell
@@ -26,12 +28,88 @@ For Electron addon and cmake-js options, you can see [cmake-js](https://github.c
2628
2729
## Run
2830
31+
### Basic Usage
32+
2933
```shell
3034
cd examples/addon.node
3135
3236
node index.js --language='language' --model='model-path' --fname_inp='file-path'
3337
```
3438
35-
Because this is a simple Demo, only the above parameters are set in the node environment.
39+
### VAD (Voice Activity Detection) Usage
40+
41+
Run the VAD example with performance comparison:
42+
43+
```shell
44+
node vad-example.js
45+
```
46+
47+
## Voice Activity Detection (VAD) Support
48+
49+
VAD can significantly improve transcription performance by only processing speech segments, which is especially beneficial for audio files with long periods of silence.
50+
51+
### VAD Model Setup
52+
53+
Before using VAD, download a VAD model:
54+
55+
```shell
56+
# From the whisper.cpp root directory
57+
./models/download-vad-model.sh silero-v5.1.2
58+
```
59+
60+
### VAD Parameters
61+
62+
All VAD parameters are optional and have sensible defaults:
63+
64+
- `vad`: Enable VAD (default: false)
65+
- `vad_model`: Path to VAD model file (required when VAD enabled)
66+
- `vad_threshold`: Speech detection threshold 0.0-1.0 (default: 0.5)
67+
- `vad_min_speech_duration_ms`: Min speech duration in ms (default: 250)
68+
- `vad_min_silence_duration_ms`: Min silence duration in ms (default: 100)
69+
- `vad_max_speech_duration_s`: Max speech duration in seconds (default: FLT_MAX)
70+
- `vad_speech_pad_ms`: Speech padding in ms (default: 30)
71+
- `vad_samples_overlap`: Sample overlap 0.0-1.0 (default: 0.1)
72+
73+
### JavaScript API Example
74+
75+
```javascript
76+
const path = require("path");
77+
const { whisper } = require(path.join(__dirname, "../../build/Release/addon.node"));
78+
const { promisify } = require("util");
79+
80+
const whisperAsync = promisify(whisper);
81+
82+
// With VAD enabled
83+
const vadParams = {
84+
language: "en",
85+
model: path.join(__dirname, "../../models/ggml-base.en.bin"),
86+
fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
87+
vad: true,
88+
vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"),
89+
vad_threshold: 0.5,
90+
progress_callback: (progress) => console.log(`Progress: ${progress}%`)
91+
};
92+
93+
whisperAsync(vadParams).then(result => console.log(result));
94+
```
95+
96+
## Supported Parameters
97+
98+
Both traditional whisper.cpp parameters and new VAD parameters are supported:
3699

37-
Other parameters can also be specified in the node environment.
100+
- `language`: Language code (e.g., "en", "es", "fr")
101+
- `model`: Path to whisper model file
102+
- `fname_inp`: Path to input audio file
103+
- `use_gpu`: Enable GPU acceleration (default: true)
104+
- `flash_attn`: Enable flash attention (default: false)
105+
- `no_prints`: Disable console output (default: false)
106+
- `no_timestamps`: Disable timestamps (default: false)
107+
- `detect_language`: Auto-detect language (default: false)
108+
- `audio_ctx`: Audio context size (default: 0)
109+
- `max_len`: Maximum segment length (default: 0)
110+
- `max_context`: Maximum context size (default: -1)
111+
- `prompt`: Initial prompt for decoder
112+
- `comma_in_time`: Use comma in timestamps (default: true)
113+
- `print_progress`: Print progress info (default: false)
114+
- `progress_callback`: Progress callback function
115+
- VAD parameters (see above section)
Lines changed: 119 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,133 @@
1-
const path = require("path");
2-
const { whisper } = require(path.join(
3-
__dirname,
4-
"../../../build/Release/addon.node"
5-
));
6-
const { promisify } = require("util");
1+
const { join } = require('path');
2+
const { whisper } = require('../../../build/Release/addon.node');
3+
const { promisify } = require('util');
74

85
const whisperAsync = promisify(whisper);
96

10-
const whisperParamsMock = {
11-
language: "en",
12-
model: path.join(__dirname, "../../../models/ggml-base.en.bin"),
13-
fname_inp: path.join(__dirname, "../../../samples/jfk.wav"),
7+
const commonParams = {
8+
language: 'en',
9+
model: join(__dirname, '../../../models/ggml-base.en.bin'),
10+
fname_inp: join(__dirname, '../../../samples/jfk.wav'),
1411
use_gpu: true,
1512
flash_attn: false,
1613
no_prints: true,
17-
comma_in_time: false,
18-
translate: true,
1914
no_timestamps: false,
2015
detect_language: false,
2116
audio_ctx: 0,
22-
max_len: 0,
23-
prompt: "",
24-
print_progress: false,
25-
progress_callback: (progress) => {
26-
console.log(`Progress: ${progress}`);
27-
},
28-
max_context: -1
17+
max_len: 0
2918
};
3019

31-
describe("Run whisper.node", () => {
32-
test("it should receive a non-empty value", async () => {
33-
let result = await whisperAsync(whisperParamsMock);
34-
console.log(result);
20+
describe('Whisper.cpp Node.js addon with VAD support', () => {
21+
test('Basic whisper transcription without VAD', async () => {
22+
const params = {
23+
...commonParams,
24+
vad: false
25+
};
3526

36-
expect(result['transcription'].length).toBeGreaterThan(0);
37-
}, 10000);
27+
const result = await whisperAsync(params);
28+
29+
expect(typeof result).toBe('object');
30+
expect(Array.isArray(result.transcription)).toBe(true);
31+
expect(result.transcription.length).toBeGreaterThan(0);
32+
33+
// Check that we got some transcription text
34+
const text = result.transcription.map(segment => segment[2]).join(' ');
35+
expect(text.length).toBeGreaterThan(0);
36+
expect(text.toLowerCase()).toContain('ask not');
37+
}, 30000);
38+
39+
test('VAD parameters validation', async () => {
40+
// Test with invalid VAD model - should return empty transcription
41+
const invalidParams = {
42+
...commonParams,
43+
vad: true,
44+
vad_model: 'non-existent-model.bin',
45+
vad_threshold: 0.5
46+
};
47+
48+
// This should handle the error gracefully and return empty transcription
49+
const result = await whisperAsync(invalidParams);
50+
expect(typeof result).toBe('object');
51+
expect(Array.isArray(result.transcription)).toBe(true);
52+
// When VAD model doesn't exist, it should return empty transcription
53+
expect(result.transcription.length).toBe(0);
54+
}, 10000);
55+
56+
test('VAD parameter parsing', async () => {
57+
// Test that VAD parameters are properly parsed (even if VAD model doesn't exist)
58+
const vadParams = {
59+
...commonParams,
60+
vad: false, // Disabled so no model required
61+
vad_threshold: 0.7,
62+
vad_min_speech_duration_ms: 300,
63+
vad_min_silence_duration_ms: 150,
64+
vad_max_speech_duration_s: 45.0,
65+
vad_speech_pad_ms: 50,
66+
vad_samples_overlap: 0.15
67+
};
68+
69+
const result = await whisperAsync(vadParams);
70+
71+
expect(typeof result).toBe('object');
72+
expect(Array.isArray(result.transcription)).toBe(true);
73+
}, 30000);
74+
75+
test('Progress callback with VAD disabled', async () => {
76+
let progressCalled = false;
77+
let lastProgress = 0;
78+
79+
const params = {
80+
...commonParams,
81+
vad: false,
82+
progress_callback: (progress) => {
83+
progressCalled = true;
84+
lastProgress = progress;
85+
expect(progress).toBeGreaterThanOrEqual(0);
86+
expect(progress).toBeLessThanOrEqual(100);
87+
}
88+
};
89+
90+
const result = await whisperAsync(params);
91+
92+
expect(progressCalled).toBe(true);
93+
expect(lastProgress).toBe(100);
94+
expect(typeof result).toBe('object');
95+
}, 30000);
96+
97+
test('Language detection without VAD', async () => {
98+
const params = {
99+
...commonParams,
100+
vad: false,
101+
detect_language: true,
102+
language: 'auto'
103+
};
104+
105+
const result = await whisperAsync(params);
106+
107+
expect(typeof result).toBe('object');
108+
expect(typeof result.language).toBe('string');
109+
expect(result.language.length).toBeGreaterThan(0);
110+
}, 30000);
111+
112+
test('Basic transcription with all VAD parameters set', async () => {
113+
// Test with VAD disabled but all parameters set to ensure no crashes
114+
const params = {
115+
...commonParams,
116+
vad: false, // Disabled so it works without VAD model
117+
vad_model: '', // Empty model path
118+
vad_threshold: 0.6,
119+
vad_min_speech_duration_ms: 200,
120+
vad_min_silence_duration_ms: 80,
121+
vad_max_speech_duration_s: 25.0,
122+
vad_speech_pad_ms: 40,
123+
vad_samples_overlap: 0.08
124+
};
125+
126+
const result = await whisperAsync(params);
127+
128+
expect(typeof result).toBe('object');
129+
expect(Array.isArray(result.transcription)).toBe(true);
130+
expect(result.transcription.length).toBeGreaterThan(0);
131+
}, 30000);
38132
});
39133

0 commit comments

Comments
 (0)