Skip to content

Use miniaudio for direct decoding flac, mp3, ogg and wav #2759

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 1 addition & 12 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,6 @@ samples:
@wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
@wget --quiet --show-progress -O samples/a13.mp3 https://upload.wikimedia.org/wikipedia/commons/transcoded/6/6f/Apollo13-wehaveaproblem.ogg/Apollo13-wehaveaproblem.ogg.mp3
@wget --quiet --show-progress -O samples/diffusion2023-07-03.flac https://archive.org/download/diffusion2023-07-03/diffusion2023-07-03.flac
@echo "Converting to 16-bit WAV ..."
@ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
@ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
@ffmpeg -loglevel -0 -y -i samples/hp0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/hp0.wav
@rm samples/*.ogg
@ffmpeg -loglevel -0 -y -i samples/mm1.wav -ar 16000 -ac 1 -c:a pcm_s16le samples/mm0.wav
@rm samples/mm1.wav
@ffmpeg -loglevel -0 -y -i samples/a13.mp3 -ar 16000 -ac 1 -c:a pcm_s16le -ss 00:00:00 -to 00:00:30 samples/a13.wav
@rm samples/a13.mp3
@ffmpeg -loglevel -0 -y -i samples/diffusion2023-07-03.flac -ar 16000 -ac 1 -c:a pcm_s16le samples/diffusion2023-07-03.wav
@rm samples/diffusion2023-07-03.flac

#
# Models
Expand Down Expand Up @@ -59,7 +48,7 @@ tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 larg
@echo "Running $@ on all samples in ./samples ..."
@echo "==============================================="
@echo ""
@for f in samples/*.wav; do \
@for f in samples/*$(.flac .mp3 .ogg .wav); do \
echo "----------------------------------------------" ; \
echo "[+] Running $@ on $$f ... (run 'ffplay $$f' to listen)" ; \
echo "----------------------------------------------" ; \
Expand Down
4 changes: 2 additions & 2 deletions examples/addon.node/addon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,8 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {

// read the input audio file if params.pcmf32 is not provided
if (params.pcmf32.empty()) {
if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
if (!::read_audio_data(fname_inp, pcmf32, pcmf32s, params.diarize)) {
fprintf(stderr, "error: failed to read audio file '%s'\n", fname_inp.c_str());
continue;
}
} else {
Expand Down
9 changes: 5 additions & 4 deletions examples/cli/cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,8 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params

static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
fprintf(stderr, "usage: %s [options] file0 file1 ...\n", argv[0]);
fprintf(stderr, "supported audio formats: flac, mp3, ogg, wav\n");
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
Expand Down Expand Up @@ -244,7 +245,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
fprintf(stderr, " -dl, --detect-language [%-7s] exit after automatically detecting language\n", params.detect_language ? "true" : "false");
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt (max n_text_ctx/2 tokens)\n", params.prompt.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input audio file path\n", "");
fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
fprintf(stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n", params.dtw.c_str());
fprintf(stderr, " -ls, --log-score [%-7s] log best decoder scores of tokens\n", params.log_score?"true":"false");
Expand Down Expand Up @@ -1069,8 +1070,8 @@ int main(int argc, char ** argv) {
std::vector<float> pcmf32; // mono-channel F32 PCM
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM

if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
if (!::read_audio_data(fname_inp, pcmf32, pcmf32s, params.diarize)) {
fprintf(stderr, "error: failed to read audio file '%s'\n", fname_inp.c_str());
continue;
}

Expand Down
164 changes: 80 additions & 84 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,17 @@

// third-party utilities
// use your favorite implementations
#define DR_WAV_IMPLEMENTATION
#include "dr_wav.h"
#define STB_VORBIS_HEADER_ONLY
#include "stb_vorbis.c" /* Enables Vorbis decoding. */

#define MA_NO_DEVICE_IO
#define MA_NO_THREADING
#define MA_NO_ENCODING
#define MA_NO_GENERATION
#define MA_NO_RESOURCE_MANAGER
#define MA_NO_NODE_GRAPH
#define MINIAUDIO_IMPLEMENTATION
#include "miniaudio.h"

#include <cmath>
#include <cstring>
Expand Down Expand Up @@ -639,111 +648,95 @@ bool is_wav_buffer(const std::string buf) {
return true;
}

bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
drwav wav;
std::vector<uint8_t> wav_data; // used for pipe input from stdin or ffmpeg decoding output
bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
std::vector<uint8_t> audio_data; // used for pipe input from stdin or ffmpeg decoding output

ma_result result;
ma_decoder_config decoder_config;
ma_decoder decoder;

decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, COMMON_SAMPLE_RATE);

if (fname == "-") {
{
#ifdef _WIN32
_setmode(_fileno(stdin), _O_BINARY);
#endif

uint8_t buf[1024];
while (true)
{
const size_t n = fread(buf, 1, sizeof(buf), stdin);
if (n == 0) {
break;
}
wav_data.insert(wav_data.end(), buf, buf + n);
}
}
#ifdef _WIN32
_setmode(_fileno(stdin), _O_BINARY);
#endif

if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
fprintf(stderr, "error: failed to open WAV file from stdin\n");
return false;
}
uint8_t buf[1024];
while (true)
{
const size_t n = fread(buf, 1, sizeof(buf), stdin);
if (n == 0) {
break;
}
audio_data.insert(audio_data.end(), buf, buf + n);
}

if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) {

fprintf(stderr, "Error: failed to open audio data from stdin (%s)\n", ma_result_description(result));

fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
return false;
}

fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, audio_data.size());
}
else if (is_wav_buffer(fname)) {
if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
return false;
}
if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) {
fprintf(stderr, "Error: failed to open audio data from fname buffer (%s)\n", ma_result_description(result));

return false;
}
}
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
else if ((result = ma_decoder_init_file(fname.c_str(), &decoder_config, &decoder)) != MA_SUCCESS) {
#if defined(WHISPER_FFMPEG)
if (ffmpeg_decode_audio(fname, wav_data) != 0) {
fprintf(stderr, "error: failed to ffmpeg decode '%s' \n", fname.c_str());
return false;
}
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
fprintf(stderr, "error: failed to read wav data as wav \n");
return false;
}
if (ffmpeg_decode_audio(fname, audio_data) != 0) {
fprintf(stderr, "error: failed to ffmpeg decode '%s'\n", fname.c_str());

return false;
}

if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) {
fprintf(stderr, "error: failed to read audio data as wav (%s)\n", ma_result_description(result));

return false;
}
#else
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
return false;
#endif
}
fprintf(stderr, "error: failed to open '%s' file (%s)\n", fname.c_str(), ma_result_description(result));

if (wav.channels != 1 && wav.channels != 2) {
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
drwav_uninit(&wav);
return false;
return false;
#endif
}

if (stereo && wav.channels != 2) {
fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
drwav_uninit(&wav);
return false;
}
ma_uint64 frame_count;
ma_uint64 frames_read;

if (wav.sampleRate != COMMON_SAMPLE_RATE) {
fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
drwav_uninit(&wav);
return false;
}
if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) {
fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result));

if (wav.bitsPerSample != 16) {
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
drwav_uninit(&wav);
return false;
return false;
}

const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
pcmf32.resize(stereo ? frame_count*2 : frame_count);

std::vector<int16_t> pcm16;
pcm16.resize(n*wav.channels);
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
drwav_uninit(&wav);
if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) {
fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result));

// convert to mono, float
pcmf32.resize(n);
if (wav.channels == 1) {
for (uint64_t i = 0; i < n; i++) {
pcmf32[i] = float(pcm16[i])/32768.0f;
}
} else {
for (uint64_t i = 0; i < n; i++) {
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
}
return false;
}

if (stereo) {
// convert to stereo, float
pcmf32s.resize(2);

pcmf32s[0].resize(n);
pcmf32s[1].resize(n);
for (uint64_t i = 0; i < n; i++) {
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
}
pcmf32s.resize(2);
pcmf32s[0].resize(frame_count);
pcmf32s[1].resize(frame_count);
for (uint64_t i = 0; i < frame_count; i++) {
pcmf32s[0][i] = pcmf32[2*i];
pcmf32s[1][i] = pcmf32[2*i + 1];
}
}

ma_decoder_uninit(&decoder);

return true;
}

Expand Down Expand Up @@ -909,3 +902,6 @@ bool speak_with_file(const std::string & command, const std::string & text, cons
}
return true;
}

#undef STB_VORBIS_HEADER_ONLY
#include "stb_vorbis.c"
2 changes: 1 addition & 1 deletion examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ bool is_wav_buffer(const std::string buf);
// fname can be a buffer of WAV data instead of a filename
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
bool read_wav(
bool read_audio_data(
const std::string & fname,
std::vector<float> & pcmf32,
std::vector<std::vector<float>> & pcmf32s,
Expand Down
Loading