4
4
5
5
// third-party utilities
6
6
// use your favorite implementations
7
- #define DR_WAV_IMPLEMENTATION
8
- #include " dr_wav.h"
7
+ #define STB_VORBIS_HEADER_ONLY
8
+ #include " stb_vorbis.c" /* Enables Vorbis decoding. */
9
+
10
+ #define MA_NO_DEVICE_IO
11
+ #define MA_NO_THREADING
12
+ #define MA_NO_ENCODING
13
+ #define MA_NO_GENERATION
14
+ #define MA_NO_RESOURCE_MANAGER
15
+ #define MA_NO_NODE_GRAPH
16
+ #define MINIAUDIO_IMPLEMENTATION
17
+ #include " miniaudio.h"
9
18
10
19
#include < cmath>
11
20
#include < cstring>
@@ -639,9 +648,14 @@ bool is_wav_buffer(const std::string buf) {
639
648
return true ;
640
649
}
641
650
642
- bool read_wav (const std::string & fname, std::vector<float >& pcmf32, std::vector<std::vector<float >>& pcmf32s, bool stereo) {
643
- drwav wav;
644
- std::vector<uint8_t > wav_data; // used for pipe input from stdin or ffmpeg decoding output
651
+ bool read_audio_data (const std::string & fname, std::vector<float >& pcmf32, std::vector<std::vector<float >>& pcmf32s, bool stereo) {
652
+ std::vector<uint8_t > audio_data; // used for pipe input from stdin or ffmpeg decoding output
653
+
654
+ ma_result result;
655
+ ma_decoder_config decoder_config;
656
+ ma_decoder decoder;
657
+
658
+ decoder_config = ma_decoder_config_init (ma_format_f32, stereo ? 2 : 1 , COMMON_SAMPLE_RATE);
645
659
646
660
if (fname == " -" ) {
647
661
{
@@ -656,94 +670,78 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector
656
670
if (n == 0 ) {
657
671
break ;
658
672
}
659
- wav_data .insert (wav_data .end (), buf, buf + n);
673
+ audio_data .insert (audio_data .end (), buf, buf + n);
660
674
}
661
675
}
662
676
663
- if (drwav_init_memory (&wav, wav_data.data (), wav_data.size (), nullptr ) == false ) {
664
- fprintf (stderr, " error: failed to open WAV file from stdin\n " );
665
- return false ;
666
- }
677
+ if ((result = ma_decoder_init_memory (audio_data.data (), audio_data.size (), &decoder_config, &decoder)) != MA_SUCCESS) {
678
+
679
+ fprintf (stderr, " Error: failed to open audio data from stdin (%s)\n " , ma_result_description (result));
667
680
668
- fprintf (stderr, " %s: read %zu bytes from stdin\n " , __func__, wav_data.size ());
681
+ return false ;
682
+ }
683
+
684
+ fprintf (stderr, " %s: read %zu bytes from stdin\n " , __func__, audio_data.size ());
669
685
}
670
686
else if (is_wav_buffer (fname)) {
671
- if (drwav_init_memory (&wav, fname.c_str (), fname.size (), nullptr ) == false ) {
672
- fprintf (stderr, " error: failed to open WAV file from fname buffer\n " );
673
- return false ;
674
- }
687
+ if ((result = ma_decoder_init_memory (audio_data.data (), audio_data.size (), &decoder_config, &decoder)) != MA_SUCCESS) {
688
+
689
+ fprintf (stderr, " Error: failed to open audio data from fname buffer (%s)\n " , ma_result_description (result));
690
+
691
+ return false ;
692
+ }
675
693
}
676
- else if (drwav_init_file (&wav, fname.c_str (), nullptr ) == false ) {
694
+ else if ((result = ma_decoder_init_file ( fname.c_str (), &decoder_config, &decoder)) != MA_SUCCESS ) {
677
695
#if defined(WHISPER_FFMPEG)
678
- if (ffmpeg_decode_audio (fname, wav_data) != 0 ) {
679
- fprintf (stderr, " error: failed to ffmpeg decode '%s' \n " , fname.c_str ());
680
- return false ;
681
- }
682
- if (drwav_init_memory (&wav, wav_data.data (), wav_data.size (), nullptr ) == false ) {
683
- fprintf (stderr, " error: failed to read wav data as wav \n " );
684
- return false ;
685
- }
696
+ if (ffmpeg_decode_audio (fname, audio_data) != 0 ) {
697
+
698
+ fprintf (stderr, " error: failed to ffmpeg decode '%s'\n " , fname.c_str ());
699
+
700
+ return false ;
701
+ }
702
+ if ((result = ma_decoder_init_memory (audio_data.data (), audio_data.size (), &decoder_config, &decoder)) != MA_SUCCESS) {
703
+
704
+ fprintf (stderr, " error: failed to read audio data as wav (%s)\n " , ma_result_description (result));
705
+
706
+ return false ;
707
+ }
686
708
#else
687
- fprintf (stderr, " error: failed to open '%s' as WAV file\n " , fname.c_str ());
688
- return false ;
709
+ fprintf (stderr, " error: failed to open '%s' file (%s)\n " , fname.c_str (), ma_result_description (result));
710
+
711
+ return false ;
689
712
#endif
690
713
}
691
714
692
- if (wav.channels != 1 && wav.channels != 2 ) {
693
- fprintf (stderr, " %s: WAV file '%s' must be mono or stereo\n " , __func__, fname.c_str ());
694
- drwav_uninit (&wav);
695
- return false ;
696
- }
715
+ ma_uint64 frame_count;
716
+ ma_uint64 frames_read;
697
717
698
- if (stereo && wav.channels != 2 ) {
699
- fprintf (stderr, " %s: WAV file '%s' must be stereo for diarization\n " , __func__, fname.c_str ());
700
- drwav_uninit (&wav);
701
- return false ;
702
- }
718
+ if ((result = ma_decoder_get_length_in_pcm_frames (&decoder, &frame_count)) != MA_SUCCESS) {
703
719
704
- if (wav.sampleRate != COMMON_SAMPLE_RATE) {
705
- fprintf (stderr, " %s: WAV file '%s' must be %i kHz\n " , __func__, fname.c_str (), COMMON_SAMPLE_RATE/1000 );
706
- drwav_uninit (&wav);
707
- return false ;
708
- }
720
+ fprintf (stderr, " error: failed to retrieve the length of the audio data (%s)\n " , ma_result_description (result));
709
721
710
- if (wav.bitsPerSample != 16 ) {
711
- fprintf (stderr, " %s: WAV file '%s' must be 16-bit\n " , __func__, fname.c_str ());
712
- drwav_uninit (&wav);
713
- return false ;
722
+ return false ;
714
723
}
715
724
716
- const uint64_t n = wav_data. empty () ? wav. totalPCMFrameCount : wav_data. size ()/(wav. channels *wav. bitsPerSample / 8 );
725
+ pcmf32. resize (stereo ? frame_count* 2 : frame_count );
717
726
718
- std::vector<int16_t > pcm16;
719
- pcm16.resize (n*wav.channels );
720
- drwav_read_pcm_frames_s16 (&wav, n, pcm16.data ());
721
- drwav_uninit (&wav);
727
+ if ((result = ma_decoder_read_pcm_frames (&decoder, pcmf32.data (), frame_count, &frames_read)) != MA_SUCCESS) {
722
728
723
- // convert to mono, float
724
- pcmf32.resize (n);
725
- if (wav.channels == 1 ) {
726
- for (uint64_t i = 0 ; i < n; i++) {
727
- pcmf32[i] = float (pcm16[i])/32768 .0f ;
728
- }
729
- } else {
730
- for (uint64_t i = 0 ; i < n; i++) {
731
- pcmf32[i] = float (pcm16[2 *i] + pcm16[2 *i + 1 ])/65536 .0f ;
732
- }
729
+ fprintf (stderr, " error: failed to read the frames of the audio data (%s)\n " , ma_result_description (result));
730
+
731
+ return false ;
733
732
}
734
733
735
734
if (stereo) {
736
- // convert to stereo, float
737
- pcmf32s.resize (2 );
738
-
739
- pcmf32s[0 ].resize (n);
740
- pcmf32s[1 ].resize (n);
741
- for (uint64_t i = 0 ; i < n; i++) {
742
- pcmf32s[0 ][i] = float (pcm16[2 *i])/32768 .0f ;
743
- pcmf32s[1 ][i] = float (pcm16[2 *i + 1 ])/32768 .0f ;
744
- }
735
+ pcmf32s[0 ].resize (frame_count);
736
+ pcmf32s[1 ].resize (frame_count);
737
+ for (uint64_t i = 0 ; i < frame_count; i++) {
738
+ pcmf32s[0 ][i] = pcmf32[2 *i];
739
+ pcmf32s[1 ][i] = pcmf32[2 *i + 1 ];
740
+ }
745
741
}
746
742
743
+ ma_decoder_uninit (&decoder);
744
+
747
745
return true ;
748
746
}
749
747
@@ -909,3 +907,6 @@ bool speak_with_file(const std::string & command, const std::string & text, cons
909
907
}
910
908
return true ;
911
909
}
910
+
911
+ #undef STB_VORBIS_HEADER_ONLY
912
+ #include " stb_vorbis.c"
0 commit comments