Skip to content

Commit 76df083

Browse files
author
litongmacos
committed
add m4a
1 parent 4e8e1dd commit 76df083

File tree

7 files changed

+166
-110
lines changed

7 files changed

+166
-110
lines changed

CMakeLists.txt

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ message(STATUS "SDL2 libraries: ${SDL2_LIBRARIES}")
2020
include_directories(${SDL2_INCLUDE_DIRS})
2121

2222
find_package(SampleRate CONFIG REQUIRED)
23-
23+
find_package(FFMPEG REQUIRED)
2424
# webrtc
2525
include_directories(webrtc)
2626
include_directories(.)
@@ -62,10 +62,15 @@ add_executable(stream_local examples/stream_local.cpp common/common.cpp common/c
6262
target_link_libraries(stream_local whisper ${SDL2_LIBRARIES} SampleRate::samplerate)
6363

6464
add_executable(whisper_http_server_base_httplib whisper_http_server_base_httplib.cpp
65-
common/common.cpp common/utils.cpp handler/inference_handler.cpp params/whisper_params.cpp)
66-
target_link_libraries(whisper_http_server_base_httplib whisper SampleRate::samplerate)
65+
common/common.cpp common/utils.cpp handler/inference_handler.cpp params/whisper_params.cpp
66+
common/common-m4a.cpp)
67+
target_include_directories(whisper_http_server_base_httplib PRIVATE ${FFMPEG_INCLUDE_DIRS})
68+
target_link_directories(whisper_http_server_base_httplib PRIVATE ${FFMPEG_LIBRARY_DIRS})
69+
target_link_libraries(whisper_http_server_base_httplib PRIVATE whisper SampleRate::samplerate ${FFMPEG_LIBRARIES})
70+
6771

68-
add_executable(whisper_server_base_on_uwebsockets whisper_server_base_on_uwebsockets.cpp common/common.cpp stream/stream_components_service.cpp common/utils.cpp)
72+
add_executable(whisper_server_base_on_uwebsockets whisper_server_base_on_uwebsockets.cpp common/common.cpp
73+
stream/stream_components_service.cpp common/utils.cpp)
6974
#add uwebsockets head files
7075
target_include_directories(whisper_server_base_on_uwebsockets PRIVATE ${UWEBSOCKETS_INCLUDE_DIRS})
7176
# linked uWebSockets、zlib、libuv 和 uSockets libs

common/common-m4a.cpp

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
#include "common.h"
2+
#include "common-m4a.h"
3+
extern "C" {
4+
#include <libavformat/avformat.h>
5+
#include <libavcodec/avcodec.h>
6+
#include <libswresample/swresample.h>
7+
}
8+
9+
#include <vector>
10+
#include <string>
11+
12+
bool read_m4a(const std::string &fname, std::vector<float> &pcmf32, std::vector<std::vector<float>> &pcmf32s,
13+
bool stereo) {
14+
avformat_network_init();
15+
16+
AVFormatContext *formatContext = avformat_alloc_context();
17+
if (avformat_open_input(&formatContext, fname.c_str(), nullptr, nullptr) != 0) {
18+
fprintf(stderr, "Could not open file %s\n", fname.c_str());
19+
return false;
20+
}
21+
22+
if (avformat_find_stream_info(formatContext, nullptr) < 0) {
23+
fprintf(stderr, "Could not find stream information\n");
24+
avformat_close_input(&formatContext);
25+
return false;
26+
}
27+
28+
const AVCodec *codec = nullptr;
29+
int streamIndex = av_find_best_stream(formatContext, AVMEDIA_TYPE_AUDIO, -1, -1, &codec, 0);
30+
if (streamIndex < 0) {
31+
fprintf(stderr, "Could not find any audio stream in the file\n");
32+
avformat_close_input(&formatContext);
33+
return false;
34+
}
35+
36+
AVCodecContext *codecContext = avcodec_alloc_context3(codec);
37+
avcodec_parameters_to_context(codecContext, formatContext->streams[streamIndex]->codecpar);
38+
39+
if (avcodec_open2(codecContext, codec, nullptr) < 0) {
40+
fprintf(stderr, "Could not open codec\n");
41+
avcodec_free_context(&codecContext);
42+
avformat_close_input(&formatContext);
43+
return false;
44+
}
45+
46+
//bool need_resample = (codecContext->sample_rate != COMMON_SAMPLE_RATE);
47+
SwrContext *swrCtx = nullptr;
48+
swrCtx = swr_alloc_set_opts(nullptr,
49+
stereo ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO,
50+
AV_SAMPLE_FMT_FLT,
51+
COMMON_SAMPLE_RATE,
52+
codecContext->channel_layout,
53+
codecContext->sample_fmt,
54+
codecContext->sample_rate,
55+
0, nullptr);
56+
if (!swrCtx || swr_init(swrCtx) < 0) {
57+
fprintf(stderr, "Could not initialize the resampling context\n");
58+
swr_free(&swrCtx);
59+
avcodec_free_context(&codecContext);
60+
avformat_close_input(&formatContext);
61+
return false;
62+
}
63+
64+
65+
AVPacket packet;
66+
av_init_packet(&packet);
67+
packet.data = nullptr;
68+
packet.size = 0;
69+
70+
AVFrame *frame = av_frame_alloc();
71+
72+
while (av_read_frame(formatContext, &packet) >= 0) {
73+
if (packet.stream_index == streamIndex) {
74+
//decode
75+
int ret = avcodec_send_packet(codecContext, &packet);
76+
if (ret < 0) {
77+
fprintf(stderr, "Error sending packet for decoding\n");
78+
break;
79+
}
80+
81+
while (ret >= 0) {
82+
ret = avcodec_receive_frame(codecContext, frame);
83+
if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
84+
break;
85+
} else if (ret < 0) {
86+
fprintf(stderr, "Error during decoding\n");
87+
break;
88+
}
89+
90+
// Direct processing of decoded frames
91+
uint8_t *out_buf[2] = {nullptr, nullptr};
92+
int out_channels = stereo ? 2 : 1;
93+
int out_samples = av_rescale_rnd(swr_get_delay(swrCtx, codecContext->sample_rate) + frame->nb_samples,
94+
COMMON_SAMPLE_RATE, codecContext->sample_rate, AV_ROUND_UP);
95+
av_samples_alloc(out_buf, nullptr, out_channels, out_samples, AV_SAMPLE_FMT_FLT, 0);
96+
swr_convert(swrCtx, out_buf, out_samples, (const uint8_t **) frame->data, frame->nb_samples);
97+
98+
int data_size = av_samples_get_buffer_size(nullptr, out_channels, out_samples, AV_SAMPLE_FMT_FLT, 0);
99+
for (int i = 0; i < data_size / sizeof(float); ++i) {
100+
pcmf32.push_back(((float *) out_buf[0])[i]);
101+
if (stereo && out_buf[1] != nullptr) {
102+
pcmf32s[0].push_back(((float *) out_buf[0])[i]);
103+
pcmf32s[1].push_back(((float *) out_buf[1])[i]);
104+
}
105+
}
106+
107+
if (out_buf[0]) {
108+
av_freep(&out_buf[0]);
109+
}
110+
if (stereo && out_buf[1]) {
111+
av_freep(&out_buf[1]);
112+
}
113+
114+
av_frame_unref(frame);
115+
}
116+
av_packet_unref(&packet);
117+
}
118+
av_packet_unref(&packet);
119+
}
120+
121+
// Clean up
122+
av_frame_free(&frame);
123+
swr_free(&swrCtx);
124+
avcodec_free_context(&codecContext);
125+
avformat_close_input(&formatContext);
126+
avformat_network_deinit();
127+
128+
return true;
129+
}

common/common-m4a.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#ifndef WHISPER_CPP_SERVER_COMMON_M4A_H
2+
#define WHISPER_CPP_SERVER_COMMON_M4A_H
3+
#include <vector>
4+
#include <string>
5+
bool read_m4a(const std::string &fname, std::vector<float> &pcmf32, std::vector<std::vector<float>> &pcmf32s,
6+
bool stereo);
7+
#endif //WHISPER_CPP_SERVER_COMMON_M4A_H

common/common.cpp

Lines changed: 0 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -792,89 +792,6 @@ bool read_mp3(const std::string &fname, std::vector<float> &pcmf32, bool stereo)
792792
return true;
793793
}
794794

795-
bool
796-
read_m4a(const std::string &fname, std::vector<float> &pcmf32, std::vector<std::vector<float>> &pcmf32s, bool stereo) {
797-
drwav wav;
798-
std::vector<uint8_t> wav_data; // used for pipe input from stdin
799-
800-
if (fname == "-") {
801-
{
802-
uint8_t buf[1024];
803-
while (true) {
804-
const size_t n = fread(buf, 1, sizeof(buf), stdin);
805-
if (n == 0) {
806-
break;
807-
}
808-
wav_data.insert(wav_data.end(), buf, buf + n);
809-
}
810-
}
811-
812-
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
813-
fprintf(stderr, "error: failed to open WAV file from stdin\n");
814-
return false;
815-
}
816-
817-
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
818-
} else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
819-
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
820-
return false;
821-
}
822-
823-
if (wav.channels != 1 && wav.channels != 2) {
824-
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
825-
return false;
826-
}
827-
828-
if (stereo && wav.channels != 2) {
829-
fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
830-
return false;
831-
}
832-
833-
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
834-
fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE / 1000);
835-
return false;
836-
}
837-
838-
if (wav.bitsPerSample != 16) {
839-
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
840-
return false;
841-
}
842-
843-
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size() /
844-
(wav.channels * wav.bitsPerSample / 8);
845-
846-
std::vector<int16_t> pcm16;
847-
pcm16.resize(n * wav.channels);
848-
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
849-
drwav_uninit(&wav);
850-
851-
// convert to mono, float
852-
pcmf32.resize(n);
853-
if (wav.channels == 1) {
854-
for (uint64_t i = 0; i < n; i++) {
855-
pcmf32[i] = float(pcm16[i]) / 32768.0f;
856-
}
857-
} else {
858-
for (uint64_t i = 0; i < n; i++) {
859-
pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f;
860-
}
861-
}
862-
863-
if (stereo) {
864-
// convert to stereo, float
865-
pcmf32s.resize(2);
866-
867-
pcmf32s[0].resize(n);
868-
pcmf32s[1].resize(n);
869-
for (uint64_t i = 0; i < n; i++) {
870-
pcmf32s[0][i] = float(pcm16[2 * i]) / 32768.0f;
871-
pcmf32s[1][i] = float(pcm16[2 * i + 1]) / 32768.0f;
872-
}
873-
}
874-
875-
return true;
876-
}
877-
878795
void high_pass_filter(std::vector<float> &data, float cutoff, float sample_rate) {
879796
const float rc = 1.0f / (2.0f * M_PI * cutoff);
880797
const float dt = 1.0f / sample_rate;

common/common.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,8 +146,6 @@ bool read_wav(
146146
std::vector<std::vector<float>> &pcmf32s,
147147
bool stereo);
148148
bool read_mp3(const std::string &fname, std::vector<float> &pcmf32, bool stereo);
149-
bool
150-
read_m4a(const std::string &fname, std::vector<float> &pcmf32, std::vector<std::vector<float>> &pcmf32s, bool stereo);
151149
// Write PCM data into WAV audio file
152150
class wav_writer {
153151
private:

handler/inference_handler.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "../params/whisper_params.h"
55
#include "../nlohmann/json.hpp"
66
#include "../common/utils.h"
7+
#include "common/common-m4a.h"
78

89
using json = nlohmann::json;
910

@@ -203,7 +204,7 @@ bool read_audio_file(std::string audio_format, std::string filename, std::vector
203204
return false;
204205
}
205206
} else if (audio_format == "m4a") {
206-
if (!::read_m4a(filename, pcmf32, pcmf32s, diarize)) {
207+
if (!read_m4a(filename, pcmf32, pcmf32s, diarize)) {
207208
fprintf(stderr, "error: failed to read m4a file '%s'\n", filename.c_str());
208209
return false;
209210
}
@@ -372,7 +373,7 @@ void handleInference(const Request &request, Response &response, std::mutex &whi
372373
if (!isOK) {
373374
json json_obj = {
374375
{"code", -1},
375-
{"msg", "error: failed to read WAV file "}
376+
{"msg", "error: failed to read audio file "}
376377
};
377378
auto json_string = json_obj.dump(-1, ' ', false, json::error_handler_t::replace);
378379
response.set_content(json_string, "application/json");

vcpkg.json

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,20 @@
11
{
2-
"name": "whisper-cpp-server",
3-
"version-string": "1.0.0",
4-
"builtin-baseline": "0c20b2a97c390e106150837042d921b0939e7ecb",
5-
"dependencies": [
6-
{
7-
"name": "sdl2",
8-
"version>=": "2.28.4#1",
9-
"$comment": " find_package(SDL2 CONFIG REQUIRED)\n\n target_link_libraries(main\n\n PRIVATE\n\n $<TARGET_NAME_IF_EXISTS:SDL2::SDL2main>\n\n $<IF:$<TARGET_EXISTS:SDL2::SDL2>,SDL2::SDL2,SDL2::SDL2-static>\n\n )\n"
10-
},
11-
{
12-
"name": "uwebsockets",
13-
"version>=": "20.47.0"
14-
},
15-
{
16-
"name": "libsamplerate",
17-
"version>=": "0.2.2#1",
18-
"$comment": " # this is heuristically generated, and may not be correct\n\n find_package(SampleRate CONFIG REQUIRED)\n\n target_link_libraries(main PRIVATE SampleRate::samplerate)\n"
19-
}
20-
]
2+
"name" : "whisper-cpp-server",
3+
"version-string" : "1.0.0",
4+
"builtin-baseline" : "0c20b2a97c390e106150837042d921b0939e7ecb",
5+
"dependencies" : [ {
6+
"name" : "sdl2",
7+
"version>=" : "2.28.4#1",
8+
"$comment" : " find_package(SDL2 CONFIG REQUIRED)\n\n target_link_libraries(main\n\n PRIVATE\n\n $<TARGET_NAME_IF_EXISTS:SDL2::SDL2main>\n\n $<IF:$<TARGET_EXISTS:SDL2::SDL2>,SDL2::SDL2,SDL2::SDL2-static>\n\n )\n"
9+
}, {
10+
"name" : "uwebsockets",
11+
"version>=" : "20.47.0"
12+
}, {
13+
"name" : "libsamplerate",
14+
"version>=" : "0.2.2#1",
15+
"$comment" : " # this is heuristically generated, and may not be correct\n\n find_package(SampleRate CONFIG REQUIRED)\n\n target_link_libraries(main PRIVATE SampleRate::samplerate)\n"
16+
}, {
17+
"name" : "ffmpeg",
18+
"version>=" : "6.1"
19+
} ]
2120
}

0 commit comments

Comments
 (0)