add m4a

litongmacos · litongmacos · commit 76df0832ae17 · 2023-11-24T15:24:24.000-10:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -20,7 +20,7 @@ message(STATUS "SDL2 libraries: ${SDL2_LIBRARIES}")
 include_directories(${SDL2_INCLUDE_DIRS})
 
 find_package(SampleRate CONFIG REQUIRED)
-
+find_package(FFMPEG REQUIRED)
 # webrtc
 include_directories(webrtc)
 include_directories(.)
@@ -62,10 +62,15 @@ add_executable(stream_local examples/stream_local.cpp common/common.cpp common/c
 target_link_libraries(stream_local whisper ${SDL2_LIBRARIES} SampleRate::samplerate)
 
 add_executable(whisper_http_server_base_httplib whisper_http_server_base_httplib.cpp
-        common/common.cpp common/utils.cpp handler/inference_handler.cpp params/whisper_params.cpp)
-target_link_libraries(whisper_http_server_base_httplib whisper SampleRate::samplerate)
+        common/common.cpp common/utils.cpp handler/inference_handler.cpp params/whisper_params.cpp
+        common/common-m4a.cpp)
+target_include_directories(whisper_http_server_base_httplib PRIVATE ${FFMPEG_INCLUDE_DIRS})
+target_link_directories(whisper_http_server_base_httplib PRIVATE ${FFMPEG_LIBRARY_DIRS})
+target_link_libraries(whisper_http_server_base_httplib PRIVATE whisper SampleRate::samplerate ${FFMPEG_LIBRARIES})
+
 
-add_executable(whisper_server_base_on_uwebsockets whisper_server_base_on_uwebsockets.cpp common/common.cpp stream/stream_components_service.cpp common/utils.cpp)
+add_executable(whisper_server_base_on_uwebsockets whisper_server_base_on_uwebsockets.cpp common/common.cpp
+        stream/stream_components_service.cpp common/utils.cpp)
 #add uwebsockets head files
 target_include_directories(whisper_server_base_on_uwebsockets PRIVATE ${UWEBSOCKETS_INCLUDE_DIRS})
 # linked uWebSockets、zlib、libuv 和 uSockets libs
diff --git a/common/common-m4a.cpp b/common/common-m4a.cpp
@@ -0,0 +1,129 @@
+#include "common.h"
+#include "common-m4a.h"
+extern "C" {
+#include <libavformat/avformat.h>
+#include <libavcodec/avcodec.h>
+#include <libswresample/swresample.h>
+}
+
+#include <vector>
+#include <string>
+
+bool read_m4a(const std::string &fname, std::vector<float> &pcmf32, std::vector<std::vector<float>> &pcmf32s,
+              bool stereo) {
+  avformat_network_init();
+
+  AVFormatContext *formatContext = avformat_alloc_context();
+  if (avformat_open_input(&formatContext, fname.c_str(), nullptr, nullptr) != 0) {
+    fprintf(stderr, "Could not open file %s\n", fname.c_str());
+    return false;
+  }
+
+  if (avformat_find_stream_info(formatContext, nullptr) < 0) {
+    fprintf(stderr, "Could not find stream information\n");
+    avformat_close_input(&formatContext);
+    return false;
+  }
+
+  const AVCodec *codec = nullptr;
+  int streamIndex = av_find_best_stream(formatContext, AVMEDIA_TYPE_AUDIO, -1, -1, &codec, 0);
+  if (streamIndex < 0) {
+    fprintf(stderr, "Could not find any audio stream in the file\n");
+    avformat_close_input(&formatContext);
+    return false;
+  }
+
+  AVCodecContext *codecContext = avcodec_alloc_context3(codec);
+  avcodec_parameters_to_context(codecContext, formatContext->streams[streamIndex]->codecpar);
+
+  if (avcodec_open2(codecContext, codec, nullptr) < 0) {
+    fprintf(stderr, "Could not open codec\n");
+    avcodec_free_context(&codecContext);
+    avformat_close_input(&formatContext);
+    return false;
+  }
+
+  //bool need_resample = (codecContext->sample_rate != COMMON_SAMPLE_RATE);
+  SwrContext *swrCtx = nullptr;
+  swrCtx = swr_alloc_set_opts(nullptr,
+                              stereo ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO,
+                              AV_SAMPLE_FMT_FLT,
+                              COMMON_SAMPLE_RATE,
+                              codecContext->channel_layout,
+                              codecContext->sample_fmt,
+                              codecContext->sample_rate,
+                              0, nullptr);
+  if (!swrCtx || swr_init(swrCtx) < 0) {
+    fprintf(stderr, "Could not initialize the resampling context\n");
+    swr_free(&swrCtx);
+    avcodec_free_context(&codecContext);
+    avformat_close_input(&formatContext);
+    return false;
+  }
+
+
+  AVPacket packet;
+  av_init_packet(&packet);
+  packet.data = nullptr;
+  packet.size = 0;
+
+  AVFrame *frame = av_frame_alloc();
+
+  while (av_read_frame(formatContext, &packet) >= 0) {
+    if (packet.stream_index == streamIndex) {
+      //decode
+      int ret = avcodec_send_packet(codecContext, &packet);
+      if (ret < 0) {
+        fprintf(stderr, "Error sending packet for decoding\n");
+        break;
+      }
+
+      while (ret >= 0) {
+        ret = avcodec_receive_frame(codecContext, frame);
+        if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
+          break;
+        } else if (ret < 0) {
+          fprintf(stderr, "Error during decoding\n");
+          break;
+        }
+
+        // Direct processing of decoded frames
+        uint8_t *out_buf[2] = {nullptr, nullptr};
+        int out_channels = stereo ? 2 : 1;
+        int out_samples = av_rescale_rnd(swr_get_delay(swrCtx, codecContext->sample_rate) + frame->nb_samples,
+                                         COMMON_SAMPLE_RATE, codecContext->sample_rate, AV_ROUND_UP);
+        av_samples_alloc(out_buf, nullptr, out_channels, out_samples, AV_SAMPLE_FMT_FLT, 0);
+        swr_convert(swrCtx, out_buf, out_samples, (const uint8_t **) frame->data, frame->nb_samples);
+
+        int data_size = av_samples_get_buffer_size(nullptr, out_channels, out_samples, AV_SAMPLE_FMT_FLT, 0);
+        for (int i = 0; i < data_size / sizeof(float); ++i) {
+          pcmf32.push_back(((float *) out_buf[0])[i]);
+          if (stereo && out_buf[1] != nullptr) {
+            pcmf32s[0].push_back(((float *) out_buf[0])[i]);
+            pcmf32s[1].push_back(((float *) out_buf[1])[i]);
+          }
+        }
+
+        if (out_buf[0]) {
+          av_freep(&out_buf[0]);
+        }
+        if (stereo && out_buf[1]) {
+          av_freep(&out_buf[1]);
+        }
+
+        av_frame_unref(frame);
+      }
+      av_packet_unref(&packet);
+    }
+    av_packet_unref(&packet);
+  }
+
+  // Clean up
+  av_frame_free(&frame);
+  swr_free(&swrCtx);
+  avcodec_free_context(&codecContext);
+  avformat_close_input(&formatContext);
+  avformat_network_deinit();
+
+  return true;
+}
diff --git a/common/common-m4a.h b/common/common-m4a.h
@@ -0,0 +1,7 @@
+#ifndef WHISPER_CPP_SERVER_COMMON_M4A_H
+#define WHISPER_CPP_SERVER_COMMON_M4A_H
+#include <vector>
+#include <string>
+bool read_m4a(const std::string &fname, std::vector<float> &pcmf32, std::vector<std::vector<float>> &pcmf32s,
+              bool stereo);
+#endif //WHISPER_CPP_SERVER_COMMON_M4A_H
diff --git a/common/common.cpp b/common/common.cpp
@@ -792,89 +792,6 @@ bool read_mp3(const std::string &fname, std::vector<float> &pcmf32, bool stereo)
   return true;
 }
 
-bool
-read_m4a(const std::string &fname, std::vector<float> &pcmf32, std::vector<std::vector<float>> &pcmf32s, bool stereo) {
-  drwav wav;
-  std::vector<uint8_t> wav_data; // used for pipe input from stdin
-
-  if (fname == "-") {
-    {
-      uint8_t buf[1024];
-      while (true) {
-        const size_t n = fread(buf, 1, sizeof(buf), stdin);
-        if (n == 0) {
-          break;
-        }
-        wav_data.insert(wav_data.end(), buf, buf + n);
-      }
-    }
-
-    if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
-      fprintf(stderr, "error: failed to open WAV file from stdin\n");
-      return false;
-    }
-
-    fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
-  } else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
-    fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
-    return false;
-  }
-
-  if (wav.channels != 1 && wav.channels != 2) {
-    fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
-    return false;
-  }
-
-  if (stereo && wav.channels != 2) {
-    fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
-    return false;
-  }
-
-  if (wav.sampleRate != COMMON_SAMPLE_RATE) {
-    fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE / 1000);
-    return false;
-  }
-
-  if (wav.bitsPerSample != 16) {
-    fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
-    return false;
-  }
-
-  const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size() /
-                                                                 (wav.channels * wav.bitsPerSample / 8);
-
-  std::vector<int16_t> pcm16;
-  pcm16.resize(n * wav.channels);
-  drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
-  drwav_uninit(&wav);
-
-  // convert to mono, float
-  pcmf32.resize(n);
-  if (wav.channels == 1) {
-    for (uint64_t i = 0; i < n; i++) {
-      pcmf32[i] = float(pcm16[i]) / 32768.0f;
-    }
-  } else {
-    for (uint64_t i = 0; i < n; i++) {
-      pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f;
-    }
-  }
-
-  if (stereo) {
-    // convert to stereo, float
-    pcmf32s.resize(2);
-
-    pcmf32s[0].resize(n);
-    pcmf32s[1].resize(n);
-    for (uint64_t i = 0; i < n; i++) {
-      pcmf32s[0][i] = float(pcm16[2 * i]) / 32768.0f;
-      pcmf32s[1][i] = float(pcm16[2 * i + 1]) / 32768.0f;
-    }
-  }
-
-  return true;
-}
-
 void high_pass_filter(std::vector<float> &data, float cutoff, float sample_rate) {
   const float rc = 1.0f / (2.0f * M_PI * cutoff);
   const float dt = 1.0f / sample_rate;
diff --git a/common/common.h b/common/common.h
@@ -146,8 +146,6 @@ bool read_wav(
   std::vector<std::vector<float>> &pcmf32s,
   bool stereo);
 bool read_mp3(const std::string &fname, std::vector<float> &pcmf32, bool stereo);
-bool
-read_m4a(const std::string &fname, std::vector<float> &pcmf32, std::vector<std::vector<float>> &pcmf32s, bool stereo);
 // Write PCM data into WAV audio file
 class wav_writer {
 private:
diff --git a/handler/inference_handler.cpp b/handler/inference_handler.cpp
@@ -4,6 +4,7 @@
 #include "../params/whisper_params.h"
 #include "../nlohmann/json.hpp"
 #include "../common/utils.h"
+#include "common/common-m4a.h"
 
 using json = nlohmann::json;
 
@@ -203,7 +204,7 @@ bool read_audio_file(std::string audio_format, std::string filename, std::vector
       return false;
     }
   } else if (audio_format == "m4a") {
-    if (!::read_m4a(filename, pcmf32, pcmf32s, diarize)) {
+    if (!read_m4a(filename, pcmf32, pcmf32s, diarize)) {
       fprintf(stderr, "error: failed to read m4a file '%s'\n", filename.c_str());
       return false;
     }
@@ -372,7 +373,7 @@ void handleInference(const Request &request, Response &response, std::mutex &whi
   if (!isOK) {
     json json_obj = {
       {"code", -1},
-      {"msg",  "error: failed to read WAV file "}
+      {"msg",  "error: failed to read audio file "}
     };
     auto json_string = json_obj.dump(-1, ' ', false, json::error_handler_t::replace);
     response.set_content(json_string, "application/json");
diff --git a/vcpkg.json b/vcpkg.json
@@ -1,21 +1,20 @@
 {
-  "name": "whisper-cpp-server",
-  "version-string": "1.0.0",
-  "builtin-baseline": "0c20b2a97c390e106150837042d921b0939e7ecb",
-  "dependencies": [
-    {
-      "name": "sdl2",
-      "version>=": "2.28.4#1",
-      "$comment": "    find_package(SDL2 CONFIG REQUIRED)\n\n    target_link_libraries(main\n\n        PRIVATE\n\n        $<TARGET_NAME_IF_EXISTS:SDL2::SDL2main>\n\n        $<IF:$<TARGET_EXISTS:SDL2::SDL2>,SDL2::SDL2,SDL2::SDL2-static>\n\n    )\n"
-    },
-    {
-      "name": "uwebsockets",
-      "version>=": "20.47.0"
-    },
-    {
-      "name": "libsamplerate",
-      "version>=": "0.2.2#1",
-      "$comment": "  # this is heuristically generated, and may not be correct\n\n  find_package(SampleRate CONFIG REQUIRED)\n\n  target_link_libraries(main PRIVATE SampleRate::samplerate)\n"
-    }
-  ]
+  "name" : "whisper-cpp-server",
+  "version-string" : "1.0.0",
+  "builtin-baseline" : "0c20b2a97c390e106150837042d921b0939e7ecb",
+  "dependencies" : [ {
+    "name" : "sdl2",
+    "version>=" : "2.28.4#1",
+    "$comment" : "    find_package(SDL2 CONFIG REQUIRED)\n\n    target_link_libraries(main\n\n        PRIVATE\n\n        $<TARGET_NAME_IF_EXISTS:SDL2::SDL2main>\n\n        $<IF:$<TARGET_EXISTS:SDL2::SDL2>,SDL2::SDL2,SDL2::SDL2-static>\n\n    )\n"
+  }, {
+    "name" : "uwebsockets",
+    "version>=" : "20.47.0"
+  }, {
+    "name" : "libsamplerate",
+    "version>=" : "0.2.2#1",
+    "$comment" : "  # this is heuristically generated, and may not be correct\n\n  find_package(SampleRate CONFIG REQUIRED)\n\n  target_link_libraries(main PRIVATE SampleRate::samplerate)\n"
+  }, {
+    "name" : "ffmpeg",
+    "version>=" : "6.1"
+  } ]
 }