add resample

litongmacos · litongmacos · commit 4e8e1dde9005 · 2023-11-24T12:12:51.000-10:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -19,6 +19,8 @@ message(STATUS "SDL2 libraries: ${SDL2_LIBRARIES}")
 
 include_directories(${SDL2_INCLUDE_DIRS})
 
+find_package(SampleRate CONFIG REQUIRED)
+
 # webrtc
 include_directories(webrtc)
 include_directories(.)
@@ -51,17 +53,17 @@ add_executable(sdl_version examples/sdl_version.cpp)
 target_link_libraries(sdl_version ${SDL2_LIBRARIES})
 
 add_executable(simplest examples/simplest.cpp common/common.cpp)
-target_link_libraries(simplest whisper)
+target_link_libraries(simplest whisper SampleRate::samplerate)
 
 add_executable(stream_local examples/stream_local.cpp common/common.cpp common/common-sdl.cpp
         stream/stream_components_service.cpp stream/stream_components_audio.cpp
         stream/stream_components_output.cpp
         )
-target_link_libraries(stream_local whisper ${SDL2_LIBRARIES})
+target_link_libraries(stream_local whisper ${SDL2_LIBRARIES} SampleRate::samplerate)
 
 add_executable(whisper_http_server_base_httplib whisper_http_server_base_httplib.cpp
         common/common.cpp common/utils.cpp handler/inference_handler.cpp params/whisper_params.cpp)
-target_link_libraries(whisper_http_server_base_httplib whisper)
+target_link_libraries(whisper_http_server_base_httplib whisper SampleRate::samplerate)
 
 add_executable(whisper_server_base_on_uwebsockets whisper_server_base_on_uwebsockets.cpp common/common.cpp stream/stream_components_service.cpp common/utils.cpp)
 #add uwebsockets head files
@@ -70,14 +72,16 @@ target_include_directories(whisper_server_base_on_uwebsockets PRIVATE ${UWEBSOCK
 # Detecting Operating Systems
 if (WIN32)
     # if Windows
-    target_link_libraries(whisper_server_base_on_uwebsockets PRIVATE whisper ZLIB::ZLIB libuv::uv ${USOCKETS_LIBRARY})
+    target_link_libraries(whisper_server_base_on_uwebsockets PRIVATE libuv::uv)
 elseif (APPLE)
     # if MacOS
-    target_link_libraries(whisper_server_base_on_uwebsockets PRIVATE whisper ZLIB::ZLIB libuv::uv_a ${USOCKETS_LIBRARY})
+    target_link_libraries(whisper_server_base_on_uwebsockets PRIVATE libuv::uv_a)
 else ()
     # if others eg. Linux
-    target_link_libraries(whisper_server_base_on_uwebsockets PRIVATE whisper ZLIB::ZLIB libuv::uv ${USOCKETS_LIBRARY})
+    target_link_libraries(whisper_server_base_on_uwebsockets PRIVATE libuv::uv)
 endif ()
 
+target_link_libraries(whisper_server_base_on_uwebsockets PRIVATE whisper ZLIB::ZLIB ${USOCKETS_LIBRARY} SampleRate::samplerate)
+
 
 
diff --git a/common/common.cpp b/common/common.cpp
@@ -8,15 +8,18 @@
 
 #include "../dr_libs/dr_wav.h"
 
+#define DR_MP3_IMPLEMENTATION
+
+#include "dr_libs/dr_mp3.h"
+#include <samplerate.h>
 #include <cmath>
 #include <cstring>
 #include <fstream>
 #include <regex>
 #include <locale>
 #include <codecvt>
 #include <sstream>
-#define DR_MP3_IMPLEMENTATION
-#include "dr_libs/dr_mp3.h"
+
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
@@ -631,6 +634,43 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
 
 }
 
+bool resample(const float *input, size_t inputSampleRate, size_t inputSize,
+              std::vector<float> &output, size_t outputSampleRate) {
+  // Initialize Converter
+  int error;
+  SRC_STATE *src_state = src_new(SRC_SINC_FASTEST, 1, &error);
+  if (src_state == NULL) {
+    fprintf(stderr,"error %s\n",src_strerror(error));
+    return false;
+  }
+
+  // set convert param
+  SRC_DATA src_data;
+  src_data.data_in = input;
+  src_data.input_frames = inputSize;
+  src_data.data_out = new float[inputSize]; // assign size
+  src_data.output_frames = inputSize;
+  src_data.src_ratio = double(outputSampleRate) / inputSampleRate;
+
+  // convert
+  error = src_process(src_state, &src_data);
+  if (error) {
+    fprintf(stderr,"Error converting sample rate: %d",error);
+    delete[] src_data.data_out;
+    src_delete(src_state);
+    return false;
+  }
+
+  // Copy the transformed data into the output vector
+  output.assign(src_data.data_out, src_data.data_out + src_data.output_frames_gen);
+
+  // clean
+  delete[] src_data.data_out;
+  src_delete(src_state);
+
+  return true;
+}
+
 bool
 read_wav(const std::string &fname, std::vector<float> &pcmf32, std::vector<std::vector<float>> &pcmf32s, bool stereo) {
   drwav wav;
@@ -721,11 +761,6 @@ bool read_mp3(const std::string &fname, std::vector<float> &pcmf32, bool stereo)
     return false;
   }
 
-  if (mp3.sampleRate != COMMON_SAMPLE_RATE) {
-    fprintf(stderr, "%s: MP3 file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE / 1000);
-    return false;
-  }
-
   if (mp3.channels != 1 && mp3.channels != 2) {
     fprintf(stderr, "%s: MP3 file '%s' must be mono or stereo\n", __func__, fname.c_str());
     return false;
@@ -739,7 +774,19 @@ bool read_mp3(const std::string &fname, std::vector<float> &pcmf32, bool stereo)
   drmp3_uint64 frameCount;
   float *pSampleData = drmp3__full_read_and_close_f32(&mp3, nullptr, &frameCount);
 
-  pcmf32.assign(pSampleData, pSampleData + frameCount * mp3.channels);
+  if (mp3.sampleRate != COMMON_SAMPLE_RATE) {
+    std::vector<float> resampledData;
+    if (!resample(pSampleData, mp3.sampleRate, frameCount * mp3.channels, resampledData, COMMON_SAMPLE_RATE)) {
+      fprintf(stderr, "error: failed to resample MP3 data\n");
+      drmp3_free(pSampleData, nullptr);
+      return false;
+    }
+
+    pcmf32.swap(resampledData); // 使用转换后的数据
+
+  } else {
+    pcmf32.assign(pSampleData, pSampleData + frameCount * mp3.channels);
+  }
   drmp3_free(pSampleData, nullptr);
 
   return true;
@@ -827,6 +874,7 @@ read_m4a(const std::string &fname, std::vector<float> &pcmf32, std::vector<std::
 
   return true;
 }
+
 void high_pass_filter(std::vector<float> &data, float cutoff, float sample_rate) {
   const float rc = 1.0f / (2.0f * M_PI * cutoff);
   const float dt = 1.0f / sample_rate;
diff --git a/common/common.h b/common/common.h
@@ -42,6 +42,7 @@ struct gpt_params {
   int32_t interactive_port = -1;
 };
 
+
 bool gpt_params_parse(int argc, char **argv, gpt_params &params);
 
 void gpt_print_usage(int argc, char **argv, const gpt_params &params);
@@ -134,7 +135,8 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
 //
 // Audio utils
 //
-
+bool resample(const float *input, size_t inputSampleRate, size_t inputSize,
+              std::vector<float> &output, size_t outputSampleRate);
 // Read WAV audio file and store the PCM data into pcmf32
 // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
 // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
diff --git a/common/utils.cpp b/common/utils.cpp
@@ -28,6 +28,23 @@ long get_current_time_millis(){
   return std::chrono::duration_cast<std::chrono::milliseconds>(start.time_since_epoch()).count();
 }
 
+//  500 -> 00:05.000
+// 6000 -> 01:00.000
+std::string to_timestamp(int64_t t, bool comma) {
+  int64_t msec = t * 10;
+  int64_t hr = msec / (1000 * 60 * 60);
+  msec = msec - hr * (1000 * 60 * 60);
+  int64_t min = msec / (1000 * 60);
+  msec = msec - min * (1000 * 60);
+  int64_t sec = msec / 1000;
+  msec = msec - sec * 1000;
+
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
+
+  return std::string(buf);
+}
+
 nlohmann::json get_result(whisper_context *ctx) {
   nlohmann::json results = nlohmann::json(nlohmann::json::array());
   const int n_segments = whisper_full_n_segments(ctx);
@@ -37,12 +54,11 @@ nlohmann::json get_result(whisper_context *ctx) {
     int64_t t1 = whisper_full_get_segment_t1(ctx, i);
     const char *sentence = whisper_full_get_segment_text(ctx, i);
     auto result = std::to_string(t0) + "-->" + std::to_string(t1) + ":" + sentence + "\n";
-    printf("%s: result:%s\n", get_current_time().c_str(), result.c_str());
-    segment["t0"] = t0;
-    segment["t1"] = t1;
+    //printf("%s: result:%s\n", get_current_time().c_str(), result.c_str());
+    segment["t0"] = to_timestamp(t0);
+    segment["t1"] = to_timestamp(t1);
     segment["sentence"] = sentence;
     results.push_back(segment);
   }
   return results;
 }
-
diff --git a/common/utils.h b/common/utils.h
@@ -6,4 +6,5 @@
 
 std::string get_current_time();
 long get_current_time_millis();
+std::string to_timestamp(int64_t t, bool comma = false);
 nlohmann::json get_result(whisper_context *ctx);
diff --git a/examples/simplest.cpp b/examples/simplest.cpp
@@ -1,6 +1,7 @@
 #include "../common/common.h"
 
 #include "whisper.h"
+#include "common/utils.h"
 
 #include <cmath>
 #include <cstdio>
@@ -20,23 +21,6 @@ const std::vector<std::string> k_colors = {
   "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
 };
 
-//  500 -> 00:05.000
-// 6000 -> 01:00.000
-std::string to_timestamp(int64_t t, bool comma = false) {
-  int64_t msec = t * 10;
-  int64_t hr = msec / (1000 * 60 * 60);
-  msec = msec - hr * (1000 * 60 * 60);
-  int64_t min = msec / (1000 * 60);
-  msec = msec - min * (1000 * 60);
-  int64_t sec = msec / 1000;
-  msec = msec - sec * 1000;
-
-  char buf[32];
-  snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
-
-  return std::string(buf);
-}
-
 int timestamp_to_sample(int64_t t, int n_samples) {
   return std::max(0, std::min((int) n_samples - 1, (int) ((t * WHISPER_SAMPLE_RATE) / 100)));
 }
diff --git a/handler/inference_handler.cpp b/handler/inference_handler.cpp
@@ -23,23 +23,6 @@ const std::vector<std::string> k_colors = {
 };
 
 
-//  500 -> 00:05.000
-// 6000 -> 01:00.000
-std::string to_timestamp(int64_t t, bool comma = false) {
-  int64_t msec = t * 10;
-  int64_t hr = msec / (1000 * 60 * 60);
-  msec = msec - hr * (1000 * 60 * 60);
-  int64_t min = msec / (1000 * 60);
-  msec = msec - min * (1000 * 60);
-  int64_t sec = msec / 1000;
-  msec = msec - sec * 1000;
-
-  char buf[32];
-  snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
-
-  return std::string(buf);
-}
-
 int timestamp_to_sample(int64_t t, int n_samples) {
   return std::max(0, std::min((int) n_samples - 1, (int) ((t * WHISPER_SAMPLE_RATE) / 100)));
 }
@@ -202,20 +185,20 @@ void getReqParameters(const Request &req, whisper_params &params) {
   if (req.has_file("temerature")) {
     params.userdef_temp = std::stof(req.get_file_value("temperature").content);
   }
-  if(req.has_file("audio_format")){
-    params.audio_format=req.get_file_value("audio_format").content;
+  if (req.has_file("audio_format")) {
+    params.audio_format = req.get_file_value("audio_format").content;
   }
 }
 
 
 void getReqParameters(const Request &request, whisper_params &params);
 
-bool read_audio_file(std::string audio_format, std::string filename, std::vector<float> & pcmf32,
-                     std::vector<std::vector<float>> & pcmf32s, bool diarize) {
+bool read_audio_file(std::string audio_format, std::string filename, std::vector<float> &pcmf32,
+                     std::vector<std::vector<float>> &pcmf32s, bool diarize) {
 
   // read audio content into pcmf32
   if (audio_format == "mp3") {
-    if (!::read_mp3(filename, pcmf32,diarize)) {
+    if (!::read_mp3(filename, pcmf32, diarize)) {
       fprintf(stderr, "error: failed to read mp3 file '%s'\n", filename.c_str());
       return false;
     }
@@ -234,7 +217,7 @@ bool read_audio_file(std::string audio_format, std::string filename, std::vector
 }
 
 bool run(std::mutex &whisper_mutex, whisper_params &params, whisper_context *ctx, std::string filename,
-         const std::vector<std::vector<float>>& pcmf32s, std::vector<float> pcmf32) {
+         const std::vector<std::vector<float>> &pcmf32s, std::vector<float> pcmf32) {
   // print system information
   {
     fprintf(stderr, "\n");
@@ -363,19 +346,19 @@ void handleInference(const Request &request, Response &response, std::mutex &whi
   if (!request.has_file("file")) {
     fprintf(stderr, "error: no 'file' field in the request\n");
     json jres = json{
-      {"code",-1},
-      {"msg", "no 'file' field in the request"}
+      {"code", -1},
+      {"msg",  "no 'file' field in the request"}
     };
-    auto json_string  = jres.dump(-1, ' ', false,json::error_handler_t::replace);
-    response.set_content(json_string,"application/json");
+    auto json_string = jres.dump(-1, ' ', false, json::error_handler_t::replace);
+    response.set_content(json_string, "application/json");
     return;
   }
   auto audio_file = request.get_file_value("file");
   std::string filename{audio_file.filename};
-  printf("%s: Received filename: %s \n",get_current_time().c_str(),filename.c_str());
+  printf("%s: Received filename: %s \n", get_current_time().c_str(), filename.c_str());
   // check non-required fields
   getReqParameters(request, params);
-  printf("%s: audio_format:%s \n",get_current_time().c_str(),params.audio_format.c_str());
+  printf("%s: audio_format:%s \n", get_current_time().c_str(), params.audio_format.c_str());
 
   // audio arrays
   std::vector<float> pcmf32;               // mono-channel F32 PCM
@@ -385,13 +368,13 @@ void handleInference(const Request &request, Response &response, std::mutex &whi
   std::ofstream temp_file{filename, std::ios::binary};
   temp_file << audio_file.content;
 
-  bool isOK=read_audio_file(params.audio_format,filename,pcmf32,pcmf32s,params.diarize);
-  if(!isOK){
-    json json_obj={
-      {"code",-1},
-      {"msg","error: failed to read WAV file "}
+  bool isOK = read_audio_file(params.audio_format, filename, pcmf32, pcmf32s, params.diarize);
+  if (!isOK) {
+    json json_obj = {
+      {"code", -1},
+      {"msg",  "error: failed to read WAV file "}
     };
-    auto json_string  = json_obj.dump(-1, ' ', false,json::error_handler_t::replace);
+    auto json_string = json_obj.dump(-1, ' ', false, json::error_handler_t::replace);
     response.set_content(json_string, "application/json");
     return;
   }
@@ -401,8 +384,8 @@ void handleInference(const Request &request, Response &response, std::mutex &whi
 
   printf("Successfully loaded %s\n", filename.c_str());
 
-  bool isOk= run(whisper_mutex, params, ctx, filename, pcmf32s, pcmf32);
-  if(isOk){
+  bool isOk = run(whisper_mutex, params, ctx, filename, pcmf32s, pcmf32);
+  if (isOk) {
     // return results to user
     if (params.response_format == text_format) {
       std::string results = output_str(ctx, params, pcmf32s);
@@ -412,18 +395,18 @@ void handleInference(const Request &request, Response &response, std::mutex &whi
     else {
       auto results = get_result(ctx);
       json jres = json{
-        {"code",0},
+        {"code", 0},
         {"data", results}
       };
       response.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
                            "application/json");
     }
-  }else{
+  } else {
     json jres = json{
-      {"code",-1},
-      {"msg", "run error"}
+      {"code", -1},
+      {"msg",  "run error"}
     };
-    auto json_string  = jres.dump(-1, ' ', false,json::error_handler_t::replace);
-    response.set_content(json_string,"application/json");
+    auto json_string = jres.dump(-1, ' ', false, json::error_handler_t::replace);
+    response.set_content(json_string, "application/json");
   }
 }
diff --git a/vcpkg.json b/vcpkg.json
diff --git a/whisper_http_server_base_httplib.cpp b/whisper_http_server_base_httplib.cpp
diff --git a/whisper_server_base_on_uwebsockets.cpp b/whisper_server_base_on_uwebsockets.cpp