add some function

litongmacos · litongmacos · commit 7a2daa10ff07 · 2023-11-24T10:37:27.000-10:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -59,7 +59,8 @@ add_executable(stream_local examples/stream_local.cpp common/common.cpp common/c
         )
 target_link_libraries(stream_local whisper ${SDL2_LIBRARIES})
 
-add_executable(whisper_http_server_base_httplib whisper_http_server_base_httplib.cpp common/common.cpp httplib/httplib.h nlohmann/json.hpp handler/inference_handler.cpp params/whisper_params.cpp)
+add_executable(whisper_http_server_base_httplib whisper_http_server_base_httplib.cpp
+        common/common.cpp common/utils.cpp handler/inference_handler.cpp params/whisper_params.cpp)
 target_link_libraries(whisper_http_server_base_httplib whisper)
 
 add_executable(whisper_server_base_on_uwebsockets whisper_server_base_on_uwebsockets.cpp common/common.cpp stream/stream_components_service.cpp common/utils.cpp)
diff --git a/common/common.cpp b/common/common.cpp
@@ -713,6 +713,93 @@ read_wav(const std::string &fname, std::vector<float> &pcmf32, std::vector<std::
   return true;
 }
 
+bool
+read_mp3(const std::string &fname, std::vector<float> &pcmf32, std::vector<std::vector<float>> &pcmf32s, bool stereo) {
+
+}
+
+bool
+read_m4a(const std::string &fname, std::vector<float> &pcmf32, std::vector<std::vector<float>> &pcmf32s, bool stereo) {
+  drwav wav;
+  std::vector<uint8_t> wav_data; // used for pipe input from stdin
+
+  if (fname == "-") {
+    {
+      uint8_t buf[1024];
+      while (true) {
+        const size_t n = fread(buf, 1, sizeof(buf), stdin);
+        if (n == 0) {
+          break;
+        }
+        wav_data.insert(wav_data.end(), buf, buf + n);
+      }
+    }
+
+    if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
+      fprintf(stderr, "error: failed to open WAV file from stdin\n");
+      return false;
+    }
+
+    fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
+  } else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
+    fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
+    return false;
+  }
+
+  if (wav.channels != 1 && wav.channels != 2) {
+    fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
+    return false;
+  }
+
+  if (stereo && wav.channels != 2) {
+    fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
+    return false;
+  }
+
+  if (wav.sampleRate != COMMON_SAMPLE_RATE) {
+    fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE / 1000);
+    return false;
+  }
+
+  if (wav.bitsPerSample != 16) {
+    fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
+    return false;
+  }
+
+  const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size() /
+                                                                 (wav.channels * wav.bitsPerSample / 8);
+
+  std::vector<int16_t> pcm16;
+  pcm16.resize(n * wav.channels);
+  drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
+  drwav_uninit(&wav);
+
+  // convert to mono, float
+  pcmf32.resize(n);
+  if (wav.channels == 1) {
+    for (uint64_t i = 0; i < n; i++) {
+      pcmf32[i] = float(pcm16[i]) / 32768.0f;
+    }
+  } else {
+    for (uint64_t i = 0; i < n; i++) {
+      pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f;
+    }
+  }
+
+  if (stereo) {
+    // convert to stereo, float
+    pcmf32s.resize(2);
+
+    pcmf32s[0].resize(n);
+    pcmf32s[1].resize(n);
+    for (uint64_t i = 0; i < n; i++) {
+      pcmf32s[0][i] = float(pcm16[2 * i]) / 32768.0f;
+      pcmf32s[1][i] = float(pcm16[2 * i + 1]) / 32768.0f;
+    }
+  }
+
+  return true;
+}
 void high_pass_filter(std::vector<float> &data, float cutoff, float sample_rate) {
   const float rc = 1.0f / (2.0f * M_PI * cutoff);
   const float dt = 1.0f / sample_rate;
diff --git a/common/common.h b/common/common.h
@@ -143,7 +143,10 @@ bool read_wav(
   std::vector<float> &pcmf32,
   std::vector<std::vector<float>> &pcmf32s,
   bool stereo);
-
+bool
+read_mp3(const std::string &fname, std::vector<float> &pcmf32, std::vector<std::vector<float>> &pcmf32s, bool stereo);
+bool
+read_m4a(const std::string &fname, std::vector<float> &pcmf32, std::vector<std::vector<float>> &pcmf32s, bool stereo);
 // Write PCM data into WAV audio file
 class wav_writer {
 private:
diff --git a/handler/inference_handler.cpp b/handler/inference_handler.cpp
@@ -3,7 +3,7 @@
 #include "../common/common.h"
 #include "../params/whisper_params.h"
 #include "../nlohmann/json.hpp"
-#include "common/utils.h"
+#include "../common/utils.h"
 
 using json = nlohmann::json;
 
@@ -210,55 +210,31 @@ void getReqParameters(const Request &req, whisper_params &params) {
 
 void getReqParameters(const Request &request, whisper_params &params);
 
-void handleInference(const Request &req, Response &res, std::mutex &whisper_mutex, whisper_params &params,
-                     whisper_context *ctx, char *arg_audio_file) {
-// aquire whisper model mutex lock
-  whisper_mutex.lock();
-
-  // first check user requested fields of the request
-  if (!req.has_file("file")) {
-    fprintf(stderr, "error: no 'file' field in the request\n");
-    const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
-    res.set_content(error_resp, "application/json");
-    whisper_mutex.unlock();
-    return;
-  }
-  auto audio_file = req.get_file_value("file");
-
-  // check non-required fields
-  getReqParameters(req, params);
+bool read_audio_file(std::string audio_format, std::string filename, std::vector<float> & pcmf32,
+                     std::vector<std::vector<float>> & pcmf32s, bool diarize) {
 
-  std::string filename{audio_file.filename};
-  printf("%s: Received filename: %s,audio_format\n",get_current_time().c_str(),filename.c_str(),params.audio_format.c_str());
-
-  // audio arrays
-  std::vector<float> pcmf32;               // mono-channel F32 PCM
-  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
-
-  // write file to temporary file
-  std::ofstream temp_file{filename, std::ios::binary};
-  temp_file << audio_file.content;
-
-  // read wav content into pcmf32
-  if(params.audio_format=="mp3"){
-
-  }else if(params.audio_format=="m4a"){
-
-  }else{
-    if (!::read_wav(filename, pcmf32, pcmf32s, params.diarize)) {
+  // read audio content into pcmf32
+  if (audio_format == "mp3") {
+    if (!::read_mp3(filename, pcmf32, pcmf32s, diarize)) {
+      fprintf(stderr, "error: failed to read mp3 file '%s'\n", filename.c_str());
+      return false;
+    }
+  } else if (audio_format == "m4a") {
+    if (!::read_m4a(filename, pcmf32, pcmf32s, diarize)) {
+      fprintf(stderr, "error: failed to read m4a file '%s'\n", filename.c_str());
+      return false;
+    }
+  } else {
+    if (!::read_wav(filename, pcmf32, pcmf32s, diarize)) {
       fprintf(stderr, "error: failed to read WAV file '%s'\n", filename.c_str());
-      const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
-      res.set_content(error_resp, "application/json");
-      whisper_mutex.unlock();
-      return;
+      return false;
     }
   }
+  return true;
+}
 
-  // remove temp file
-  std::remove(filename.c_str());
-
-  printf("Successfully loaded %s\n", filename.c_str());
-
+bool run(std::mutex &whisper_mutex, whisper_params &params, whisper_context *ctx, std::string filename,
+         const std::vector<std::vector<float>>& pcmf32s, std::vector<float> pcmf32) {
   // print system information
   {
     fprintf(stderr, "\n");
@@ -368,31 +344,87 @@ void handleInference(const Request &req, Response &res, std::mutex &whisper_mute
       wparams.abort_callback_user_data = &is_aborted;
     }
 
+    // aquire whisper model mutex lock
+    whisper_mutex.lock();
     if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
-      fprintf(stderr, "%s: failed to process audio\n", arg_audio_file);
-      const std::string error_resp = "{\"error\":\"failed to process audio\"}";
-      res.set_content(error_resp, "application/json");
+      fprintf(stderr, "%s: failed to process audio\n", filename.c_str());
       whisper_mutex.unlock();
-      return;
+      return false;
     }
+    whisper_mutex.unlock();
+    return true;
   }
+}
 
-  // return results to user
-  if (params.response_format == text_format) {
-    std::string results = output_str(ctx, params, pcmf32s);
-    res.set_content(results.c_str(), "text/html");
-  }
-    // TODO add more output formats
-  else {
-    std::string results = output_str(ctx, params, pcmf32s);
+
+void handleInference(const Request &request, Response &response, std::mutex &whisper_mutex, whisper_params &params,
+                     whisper_context *ctx, char *arg_audio_file) {
+  // first check user requested fields of the request
+  if (!request.has_file("file")) {
+    fprintf(stderr, "error: no 'file' field in the request\n");
     json jres = json{
-      {"text", results}
+      {"code",-1},
+      {"msg", "no 'file' field in the request"}
     };
-    res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
-                    "application/json");
+    auto json_string  = jres.dump(-1, ' ', false,json::error_handler_t::replace);
+    response.set_content(json_string,"application/json");
+    return;
   }
+  auto audio_file = request.get_file_value("file");
 
-  // return whisper model mutex lock
-  whisper_mutex.unlock();
-}
+  // check non-required fields
+  getReqParameters(request, params);
+
+  std::string filename{audio_file.filename};
+  printf("%s: Received filename: %s,audio_format:%s \n",get_current_time().c_str(),filename.c_str(),params.audio_format.c_str());
+
+  // audio arrays
+  std::vector<float> pcmf32;               // mono-channel F32 PCM
+  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
+
+  // write file to temporary file
+  std::ofstream temp_file{filename, std::ios::binary};
+  temp_file << audio_file.content;
 
+  bool isOK=read_audio_file(params.audio_format,filename,pcmf32,pcmf32s,params.diarize);
+  if(!isOK){
+    json json_obj={
+      {"code",-1},
+      {"msg","error: failed to read WAV file "}
+    };
+    auto json_string  = json_obj.dump(-1, ' ', false,json::error_handler_t::replace);
+    response.set_content(json_string, "application/json");
+    return;
+  }
+
+  // remove temp file
+  std::remove(filename.c_str());
+
+  printf("Successfully loaded %s\n", filename.c_str());
+
+  bool isOk= run(whisper_mutex, params, ctx, filename, pcmf32s, pcmf32);
+  if(isOk){
+    // return results to user
+    if (params.response_format == text_format) {
+      std::string results = output_str(ctx, params, pcmf32s);
+      response.set_content(results.c_str(), "text/html");
+    }
+      // TODO add more output formats
+    else {
+      std::string results = output_str(ctx, params, pcmf32s);
+      json jres = json{
+        {"code",0},
+        {"text", results}
+      };
+      response.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
+                           "application/json");
+    }
+  }else{
+    json jres = json{
+      {"code",-1},
+      {"msg", "run error"}
+    };
+    auto json_string  = jres.dump(-1, ' ', false,json::error_handler_t::replace);
+    response.set_content(json_string,"application/json");
+  }
+}
diff --git a/handler/inference_handler.h b/handler/inference_handler.h
@@ -5,5 +5,5 @@
 
 using namespace httplib;
 
-void handleInference(const Request &req, Response &res, std::mutex &whisper_mutex, whisper_params &params,
+void handleInference(const Request &request, Response &response, std::mutex &whisper_mutex, whisper_params &params,
                      whisper_context *ctx, char *arg_audio_file);
diff --git a/params/whisper_params.h b/params/whisper_params.h
@@ -50,7 +50,7 @@ struct whisper_params {
   std::string language = "en";
   std::string prompt = "";
   std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
-  std::string model = "models/ggml-base.en.bin";
+  std::string model = "../models/ggml-base.en.bin";
 
   std::string response_format = json_format;
 

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,8 @@ add_executable(stream_local examples/stream_local.cpp common/common.cpp common/c`
`59`	`59`	`)`
`60`	`60`	`target_link_libraries(stream_local whisper ${SDL2_LIBRARIES})`
`61`	`61`
`62`		`-add_executable(whisper_http_server_base_httplib whisper_http_server_base_httplib.cpp common/common.cpp httplib/httplib.h nlohmann/json.hpp handler/inference_handler.cpp params/whisper_params.cpp)`
	`62`	`+add_executable(whisper_http_server_base_httplib whisper_http_server_base_httplib.cpp`
	`63`	`+ common/common.cpp common/utils.cpp handler/inference_handler.cpp params/whisper_params.cpp)`
`63`	`64`	`target_link_libraries(whisper_http_server_base_httplib whisper)`
`64`	`65`
`65`	`66`	`add_executable(whisper_server_base_on_uwebsockets whisper_server_base_on_uwebsockets.cpp common/common.cpp stream/stream_components_service.cpp common/utils.cpp)`