Encapsulate getResult method

litongmacos · litongmacos · commit 95c987d960c9 · 2023-11-22T21:44:38.000-10:00
diff --git a/examples/simplest.cpp b/examples/simplest.cpp
@@ -86,7 +86,7 @@ struct whisper_local_params {
   std::string language = "en";
   std::string prompt;
   std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
-  std::string model = "models/ggml-base.en.bin";
+  std::string model = "../models/ggml-base.en.bin";
 
   // [TDRZ] speaker turn string
   std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
diff --git a/stream/stream_components_params.h b/stream/stream_components_params.h
@@ -50,7 +50,7 @@ namespace stream_components {
     bool use_gpu = true;
 
     std::string language = "en";
-    std::string model = "models/ggml-base.en.bin";
+    std::string model = "../models/ggml-base.en.bin";
 
     void initialize() {}
   };
diff --git a/whisper_server_base_on_uwebsockets.cpp b/whisper_server_base_on_uwebsockets.cpp
@@ -9,21 +9,15 @@
 #include <whisper.h>
 #include <sstream>
 
-struct PerSocketData {
-  wav_writer wavWriter;
-};
-
-bool process_vad(float *pDouble, unsigned long size);
-
-std::vector<float> extract_first_voice_segment(std::vector<float> vector1);
-
 using namespace stream_components;
+nlohmann::json getResult(whisper_context *ctx);
+bool processAudio(WhisperService service, std::vector<float> pcm32, const whisper_local_stream_params& params);
 
 int main(int argc, char **argv) {
   // Read parameters...
   whisper_local_stream_params params;
 
-  if (whisper_params_parse(argc, argv, params) == false) {
+  if (!whisper_params_parse(argc, argv, params)) {
     return 1;
   }
 
@@ -40,7 +34,7 @@ int main(int argc, char **argv) {
   }
 
   // Instantiate the service
-  struct whisper_context_params cparams;
+  struct whisper_context_params cparams{};
   cparams.use_gpu = params.service.use_gpu;
   stream_components::WhisperService whisperService(params.service, params.audio, cparams);
 
@@ -66,7 +60,7 @@ int main(int argc, char **argv) {
   };
   //Save Audio
   auto ws_save_handler=[](auto *ws,std::string_view message,uWS::OpCode opCode){
-    std::string* userData = (std::string*)ws->getUserData();
+    auto* userData = (std::string*)ws->getUserData();
     printf("%s: User Data: %s\n", get_current_time().c_str(), userData->c_str());
     thread_local wav_writer wavWriter;
     thread_local std::string filename;
@@ -144,16 +138,36 @@ int main(int argc, char **argv) {
 //          nlohmann::json response = {{"name",filename},{"signal", signal}};
           response = {{"name",   filename},
                       {"signal", signal}};
-          response["result"] = final_results;
+
+          std::vector<float> pcm32(audioBuffer.size());
+          std::transform(audioBuffer.begin(), audioBuffer.end(), pcm32.begin(), [](int16_t sample) {
+            return static_cast<float>(sample) / 32768.0f;
+          });
+          audioBuffer.clear();
+          // 如果开启了VAD
+          bool isOk;
+          if (params.audio.use_vad) {
+            printf("%s: vad: %d \n", get_current_time().c_str(), params.audio.use_vad);
+            // TODO: 实现VAD处理，
+            //bool containsVoice = vad_simple(audioBuffer, WHISPER_SAMPLE_RATE, 1000, params.audio.vad_thold, params.audio.freq_thold, false);
+            isOk=whisperService.process(pcm32.data(), pcm32.size());
+          } else {
+            // asr
+            isOk= whisperService.process(pcm32.data(), pcm32.size());
+          }
+          if (isOk) {
+            final_results = getResult(whisperService.ctx);
+            response["result"] = final_results;
+          }
           ws->send(response.dump(), uWS::OpCode::TEXT);
         }
         // other process logic...
       } catch (const std::exception &e) {
         std::cerr << "JSON parse error: " << e.what() << std::endl;
+        auto size = message.size();
       }
     } else if (opCode == uWS::OpCode::BINARY) {
       // process binary message（PCM16 data）
-      auto size = message.size();
       std::basic_string_view<char, std::char_traits<char>>::const_pointer data = message.data();
       printf("%s: Received message size on /paddlespeech/asr/streaming: %zu\n", get_current_time().c_str(), size);
       // add received PCM16 to audio cache
@@ -172,32 +186,18 @@ int main(int argc, char **argv) {
         });
         audioBuffer.clear();
         // 如果开启了VAD
-        bool isOk = false;
+        bool isOk;
         if (params.audio.use_vad) {
-          printf("%s: vad: %n\n", get_current_time().c_str(), params.audio.use_vad);
+          printf("%s: vad: %d \n", get_current_time().c_str(), params.audio.use_vad);
           // TODO: 实现VAD处理，
           //bool containsVoice = vad_simple(audioBuffer, WHISPER_SAMPLE_RATE, 1000, params.audio.vad_thold, params.audio.freq_thold, false);
-          isOk = whisperService.process(pcm32.data(), pcm32.size());
+          isOk=whisperService.process(pcm32.data(), pcm32.size());
         } else {
           // asr
-          isOk = whisperService.process(pcm32.data(), pcm32.size());
+          isOk=whisperService.process(pcm32.data(), pcm32.size());
         }
         if (isOk) {
-          const int n_segments = whisper_full_n_segments(whisperService.ctx);
-          nlohmann::json results = nlohmann::json(nlohmann::json::array());
-          for (int i = 0; i < n_segments; ++i) {
-            nlohmann::json segment;
-            int64_t t0 = whisper_full_get_segment_t0(whisperService.ctx, i);
-            int64_t t1 = whisper_full_get_segment_t1(whisperService.ctx, i);
-            const char *sentence = whisper_full_get_segment_text(whisperService.ctx, i);
-            auto result = std::to_string(t0) + "-->" + std::to_string(t1) + ":" + sentence + "\n";
-            printf("%s: result:%s\n", get_current_time().c_str(), result.c_str());
-            segment["t0"] = t0;
-            segment["t1"] = t1;
-            segment["sentence"] = sentence;
-            results.push_back(segment);
-          }
-          final_results = results;
+          final_results = getResult(whisperService.ctx);
           response["result"] = final_results;
         }
       }
@@ -215,7 +215,7 @@ int main(int argc, char **argv) {
     //only_save_audio
     .ws<std::string>("/streaming/save", {.open=[](auto *ws){
       // 初始化用户数据
-      std::string* userData = (std::string*)ws->getUserData();
+      auto* userData = (std::string*)ws->getUserData();
       *userData = "Create User Id";  // 设置初始值
     },.message = ws_save_handler})
       //streaming asr
@@ -224,7 +224,34 @@ int main(int argc, char **argv) {
     .listen(port, started_handler).run();
 }
 
-std::vector<float> extract_first_voice_segment(std::vector<float> vector1) {
-  return std::vector<float>();
+bool processAudio(WhisperService whisperService, std::vector<float> pcm32, const whisper_local_stream_params& params) {
+  if (params.audio.use_vad) {
+    printf("%s: vad: %d \n", get_current_time().c_str(), params.audio.use_vad);
+    // TODO: 实现VAD处理，
+    //bool containsVoice = vad_simple(audioBuffer, WHISPER_SAMPLE_RATE, 1000, params.audio.vad_thold, params.audio.freq_thold, false);
+    return whisperService.process(pcm32.data(), pcm32.size());
+  } else {
+    // asr
+    return whisperService.process(pcm32.data(), pcm32.size());
+  }
 }
 
+nlohmann::json getResult(whisper_context *ctx) {
+  nlohmann::json results = nlohmann::json(nlohmann::json::array());
+  const int n_segments = whisper_full_n_segments(ctx);
+  for (int i = 0; i < n_segments; ++i) {
+    nlohmann::json segment;
+    int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+    int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+    const char *sentence = whisper_full_get_segment_text(ctx, i);
+    auto result = std::to_string(t0) + "-->" + std::to_string(t1) + ":" + sentence + "\n";
+    printf("%s: result:%s\n", get_current_time().c_str(), result.c_str());
+    segment["t0"] = t0;
+    segment["t1"] = t1;
+    segment["sentence"] = sentence;
+    results.push_back(segment);
+  }
+  return results;
+}
+
+