change audio_vad

litongmacos · litongmacos · commit d0b59ef46619 · 2023-11-22T17:19:55.000-10:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -23,10 +23,10 @@ include_directories(${SDL2_INCLUDE_DIRS})
 include_directories(webrtc)
 include_directories(.)
 # find cpp files
-file(GLOB SRC_FILES simplevad/*.c simplevad/*.h
+file(GLOB VAD_FILES simplevad/*.c simplevad/*.h
         webrtc/common_audio/*/*.c webrtc/rtc_base/*.c*)
 
-add_executable(audio_vad examples/audio_vad.cpp ${SRC_FILES})
+add_executable(audio_vad examples/audio_vad.cpp ${VAD_FILES})
 target_link_libraries(audio_vad pthread)
 
 
diff --git a/common/common.cpp b/common/common.cpp
@@ -727,7 +727,8 @@ void high_pass_filter(std::vector<float> &data, float cutoff, float sample_rate)
 }
 
 bool
-vad_simple(std::vector<float> &pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+vad_simple(std::vector<float> &pcmf32, int sample_rate, int last_ms,
+           float vad_thold, float freq_thold, bool verbose) {
   const int n_samples = pcmf32.size();
   const int n_samples_last = (sample_rate * last_ms) / 1000;
 
diff --git a/examples/audio_vad.cpp b/examples/audio_vad.cpp
@@ -28,15 +28,17 @@ int main(int argc,char** argv) {
     return 3;
   }
 
-  std::unique_ptr<simple_vad, decltype(&simple_vad_free)> vad(simple_vad_create(), &simple_vad_free);
+  std::unique_ptr<simple_vad, decltype(&simple_vad_free)> vad(
+    simple_vad_create(), &simple_vad_free);
   if (!vad) {
     return 4;
   }
 
   std::unique_ptr<FILE, decltype(&fclose)> fp2(fopen(filename, "rb"), &fclose);
   std::unique_ptr<struct cut_info, decltype(&cut_info_free)> cut(cut_info_create(fp2.get()), &cut_info_free);
 
-  snprintf(cut->output_filename_prefix, sizeof(cut->output_filename_prefix), "%s", output_filename_prefix);
+  snprintf(cut->output_filename_prefix, sizeof(cut->output_filename_prefix), "%s",
+           output_filename_prefix);
   snprintf(cut->output_file_dir, sizeof(cut->output_file_dir), "%s", output_dir);
 
   int res = run(fp.get(), vad.get(), cut.get());
diff --git a/whisper_server_base_on_uwebsockets.cpp b/whisper_server_base_on_uwebsockets.cpp
@@ -9,6 +9,10 @@
 #include <whisper.h>
 #include <sstream>
 
+bool process_vad(float *pDouble, unsigned long size);
+
+std::vector<float> extract_first_voice_segment(std::vector<float> vector1);
+
 using namespace stream_components;
 
 int main(int argc, char **argv) {
@@ -22,7 +26,7 @@ int main(int argc, char **argv) {
   // Compute derived parameters
   params.initialize();
   //output params
-  printf("vad:%d\n", params.audio.use_vad);
+
 
   // Check parameters
   if (params.service.language != "auto" && whisper_lang_id(params.service.language.c_str()) == -1) {
@@ -58,7 +62,7 @@ int main(int argc, char **argv) {
   };
 
   // WebSocket /paddlespeech/asr/streaming handler
-  auto item = [&whisperService](auto *ws, std::string_view message, uWS::OpCode opCode) {
+  auto item = [&whisperService, &params](auto *ws, std::string_view message, uWS::OpCode opCode) {
     thread_local std::vector<float> audioBuffer; //thread-localized variable
     thread_local wav_writer wavWriter;
     thread_local std::string filename;
@@ -102,11 +106,11 @@ int main(int argc, char **argv) {
     } else if (opCode == uWS::OpCode::BINARY) {
       // process binary message（PCM16 data）
       auto size = message.size();
-
+      std::basic_string_view<char, std::char_traits<char>>::const_pointer data = message.data();
       printf("%s: Received message size on /paddlespeech/asr/streaming: %zu\n", get_current_time().c_str(), size);
       // add received PCM16 to audio cache
       std::vector<int16_t> pcm16(size / 2);
-      std::basic_string_view<char, std::char_traits<char>>::const_pointer data = message.data();
+
       std::memcpy(pcm16.data(), data, size);
 
       std::vector<float> temp(size / 2);
@@ -116,8 +120,24 @@ int main(int argc, char **argv) {
       //write to file
       wavWriter.write(temp.data(), size / 2);
       audioBuffer.insert(audioBuffer.end(), temp.begin(), temp.end());
-      // asr
-      bool isOk = whisperService.process(audioBuffer.data(), audioBuffer.size());
+      // 如果开启了VAD
+      bool isOk = false;
+      if (params.audio.use_vad) {
+        printf("%s: vad: %n\n", get_current_time().c_str(), params.audio.use_vad);
+        // TODO: 实现VAD处理，这里假设process_vad是一个可以处理音频并返回是否包含有效语音的函数
+        bool containsVoice = vad_simple(audioBuffer, WHISPER_SAMPLE_RATE, 1000, params.audio.vad_thold, params.audio.freq_thold, false);
+
+        if (containsVoice) {
+          // 提取第一个有效音频段
+          // TODO: 实现提取第一个有效音频段的逻辑，这里假设extract_first_voice_segment是实现这一功能的函数
+          std::vector<float> firstSegment = extract_first_voice_segment(audioBuffer);
+          // 清除audioBuffer中对应的字节
+          isOk = whisperService.process(firstSegment.data(), firstSegment.size());
+        }
+      } else {
+        // asr
+        isOk = whisperService.process(audioBuffer.data(), audioBuffer.size());
+      }
       if (isOk) {
         const int n_segments = whisper_full_n_segments(whisperService.ctx);
         nlohmann::json results = nlohmann::json(nlohmann::json::array());
@@ -137,6 +157,7 @@ int main(int argc, char **argv) {
         response["result"] = final_results;
       }
 
+
       ws->send(response.dump(), uWS::OpCode::TEXT);
     }
   };
@@ -153,3 +174,11 @@ int main(int argc, char **argv) {
       //listen
     .listen(port, started_handler).run();
 }
+
+std::vector<float> extract_first_voice_segment(std::vector<float> vector1) {
+  return std::vector<float>();
+}
+
+bool process_vad(float *pDouble, unsigned long size) {
+  return false;
+}

Original file line number	Diff line number	Diff line change
`@@ -727,7 +727,8 @@ void high_pass_filter(std::vector<float> &data, float cutoff, float sample_rate)`
`727`	`727`	`}`
`728`	`728`
`729`	`729`	`bool`
`730`		`-vad_simple(std::vector<float> &pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {`
	`730`	`+vad_simple(std::vector<float> &pcmf32, int sample_rate, int last_ms,`
	`731`	`+ float vad_thold, float freq_thold, bool verbose) {`
`731`	`732`	`const int n_samples = pcmf32.size();`
`732`	`733`	`const int n_samples_last = (sample_rate * last_ms) / 1000;`
`733`	`734`