9
9
#include < whisper.h>
10
10
#include < sstream>
11
11
12
+ bool process_vad (float *pDouble, unsigned long size);
13
+
14
+ std::vector<float > extract_first_voice_segment (std::vector<float > vector1);
15
+
12
16
using namespace stream_components ;
13
17
14
18
int main (int argc, char **argv) {
@@ -22,7 +26,7 @@ int main(int argc, char **argv) {
22
26
// Compute derived parameters
23
27
params.initialize ();
24
28
// output params
25
- printf ( " vad:%d \n " , params. audio . use_vad );
29
+
26
30
27
31
// Check parameters
28
32
if (params.service .language != " auto" && whisper_lang_id (params.service .language .c_str ()) == -1 ) {
@@ -58,7 +62,7 @@ int main(int argc, char **argv) {
58
62
};
59
63
60
64
// WebSocket /paddlespeech/asr/streaming handler
61
- auto item = [&whisperService](auto *ws, std::string_view message, uWS::OpCode opCode) {
65
+ auto item = [&whisperService, ¶ms ](auto *ws, std::string_view message, uWS::OpCode opCode) {
62
66
thread_local std::vector<float > audioBuffer; // thread-localized variable
63
67
thread_local wav_writer wavWriter;
64
68
thread_local std::string filename;
@@ -102,11 +106,11 @@ int main(int argc, char **argv) {
102
106
} else if (opCode == uWS::OpCode::BINARY) {
103
107
// process binary message(PCM16 data)
104
108
auto size = message.size ();
105
-
109
+ std::basic_string_view< char , std::char_traits< char >>::const_pointer data = message. data ();
106
110
printf (" %s: Received message size on /paddlespeech/asr/streaming: %zu\n " , get_current_time ().c_str (), size);
107
111
// add received PCM16 to audio cache
108
112
std::vector<int16_t > pcm16 (size / 2 );
109
- std::basic_string_view< char , std::char_traits< char >>::const_pointer data = message. data ();
113
+
110
114
std::memcpy (pcm16.data (), data, size);
111
115
112
116
std::vector<float > temp (size / 2 );
@@ -116,8 +120,24 @@ int main(int argc, char **argv) {
116
120
// write to file
117
121
wavWriter.write (temp.data (), size / 2 );
118
122
audioBuffer.insert (audioBuffer.end (), temp.begin (), temp.end ());
119
- // asr
120
- bool isOk = whisperService.process (audioBuffer.data (), audioBuffer.size ());
123
+ // 如果开启了VAD
124
+ bool isOk = false ;
125
+ if (params.audio .use_vad ) {
126
+ printf (" %s: vad: %n\n " , get_current_time ().c_str (), params.audio .use_vad );
127
+ // TODO: 实现VAD处理,这里假设process_vad是一个可以处理音频并返回是否包含有效语音的函数
128
+ bool containsVoice = vad_simple (audioBuffer, WHISPER_SAMPLE_RATE, 1000 , params.audio .vad_thold , params.audio .freq_thold , false );
129
+
130
+ if (containsVoice) {
131
+ // 提取第一个有效音频段
132
+ // TODO: 实现提取第一个有效音频段的逻辑,这里假设extract_first_voice_segment是实现这一功能的函数
133
+ std::vector<float > firstSegment = extract_first_voice_segment (audioBuffer);
134
+ // 清除audioBuffer中对应的字节
135
+ isOk = whisperService.process (firstSegment.data (), firstSegment.size ());
136
+ }
137
+ } else {
138
+ // asr
139
+ isOk = whisperService.process (audioBuffer.data (), audioBuffer.size ());
140
+ }
121
141
if (isOk) {
122
142
const int n_segments = whisper_full_n_segments (whisperService.ctx );
123
143
nlohmann::json results = nlohmann::json (nlohmann::json::array ());
@@ -137,6 +157,7 @@ int main(int argc, char **argv) {
137
157
response[" result" ] = final_results;
138
158
}
139
159
160
+
140
161
ws->send (response.dump (), uWS::OpCode::TEXT);
141
162
}
142
163
};
@@ -153,3 +174,11 @@ int main(int argc, char **argv) {
153
174
// listen
154
175
.listen (port, started_handler).run ();
155
176
}
177
+
178
+ std::vector<float > extract_first_voice_segment (std::vector<float > vector1) {
179
+ return std::vector<float >();
180
+ }
181
+
182
+ bool process_vad (float *pDouble, unsigned long size) {
183
+ return false ;
184
+ }
0 commit comments