@@ -111,7 +111,7 @@ int main(int argc, char **argv) {
111
111
112
112
// WebSocket /paddlespeech/asr/streaming handler
113
113
auto ws_streaming_handler = [&whisperService, ¶ms](auto *ws, std::string_view message, uWS::OpCode opCode) {
114
- thread_local std::vector<float > audioBuffer; // thread-localized variable
114
+ thread_local std::vector<int16_t > audioBuffer; // thread-localized variable
115
115
thread_local wav_writer wavWriter;
116
116
thread_local std::string filename;
117
117
// std::unique_ptr<nlohmann::json> results(new nlohmann::json(nlohmann::json::array()));
@@ -160,52 +160,47 @@ int main(int argc, char **argv) {
160
160
std::vector<int16_t > pcm16 (size / 2 );
161
161
162
162
std::memcpy (pcm16.data (), data, size);
163
-
164
- std::vector<float > temp (size / 2 );
165
- std::transform (pcm16.begin (), pcm16.end (), temp.begin (), [](int16_t sample) {
166
- return static_cast <float >(sample) / 32768 .0f ;
167
- });
168
163
// write to file
169
- // wavWriter.write(temp.data(), size / 2);
170
- audioBuffer.insert (audioBuffer.end (), temp.begin (), temp.end ());
171
- // 如果开启了VAD
172
- bool isOk = false ;
173
- if (params.audio .use_vad ) {
174
- printf (" %s: vad: %n\n " , get_current_time ().c_str (), params.audio .use_vad );
175
- // TODO: 实现VAD处理,这里假设process_vad是一个可以处理音频并返回是否包含有效语音的函数
176
- bool containsVoice = vad_simple (audioBuffer, WHISPER_SAMPLE_RATE, 1000 , params.audio .vad_thold , params.audio .freq_thold , false );
177
-
178
- if (containsVoice) {
179
- // 提取第一个有效音频段
180
- // TODO: 实现提取第一个有效音频段的逻辑,这里假设extract_first_voice_segment是实现这一功能的函数
181
- std::vector<float > firstSegment = extract_first_voice_segment (audioBuffer);
182
- // 清除audioBuffer中对应的字节
183
- isOk = whisperService.process (firstSegment.data (), firstSegment.size ());
164
+ wavWriter.write (pcm16.data (), size / 2 );
165
+
166
+ audioBuffer.insert (audioBuffer.end (), pcm16.begin (), pcm16.end ());
167
+ unsigned long bufferSize = audioBuffer.size ();
168
+ if (bufferSize>16000 *10 ){
169
+ std::vector<float > pcm32 (bufferSize);
170
+ std::transform (audioBuffer.begin (), audioBuffer.end (), pcm32.begin (), [](int16_t sample) {
171
+ return static_cast <float >(sample) / 32768 .0f ;
172
+ });
173
+ audioBuffer.clear ();
174
+ // 如果开启了VAD
175
+ bool isOk = false ;
176
+ if (params.audio .use_vad ) {
177
+ printf (" %s: vad: %n\n " , get_current_time ().c_str (), params.audio .use_vad );
178
+ // TODO: 实现VAD处理,
179
+ // bool containsVoice = vad_simple(audioBuffer, WHISPER_SAMPLE_RATE, 1000, params.audio.vad_thold, params.audio.freq_thold, false);
180
+ isOk = whisperService.process (pcm32.data (), pcm32.size ());
181
+ } else {
182
+ // asr
183
+ isOk = whisperService.process (pcm32.data (), pcm32.size ());
184
184
}
185
- } else {
186
- // asr
187
- isOk = whisperService.process (audioBuffer.data (), audioBuffer.size ());
188
- }
189
- if (isOk) {
190
- const int n_segments = whisper_full_n_segments (whisperService.ctx );
191
- nlohmann::json results = nlohmann::json (nlohmann::json::array ());
192
- for (int i = 0 ; i < n_segments; ++i) {
193
- nlohmann::json segment;
194
- int64_t t0 = whisper_full_get_segment_t0 (whisperService.ctx , i);
195
- int64_t t1 = whisper_full_get_segment_t1 (whisperService.ctx , i);
196
- const char *sentence = whisper_full_get_segment_text (whisperService.ctx , i);
197
- auto result = std::to_string (t0) + " -->" + std::to_string (t1) + " :" + sentence + " \n " ;
198
- printf (" %s: result:%s\n " , get_current_time ().c_str (), result.c_str ());
199
- segment[" t0" ] = t0;
200
- segment[" t1" ] = t1;
201
- segment[" sentence" ] = sentence;
202
- results.push_back (segment);
185
+ if (isOk) {
186
+ const int n_segments = whisper_full_n_segments (whisperService.ctx );
187
+ nlohmann::json results = nlohmann::json (nlohmann::json::array ());
188
+ for (int i = 0 ; i < n_segments; ++i) {
189
+ nlohmann::json segment;
190
+ int64_t t0 = whisper_full_get_segment_t0 (whisperService.ctx , i);
191
+ int64_t t1 = whisper_full_get_segment_t1 (whisperService.ctx , i);
192
+ const char *sentence = whisper_full_get_segment_text (whisperService.ctx , i);
193
+ auto result = std::to_string (t0) + " -->" + std::to_string (t1) + " :" + sentence + " \n " ;
194
+ printf (" %s: result:%s\n " , get_current_time ().c_str (), result.c_str ());
195
+ segment[" t0" ] = t0;
196
+ segment[" t1" ] = t1;
197
+ segment[" sentence" ] = sentence;
198
+ results.push_back (segment);
199
+ }
200
+ final_results = results;
201
+ response[" result" ] = final_results;
203
202
}
204
- final_results = results;
205
- response[" result" ] = final_results;
206
203
}
207
-
208
-
209
204
ws->send (response.dump (), uWS::OpCode::TEXT);
210
205
}
211
206
};
0 commit comments