Skip to content

Commit 22c68af

Browse files
authored
Merge pull request #10 from speechmatics/v0.0.8
Smoother audio playback in poor networking conditions
2 parents 33fac22 + c73cf3e commit 22c68af

File tree

7 files changed

+189
-45
lines changed

7 files changed

+189
-45
lines changed

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file.
44

55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
66

7+
## [0.0.8] - 2024-11-29
8+
9+
### Added
10+
11+
- Introduced a new class to enable custom configuration of audio playback settings, offering greater flexibility for
12+
fine-tuning audio playback.
13+
- The client now buffers audio to ensure smoother playback, especially in challenging network conditions.
14+
15+
### Fixed
16+
17+
- Resolved an issue with reading piped audio from stdin.
18+
719
## [0.0.7] - 2024-11-25
820

921
### Added

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.0.7
1+
0.0.8

speechmatics_flow/cli.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
ServerMessageType,
2424
Interaction,
2525
ConnectionSettings,
26+
PlaybackSettings,
2627
)
2728
from speechmatics_flow.templates import TemplateOptions
2829

@@ -139,6 +140,25 @@ def get_audio_settings(args):
139140
return settings
140141

141142

143+
def get_playback_settings(args):
144+
"""
145+
Helper function which returns a PlaybackSettings object based on the command
146+
line options given to the program.
147+
148+
Args:
149+
args (dict): Keyword arguments, typically from the command line.
150+
151+
Returns:
152+
models.PlaybackSettings: Settings for the audio playback stream
153+
in the connection.
154+
"""
155+
return PlaybackSettings(
156+
buffering=args.get("playback_buffering"),
157+
sample_rate=args.get("playback_sample_rate"),
158+
chunk_size=args.get("playback_chunk_size"),
159+
)
160+
161+
142162
# pylint: disable=too-many-arguments,too-many-statements
143163
def add_printing_handlers(
144164
api,
@@ -248,7 +268,6 @@ def flow_main(args):
248268
:param args: arguments from parse_args()
249269
:type args: argparse.Namespace
250270
"""
251-
conversation_config = get_conversation_config(args)
252271
settings = get_connection_settings(args)
253272
api = WebsocketClient(settings)
254273
transcripts = Transcripts()
@@ -261,9 +280,10 @@ def flow_main(args):
261280
def run(stream):
262281
try:
263282
api.run_synchronously(
264-
[Interaction(stream)],
265-
get_audio_settings(args),
266-
conversation_config,
283+
interactions=[Interaction(stream)],
284+
audio_settings=get_audio_settings(args),
285+
conversation_config=get_conversation_config(args),
286+
playback_settings=get_playback_settings(args),
267287
from_cli=True,
268288
)
269289
except KeyboardInterrupt:

speechmatics_flow/cli_parser.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,31 @@ def get_arg_parser():
104104
"acknowledgements from the server."
105105
),
106106
)
107+
parser.add_argument(
108+
"--playback-buffering",
109+
type=int,
110+
default=10,
111+
help=(
112+
"Buffer (in milliseconds) for audio received from the server before playback. "
113+
"Increasing the buffer size can improve resilience to poor network conditions, "
114+
"at the cost of increased latency."
115+
),
116+
),
117+
parser.add_argument(
118+
"--playback-sample-rate",
119+
type=int,
120+
default=16_000,
121+
help="The sample rate in Hz of the output audio.",
122+
)
123+
parser.add_argument(
124+
"--playback-chunk-size",
125+
type=int,
126+
default=256,
127+
help=(
128+
"The size of each audio chunk, in bytes, to read from the audio buffer. "
129+
"Increasing the chunk size may improve playback smoothness."
130+
),
131+
)
107132
parser.add_argument(
108133
"--print-json",
109134
default=False,

speechmatics_flow/client.py

Lines changed: 89 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import json
1010
import logging
1111
import os
12+
import sys
1213
from concurrent.futures import ThreadPoolExecutor
1314
from typing import List, Optional
1415

@@ -22,12 +23,13 @@
2223
ConversationError,
2324
)
2425
from speechmatics_flow.models import (
25-
ClientMessageType,
26-
ServerMessageType,
2726
AudioSettings,
27+
ClientMessageType,
28+
ConnectionSettings,
2829
ConversationConfig,
2930
Interaction,
30-
ConnectionSettings,
31+
PlaybackSettings,
32+
ServerMessageType,
3133
)
3234
from speechmatics_flow.tool_function_param import ToolFunctionParam
3335
from speechmatics_flow.utils import read_in_chunks, json_utf8
@@ -63,6 +65,7 @@ def __init__(
6365
self.websocket = None
6466
self.conversation_config = None
6567
self.audio_settings = None
68+
self.playback_settings = None
6669
self.tools = None
6770

6871
self.event_handlers = {x: [] for x in ServerMessageType}
@@ -73,13 +76,15 @@ def __init__(
7376
self.session_running = False
7477
self.conversation_ended_wait_timeout = 5
7578
self._session_needs_closing = False
76-
self._audio_buffer = None
79+
self._audio_buffer = bytearray()
80+
self._audio_buffer_lock = asyncio.Lock()
7781
self._executor = ThreadPoolExecutor()
7882

7983
# The following asyncio fields are fully instantiated in
8084
# _init_synchronization_primitives
8185
self._conversation_started = asyncio.Event
8286
self._conversation_ended = asyncio.Event
87+
self._response_started = asyncio.Event
8388
# Semaphore used to ensure that we don't send too much audio data to
8489
# the server too quickly and burst any buffers downstream.
8590
self._buffer_semaphore = asyncio.BoundedSemaphore
@@ -91,24 +96,34 @@ async def _init_synchronization_primitives(self):
9196
"""
9297
self._conversation_started = asyncio.Event()
9398
self._conversation_ended = asyncio.Event()
99+
self._response_started = asyncio.Event()
94100
self._buffer_semaphore = asyncio.BoundedSemaphore(
95101
self.connection_settings.message_buffer_size
96102
)
97103

98104
def _flag_conversation_started(self):
99105
"""
100106
Handle a
101-
:py:attr:`models.ClientMessageType.ConversationStarted`
107+
:py:attr:`models.ServerMessageType.ConversationStarted`
102108
message from the server.
103109
This updates an internal flag to mark the session started
104110
as started meaning, AddAudio is now allowed.
105111
"""
106112
self._conversation_started.set()
107113

114+
def _flag_response_started(self):
115+
"""
116+
Handle a
117+
:py:attr:`models.ServerMessageType.ResponseStarted`
118+
message from the server.
119+
This updates an internal flag to mark that the server started sending audio.
120+
"""
121+
self._response_started.set()
122+
108123
def _flag_conversation_ended(self):
109124
"""
110125
Handle a
111-
:py:attr:`models.ClientMessageType.ConversationEnded`
126+
:py:attr:`models.ServerMessageType.ConversationEnded`
112127
message from the server.
113128
This updates an internal flag to mark the session ended
114129
and server connection is closed
@@ -158,7 +173,7 @@ def _audio_received(self):
158173
msg = {
159174
"message": ClientMessageType.AudioReceived,
160175
"seq_no": self.server_seq_no,
161-
"buffering": 0.01, # 10ms
176+
"buffering": self.playback_settings.buffering / 1000,
162177
}
163178
self._call_middleware(ClientMessageType.AudioReceived, msg, False)
164179
LOGGER.debug(msg)
@@ -169,9 +184,12 @@ async def _wait_for_conversation_ended(self):
169184
Waits for :py:attr:`models.ClientMessageType.ConversationEnded`
170185
message from the server.
171186
"""
172-
await asyncio.wait_for(
173-
self._conversation_ended.wait(), self.conversation_ended_wait_timeout
174-
)
187+
try:
188+
await asyncio.wait_for(
189+
self._conversation_ended.wait(), self.conversation_ended_wait_timeout
190+
)
191+
except asyncio.TimeoutError:
192+
LOGGER.warning("Timeout waiting for ConversationEnded message.")
175193

176194
async def _consumer(self, message, from_cli=False):
177195
"""
@@ -192,7 +210,8 @@ async def _consumer(self, message, from_cli=False):
192210
await self.websocket.send(self._audio_received())
193211
# add an audio message to local buffer only when running from cli
194212
if from_cli:
195-
await self._audio_buffer.put(message)
213+
async with self._audio_buffer_lock:
214+
self._audio_buffer.extend(message)
196215
# Implicit name for all inbound binary messages.
197216
# We must manually set it for event handler subscribed
198217
# to `ServerMessageType.AddAudio` messages to work.
@@ -226,6 +245,13 @@ async def _consumer(self, message, from_cli=False):
226245

227246
if message_type == ServerMessageType.ConversationStarted:
228247
self._flag_conversation_started()
248+
if message_type == ServerMessageType.ResponseStarted:
249+
self._flag_response_started()
250+
if message_type in [
251+
ServerMessageType.ResponseCompleted,
252+
ServerMessageType.ResponseInterrupted,
253+
]:
254+
self._response_started.clear()
229255
elif message_type == ServerMessageType.AudioAdded:
230256
self._buffer_semaphore.release()
231257
elif message_type == ServerMessageType.ConversationEnded:
@@ -313,20 +339,31 @@ async def _producer_handler(self, interactions: List[Interaction]):
313339
Controls the producer loop for sending messages to the server.
314340
"""
315341
await self._conversation_started.wait()
316-
if interactions[0].stream.name == "<stdin>":
342+
# Stream audio from microphone when running from the terminal and input is not piped
343+
if (
344+
sys.stdin.isatty()
345+
and hasattr(interactions[0].stream, "name")
346+
and interactions[0].stream.name == "<stdin>"
347+
):
317348
return await self._read_from_microphone()
318349

319350
for interaction in interactions:
320-
async for message in self._stream_producer(
321-
interaction.stream, self.audio_settings.chunk_size
322-
):
323-
try:
324-
await self.websocket.send(message)
325-
except Exception as e:
326-
LOGGER.error(f"error sending message: {e}")
327-
return
328-
if interaction.callback:
329-
interaction.callback(self)
351+
try:
352+
async for message in self._stream_producer(
353+
interaction.stream, self.audio_settings.chunk_size
354+
):
355+
try:
356+
await self.websocket.send(message)
357+
except Exception as e:
358+
LOGGER.error(f"Error sending message: {e}")
359+
return
360+
361+
if interaction.callback:
362+
LOGGER.debug("Executing callback for interaction.")
363+
interaction.callback(self)
364+
365+
except Exception as e:
366+
LOGGER.error(f"Error processing interaction: {e}")
330367

331368
await self.websocket.send(self._end_of_audio())
332369
await self._wait_for_conversation_ended()
@@ -339,26 +376,38 @@ async def _playback_handler(self):
339376
stream = _pyaudio.open(
340377
format=pyaudio.paInt16,
341378
channels=1,
342-
rate=self.audio_settings.sample_rate,
343-
frames_per_buffer=128,
379+
rate=self.playback_settings.sample_rate,
380+
frames_per_buffer=self.playback_settings.chunk_size,
344381
output=True,
345382
)
383+
chunk_size = self.playback_settings.chunk_size
384+
346385
try:
347-
while True:
348-
if self._session_needs_closing or self._conversation_ended.is_set():
349-
break
386+
while not self._session_needs_closing or self._conversation_ended.is_set():
387+
# Wait for the server to start sending audio
388+
await self._response_started.wait()
389+
390+
# Ensure enough data is added to the buffer before starting playback
391+
await asyncio.sleep(self.playback_settings.buffering / 1000)
392+
393+
# Start playback
350394
try:
351-
audio_message = await self._audio_buffer.get()
352-
stream.write(audio_message)
353-
self._audio_buffer.task_done()
354-
# read from buffer at a constant rate
355-
await asyncio.sleep(0.005)
395+
while self._audio_buffer:
396+
if len(self._audio_buffer) >= chunk_size:
397+
async with self._audio_buffer_lock:
398+
audio_chunk = bytes(self._audio_buffer[:chunk_size])
399+
self._audio_buffer = self._audio_buffer[chunk_size:]
400+
stream.write(audio_chunk)
401+
await asyncio.sleep(0.005)
356402
except Exception as e:
357-
LOGGER.error(f"Error during audio playback: {e}")
403+
LOGGER.error(f"Error during audio playback: {e}", exc_info=True)
358404
raise e
405+
406+
except asyncio.CancelledError:
407+
LOGGER.info("Playback handler cancelled.")
359408
finally:
360-
stream.close()
361409
stream.stop_stream()
410+
stream.close()
362411
_pyaudio.terminate()
363412

364413
def _call_middleware(self, event_name, *args):
@@ -482,7 +531,6 @@ async def _communicate(self, interactions: List[Interaction], from_cli=False):
482531

483532
# Run the playback task that plays audio messages to the user when started from cli
484533
if from_cli:
485-
self._audio_buffer = asyncio.Queue()
486534
tasks.append(asyncio.create_task(self._playback_handler()))
487535

488536
(done, pending) = await asyncio.wait(
@@ -509,6 +557,7 @@ async def run(
509557
conversation_config: ConversationConfig = None,
510558
from_cli: bool = False,
511559
tools: Optional[List[ToolFunctionParam]] = None,
560+
playback_settings: PlaybackSettings = PlaybackSettings(),
512561
):
513562
"""
514563
Begin a new recognition session.
@@ -528,13 +577,18 @@ async def run(
528577
:param tools: Optional list of tool functions.
529578
:type tools: List[ToolFunctionParam]
530579
580+
:param playback_settings: Configuration for the playback stream.
581+
:type playback_settings: models.PlaybackSettings
582+
531583
:raises Exception: Can raise any exception returned by the
532584
consumer/producer tasks.
585+
533586
"""
534587
self.client_seq_no = 0
535588
self.server_seq_no = 0
536589
self.conversation_config = conversation_config
537590
self.audio_settings = audio_settings
591+
self.playback_settings = playback_settings
538592
self.tools = tools
539593

540594
await self._init_synchronization_primitives()

0 commit comments

Comments
 (0)