Skip to content

Commit e4d5c18

Browse files
Wrap websocket flush in timeout (#2908)
1 parent 29a0df4 commit e4d5c18

File tree

1 file changed

+105
-27
lines changed

1 file changed

+105
-27
lines changed

crates/client-api/src/routes/subscribe.rs

Lines changed: 105 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ where
182182
}
183183

184184
const LIVELINESS_TIMEOUT: Duration = Duration::from_secs(60);
185+
const SEND_TIMEOUT: Duration = Duration::from_secs(5);
185186

186187
async fn ws_client_actor(client: ClientConnection, ws: WebSocketStream, sendrx: MeteredReceiver<SerializableMessage>) {
187188
// ensure that even if this task gets cancelled, we always cleanup the connection
@@ -266,12 +267,12 @@ async fn ws_client_actor_inner(
266267
Some(Ok(m)) => Item::Message(ClientMessage::from_message(m)),
267268
Some(Err(error)) => {
268269
log::warn!("Websocket receive error: {}", error);
269-
continue;
270+
break;
270271
}
271272
// the client sent us a close frame
272273
None => {
273-
break
274-
},
274+
break;
275+
}
275276
},
276277

277278
// If we have an outgoing message to send, send it off.
@@ -311,11 +312,29 @@ async fn ws_client_actor_inner(
311312
// now we flush all the messages to the socket
312313
(ws.flush().await, msg_buffer)
313314
};
315+
// Build a future that both times out and drives the send.
316+
//
317+
// Note that if flushing cannot immediately complete for whatever reason,
318+
// it will wait without polling the other futures in the `select!` arms.
319+
// Among other things, this means our liveness tick will not be polled.
320+
//
321+
// To avoid waiting indefinitely, we wrap the send in a timeout.
322+
// A timeout is treated as an unresponsive client and we drop the connection.
323+
let send_all = tokio::time::timeout(SEND_TIMEOUT, send_all);
314324
// Flush the websocket while continuing to poll the `handle_queue`,
315325
// to avoid deadlocks or delays due to enqueued futures holding resources.
316326
let send_all = also_poll(send_all, make_progress(&mut current_message));
317327
let t1 = Instant::now();
318-
let (send_all_result, buf) = send_all.await;
328+
let (send_all_result, buf) = match send_all.await {
329+
Ok((send_all_result, buf)) => {
330+
(send_all_result, buf)
331+
}
332+
Err(e) => {
333+
// Our send timed out; drop client without trying to send them a Close
334+
log::warn!("send_all timed out: {e}");
335+
break;
336+
}
337+
};
319338
msg_buffer = buf;
320339
if let Err(error) = send_all_result {
321340
log::warn!("Websocket send error: {error}")
@@ -335,13 +354,33 @@ async fn ws_client_actor_inner(
335354
Err(NoSuchModule) => {
336355
// Send a close frame while continuing to poll the `handle_queue`,
337356
// to avoid deadlocks or delays due to enqueued futures holding resources.
338-
let close = also_poll(
339-
ws.close(Some(CloseFrame { code: CloseCode::Away, reason: "module exited".into() })),
340-
make_progress(&mut current_message),
341-
);
342-
if let Err(e) = close.await {
343-
log::warn!("error closing: {e:#}")
344-
}
357+
let close = ws.close(Some(CloseFrame { code: CloseCode::Away, reason: "module exited".into() }));
358+
// Wrap the close in a timeout
359+
let close = tokio::time::timeout(SEND_TIMEOUT, close);
360+
match also_poll(close, make_progress(&mut current_message)).await {
361+
Ok(Err(e)) => {
362+
log::warn!("error closing websocket: {e:#}")
363+
}
364+
Err(e) => {
365+
// Our send timed out; drop client without trying to send them a Close.
366+
//
367+
// Is it correct to break if a reducer is still in progress?
368+
// Answer: Yes it is.
369+
//
370+
// If a reducer is currently being executed,
371+
// we are waiting for the `current_message` future to complete.
372+
// When we break, the task completes and this future is dropped.
373+
//
374+
// Notably though the reducer itself will run to completion,
375+
// however when it tries to notify this task that it is done,
376+
// it will encounter a closed sender in `JobThread::run`,
377+
// dropping the value that it's trying to send.
378+
// In particular it will not throw an error or panic.
379+
log::warn!("websocket close timed out: {e}");
380+
break;
381+
}
382+
_ => {}
383+
};
345384
closed = true;
346385
}
347386
}
@@ -352,10 +391,29 @@ async fn ws_client_actor_inner(
352391
_ = liveness_check_interval.tick() => {
353392
// If we received a pong at some point, send a fresh ping.
354393
if mem::take(&mut got_pong) {
394+
// Build a future that both times out and drives the send.
395+
//
396+
// Note that if the send cannot immediately complete for whatever reason,
397+
// it will wait without polling the other futures in the `select!` arms.
398+
// Among other things, this means we won't poll the websocket for a Close frame.
399+
//
400+
// To avoid waiting indefinitely, we wrap the ping in a timeout.
401+
// A timeout is treated as an unresponsive client and we drop the connection.
402+
let ping = ws.send(WsMessage::Ping(Bytes::new()));
403+
let ping_with_timeout = tokio::time::timeout(SEND_TIMEOUT, ping);
404+
355405
// Send a ping message while continuing to poll the `handle_queue`,
356406
// to avoid deadlocks or delays due to enqueued futures holding resources.
357-
if let Err(e) = also_poll(ws.send(WsMessage::Ping(Bytes::new())), make_progress(&mut current_message)).await {
358-
log::warn!("error sending ping: {e:#}");
407+
match also_poll(ping_with_timeout, make_progress(&mut current_message)).await {
408+
Ok(Err(e)) => {
409+
log::warn!("error sending ping: {e:#}");
410+
}
411+
Err(e) => {
412+
// Our ping timed out; drop them without trying to send them a Close
413+
log::warn!("ping timed out after: {e}");
414+
break;
415+
}
416+
_ => {}
359417
}
360418
continue;
361419
} else {
@@ -380,13 +438,22 @@ async fn ws_client_actor_inner(
380438
Item::HandleResult(res) => {
381439
if let Err(e) = res {
382440
if let MessageHandleError::Execution(err) = e {
383-
log::error!("{err:#}");
441+
log::error!("reducer execution error: {err:#}");
384442
// Serialize the message and keep a handle to the buffer.
385443
let (msg_alloc, msg_data) = serialize(msg_buffer, err, client.config);
386444

387-
// Buffer the message without necessarily sending it.
388-
if let Err(error) = ws.send(datamsg_to_wsmsg(msg_data)).await {
389-
log::warn!("Websocket send error: {error}")
445+
let send = async { ws.send(datamsg_to_wsmsg(msg_data)).await };
446+
let send = tokio::time::timeout(SEND_TIMEOUT, send);
447+
448+
match send.await {
449+
Ok(Err(error)) => {
450+
log::warn!("Websocket send error: {error}")
451+
}
452+
Err(error) => {
453+
log::warn!("send timed out after: {error}");
454+
break;
455+
}
456+
_ => {}
390457
}
391458

392459
// At this point,
@@ -399,16 +466,23 @@ async fn ws_client_actor_inner(
399466

400467
continue;
401468
}
402-
log::debug!("Client caused error on text message: {}", e);
403-
if let Err(e) = ws
404-
.close(Some(CloseFrame {
405-
code: CloseCode::Error,
406-
reason: format!("{e:#}").into(),
407-
}))
408-
.await
409-
{
410-
log::warn!("error closing websocket: {e:#}")
411-
};
469+
log::warn!("Client caused error on text message: {}", e);
470+
let close = ws.close(Some(CloseFrame {
471+
code: CloseCode::Error,
472+
reason: format!("{e:#}").into(),
473+
}));
474+
475+
// Wrap the close in a timeout
476+
match tokio::time::timeout(SEND_TIMEOUT, close).await {
477+
Ok(Err(e)) => {
478+
log::warn!("error closing websocket: {e:#}")
479+
}
480+
Err(e) => {
481+
log::warn!("send timed out after: {e}");
482+
break;
483+
}
484+
_ => {}
485+
}
412486
}
413487
}
414488
Item::Message(ClientMessage::Ping(_message)) => {
@@ -439,6 +513,10 @@ async fn ws_client_actor_inner(
439513
.with_label_values(&addr)
440514
.inc();
441515
}
516+
517+
// Can't we just break out of the loop here?
518+
// Not, if we want tungstenite to send a close frame back to the client.
519+
// That will only happen once `ws.next()` returns `None`.
442520
closed = true;
443521
}
444522
}

0 commit comments

Comments
 (0)