Skip to content

Commit a392c33

Browse files
mjchen0danieldegrasse
authored andcommitted
bluetooth: fix bug when destroying tx queue buffers on disconnect
Channel tx_queue purging on disconnect was inconsistently handled by the different channels: iso, l2cap, l2cap_br. iso channels handled purging in the tx_data_pull hook. l2cap and l2cap_br did the purging in channel delete functions and did not expect tx_data_pull to be called for a disconnected channel. Their data_pull functions could return a ptr to a net_buf that was still on the tx_queue, which is problematic when the conn tx_processor unrefs the returned buffer resulting in multiple calls to the buf destroy function. To make things consistent and correct, remove the code that tries to purge tx_queues in the tx_processor and only do purging in the channels themselves when they are deleted/disconnected. Also refactor and clarify referencing of the net_buf returned by tx_data_pull. It was confusing who had a reference and when, which could vary depending on the length of the original buffer. There are three cases: the buffer length is less than the tx.mps, greater the mps but less than the mtu so requiring segementation but not fragmentation, or greater than both mps and mtu so requiring both segmentation and fragmentation. The conn layer would increase the refcnt if the length was greater than the mtu, but not have any awareness of whether the net_buf was still on the tx_queue or not. Now it is the tx_data_pull callbacks responsibitity to increment the reference count if it is returning a pointer to a net_buf that it is still keeping on the tx_queue for segmentation purposes. The conn layer will now always transfer that reference into a fragment view and not conditional it on the length relative to the mtu, and always decrement the reference to the parent when the fragment is destroyed. So there is no risk of decrementing a reference to a net buf that might still be on a tx_queue, which simplifies error handling in particular. Also add error handling paths for when asserts are not enabled. Signed-off-by: Mike J. Chen <mjchen@google.com>
1 parent a132ebf commit a392c33

File tree

5 files changed

+98
-104
lines changed

5 files changed

+98
-104
lines changed

subsys/bluetooth/host/classic/l2cap_br.c

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1592,6 +1592,8 @@ struct net_buf *l2cap_br_data_pull(struct bt_conn *conn, size_t amount, size_t *
15921592
return NULL;
15931593
}
15941594

1595+
__ASSERT_NO_MSG(conn->state == BT_CONN_CONNECTED);
1596+
15951597
struct bt_l2cap_br_chan *br_chan;
15961598

15971599
br_chan = CONTAINER_OF(pdu_ready, struct bt_l2cap_br_chan, _pdu_ready);
@@ -1609,27 +1611,31 @@ struct net_buf *l2cap_br_data_pull(struct bt_conn *conn, size_t amount, size_t *
16091611

16101612
__ASSERT(tx_pdu, "signaled ready but no PDUs in the TX queue");
16111613

1612-
struct net_buf *pdu = CONTAINER_OF(tx_pdu, struct net_buf, node);
1614+
struct net_buf *q_pdu = CONTAINER_OF(tx_pdu, struct net_buf, node);
16131615

1614-
if (bt_buf_has_view(pdu)) {
1615-
LOG_ERR("already have view on %p", pdu);
1616+
if (bt_buf_has_view(q_pdu)) {
1617+
LOG_ERR("already have view on %p", q_pdu);
16161618
return NULL;
16171619
}
16181620

1621+
struct net_buf *pdu = net_buf_ref(q_pdu);
1622+
16191623
/* We can't interleave ACL fragments from different channels for the
16201624
* same ACL conn -> we have to wait until a full L2 PDU is transferred
16211625
* before switching channels.
16221626
*/
16231627
bool last_frag = amount >= pdu->len;
16241628

16251629
if (last_frag) {
1626-
LOG_DBG("last frag, removing %p", pdu);
1630+
LOG_DBG("last frag, removing %p", q_pdu);
16271631
__maybe_unused bool found;
16281632

1629-
found = sys_slist_find_and_remove(&br_chan->_pdu_tx_queue, &pdu->node);
1633+
found = sys_slist_find_and_remove(&br_chan->_pdu_tx_queue, &q_pdu->node);
16301634

16311635
__ASSERT_NO_MSG(found);
16321636

1637+
net_buf_unref(q_pdu);
1638+
16331639
LOG_DBG("chan %p done", br_chan);
16341640
lower_data_ready(br_chan);
16351641

subsys/bluetooth/host/conn.c

Lines changed: 39 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -649,7 +649,7 @@ static bool is_acl_conn(struct bt_conn *conn)
649649
}
650650

651651
static int send_buf(struct bt_conn *conn, struct net_buf *buf,
652-
size_t len, void *cb, void *ud)
652+
size_t len, bt_conn_tx_cb_t cb, void *ud)
653653
{
654654
struct net_buf *frag = NULL;
655655
struct bt_conn_tx *tx = NULL;
@@ -659,13 +659,15 @@ static int send_buf(struct bt_conn *conn, struct net_buf *buf,
659659
if (buf->len == 0) {
660660
__ASSERT_NO_MSG(0);
661661

662-
return -EMSGSIZE;
662+
err = -EMSGSIZE;
663+
goto error_return;
663664
}
664665

665666
if (bt_buf_has_view(buf)) {
666667
__ASSERT_NO_MSG(0);
667668

668-
return -EIO;
669+
err = -EIO;
670+
goto error_return;
669671
}
670672

671673
LOG_DBG("conn %p buf %p len %zu buf->len %u cb %p ud %p",
@@ -680,7 +682,8 @@ static int send_buf(struct bt_conn *conn, struct net_buf *buf,
680682
*/
681683
__ASSERT(0, "No controller bufs");
682684

683-
return -ENOMEM;
685+
err = -ENOMEM;
686+
goto error_return;
684687
}
685688

686689
/* Allocate and set the TX context */
@@ -689,27 +692,27 @@ static int send_buf(struct bt_conn *conn, struct net_buf *buf,
689692
/* See big comment above */
690693
if (!tx) {
691694
__ASSERT(0, "No TX context");
692-
693-
return -ENOMEM;
695+
k_sem_give(bt_conn_get_pkts(conn));
696+
err = -ENOMEM;
697+
goto error_return;
694698
}
695699

696700
tx->cb = cb;
697701
tx->user_data = ud;
698702

699703
uint16_t frag_len = MIN(conn_mtu(conn), len);
700704

701-
__ASSERT_NO_MSG(buf->ref == 1);
705+
/* Check that buf->ref is 1 or 2. It would be 1 if this
706+
* was the only reference (e.g. buf was removed
707+
* from the conn tx_queue). It would be 2 if the
708+
* tx_data_pull kept it on the tx_queue for segmentation.
709+
*/
710+
__ASSERT_NO_MSG((buf->ref == 1) || (buf->ref == 2));
702711

703-
if (buf->len > frag_len) {
704-
LOG_DBG("keep %p around", buf);
705-
frag = get_data_frag(net_buf_ref(buf), frag_len);
706-
} else {
707-
LOG_DBG("move %p ref in", buf);
708-
/* Move the ref into `frag` for the last TX. That way `buf` will
709-
* get destroyed when `frag` is destroyed.
710-
*/
711-
frag = get_data_frag(buf, frag_len);
712-
}
712+
/* The reference is always transferred to the frag, so when
713+
* the frag is destroyed, the parent reference is decremented.
714+
*/
715+
frag = get_data_frag(buf, frag_len);
713716

714717
/* Caller is supposed to check we have all resources to send */
715718
__ASSERT_NO_MSG(frag != NULL);
@@ -723,7 +726,7 @@ static int send_buf(struct bt_conn *conn, struct net_buf *buf,
723726
conn->next_is_frag = false;
724727
}
725728

726-
LOG_DBG("send frag: buf %p len %d", buf, frag_len);
729+
LOG_DBG("send frag: buf %p len %d", frag, frag_len);
727730

728731
/* At this point, the buffer is either a fragment or a full HCI packet.
729732
* The flags are also valid.
@@ -766,15 +769,26 @@ static int send_buf(struct bt_conn *conn, struct net_buf *buf,
766769
*/
767770
net_buf_unref(frag);
768771

769-
/* `buf` might not get destroyed right away, and its `tx`
770-
* pointer will still be reachable. Make sure that we don't try
771-
* to use the destroyed context later.
772+
/* `buf` might not get destroyed right away because it may
773+
* still be on a conn tx_queue, and its `tx` pointer will still
774+
* be reachable. Make sure that we don't try to use the
775+
* destroyed context later.
772776
*/
773777
conn_tx_destroy(conn, tx);
774778
k_sem_give(bt_conn_get_pkts(conn));
775779

776780
/* Merge HCI driver errors */
777781
return -EIO;
782+
783+
error_return:
784+
/* Runtime handling of fatal errors when ASSERTS are disabled.
785+
* Unref the buf and invoke callback with the error.
786+
*/
787+
net_buf_unref(buf);
788+
if (cb) {
789+
cb(conn, ud, err);
790+
}
791+
return err;
778792
}
779793

780794
static struct k_poll_signal conn_change =
@@ -956,8 +970,8 @@ struct bt_conn *get_conn_ready(void)
956970
sys_slist_remove(&bt_dev.le.conn_ready, prev, &conn->_conn_ready);
957971
(void)atomic_set(&conn->_conn_ready_lock, 0);
958972

959-
/* Append connection to list if it still has data */
960-
if (conn->has_data(conn)) {
973+
/* Append connection to list if it is connected and still has data */
974+
if (conn->has_data(conn) && (conn->state == BT_CONN_CONNECTED)) {
961975
LOG_DBG("appending %p to back of TX queue", conn);
962976
bt_conn_data_ready(conn);
963977
}
@@ -985,30 +999,6 @@ static void acl_get_and_clear_cb(struct bt_conn *conn, struct net_buf *buf,
985999
}
9861000
#endif /* defined(CONFIG_BT_CONN) */
9871001

988-
/* Acts as a "null-routed" bt_send(). This fn will decrease the refcount of
989-
* `buf` and call the user callback with an error code.
990-
*/
991-
static void destroy_and_callback(struct bt_conn *conn,
992-
struct net_buf *buf,
993-
bt_conn_tx_cb_t cb,
994-
void *ud)
995-
{
996-
if (!cb) {
997-
conn->get_and_clear_cb(conn, buf, &cb, &ud);
998-
}
999-
1000-
LOG_DBG("pop: cb %p userdata %p", cb, ud);
1001-
1002-
/* bt_send() would've done an unref. Do it here also, so the buffer is
1003-
* hopefully destroyed and the user callback can allocate a new one.
1004-
*/
1005-
net_buf_unref(buf);
1006-
1007-
if (cb) {
1008-
cb(conn, ud, -ESHUTDOWN);
1009-
}
1010-
}
1011-
10121002
static volatile bool _suspend_tx;
10131003

10141004
#if defined(CONFIG_BT_TESTING)
@@ -1051,17 +1041,7 @@ void bt_conn_tx_processor(void)
10511041

10521042
if (conn->state != BT_CONN_CONNECTED) {
10531043
LOG_WRN("conn %p: not connected", conn);
1054-
1055-
/* Call the user callbacks & destroy (final-unref) the buffers
1056-
* we were supposed to send.
1057-
*/
1058-
buf = conn->tx_data_pull(conn, SIZE_MAX, &buf_len);
1059-
while (buf) {
1060-
destroy_and_callback(conn, buf, cb, ud);
1061-
buf = conn->tx_data_pull(conn, SIZE_MAX, &buf_len);
1062-
}
1063-
1064-
goto exit;
1044+
goto raise_and_exit;
10651045
}
10661046

10671047
/* now that we are guaranteed resources, we can pull data from the upper
@@ -1095,25 +1075,12 @@ void bt_conn_tx_processor(void)
10951075
int err = send_buf(conn, buf, buf_len, cb, ud);
10961076

10971077
if (err) {
1098-
/* -EIO means `unrecoverable error`. It can be an assertion that
1099-
* failed or an error from the HCI driver.
1100-
*
1101-
* -ENOMEM means we thought we had all the resources to send the
1102-
* buf (ie. TX context + controller buffer) but one of them was
1103-
* not available. This is likely due to a failure of
1104-
* assumption, likely that we have been pre-empted somehow and
1105-
* that `tx_processor()` has been re-entered.
1106-
*
1107-
* In both cases, we destroy the buffer and mark the connection
1108-
* as dead.
1109-
*/
11101078
LOG_ERR("Fatal error (%d). Disconnecting %p", err, conn);
1111-
destroy_and_callback(conn, buf, cb, ud);
11121079
bt_conn_disconnect(conn, BT_HCI_ERR_REMOTE_USER_TERM_CONN);
1113-
11141080
goto exit;
11151081
}
11161082

1083+
raise_and_exit:
11171084
/* Always kick the TX work. It will self-suspend if it doesn't get
11181085
* resources or there is nothing left to send.
11191086
*/

subsys/bluetooth/host/conn_internal.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,10 +288,22 @@ struct bt_conn {
288288
#endif
289289

290290
/* Callback into the higher-layers (L2CAP / ISO) to return a buffer for
291-
* sending `amount` of bytes to HCI.
291+
* sending `amount` of bytes to HCI. Will only be called when
292+
* the state is connected. The higher-layer is responsible for purging
293+
* the remaining buffers on disconnect.
292294
*
293295
* Scheduling from which channel to pull (e.g. for L2CAP) is done at the
294296
* upper layer's discretion.
297+
*
298+
* Details about the returned net_buf when it is not NULL:
299+
* - If the net_buf->len <= *length, then the net_buf has been removed
300+
* from the tx_queue of the connection and the caller is now the
301+
* owner of the only reference to the net_buf.
302+
* - Otherwise, the net_buf is still on the tx_queue of the connection,
303+
* and the callback has incremented the reference count to account
304+
* for it having a reference still.
305+
* - The caller must consume *length bytes from the net_buf before
306+
* calling this function again.
295307
*/
296308
struct net_buf * (*tx_data_pull)(struct bt_conn *conn,
297309
size_t amount,

subsys/bluetooth/host/iso.c

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -454,10 +454,18 @@ void bt_iso_connected(struct bt_conn *iso)
454454
static void bt_iso_chan_disconnected(struct bt_iso_chan *chan, uint8_t reason)
455455
{
456456
const uint8_t conn_type = chan->iso->iso.info.type;
457+
struct net_buf *buf;
458+
457459
LOG_DBG("%p, reason 0x%02x", chan, reason);
458460

459461
__ASSERT(chan->iso != NULL, "NULL conn for iso chan %p", chan);
460462

463+
/* release buffers from tx_queue */
464+
while ((buf = k_fifo_get(&chan->iso->iso.txq, K_NO_WAIT))) {
465+
__ASSERT_NO_MSG(!bt_buf_has_view(buf));
466+
net_buf_unref(buf);
467+
}
468+
461469
bt_iso_chan_set_state(chan, BT_ISO_STATE_DISCONNECTED);
462470
bt_conn_set_state(chan->iso, BT_CONN_DISCONNECT_COMPLETE);
463471

@@ -775,7 +783,8 @@ void bt_iso_recv(struct bt_conn *iso, struct net_buf *buf, uint8_t flags)
775783
static bool iso_has_data(struct bt_conn *conn)
776784
{
777785
#if defined(CONFIG_BT_ISO_TX)
778-
return !k_fifo_is_empty(&conn->iso.txq);
786+
return ((conn->iso.chan->state == BT_ISO_STATE_CONNECTED) &&
787+
!k_fifo_is_empty(&conn->iso.txq));
779788
#else /* !CONFIG_BT_ISO_TX */
780789
return false;
781790
#endif /* CONFIG_BT_ISO_TX */
@@ -789,45 +798,45 @@ static struct net_buf *iso_data_pull(struct bt_conn *conn, size_t amount, size_t
789798
/* Leave the PDU buffer in the queue until we have sent all its
790799
* fragments.
791800
*/
792-
struct net_buf *frag = k_fifo_peek_head(&conn->iso.txq);
801+
struct net_buf *q_frag = k_fifo_peek_head(&conn->iso.txq);
793802

794-
if (!frag) {
803+
if (!q_frag) {
795804
BT_ISO_DATA_DBG("signaled ready but no frag available");
796805
/* Service other connections */
797806
bt_tx_irq_raise();
798807

799808
return NULL;
800809
}
801810

802-
if (conn->iso.chan->state != BT_ISO_STATE_CONNECTED) {
803-
__maybe_unused struct net_buf *b = k_fifo_get(&conn->iso.txq, K_NO_WAIT);
811+
__ASSERT_NO_MSG(conn->state == BT_CONN_CONNECTED);
804812

813+
if (conn->iso.chan->state != BT_ISO_STATE_CONNECTED) {
805814
LOG_DBG("channel has been disconnected");
806-
__ASSERT_NO_MSG(b == frag);
807-
808-
net_buf_unref(b);
809815

810816
/* Service other connections */
811817
bt_tx_irq_raise();
812818

813819
return NULL;
814820
}
815821

816-
if (bt_buf_has_view(frag)) {
822+
if (bt_buf_has_view(q_frag)) {
817823
/* This should not happen. conn.c should wait until the view is
818824
* destroyed before requesting more data.
819825
*/
820826
LOG_DBG("already have view");
821827
return NULL;
822828
}
823829

830+
struct net_buf *frag = net_buf_ref(q_frag);
824831
bool last_frag = amount >= frag->len;
825832

826833
if (last_frag) {
827-
__maybe_unused struct net_buf *b = k_fifo_get(&conn->iso.txq, K_NO_WAIT);
834+
q_frag = k_fifo_get(&conn->iso.txq, K_NO_WAIT);
828835

829836
BT_ISO_DATA_DBG("last frag, pop buf");
830-
__ASSERT_NO_MSG(b == frag);
837+
__ASSERT_NO_MSG(q_frag == frag);
838+
839+
net_buf_unref(q_frag);
831840
}
832841

833842
*length = frag->len;

0 commit comments

Comments
 (0)