Skip to content

Commit 7c6e99c

Browse files
mikechristiemartinkpetersen
authored andcommitted
scsi: iscsi: Fix conn cleanup and stop race during iscsid restart
If iscsid is doing a stop_conn at the same time the kernel is starting error recovery we can hit a race that allows the cleanup work to run on a valid connection. In the race, iscsi_if_stop_conn sees the cleanup bit set, but it calls flush_work on the clean_work before iscsi_conn_error_event has queued it. The flush then returns before the queueing and so the cleanup_work can run later and disconnect/stop a conn while it's in a connected state. The patch: Commit 0ab7104 ("scsi: iscsi: Perform connection failure entirely in kernel space") added the late stop_conn call bug originally, and the patch: Commit 23d6fef ("scsi: iscsi: Fix in-kernel conn failure handling") attempted to fix it but only fixed the normal EH case and left the above race for the iscsid restart case. For the normal EH case we don't hit the race because we only signal userspace to start recovery after we have done the queueing, so the flush will always catch the queued work or see it completed. For iscsid restart cases like boot, we can hit the race because iscsid will call down to the kernel before the kernel has signaled any error, so both code paths can be running at the same time. This adds a lock around the setting of the cleanup bit and queueing so they happen together. Link: https://lore.kernel.org/r/20220408001314.5014-6-michael.christie@oracle.com Fixes: 0ab7104 ("scsi: iscsi: Perform connection failure entirely in kernel space") Tested-by: Manish Rangankar <mrangankar@marvell.com> Reviewed-by: Lee Duncan <lduncan@suse.com> Reviewed-by: Chris Leech <cleech@redhat.com> Signed-off-by: Mike Christie <michael.christie@oracle.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
1 parent 0aadafb commit 7c6e99c

File tree

2 files changed

+19
-0
lines changed

2 files changed

+19
-0
lines changed

drivers/scsi/scsi_transport_iscsi.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2240,9 +2240,12 @@ static void iscsi_if_disconnect_bound_ep(struct iscsi_cls_conn *conn,
22402240
bool is_active)
22412241
{
22422242
/* Check if this was a conn error and the kernel took ownership */
2243+
spin_lock_irq(&conn->lock);
22432244
if (!test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) {
2245+
spin_unlock_irq(&conn->lock);
22442246
iscsi_ep_disconnect(conn, is_active);
22452247
} else {
2248+
spin_unlock_irq(&conn->lock);
22462249
ISCSI_DBG_TRANS_CONN(conn, "flush kernel conn cleanup.\n");
22472250
mutex_unlock(&conn->ep_mutex);
22482251

@@ -2289,9 +2292,12 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport,
22892292
/*
22902293
* Figure out if it was the kernel or userspace initiating this.
22912294
*/
2295+
spin_lock_irq(&conn->lock);
22922296
if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) {
2297+
spin_unlock_irq(&conn->lock);
22932298
iscsi_stop_conn(conn, flag);
22942299
} else {
2300+
spin_unlock_irq(&conn->lock);
22952301
ISCSI_DBG_TRANS_CONN(conn,
22962302
"flush kernel conn cleanup.\n");
22972303
flush_work(&conn->cleanup_work);
@@ -2300,7 +2306,9 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport,
23002306
* Only clear for recovery to avoid extra cleanup runs during
23012307
* termination.
23022308
*/
2309+
spin_lock_irq(&conn->lock);
23032310
clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags);
2311+
spin_unlock_irq(&conn->lock);
23042312
}
23052313
ISCSI_DBG_TRANS_CONN(conn, "iscsi if conn stop done.\n");
23062314
return 0;
@@ -2321,7 +2329,9 @@ static void iscsi_cleanup_conn_work_fn(struct work_struct *work)
23212329
*/
23222330
if (conn->state != ISCSI_CONN_BOUND && conn->state != ISCSI_CONN_UP) {
23232331
ISCSI_DBG_TRANS_CONN(conn, "Got error while conn is already failed. Ignoring.\n");
2332+
spin_lock_irq(&conn->lock);
23242333
clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags);
2334+
spin_unlock_irq(&conn->lock);
23252335
mutex_unlock(&conn->ep_mutex);
23262336
return;
23272337
}
@@ -2376,6 +2386,7 @@ iscsi_alloc_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid)
23762386
conn->dd_data = &conn[1];
23772387

23782388
mutex_init(&conn->ep_mutex);
2389+
spin_lock_init(&conn->lock);
23792390
INIT_LIST_HEAD(&conn->conn_list);
23802391
INIT_WORK(&conn->cleanup_work, iscsi_cleanup_conn_work_fn);
23812392
conn->transport = transport;
@@ -2578,9 +2589,12 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
25782589
struct iscsi_uevent *ev;
25792590
struct iscsi_internal *priv;
25802591
int len = nlmsg_total_size(sizeof(*ev));
2592+
unsigned long flags;
25812593

2594+
spin_lock_irqsave(&conn->lock, flags);
25822595
if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags))
25832596
queue_work(iscsi_conn_cleanup_workq, &conn->cleanup_work);
2597+
spin_unlock_irqrestore(&conn->lock, flags);
25842598

25852599
priv = iscsi_if_transport_lookup(conn->transport);
25862600
if (!priv)
@@ -3723,11 +3737,14 @@ static int iscsi_if_transport_conn(struct iscsi_transport *transport,
37233737
return -EINVAL;
37243738

37253739
mutex_lock(&conn->ep_mutex);
3740+
spin_lock_irq(&conn->lock);
37263741
if (test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) {
3742+
spin_unlock_irq(&conn->lock);
37273743
mutex_unlock(&conn->ep_mutex);
37283744
ev->r.retcode = -ENOTCONN;
37293745
return 0;
37303746
}
3747+
spin_unlock_irq(&conn->lock);
37313748

37323749
switch (nlh->nlmsg_type) {
37333750
case ISCSI_UEVENT_BIND_CONN:

include/scsi/scsi_transport_iscsi.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,8 @@ struct iscsi_cls_conn {
211211
struct mutex ep_mutex;
212212
struct iscsi_endpoint *ep;
213213

214+
/* Used when accessing flags and queueing work. */
215+
spinlock_t lock;
214216
unsigned long flags;
215217
struct work_struct cleanup_work;
216218

0 commit comments

Comments
 (0)