Skip to content

Commit 82a8a30

Browse files
ps-ushankaraxboe
authored andcommitted
ublk: improve detection and handling of ublk server exit
There are currently two ways in which ublk server exit is detected by ublk_drv: 1. uring_cmd cancellation. If there are any outstanding uring_cmds which have not been completed to the ublk server when it exits, io_uring calls the uring_cmd callback with a special cancellation flag as the issuing task is exiting. 2. I/O timeout. This is needed in addition to the above to handle the "saturated queue" case, when all I/Os for a given queue are in the ublk server, and therefore there are no outstanding uring_cmds to cancel when the ublk server exits. There are a couple of issues with this approach: - It is complex and inelegant to have two methods to detect the same condition - The second method detects ublk server exit only after a long delay (~30s, the default timeout assigned by the block layer). This delays the nosrv behavior from kicking in and potential subsequent recovery of the device. The second issue is brought to light with the new test_generic_06 which will be added in following patch. It fails before this fix: selftests: ublk: test_generic_06.sh dev id is 0 dd: error writing '/dev/ublkb0': Input/output error 1+0 records in 0+0 records out 0 bytes copied, 30.0611 s, 0.0 kB/s DEAD dd took 31 seconds to exit (>= 5s tolerance)! generic_06 : [FAIL] Fix this by instead detecting and handling ublk server exit in the character file release callback. This has several advantages: - This one place can handle both saturated and unsaturated queues. Thus, it replaces both preexisting methods of detecting ublk server exit. - It runs quickly on ublk server exit - there is no 30s delay. - It starts the process of removing task references in ublk_drv. This is needed if we want to relax restrictions in the driver like letting only one thread serve each queue There is also the disadvantage that the character file release callback can also be triggered by intentional close of the file, which is a significant behavior change. Preexisting ublk servers (libublksrv) are dependent on the ability to open/close the file multiple times. To address this, only transition to a nosrv state if the file is released while the ublk device is live. This allows for programs to open/close the file multiple times during setup. It is still a behavior change if a ublk server decides to close/reopen the file while the device is LIVE (i.e. while it is responsible for serving I/O), but that would be highly unusual. This behavior is in line with what is done by FUSE, which is very similar to ublk in that a userspace daemon is providing services traditionally provided by the kernel. With this change in, the new test (and all other selftests, and all ublksrv tests) pass: selftests: ublk: test_generic_06.sh dev id is 0 dd: error writing '/dev/ublkb0': Input/output error 1+0 records in 0+0 records out 0 bytes copied, 0.0376731 s, 0.0 kB/s DEAD generic_04 : [PASS] Signed-off-by: Uday Shankar <ushankar@purestorage.com> Signed-off-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/20250416035444.99569-6-ming.lei@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 728cbac commit 82a8a30

File tree

1 file changed

+123
-100
lines changed

1 file changed

+123
-100
lines changed

drivers/block/ublk_drv.c

Lines changed: 123 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -199,8 +199,6 @@ struct ublk_device {
199199
struct completion completion;
200200
unsigned int nr_queues_ready;
201201
unsigned int nr_privileged_daemon;
202-
203-
struct work_struct nosrv_work;
204202
};
205203

206204
/* header of ublk_params */
@@ -209,8 +207,9 @@ struct ublk_params_header {
209207
__u32 types;
210208
};
211209

212-
static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq);
213-
210+
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
211+
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
212+
static void __ublk_quiesce_dev(struct ublk_device *ub);
214213
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
215214
struct ublk_queue *ubq, int tag, size_t offset);
216215
static inline unsigned int ublk_req_build_flags(struct request *req);
@@ -1336,8 +1335,6 @@ static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
13361335
static enum blk_eh_timer_return ublk_timeout(struct request *rq)
13371336
{
13381337
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
1339-
unsigned int nr_inflight = 0;
1340-
int i;
13411338

13421339
if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
13431340
if (!ubq->timeout) {
@@ -1348,26 +1345,6 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq)
13481345
return BLK_EH_DONE;
13491346
}
13501347

1351-
if (!ubq_daemon_is_dying(ubq))
1352-
return BLK_EH_RESET_TIMER;
1353-
1354-
for (i = 0; i < ubq->q_depth; i++) {
1355-
struct ublk_io *io = &ubq->ios[i];
1356-
1357-
if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
1358-
nr_inflight++;
1359-
}
1360-
1361-
/* cancelable uring_cmd can't help us if all commands are in-flight */
1362-
if (nr_inflight == ubq->q_depth) {
1363-
struct ublk_device *ub = ubq->dev;
1364-
1365-
if (ublk_abort_requests(ub, ubq)) {
1366-
schedule_work(&ub->nosrv_work);
1367-
}
1368-
return BLK_EH_DONE;
1369-
}
1370-
13711348
return BLK_EH_RESET_TIMER;
13721349
}
13731350

@@ -1525,13 +1502,105 @@ static void ublk_reset_ch_dev(struct ublk_device *ub)
15251502
ub->nr_privileged_daemon = 0;
15261503
}
15271504

1505+
static struct gendisk *ublk_get_disk(struct ublk_device *ub)
1506+
{
1507+
struct gendisk *disk;
1508+
1509+
spin_lock(&ub->lock);
1510+
disk = ub->ub_disk;
1511+
if (disk)
1512+
get_device(disk_to_dev(disk));
1513+
spin_unlock(&ub->lock);
1514+
1515+
return disk;
1516+
}
1517+
1518+
static void ublk_put_disk(struct gendisk *disk)
1519+
{
1520+
if (disk)
1521+
put_device(disk_to_dev(disk));
1522+
}
1523+
15281524
static int ublk_ch_release(struct inode *inode, struct file *filp)
15291525
{
15301526
struct ublk_device *ub = filp->private_data;
1527+
struct gendisk *disk;
1528+
int i;
1529+
1530+
/*
1531+
* disk isn't attached yet, either device isn't live, or it has
1532+
* been removed already, so we needn't to do anything
1533+
*/
1534+
disk = ublk_get_disk(ub);
1535+
if (!disk)
1536+
goto out;
1537+
1538+
/*
1539+
* All uring_cmd are done now, so abort any request outstanding to
1540+
* the ublk server
1541+
*
1542+
* This can be done in lockless way because ublk server has been
1543+
* gone
1544+
*
1545+
* More importantly, we have to provide forward progress guarantee
1546+
* without holding ub->mutex, otherwise control task grabbing
1547+
* ub->mutex triggers deadlock
1548+
*
1549+
* All requests may be inflight, so ->canceling may not be set, set
1550+
* it now.
1551+
*/
1552+
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
1553+
struct ublk_queue *ubq = ublk_get_queue(ub, i);
1554+
1555+
ubq->canceling = true;
1556+
ublk_abort_queue(ub, ubq);
1557+
}
1558+
blk_mq_kick_requeue_list(disk->queue);
1559+
1560+
/*
1561+
* All infligh requests have been completed or requeued and any new
1562+
* request will be failed or requeued via `->canceling` now, so it is
1563+
* fine to grab ub->mutex now.
1564+
*/
1565+
mutex_lock(&ub->mutex);
1566+
1567+
/* double check after grabbing lock */
1568+
if (!ub->ub_disk)
1569+
goto unlock;
1570+
1571+
/*
1572+
* Transition the device to the nosrv state. What exactly this
1573+
* means depends on the recovery flags
1574+
*/
1575+
blk_mq_quiesce_queue(disk->queue);
1576+
if (ublk_nosrv_should_stop_dev(ub)) {
1577+
/*
1578+
* Allow any pending/future I/O to pass through quickly
1579+
* with an error. This is needed because del_gendisk
1580+
* waits for all pending I/O to complete
1581+
*/
1582+
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1583+
ublk_get_queue(ub, i)->force_abort = true;
1584+
blk_mq_unquiesce_queue(disk->queue);
1585+
1586+
ublk_stop_dev_unlocked(ub);
1587+
} else {
1588+
if (ublk_nosrv_dev_should_queue_io(ub)) {
1589+
__ublk_quiesce_dev(ub);
1590+
} else {
1591+
ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
1592+
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1593+
ublk_get_queue(ub, i)->fail_io = true;
1594+
}
1595+
blk_mq_unquiesce_queue(disk->queue);
1596+
}
1597+
unlock:
1598+
mutex_unlock(&ub->mutex);
1599+
ublk_put_disk(disk);
15311600

15321601
/* all uring_cmd has been done now, reset device & ubq */
15331602
ublk_reset_ch_dev(ub);
1534-
1603+
out:
15351604
clear_bit(UB_STATE_OPEN, &ub->state);
15361605
return 0;
15371606
}
@@ -1627,37 +1696,22 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
16271696
}
16281697

16291698
/* Must be called when queue is frozen */
1630-
static bool ublk_mark_queue_canceling(struct ublk_queue *ubq)
1699+
static void ublk_mark_queue_canceling(struct ublk_queue *ubq)
16311700
{
1632-
bool canceled;
1633-
16341701
spin_lock(&ubq->cancel_lock);
1635-
canceled = ubq->canceling;
1636-
if (!canceled)
1702+
if (!ubq->canceling)
16371703
ubq->canceling = true;
16381704
spin_unlock(&ubq->cancel_lock);
1639-
1640-
return canceled;
16411705
}
16421706

1643-
static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
1707+
static void ublk_start_cancel(struct ublk_queue *ubq)
16441708
{
1645-
bool was_canceled = ubq->canceling;
1646-
struct gendisk *disk;
1647-
1648-
if (was_canceled)
1649-
return false;
1650-
1651-
spin_lock(&ub->lock);
1652-
disk = ub->ub_disk;
1653-
if (disk)
1654-
get_device(disk_to_dev(disk));
1655-
spin_unlock(&ub->lock);
1709+
struct ublk_device *ub = ubq->dev;
1710+
struct gendisk *disk = ublk_get_disk(ub);
16561711

16571712
/* Our disk has been dead */
16581713
if (!disk)
1659-
return false;
1660-
1714+
return;
16611715
/*
16621716
* Now we are serialized with ublk_queue_rq()
16631717
*
@@ -1666,15 +1720,9 @@ static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
16661720
* touch completed uring_cmd
16671721
*/
16681722
blk_mq_quiesce_queue(disk->queue);
1669-
was_canceled = ublk_mark_queue_canceling(ubq);
1670-
if (!was_canceled) {
1671-
/* abort queue is for making forward progress */
1672-
ublk_abort_queue(ub, ubq);
1673-
}
1723+
ublk_mark_queue_canceling(ubq);
16741724
blk_mq_unquiesce_queue(disk->queue);
1675-
put_device(disk_to_dev(disk));
1676-
1677-
return !was_canceled;
1725+
ublk_put_disk(disk);
16781726
}
16791727

16801728
static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
@@ -1698,15 +1746,24 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
16981746
/*
16991747
* The ublk char device won't be closed when calling cancel fn, so both
17001748
* ublk device and queue are guaranteed to be live
1749+
*
1750+
* Two-stage cancel:
1751+
*
1752+
* - make every active uring_cmd done in ->cancel_fn()
1753+
*
1754+
* - aborting inflight ublk IO requests in ublk char device release handler,
1755+
* which depends on 1st stage because device can only be closed iff all
1756+
* uring_cmd are done
1757+
*
1758+
* Do _not_ try to acquire ub->mutex before all inflight requests are
1759+
* aborted, otherwise deadlock may be caused.
17011760
*/
17021761
static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
17031762
unsigned int issue_flags)
17041763
{
17051764
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
17061765
struct ublk_queue *ubq = pdu->ubq;
17071766
struct task_struct *task;
1708-
struct ublk_device *ub;
1709-
bool need_schedule;
17101767
struct ublk_io *io;
17111768

17121769
if (WARN_ON_ONCE(!ubq))
@@ -1719,16 +1776,12 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
17191776
if (WARN_ON_ONCE(task && task != ubq->ubq_daemon))
17201777
return;
17211778

1722-
ub = ubq->dev;
1723-
need_schedule = ublk_abort_requests(ub, ubq);
1779+
if (!ubq->canceling)
1780+
ublk_start_cancel(ubq);
17241781

17251782
io = &ubq->ios[pdu->tag];
17261783
WARN_ON_ONCE(io->cmd != cmd);
17271784
ublk_cancel_cmd(ubq, io, issue_flags);
1728-
1729-
if (need_schedule) {
1730-
schedule_work(&ub->nosrv_work);
1731-
}
17321785
}
17331786

17341787
static inline bool ublk_queue_ready(struct ublk_queue *ubq)
@@ -1787,13 +1840,11 @@ static void __ublk_quiesce_dev(struct ublk_device *ub)
17871840
__func__, ub->dev_info.dev_id,
17881841
ub->dev_info.state == UBLK_S_DEV_LIVE ?
17891842
"LIVE" : "QUIESCED");
1790-
blk_mq_quiesce_queue(ub->ub_disk->queue);
17911843
/* mark every queue as canceling */
17921844
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
17931845
ublk_get_queue(ub, i)->canceling = true;
17941846
ublk_wait_tagset_rqs_idle(ub);
17951847
ub->dev_info.state = UBLK_S_DEV_QUIESCED;
1796-
blk_mq_unquiesce_queue(ub->ub_disk->queue);
17971848
}
17981849

17991850
static void ublk_force_abort_dev(struct ublk_device *ub)
@@ -1830,50 +1881,25 @@ static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
18301881
return disk;
18311882
}
18321883

1833-
static void ublk_stop_dev(struct ublk_device *ub)
1884+
static void ublk_stop_dev_unlocked(struct ublk_device *ub)
1885+
__must_hold(&ub->mutex)
18341886
{
18351887
struct gendisk *disk;
18361888

1837-
mutex_lock(&ub->mutex);
18381889
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
1839-
goto unlock;
1890+
return;
1891+
18401892
if (ublk_nosrv_dev_should_queue_io(ub))
18411893
ublk_force_abort_dev(ub);
18421894
del_gendisk(ub->ub_disk);
18431895
disk = ublk_detach_disk(ub);
18441896
put_disk(disk);
1845-
unlock:
1846-
mutex_unlock(&ub->mutex);
1847-
ublk_cancel_dev(ub);
18481897
}
18491898

1850-
static void ublk_nosrv_work(struct work_struct *work)
1899+
static void ublk_stop_dev(struct ublk_device *ub)
18511900
{
1852-
struct ublk_device *ub =
1853-
container_of(work, struct ublk_device, nosrv_work);
1854-
int i;
1855-
1856-
if (ublk_nosrv_should_stop_dev(ub)) {
1857-
ublk_stop_dev(ub);
1858-
return;
1859-
}
1860-
18611901
mutex_lock(&ub->mutex);
1862-
if (ub->dev_info.state != UBLK_S_DEV_LIVE)
1863-
goto unlock;
1864-
1865-
if (ublk_nosrv_dev_should_queue_io(ub)) {
1866-
__ublk_quiesce_dev(ub);
1867-
} else {
1868-
blk_mq_quiesce_queue(ub->ub_disk->queue);
1869-
ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
1870-
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
1871-
ublk_get_queue(ub, i)->fail_io = true;
1872-
}
1873-
blk_mq_unquiesce_queue(ub->ub_disk->queue);
1874-
}
1875-
1876-
unlock:
1902+
ublk_stop_dev_unlocked(ub);
18771903
mutex_unlock(&ub->mutex);
18781904
ublk_cancel_dev(ub);
18791905
}
@@ -2502,7 +2528,6 @@ static void ublk_remove(struct ublk_device *ub)
25022528
bool unprivileged;
25032529

25042530
ublk_stop_dev(ub);
2505-
cancel_work_sync(&ub->nosrv_work);
25062531
cdev_device_del(&ub->cdev, &ub->cdev_dev);
25072532
unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
25082533
ublk_put_device(ub);
@@ -2787,7 +2812,6 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
27872812
goto out_unlock;
27882813
mutex_init(&ub->mutex);
27892814
spin_lock_init(&ub->lock);
2790-
INIT_WORK(&ub->nosrv_work, ublk_nosrv_work);
27912815

27922816
ret = ublk_alloc_dev_number(ub, header->dev_id);
27932817
if (ret < 0)
@@ -2919,7 +2943,6 @@ static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
29192943
static int ublk_ctrl_stop_dev(struct ublk_device *ub)
29202944
{
29212945
ublk_stop_dev(ub);
2922-
cancel_work_sync(&ub->nosrv_work);
29232946
return 0;
29242947
}
29252948

0 commit comments

Comments
 (0)