Skip to content

Commit f7f70f4

Browse files
Ruozhu LiChristoph Hellwig
authored andcommitted
nvme: fix regression when disconnect a recovering ctrl
We encountered a problem that the disconnect command hangs. After analyzing the log and stack, we found that the triggering process is as follows: CPU0 CPU1 nvme_rdma_error_recovery_work nvme_rdma_teardown_io_queues nvme_do_delete_ctrl nvme_stop_queues nvme_remove_namespaces --clear ctrl->namespaces nvme_start_queues --no ns in ctrl->namespaces nvme_ns_remove return(because ctrl is deleting) blk_freeze_queue blk_mq_freeze_queue_wait --wait for ns to unquiesce to clean infligt IO, hang forever This problem was not found in older kernels because we will flush err work in nvme_stop_ctrl before nvme_remove_namespaces.It does not seem to be modified for functional reasons, the patch can be revert to solve the problem. Revert commit 794a4cb ("nvme: remove the .stop_ctrl callout") Signed-off-by: Ruozhu Li <liruozhu@huawei.com> Reviewed-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Christoph Hellwig <hch@lst.de>
1 parent 1629de0 commit f7f70f4

File tree

4 files changed

+19
-6
lines changed

4 files changed

+19
-6
lines changed

drivers/nvme/host/core.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4595,6 +4595,8 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
45954595
nvme_stop_failfast_work(ctrl);
45964596
flush_work(&ctrl->async_event_work);
45974597
cancel_work_sync(&ctrl->fw_act_work);
4598+
if (ctrl->ops->stop_ctrl)
4599+
ctrl->ops->stop_ctrl(ctrl);
45984600
}
45994601
EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
46004602

drivers/nvme/host/nvme.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,7 @@ struct nvme_ctrl_ops {
502502
void (*free_ctrl)(struct nvme_ctrl *ctrl);
503503
void (*submit_async_event)(struct nvme_ctrl *ctrl);
504504
void (*delete_ctrl)(struct nvme_ctrl *ctrl);
505+
void (*stop_ctrl)(struct nvme_ctrl *ctrl);
505506
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
506507
void (*print_device_info)(struct nvme_ctrl *ctrl);
507508
};

drivers/nvme/host/rdma.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1048,6 +1048,14 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
10481048
}
10491049
}
10501050

1051+
static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
1052+
{
1053+
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1054+
1055+
cancel_work_sync(&ctrl->err_work);
1056+
cancel_delayed_work_sync(&ctrl->reconnect_work);
1057+
}
1058+
10511059
static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
10521060
{
10531061
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@@ -2252,9 +2260,6 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
22522260

22532261
static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
22542262
{
2255-
cancel_work_sync(&ctrl->err_work);
2256-
cancel_delayed_work_sync(&ctrl->reconnect_work);
2257-
22582263
nvme_rdma_teardown_io_queues(ctrl, shutdown);
22592264
nvme_stop_admin_queue(&ctrl->ctrl);
22602265
if (shutdown)
@@ -2304,6 +2309,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
23042309
.submit_async_event = nvme_rdma_submit_async_event,
23052310
.delete_ctrl = nvme_rdma_delete_ctrl,
23062311
.get_address = nvmf_get_address,
2312+
.stop_ctrl = nvme_rdma_stop_ctrl,
23072313
};
23082314

23092315
/*

drivers/nvme/host/tcp.c

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2193,9 +2193,6 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
21932193

21942194
static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
21952195
{
2196-
cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
2197-
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
2198-
21992196
nvme_tcp_teardown_io_queues(ctrl, shutdown);
22002197
nvme_stop_admin_queue(ctrl);
22012198
if (shutdown)
@@ -2235,6 +2232,12 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
22352232
nvme_tcp_reconnect_or_remove(ctrl);
22362233
}
22372234

2235+
static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
2236+
{
2237+
cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
2238+
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
2239+
}
2240+
22382241
static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
22392242
{
22402243
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
@@ -2556,6 +2559,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
25562559
.submit_async_event = nvme_tcp_submit_async_event,
25572560
.delete_ctrl = nvme_tcp_delete_ctrl,
25582561
.get_address = nvmf_get_address,
2562+
.stop_ctrl = nvme_tcp_stop_ctrl,
25592563
};
25602564

25612565
static bool

0 commit comments

Comments
 (0)