Skip to content
This repository was archived by the owner on Nov 8, 2023. It is now read-only.

Commit be647e2

Browse files
committed
nvme: use srcu for iterating namespace list
The nvme pci driver synchronizes with all the namespace queues during a reset to ensure that there's no pending timeout work. Meanwhile the timeout work potentially iterates those same namespaces to freeze their queues. Each of those namespace iterations use the same read lock. If a write lock should somehow get between the synchronize and freeze steps, then forward progress is deadlocked. We had been relying on the nvme controller state machine to ensure the reset work wouldn't conflict with timeout work. That guarantee may be a bit fragile to rely on, so iterate the namespace lists without taking potentially circular locks, as reported by lockdep. Link: https://lore.kernel.org/all/20220930001943.zdbvolc3gkekfmcv@shindev/ Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com> Tested-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com> Reviewed-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Keith Busch <kbusch@kernel.org>
1 parent 1bd293f commit be647e2

File tree

4 files changed

+83
-56
lines changed

4 files changed

+83
-56
lines changed

drivers/nvme/host/core.c

Lines changed: 59 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -678,7 +678,7 @@ static void nvme_free_ns(struct kref *kref)
678678
kfree(ns);
679679
}
680680

681-
static inline bool nvme_get_ns(struct nvme_ns *ns)
681+
bool nvme_get_ns(struct nvme_ns *ns)
682682
{
683683
return kref_get_unless_zero(&ns->kref);
684684
}
@@ -3684,9 +3684,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
36843684
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
36853685
{
36863686
struct nvme_ns *ns, *ret = NULL;
3687+
int srcu_idx;
36873688

3688-
down_read(&ctrl->namespaces_rwsem);
3689-
list_for_each_entry(ns, &ctrl->namespaces, list) {
3689+
srcu_idx = srcu_read_lock(&ctrl->srcu);
3690+
list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
36903691
if (ns->head->ns_id == nsid) {
36913692
if (!nvme_get_ns(ns))
36923693
continue;
@@ -3696,7 +3697,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
36963697
if (ns->head->ns_id > nsid)
36973698
break;
36983699
}
3699-
up_read(&ctrl->namespaces_rwsem);
3700+
srcu_read_unlock(&ctrl->srcu, srcu_idx);
37003701
return ret;
37013702
}
37023703
EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
@@ -3710,7 +3711,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
37103711

37113712
list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
37123713
if (tmp->head->ns_id < ns->head->ns_id) {
3713-
list_add(&ns->list, &tmp->list);
3714+
list_add_rcu(&ns->list, &tmp->list);
37143715
return;
37153716
}
37163717
}
@@ -3776,17 +3777,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
37763777
if (nvme_update_ns_info(ns, info))
37773778
goto out_unlink_ns;
37783779

3779-
down_write(&ctrl->namespaces_rwsem);
3780+
mutex_lock(&ctrl->namespaces_lock);
37803781
/*
37813782
* Ensure that no namespaces are added to the ctrl list after the queues
37823783
* are frozen, thereby avoiding a deadlock between scan and reset.
37833784
*/
37843785
if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
3785-
up_write(&ctrl->namespaces_rwsem);
3786+
mutex_unlock(&ctrl->namespaces_lock);
37863787
goto out_unlink_ns;
37873788
}
37883789
nvme_ns_add_to_ctrl_list(ns);
3789-
up_write(&ctrl->namespaces_rwsem);
3790+
mutex_unlock(&ctrl->namespaces_lock);
3791+
synchronize_srcu(&ctrl->srcu);
37903792
nvme_get_ctrl(ctrl);
37913793

37923794
if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
@@ -3809,9 +3811,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
38093811

38103812
out_cleanup_ns_from_list:
38113813
nvme_put_ctrl(ctrl);
3812-
down_write(&ctrl->namespaces_rwsem);
3813-
list_del_init(&ns->list);
3814-
up_write(&ctrl->namespaces_rwsem);
3814+
mutex_lock(&ctrl->namespaces_lock);
3815+
list_del_rcu(&ns->list);
3816+
mutex_unlock(&ctrl->namespaces_lock);
3817+
synchronize_srcu(&ctrl->srcu);
38153818
out_unlink_ns:
38163819
mutex_lock(&ctrl->subsys->lock);
38173820
list_del_rcu(&ns->siblings);
@@ -3861,9 +3864,10 @@ static void nvme_ns_remove(struct nvme_ns *ns)
38613864
nvme_cdev_del(&ns->cdev, &ns->cdev_device);
38623865
del_gendisk(ns->disk);
38633866

3864-
down_write(&ns->ctrl->namespaces_rwsem);
3865-
list_del_init(&ns->list);
3866-
up_write(&ns->ctrl->namespaces_rwsem);
3867+
mutex_lock(&ns->ctrl->namespaces_lock);
3868+
list_del_rcu(&ns->list);
3869+
mutex_unlock(&ns->ctrl->namespaces_lock);
3870+
synchronize_srcu(&ns->ctrl->srcu);
38673871

38683872
if (last_path)
38693873
nvme_mpath_shutdown_disk(ns->head);
@@ -3953,16 +3957,17 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
39533957
struct nvme_ns *ns, *next;
39543958
LIST_HEAD(rm_list);
39553959

3956-
down_write(&ctrl->namespaces_rwsem);
3960+
mutex_lock(&ctrl->namespaces_lock);
39573961
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
39583962
if (ns->head->ns_id > nsid)
3959-
list_move_tail(&ns->list, &rm_list);
3963+
list_splice_init_rcu(&ns->list, &rm_list,
3964+
synchronize_rcu);
39603965
}
3961-
up_write(&ctrl->namespaces_rwsem);
3966+
mutex_unlock(&ctrl->namespaces_lock);
3967+
synchronize_srcu(&ctrl->srcu);
39623968

39633969
list_for_each_entry_safe(ns, next, &rm_list, list)
39643970
nvme_ns_remove(ns);
3965-
39663971
}
39673972

39683973
static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
@@ -4132,9 +4137,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
41324137
/* this is a no-op when called from the controller reset handler */
41334138
nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
41344139

4135-
down_write(&ctrl->namespaces_rwsem);
4136-
list_splice_init(&ctrl->namespaces, &ns_list);
4137-
up_write(&ctrl->namespaces_rwsem);
4140+
mutex_lock(&ctrl->namespaces_lock);
4141+
list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
4142+
mutex_unlock(&ctrl->namespaces_lock);
4143+
synchronize_srcu(&ctrl->srcu);
41384144

41394145
list_for_each_entry_safe(ns, next, &ns_list, list)
41404146
nvme_ns_remove(ns);
@@ -4582,6 +4588,7 @@ static void nvme_free_ctrl(struct device *dev)
45824588
key_put(ctrl->tls_key);
45834589
nvme_free_cels(ctrl);
45844590
nvme_mpath_uninit(ctrl);
4591+
cleanup_srcu_struct(&ctrl->srcu);
45854592
nvme_auth_stop(ctrl);
45864593
nvme_auth_free(ctrl);
45874594
__free_page(ctrl->discard_page);
@@ -4614,10 +4621,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
46144621
ctrl->passthru_err_log_enabled = false;
46154622
clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
46164623
spin_lock_init(&ctrl->lock);
4624+
mutex_init(&ctrl->namespaces_lock);
4625+
4626+
ret = init_srcu_struct(&ctrl->srcu);
4627+
if (ret)
4628+
return ret;
4629+
46174630
mutex_init(&ctrl->scan_lock);
46184631
INIT_LIST_HEAD(&ctrl->namespaces);
46194632
xa_init(&ctrl->cels);
4620-
init_rwsem(&ctrl->namespaces_rwsem);
46214633
ctrl->dev = dev;
46224634
ctrl->ops = ops;
46234635
ctrl->quirks = quirks;
@@ -4697,6 +4709,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
46974709
out:
46984710
if (ctrl->discard_page)
46994711
__free_page(ctrl->discard_page);
4712+
cleanup_srcu_struct(&ctrl->srcu);
47004713
return ret;
47014714
}
47024715
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
@@ -4705,61 +4718,66 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl);
47054718
void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
47064719
{
47074720
struct nvme_ns *ns;
4721+
int srcu_idx;
47084722

4709-
down_read(&ctrl->namespaces_rwsem);
4710-
list_for_each_entry(ns, &ctrl->namespaces, list)
4723+
srcu_idx = srcu_read_lock(&ctrl->srcu);
4724+
list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
47114725
blk_mark_disk_dead(ns->disk);
4712-
up_read(&ctrl->namespaces_rwsem);
4726+
srcu_read_unlock(&ctrl->srcu, srcu_idx);
47134727
}
47144728
EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
47154729

47164730
void nvme_unfreeze(struct nvme_ctrl *ctrl)
47174731
{
47184732
struct nvme_ns *ns;
4733+
int srcu_idx;
47194734

4720-
down_read(&ctrl->namespaces_rwsem);
4721-
list_for_each_entry(ns, &ctrl->namespaces, list)
4735+
srcu_idx = srcu_read_lock(&ctrl->srcu);
4736+
list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
47224737
blk_mq_unfreeze_queue(ns->queue);
4723-
up_read(&ctrl->namespaces_rwsem);
4738+
srcu_read_unlock(&ctrl->srcu, srcu_idx);
47244739
clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
47254740
}
47264741
EXPORT_SYMBOL_GPL(nvme_unfreeze);
47274742

47284743
int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
47294744
{
47304745
struct nvme_ns *ns;
4746+
int srcu_idx;
47314747

4732-
down_read(&ctrl->namespaces_rwsem);
4733-
list_for_each_entry(ns, &ctrl->namespaces, list) {
4748+
srcu_idx = srcu_read_lock(&ctrl->srcu);
4749+
list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
47344750
timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
47354751
if (timeout <= 0)
47364752
break;
47374753
}
4738-
up_read(&ctrl->namespaces_rwsem);
4754+
srcu_read_unlock(&ctrl->srcu, srcu_idx);
47394755
return timeout;
47404756
}
47414757
EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
47424758

47434759
void nvme_wait_freeze(struct nvme_ctrl *ctrl)
47444760
{
47454761
struct nvme_ns *ns;
4762+
int srcu_idx;
47464763

4747-
down_read(&ctrl->namespaces_rwsem);
4748-
list_for_each_entry(ns, &ctrl->namespaces, list)
4764+
srcu_idx = srcu_read_lock(&ctrl->srcu);
4765+
list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
47494766
blk_mq_freeze_queue_wait(ns->queue);
4750-
up_read(&ctrl->namespaces_rwsem);
4767+
srcu_read_unlock(&ctrl->srcu, srcu_idx);
47514768
}
47524769
EXPORT_SYMBOL_GPL(nvme_wait_freeze);
47534770

47544771
void nvme_start_freeze(struct nvme_ctrl *ctrl)
47554772
{
47564773
struct nvme_ns *ns;
4774+
int srcu_idx;
47574775

47584776
set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
4759-
down_read(&ctrl->namespaces_rwsem);
4760-
list_for_each_entry(ns, &ctrl->namespaces, list)
4777+
srcu_idx = srcu_read_lock(&ctrl->srcu);
4778+
list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
47614779
blk_freeze_queue_start(ns->queue);
4762-
up_read(&ctrl->namespaces_rwsem);
4780+
srcu_read_unlock(&ctrl->srcu, srcu_idx);
47634781
}
47644782
EXPORT_SYMBOL_GPL(nvme_start_freeze);
47654783

@@ -4802,11 +4820,12 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
48024820
void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
48034821
{
48044822
struct nvme_ns *ns;
4823+
int srcu_idx;
48054824

4806-
down_read(&ctrl->namespaces_rwsem);
4807-
list_for_each_entry(ns, &ctrl->namespaces, list)
4825+
srcu_idx = srcu_read_lock(&ctrl->srcu);
4826+
list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
48084827
blk_sync_queue(ns->queue);
4809-
up_read(&ctrl->namespaces_rwsem);
4828+
srcu_read_unlock(&ctrl->srcu, srcu_idx);
48104829
}
48114830
EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
48124831

drivers/nvme/host/ioctl.c

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -789,15 +789,15 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
789789
bool open_for_write)
790790
{
791791
struct nvme_ns *ns;
792-
int ret;
792+
int ret, srcu_idx;
793793

794-
down_read(&ctrl->namespaces_rwsem);
794+
srcu_idx = srcu_read_lock(&ctrl->srcu);
795795
if (list_empty(&ctrl->namespaces)) {
796796
ret = -ENOTTY;
797797
goto out_unlock;
798798
}
799799

800-
ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
800+
ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list);
801801
if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
802802
dev_warn(ctrl->device,
803803
"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
@@ -807,15 +807,18 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
807807

808808
dev_warn(ctrl->device,
809809
"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
810-
kref_get(&ns->kref);
811-
up_read(&ctrl->namespaces_rwsem);
810+
if (!nvme_get_ns(ns)) {
811+
ret = -ENXIO;
812+
goto out_unlock;
813+
}
814+
srcu_read_unlock(&ctrl->srcu, srcu_idx);
812815

813816
ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write);
814817
nvme_put_ns(ns);
815818
return ret;
816819

817820
out_unlock:
818-
up_read(&ctrl->namespaces_rwsem);
821+
srcu_read_unlock(&ctrl->srcu, srcu_idx);
819822
return ret;
820823
}
821824

drivers/nvme/host/multipath.c

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -151,16 +151,17 @@ void nvme_mpath_end_request(struct request *rq)
151151
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
152152
{
153153
struct nvme_ns *ns;
154+
int srcu_idx;
154155

155-
down_read(&ctrl->namespaces_rwsem);
156-
list_for_each_entry(ns, &ctrl->namespaces, list) {
156+
srcu_idx = srcu_read_lock(&ctrl->srcu);
157+
list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
157158
if (!ns->head->disk)
158159
continue;
159160
kblockd_schedule_work(&ns->head->requeue_work);
160161
if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
161162
disk_uevent(ns->head->disk, KOBJ_CHANGE);
162163
}
163-
up_read(&ctrl->namespaces_rwsem);
164+
srcu_read_unlock(&ctrl->srcu, srcu_idx);
164165
}
165166

166167
static const char *nvme_ana_state_names[] = {
@@ -194,13 +195,14 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
194195
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
195196
{
196197
struct nvme_ns *ns;
198+
int srcu_idx;
197199

198-
down_read(&ctrl->namespaces_rwsem);
199-
list_for_each_entry(ns, &ctrl->namespaces, list) {
200+
srcu_idx = srcu_read_lock(&ctrl->srcu);
201+
list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
200202
nvme_mpath_clear_current_path(ns);
201203
kblockd_schedule_work(&ns->head->requeue_work);
202204
}
203-
up_read(&ctrl->namespaces_rwsem);
205+
srcu_read_unlock(&ctrl->srcu, srcu_idx);
204206
}
205207

206208
void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
@@ -681,6 +683,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
681683
u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
682684
unsigned *nr_change_groups = data;
683685
struct nvme_ns *ns;
686+
int srcu_idx;
684687

685688
dev_dbg(ctrl->device, "ANA group %d: %s.\n",
686689
le32_to_cpu(desc->grpid),
@@ -692,8 +695,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
692695
if (!nr_nsids)
693696
return 0;
694697

695-
down_read(&ctrl->namespaces_rwsem);
696-
list_for_each_entry(ns, &ctrl->namespaces, list) {
698+
srcu_idx = srcu_read_lock(&ctrl->srcu);
699+
list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
697700
unsigned nsid;
698701
again:
699702
nsid = le32_to_cpu(desc->nsids[n]);
@@ -706,7 +709,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
706709
if (ns->head->ns_id > nsid)
707710
goto again;
708711
}
709-
up_read(&ctrl->namespaces_rwsem);
712+
srcu_read_unlock(&ctrl->srcu, srcu_idx);
710713
return 0;
711714
}
712715

drivers/nvme/host/nvme.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,8 @@ struct nvme_ctrl {
282282
struct blk_mq_tag_set *tagset;
283283
struct blk_mq_tag_set *admin_tagset;
284284
struct list_head namespaces;
285-
struct rw_semaphore namespaces_rwsem;
285+
struct mutex namespaces_lock;
286+
struct srcu_struct srcu;
286287
struct device ctrl_device;
287288
struct device *device; /* char device */
288289
#ifdef CONFIG_NVME_HWMON
@@ -1160,6 +1161,7 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
11601161
struct nvme_command *cmd, int status);
11611162
struct nvme_ctrl *nvme_ctrl_from_file(struct file *file);
11621163
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid);
1164+
bool nvme_get_ns(struct nvme_ns *ns);
11631165
void nvme_put_ns(struct nvme_ns *ns);
11641166

11651167
static inline bool nvme_multi_css(struct nvme_ctrl *ctrl)

0 commit comments

Comments
 (0)