Skip to content

Commit 5c977f1

Browse files
bmarzinsMikulas Patocka
authored andcommitted
dm-mpath: Don't grab work_mutex while probing paths
Grabbing the work_mutex keeps probe_active_paths() from running at the same time as multipath_message(). The only messages that could interfere with probing the paths are "disable_group", "enable_group", and "switch_group". These messages could force multipath to pick a new pathgroup while probe_active_paths() was probing the current pathgroup. If the multipath device has a hardware handler, and it switches active pathgroups while there is outstanding IO to a path device, it's possible that IO to the path will fail, even if the path would be usable if it was in the active pathgroup. To avoid this, do not clear the current pathgroup for the *_group messages while probe_active_paths() is running. Instead set a flag, and probe_active_paths() will clear the current pathgroup when it finishes probing the paths. For this to work correctly, multipath needs to check current_pg before next_pg in choose_pgpath(), but before this patch next_pg was only ever set when current_pg was cleared, so this doesn't change the current behavior when paths aren't being probed. Even with this change, it is still possible to switch pathgroups while the probe is running, but only if all the paths have failed, and the probe function will skip them as well in this case. If multiple DM_MPATH_PROBE_PATHS requests are received at once, there is no point in repeatedly issuing test IOs. Instead, the later probes should wait for the current probe to complete. If current pathgroup is still the same as the one that was just checked, the other probes should skip probing and just check the number of valid paths. Finally, probing the paths should quit early if the multipath device is trying to suspend, instead of continuing to issue test IOs, delaying the suspend. While this patch will not change the behavior of existing multipath users which don't use the DM_MPATH_PROBE_PATHS ioctl, when that ioctl is used, the behavior of the "disable_group", "enable_group", and "switch_group" messages can change subtly. When these messages return, the next IO to the multipath device will no longer be guaranteed to choose a new pathgroup. Instead, choosing a new pathgroup could be delayed by an in-progress DM_MPATH_PROBE_PATHS ioctl. The userspace multipath tools make no assumptions about what will happen to IOs after sending these messages, so this change will not effect already released versions of them, even if the DM_MPATH_PROBE_PATHS ioctl is run alongside them. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
1 parent 241b9b5 commit 5c977f1

File tree

1 file changed

+78
-40
lines changed

1 file changed

+78
-40
lines changed

drivers/md/dm-mpath.c

Lines changed: 78 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ struct multipath {
7979
struct pgpath *current_pgpath;
8080
struct priority_group *current_pg;
8181
struct priority_group *next_pg; /* Switch to this PG if set */
82+
struct priority_group *last_probed_pg;
8283

8384
atomic_t nr_valid_paths; /* Total number of usable paths */
8485
unsigned int nr_priority_groups;
@@ -87,6 +88,7 @@ struct multipath {
8788
const char *hw_handler_name;
8889
char *hw_handler_params;
8990
wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
91+
wait_queue_head_t probe_wait; /* Wait for probing paths */
9092
unsigned int pg_init_retries; /* Number of times to retry pg_init */
9193
unsigned int pg_init_delay_msecs; /* Number of msecs before pg_init retry */
9294
atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */
@@ -100,6 +102,7 @@ struct multipath {
100102
struct bio_list queued_bios;
101103

102104
struct timer_list nopath_timer; /* Timeout for queue_if_no_path */
105+
bool is_suspending;
103106
};
104107

105108
/*
@@ -132,6 +135,8 @@ static void queue_if_no_path_timeout_work(struct timer_list *t);
132135
#define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */
133136
#define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */
134137
#define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */
138+
#define MPATHF_DELAY_PG_SWITCH 7 /* Delay switching pg if it still has paths */
139+
#define MPATHF_NEED_PG_SWITCH 8 /* Need to switch pgs after the delay has ended */
135140

136141
static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m)
137142
{
@@ -254,6 +259,7 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
254259
atomic_set(&m->pg_init_count, 0);
255260
m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
256261
init_waitqueue_head(&m->pg_init_wait);
262+
init_waitqueue_head(&m->probe_wait);
257263

258264
return 0;
259265
}
@@ -413,30 +419,29 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
413419
goto failed;
414420
}
415421

422+
/* Don't change PG until it has no remaining paths */
423+
pg = READ_ONCE(m->current_pg);
424+
if (pg) {
425+
pgpath = choose_path_in_pg(m, pg, nr_bytes);
426+
if (!IS_ERR_OR_NULL(pgpath))
427+
return pgpath;
428+
}
429+
416430
/* Were we instructed to switch PG? */
417431
if (READ_ONCE(m->next_pg)) {
418432
spin_lock_irqsave(&m->lock, flags);
419433
pg = m->next_pg;
420434
if (!pg) {
421435
spin_unlock_irqrestore(&m->lock, flags);
422-
goto check_current_pg;
436+
goto check_all_pgs;
423437
}
424438
m->next_pg = NULL;
425439
spin_unlock_irqrestore(&m->lock, flags);
426440
pgpath = choose_path_in_pg(m, pg, nr_bytes);
427441
if (!IS_ERR_OR_NULL(pgpath))
428442
return pgpath;
429443
}
430-
431-
/* Don't change PG until it has no remaining paths */
432-
check_current_pg:
433-
pg = READ_ONCE(m->current_pg);
434-
if (pg) {
435-
pgpath = choose_path_in_pg(m, pg, nr_bytes);
436-
if (!IS_ERR_OR_NULL(pgpath))
437-
return pgpath;
438-
}
439-
444+
check_all_pgs:
440445
/*
441446
* Loop through priority groups until we find a valid path.
442447
* First time we skip PGs marked 'bypassed'.
@@ -1439,15 +1444,19 @@ static int action_dev(struct multipath *m, dev_t dev, action_fn action)
14391444
* Temporarily try to avoid having to use the specified PG
14401445
*/
14411446
static void bypass_pg(struct multipath *m, struct priority_group *pg,
1442-
bool bypassed)
1447+
bool bypassed, bool can_be_delayed)
14431448
{
14441449
unsigned long flags;
14451450

14461451
spin_lock_irqsave(&m->lock, flags);
14471452

14481453
pg->bypassed = bypassed;
1449-
m->current_pgpath = NULL;
1450-
m->current_pg = NULL;
1454+
if (can_be_delayed && test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags))
1455+
set_bit(MPATHF_NEED_PG_SWITCH, &m->flags);
1456+
else {
1457+
m->current_pgpath = NULL;
1458+
m->current_pg = NULL;
1459+
}
14511460

14521461
spin_unlock_irqrestore(&m->lock, flags);
14531462

@@ -1476,8 +1485,12 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
14761485
if (--pgnum)
14771486
continue;
14781487

1479-
m->current_pgpath = NULL;
1480-
m->current_pg = NULL;
1488+
if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags))
1489+
set_bit(MPATHF_NEED_PG_SWITCH, &m->flags);
1490+
else {
1491+
m->current_pgpath = NULL;
1492+
m->current_pg = NULL;
1493+
}
14811494
m->next_pg = pg;
14821495
}
14831496
spin_unlock_irqrestore(&m->lock, flags);
@@ -1507,7 +1520,7 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed)
15071520
break;
15081521
}
15091522

1510-
bypass_pg(m, pg, bypassed);
1523+
bypass_pg(m, pg, bypassed, true);
15111524
return 0;
15121525
}
15131526

@@ -1561,7 +1574,7 @@ static void pg_init_done(void *data, int errors)
15611574
* Probably doing something like FW upgrade on the
15621575
* controller so try the other pg.
15631576
*/
1564-
bypass_pg(m, pg, true);
1577+
bypass_pg(m, pg, true, false);
15651578
break;
15661579
case SCSI_DH_RETRY:
15671580
/* Wait before retrying. */
@@ -1741,7 +1754,11 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
17411754
static void multipath_presuspend(struct dm_target *ti)
17421755
{
17431756
struct multipath *m = ti->private;
1757+
unsigned long flags;
17441758

1759+
spin_lock_irqsave(&m->lock, flags);
1760+
m->is_suspending = true;
1761+
spin_unlock_irqrestore(&m->lock, flags);
17451762
/* FIXME: bio-based shouldn't need to always disable queue_if_no_path */
17461763
if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti))
17471764
queue_if_no_path(m, false, true, __func__);
@@ -1765,6 +1782,7 @@ static void multipath_resume(struct dm_target *ti)
17651782
unsigned long flags;
17661783

17671784
spin_lock_irqsave(&m->lock, flags);
1785+
m->is_suspending = false;
17681786
if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) {
17691787
set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
17701788
clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
@@ -1845,10 +1863,10 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
18451863

18461864
DMEMIT("%u ", m->nr_priority_groups);
18471865

1848-
if (m->next_pg)
1849-
pg_num = m->next_pg->pg_num;
1850-
else if (m->current_pg)
1866+
if (m->current_pg)
18511867
pg_num = m->current_pg->pg_num;
1868+
else if (m->next_pg)
1869+
pg_num = m->next_pg->pg_num;
18521870
else
18531871
pg_num = (m->nr_priority_groups ? 1 : 0);
18541872

@@ -2077,35 +2095,55 @@ static int probe_path(struct pgpath *pgpath)
20772095
static int probe_active_paths(struct multipath *m)
20782096
{
20792097
struct pgpath *pgpath;
2080-
struct priority_group *pg;
2098+
struct priority_group *pg = NULL;
20812099
unsigned long flags;
20822100
int r = 0;
20832101

2084-
mutex_lock(&m->work_mutex);
2085-
20862102
spin_lock_irqsave(&m->lock, flags);
2087-
if (test_bit(MPATHF_QUEUE_IO, &m->flags))
2088-
pg = NULL;
2089-
else
2090-
pg = m->current_pg;
2103+
if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) {
2104+
wait_event_lock_irq(m->probe_wait,
2105+
!test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags),
2106+
m->lock);
2107+
/*
2108+
* if we waited because a probe was already in progress,
2109+
* and it probed the current active pathgroup, don't
2110+
* reprobe. Just return the number of valid paths
2111+
*/
2112+
if (m->current_pg == m->last_probed_pg)
2113+
goto skip_probe;
2114+
}
2115+
if (!m->current_pg || m->is_suspending ||
2116+
test_bit(MPATHF_QUEUE_IO, &m->flags))
2117+
goto skip_probe;
2118+
set_bit(MPATHF_DELAY_PG_SWITCH, &m->flags);
2119+
pg = m->last_probed_pg = m->current_pg;
20912120
spin_unlock_irqrestore(&m->lock, flags);
20922121

2093-
if (pg) {
2094-
list_for_each_entry(pgpath, &pg->pgpaths, list) {
2095-
if (!pgpath->is_active)
2096-
continue;
2122+
list_for_each_entry(pgpath, &pg->pgpaths, list) {
2123+
if (pg != READ_ONCE(m->current_pg) ||
2124+
READ_ONCE(m->is_suspending))
2125+
goto out;
2126+
if (!pgpath->is_active)
2127+
continue;
20972128

2098-
r = probe_path(pgpath);
2099-
if (r < 0)
2100-
goto out;
2101-
}
2129+
r = probe_path(pgpath);
2130+
if (r < 0)
2131+
goto out;
21022132
}
21032133

2104-
if (!atomic_read(&m->nr_valid_paths))
2105-
r = -ENOTCONN;
2106-
21072134
out:
2108-
mutex_unlock(&m->work_mutex);
2135+
spin_lock_irqsave(&m->lock, flags);
2136+
clear_bit(MPATHF_DELAY_PG_SWITCH, &m->flags);
2137+
if (test_and_clear_bit(MPATHF_NEED_PG_SWITCH, &m->flags)) {
2138+
m->current_pgpath = NULL;
2139+
m->current_pg = NULL;
2140+
}
2141+
skip_probe:
2142+
if (r == 0 && !atomic_read(&m->nr_valid_paths))
2143+
r = -ENOTCONN;
2144+
spin_unlock_irqrestore(&m->lock, flags);
2145+
if (pg)
2146+
wake_up(&m->probe_wait);
21092147
return r;
21102148
}
21112149

0 commit comments

Comments
 (0)