Skip to content

Commit ce85a1e

Browse files
author
Darrick J. Wong
committed
xfs: stabilize fs summary counters for online fsck
If the fscounters scrubber notices incorrect summary counters, it's entirely possible that scrub is simply racing with other threads that are updating the incore counters. There isn't a good way to stabilize percpu counters or set ourselves up to observe live updates with hooks like we do for the quotacheck or nlinks scanners, so we instead choose to freeze the filesystem long enough to walk the incore per-AG structures. Past me thought that it was going to be commonplace to have to freeze the filesystem to perform some kind of repair and set up a whole separate infrastructure to freeze the filesystem in such a way that userspace could not unfreeze while we were running. This involved adding a mutex and freeze_super/thaw_super functions and dealing with the fact that the VFS freeze/thaw functions can free the VFS superblock references on return. This was all very overwrought, since fscounters turned out to be the only user of scrub freezes, and it doesn't require the log to quiesce, only the incore superblock counters. We prevent other threads from changing the freeze level by calling freeze_super_excl with a custom freeze cookie to keep everyone else out of the filesystem. The end result is that fscounters should be much more efficient. When we're checking a busy system and we can't stabilize the counters, the custom freeze will do less work, which should result in less downtime. Repair should be similarly speedy, but that's in a later patch. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Dave Chinner <dchinner@redhat.com>
1 parent 59ba4fd commit ce85a1e

File tree

4 files changed

+183
-38
lines changed

4 files changed

+183
-38
lines changed

fs/xfs/scrub/fscounters.c

Lines changed: 151 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// SPDX-License-Identifier: GPL-2.0+
1+
// SPDX-License-Identifier: GPL-2.0-or-later
22
/*
33
* Copyright (C) 2019-2023 Oracle. All Rights Reserved.
44
* Author: Darrick J. Wong <djwong@kernel.org>
@@ -8,6 +8,8 @@
88
#include "xfs_shared.h"
99
#include "xfs_format.h"
1010
#include "xfs_trans_resv.h"
11+
#include "xfs_log_format.h"
12+
#include "xfs_trans.h"
1113
#include "xfs_mount.h"
1214
#include "xfs_alloc.h"
1315
#include "xfs_ialloc.h"
@@ -16,6 +18,7 @@
1618
#include "xfs_ag.h"
1719
#include "xfs_rtalloc.h"
1820
#include "xfs_inode.h"
21+
#include "xfs_icache.h"
1922
#include "scrub/scrub.h"
2023
#include "scrub/common.h"
2124
#include "scrub/trace.h"
@@ -53,6 +56,7 @@ struct xchk_fscounters {
5356
uint64_t frextents;
5457
unsigned long long icount_min;
5558
unsigned long long icount_max;
59+
bool frozen;
5660
};
5761

5862
/*
@@ -123,6 +127,82 @@ xchk_fscount_warmup(
123127
return error;
124128
}
125129

130+
static inline int
131+
xchk_fsfreeze(
132+
struct xfs_scrub *sc)
133+
{
134+
int error;
135+
136+
error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
137+
trace_xchk_fsfreeze(sc, error);
138+
return error;
139+
}
140+
141+
static inline int
142+
xchk_fsthaw(
143+
struct xfs_scrub *sc)
144+
{
145+
int error;
146+
147+
/* This should always succeed, we have a kernel freeze */
148+
error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
149+
trace_xchk_fsthaw(sc, error);
150+
return error;
151+
}
152+
153+
/*
154+
* We couldn't stabilize the filesystem long enough to sample all the variables
155+
* that comprise the summary counters and compare them to the percpu counters.
156+
* We need to disable all writer threads, which means taking the first two
157+
* freeze levels to put userspace to sleep, and the third freeze level to
158+
* prevent background threads from starting new transactions. Take one level
159+
* more to prevent other callers from unfreezing the filesystem while we run.
160+
*/
161+
STATIC int
162+
xchk_fscounters_freeze(
163+
struct xfs_scrub *sc)
164+
{
165+
struct xchk_fscounters *fsc = sc->buf;
166+
int error = 0;
167+
168+
if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
169+
sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
170+
mnt_drop_write_file(sc->file);
171+
}
172+
173+
/* Try to grab a kernel freeze. */
174+
while ((error = xchk_fsfreeze(sc)) == -EBUSY) {
175+
if (xchk_should_terminate(sc, &error))
176+
return error;
177+
178+
delay(HZ / 10);
179+
}
180+
if (error)
181+
return error;
182+
183+
fsc->frozen = true;
184+
return 0;
185+
}
186+
187+
/* Thaw the filesystem after checking or repairing fscounters. */
188+
STATIC void
189+
xchk_fscounters_cleanup(
190+
void *buf)
191+
{
192+
struct xchk_fscounters *fsc = buf;
193+
struct xfs_scrub *sc = fsc->sc;
194+
int error;
195+
196+
if (!fsc->frozen)
197+
return;
198+
199+
error = xchk_fsthaw(sc);
200+
if (error)
201+
xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error);
202+
else
203+
fsc->frozen = false;
204+
}
205+
126206
int
127207
xchk_setup_fscounters(
128208
struct xfs_scrub *sc)
@@ -140,6 +220,7 @@ xchk_setup_fscounters(
140220
sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
141221
if (!sc->buf)
142222
return -ENOMEM;
223+
sc->buf_cleanup = xchk_fscounters_cleanup;
143224
fsc = sc->buf;
144225
fsc->sc = sc;
145226

@@ -150,7 +231,18 @@ xchk_setup_fscounters(
150231
if (error)
151232
return error;
152233

153-
return xchk_trans_alloc(sc, 0);
234+
/*
235+
* Pause all writer activity in the filesystem while we're scrubbing to
236+
* reduce the likelihood of background perturbations to the counters
237+
* throwing off our calculations.
238+
*/
239+
if (sc->flags & XCHK_TRY_HARDER) {
240+
error = xchk_fscounters_freeze(sc);
241+
if (error)
242+
return error;
243+
}
244+
245+
return xfs_trans_alloc_empty(sc->mp, &sc->tp);
154246
}
155247

156248
/*
@@ -290,8 +382,7 @@ xchk_fscount_aggregate_agcounts(
290382
if (fsc->ifree > fsc->icount) {
291383
if (tries--)
292384
goto retry;
293-
xchk_set_incomplete(sc);
294-
return 0;
385+
return -EDEADLOCK;
295386
}
296387

297388
return 0;
@@ -367,6 +458,8 @@ xchk_fscount_count_frextents(
367458
* Otherwise, we /might/ have a problem. If the change in the summations is
368459
* more than we want to tolerate, the filesystem is probably busy and we should
369460
* just send back INCOMPLETE and see if userspace will try again.
461+
*
462+
* If we're repairing then we require an exact match.
370463
*/
371464
static inline bool
372465
xchk_fscount_within_range(
@@ -396,21 +489,7 @@ xchk_fscount_within_range(
396489
if (expected >= min_value && expected <= max_value)
397490
return true;
398491

399-
/*
400-
* If the difference between the two summations is too large, the fs
401-
* might just be busy and so we'll mark the scrub incomplete. Return
402-
* true here so that we don't mark the counter corrupt.
403-
*
404-
* XXX: In the future when userspace can grant scrub permission to
405-
* quiesce the filesystem to solve the outsized variance problem, this
406-
* check should be moved up and the return code changed to signal to
407-
* userspace that we need quiesce permission.
408-
*/
409-
if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) {
410-
xchk_set_incomplete(sc);
411-
return true;
412-
}
413-
492+
/* Everything else is bad. */
414493
return false;
415494
}
416495

@@ -422,6 +501,7 @@ xchk_fscounters(
422501
struct xfs_mount *mp = sc->mp;
423502
struct xchk_fscounters *fsc = sc->buf;
424503
int64_t icount, ifree, fdblocks, frextents;
504+
bool try_again = false;
425505
int error;
426506

427507
/* Snapshot the percpu counters. */
@@ -431,9 +511,26 @@ xchk_fscounters(
431511
frextents = percpu_counter_sum(&mp->m_frextents);
432512

433513
/* No negative values, please! */
434-
if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0)
514+
if (icount < 0 || ifree < 0)
435515
xchk_set_corrupt(sc);
436516

517+
/*
518+
* If the filesystem is not frozen, the counter summation calls above
519+
* can race with xfs_mod_freecounter, which subtracts a requested space
520+
* reservation from the counter and undoes the subtraction if that made
521+
* the counter go negative. Therefore, it's possible to see negative
522+
* values here, and we should only flag that as a corruption if we
523+
* froze the fs. This is much more likely to happen with frextents
524+
* since there are no reserved pools.
525+
*/
526+
if (fdblocks < 0 || frextents < 0) {
527+
if (!fsc->frozen)
528+
return -EDEADLOCK;
529+
530+
xchk_set_corrupt(sc);
531+
return 0;
532+
}
533+
437534
/* See if icount is obviously wrong. */
438535
if (icount < fsc->icount_min || icount > fsc->icount_max)
439536
xchk_set_corrupt(sc);
@@ -446,12 +543,6 @@ xchk_fscounters(
446543
if (frextents > mp->m_sb.sb_rextents)
447544
xchk_set_corrupt(sc);
448545

449-
/*
450-
* XXX: We can't quiesce percpu counter updates, so exit early.
451-
* This can be re-enabled when we gain exclusive freeze functionality.
452-
*/
453-
return 0;
454-
455546
/*
456547
* If ifree exceeds icount by more than the minimum variance then
457548
* something's probably wrong with the counters.
@@ -463,8 +554,6 @@ xchk_fscounters(
463554
error = xchk_fscount_aggregate_agcounts(sc, fsc);
464555
if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
465556
return error;
466-
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
467-
return 0;
468557

469558
/* Count the free extents counter for rt volumes. */
470559
error = xchk_fscount_count_frextents(sc, fsc);
@@ -473,20 +562,45 @@ xchk_fscounters(
473562
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
474563
return 0;
475564

476-
/* Compare the in-core counters with whatever we counted. */
477-
if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
478-
xchk_set_corrupt(sc);
565+
/*
566+
* Compare the in-core counters with whatever we counted. If the fs is
567+
* frozen, we treat the discrepancy as a corruption because the freeze
568+
* should have stabilized the counter values. Otherwise, we need
569+
* userspace to call us back having granted us freeze permission.
570+
*/
571+
if (!xchk_fscount_within_range(sc, icount, &mp->m_icount,
572+
fsc->icount)) {
573+
if (fsc->frozen)
574+
xchk_set_corrupt(sc);
575+
else
576+
try_again = true;
577+
}
479578

480-
if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree))
481-
xchk_set_corrupt(sc);
579+
if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) {
580+
if (fsc->frozen)
581+
xchk_set_corrupt(sc);
582+
else
583+
try_again = true;
584+
}
482585

483586
if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
484-
fsc->fdblocks))
485-
xchk_set_corrupt(sc);
587+
fsc->fdblocks)) {
588+
if (fsc->frozen)
589+
xchk_set_corrupt(sc);
590+
else
591+
try_again = true;
592+
}
486593

487594
if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
488-
fsc->frextents))
489-
xchk_set_corrupt(sc);
595+
fsc->frextents)) {
596+
if (fsc->frozen)
597+
xchk_set_corrupt(sc);
598+
else
599+
try_again = true;
600+
}
601+
602+
if (try_again)
603+
return -EDEADLOCK;
490604

491605
return 0;
492606
}

fs/xfs/scrub/scrub.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,8 +184,10 @@ xchk_teardown(
184184
xchk_irele(sc, sc->ip);
185185
sc->ip = NULL;
186186
}
187-
if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
187+
if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
188+
sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
188189
mnt_drop_write_file(sc->file);
190+
}
189191
if (sc->buf) {
190192
if (sc->buf_cleanup)
191193
sc->buf_cleanup(sc->buf);
@@ -505,6 +507,8 @@ xfs_scrub_metadata(
505507
error = mnt_want_write_file(sc->file);
506508
if (error)
507509
goto out_sc;
510+
511+
sc->flags |= XCHK_HAVE_FREEZE_PROT;
508512
}
509513

510514
/* Set up for the operation. */

fs/xfs/scrub/scrub.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ struct xfs_scrub {
106106

107107
/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
108108
#define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */
109+
#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */
109110
#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
110111
#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
111112
#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */

fs/xfs/scrub/trace.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
9898

9999
#define XFS_SCRUB_STATE_STRINGS \
100100
{ XCHK_TRY_HARDER, "try_harder" }, \
101+
{ XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \
101102
{ XCHK_FSGATES_DRAIN, "fsgates_drain" }, \
102103
{ XCHK_NEED_DRAIN, "need_drain" }, \
103104
{ XREP_ALREADY_FIXED, "already_fixed" }
@@ -693,6 +694,31 @@ TRACE_EVENT(xchk_fscounters_within_range,
693694
__entry->old_value)
694695
)
695696

697+
DECLARE_EVENT_CLASS(xchk_fsfreeze_class,
698+
TP_PROTO(struct xfs_scrub *sc, int error),
699+
TP_ARGS(sc, error),
700+
TP_STRUCT__entry(
701+
__field(dev_t, dev)
702+
__field(unsigned int, type)
703+
__field(int, error)
704+
),
705+
TP_fast_assign(
706+
__entry->dev = sc->mp->m_super->s_dev;
707+
__entry->type = sc->sm->sm_type;
708+
__entry->error = error;
709+
),
710+
TP_printk("dev %d:%d type %s error %d",
711+
MAJOR(__entry->dev), MINOR(__entry->dev),
712+
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
713+
__entry->error)
714+
);
715+
#define DEFINE_XCHK_FSFREEZE_EVENT(name) \
716+
DEFINE_EVENT(xchk_fsfreeze_class, name, \
717+
TP_PROTO(struct xfs_scrub *sc, int error), \
718+
TP_ARGS(sc, error))
719+
DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze);
720+
DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw);
721+
696722
TRACE_EVENT(xchk_refcount_incorrect,
697723
TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec,
698724
xfs_nlink_t seen),

0 commit comments

Comments
 (0)