Skip to content

Commit f768b35

Browse files
committed
Merge tag 'xfs-6.3-fixes-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull xfs percpu counter fixes from Darrick Wong: "We discovered a filesystem summary counter corruption problem that was traced to cpu hot-remove racing with the call to percpu_counter_sum that sets the free block count in the superblock when writing it to disk. The root cause is that percpu_counter_sum doesn't cull from dying cpus and hence misses those counter values if the cpu shutdown hooks have not yet run to merge the values. I'm hoping this is a fairly painless fix to the problem, since the dying cpu mask should generally be empty. It's been in for-next for a week without any complaints from the bots. - Fix a race in the percpu counters summation code where the summation failed to add in the values for any CPUs that were dying but not yet dead. This fixes some minor discrepancies and incorrect assertions when running generic/650" * tag 'xfs-6.3-fixes-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: pcpcntr: remove percpu_counter_sum_all() fork: remove use of percpu_counter_sum_all pcpcntrs: fix dying cpu summation race cpumask: introduce for_each_cpu_or
2 parents d704426 + e9b60c7 commit f768b35

File tree

6 files changed

+77
-34
lines changed

6 files changed

+77
-34
lines changed

include/linux/cpumask.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,23 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
350350
#define for_each_cpu_andnot(cpu, mask1, mask2) \
351351
for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
352352

353+
/**
354+
* for_each_cpu_or - iterate over every cpu present in either mask
355+
* @cpu: the (optionally unsigned) integer iterator
356+
* @mask1: the first cpumask pointer
357+
* @mask2: the second cpumask pointer
358+
*
359+
* This saves a temporary CPU mask in many places. It is equivalent to:
360+
* struct cpumask tmp;
361+
* cpumask_or(&tmp, &mask1, &mask2);
362+
* for_each_cpu(cpu, &tmp)
363+
* ...
364+
*
365+
* After the loop, cpu is >= nr_cpu_ids.
366+
*/
367+
#define for_each_cpu_or(cpu, mask1, mask2) \
368+
for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
369+
353370
/**
354371
* cpumask_any_but - return a "random" in a cpumask, but not this one.
355372
* @mask: the cpumask to search

include/linux/find.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long
1414
unsigned long nbits, unsigned long start);
1515
unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
1616
unsigned long nbits, unsigned long start);
17+
unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2,
18+
unsigned long nbits, unsigned long start);
1719
unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
1820
unsigned long start);
1921
extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
@@ -127,6 +129,36 @@ unsigned long find_next_andnot_bit(const unsigned long *addr1,
127129
}
128130
#endif
129131

132+
#ifndef find_next_or_bit
133+
/**
134+
* find_next_or_bit - find the next set bit in either memory regions
135+
* @addr1: The first address to base the search on
136+
* @addr2: The second address to base the search on
137+
* @size: The bitmap size in bits
138+
* @offset: The bitnumber to start searching at
139+
*
140+
* Returns the bit number for the next set bit
141+
* If no bits are set, returns @size.
142+
*/
143+
static inline
144+
unsigned long find_next_or_bit(const unsigned long *addr1,
145+
const unsigned long *addr2, unsigned long size,
146+
unsigned long offset)
147+
{
148+
if (small_const_nbits(size)) {
149+
unsigned long val;
150+
151+
if (unlikely(offset >= size))
152+
return size;
153+
154+
val = (*addr1 | *addr2) & GENMASK(size - 1, offset);
155+
return val ? __ffs(val) : size;
156+
}
157+
158+
return _find_next_or_bit(addr1, addr2, size, offset);
159+
}
160+
#endif
161+
130162
#ifndef find_next_zero_bit
131163
/**
132164
* find_next_zero_bit - find the next cleared bit in a memory region
@@ -536,6 +568,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned
536568
(bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
537569
(bit)++)
538570

571+
#define for_each_or_bit(bit, addr1, addr2, size) \
572+
for ((bit) = 0; \
573+
(bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
574+
(bit)++)
575+
539576
/* same as for_each_set_bit() but use bit as value to start with */
540577
#define for_each_set_bit_from(bit, addr, size) \
541578
for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

include/linux/percpu_counter.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
4545
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
4646
s32 batch);
4747
s64 __percpu_counter_sum(struct percpu_counter *fbc);
48-
s64 percpu_counter_sum_all(struct percpu_counter *fbc);
4948
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
5049
void percpu_counter_sync(struct percpu_counter *fbc);
5150

@@ -196,11 +195,6 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
196195
return percpu_counter_read(fbc);
197196
}
198197

199-
static inline s64 percpu_counter_sum_all(struct percpu_counter *fbc)
200-
{
201-
return percpu_counter_read(fbc);
202-
}
203-
204198
static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
205199
{
206200
return true;

kernel/fork.c

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -755,11 +755,6 @@ static void check_mm(struct mm_struct *mm)
755755
for (i = 0; i < NR_MM_COUNTERS; i++) {
756756
long x = percpu_counter_sum(&mm->rss_stat[i]);
757757

758-
if (likely(!x))
759-
continue;
760-
761-
/* Making sure this is not due to race with CPU offlining. */
762-
x = percpu_counter_sum_all(&mm->rss_stat[i]);
763758
if (unlikely(x))
764759
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
765760
mm, resident_page_types[i], x);

lib/find_bit.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,15 @@ unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned l
182182
EXPORT_SYMBOL(_find_next_andnot_bit);
183183
#endif
184184

185+
#ifndef find_next_or_bit
186+
unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2,
187+
unsigned long nbits, unsigned long start)
188+
{
189+
return FIND_NEXT_BIT(addr1[idx] | addr2[idx], /* nop */, nbits, start);
190+
}
191+
EXPORT_SYMBOL(_find_next_or_bit);
192+
#endif
193+
185194
#ifndef find_next_zero_bit
186195
unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
187196
unsigned long start)

lib/percpu_counter.c

Lines changed: 14 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -122,44 +122,35 @@ void percpu_counter_sync(struct percpu_counter *fbc)
122122
}
123123
EXPORT_SYMBOL(percpu_counter_sync);
124124

125-
static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc,
126-
const struct cpumask *cpu_mask)
125+
/*
126+
* Add up all the per-cpu counts, return the result. This is a more accurate
127+
* but much slower version of percpu_counter_read_positive().
128+
*
129+
* We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums
130+
* from CPUs that are in the process of being taken offline. Dying cpus have
131+
* been removed from the online mask, but may not have had the hotplug dead
132+
* notifier called to fold the percpu count back into the global counter sum.
133+
* By including dying CPUs in the iteration mask, we avoid this race condition
134+
* so __percpu_counter_sum() just does the right thing when CPUs are being taken
135+
* offline.
136+
*/
137+
s64 __percpu_counter_sum(struct percpu_counter *fbc)
127138
{
128139
s64 ret;
129140
int cpu;
130141
unsigned long flags;
131142

132143
raw_spin_lock_irqsave(&fbc->lock, flags);
133144
ret = fbc->count;
134-
for_each_cpu(cpu, cpu_mask) {
145+
for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
135146
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
136147
ret += *pcount;
137148
}
138149
raw_spin_unlock_irqrestore(&fbc->lock, flags);
139150
return ret;
140151
}
141-
142-
/*
143-
* Add up all the per-cpu counts, return the result. This is a more accurate
144-
* but much slower version of percpu_counter_read_positive()
145-
*/
146-
s64 __percpu_counter_sum(struct percpu_counter *fbc)
147-
{
148-
return __percpu_counter_sum_mask(fbc, cpu_online_mask);
149-
}
150152
EXPORT_SYMBOL(__percpu_counter_sum);
151153

152-
/*
153-
* This is slower version of percpu_counter_sum as it traverses all possible
154-
* cpus. Use this only in the cases where accurate data is needed in the
155-
* presense of CPUs getting offlined.
156-
*/
157-
s64 percpu_counter_sum_all(struct percpu_counter *fbc)
158-
{
159-
return __percpu_counter_sum_mask(fbc, cpu_possible_mask);
160-
}
161-
EXPORT_SYMBOL(percpu_counter_sum_all);
162-
163154
int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
164155
struct lock_class_key *key)
165156
{

0 commit comments

Comments
 (0)