Skip to content

Commit 3d25216

Browse files
committed
fs/pipe: Read pipe->{head,tail} atomically outside pipe->mutex
pipe_readable(), pipe_writable(), and pipe_poll() can read "pipe->head" and "pipe->tail" outside of "pipe->mutex" critical section. When the head and the tail are read individually in that order, there is a window for interruption between the two reads in which both the head and the tail can be updated by concurrent readers and writers. One of the problematic scenarios observed with hackbench running multiple groups on a large server on a particular pipe inode is as follows: pipe->head = 36 pipe->tail = 36 hackbench-118762 [057] ..... 1029.550548: pipe_write: *wakes up: pipe not full* hackbench-118762 [057] ..... 1029.550548: pipe_write: head: 36 -> 37 [tail: 36] hackbench-118762 [057] ..... 1029.550548: pipe_write: *wake up next reader 118740* hackbench-118762 [057] ..... 1029.550548: pipe_write: *wake up next writer 118768* hackbench-118768 [206] ..... 1029.55055X: pipe_write: *writer wakes up* hackbench-118768 [206] ..... 1029.55055X: pipe_write: head = READ_ONCE(pipe->head) [37] ... CPU 206 interrupted (exact wakeup was not traced but 118768 did read head at 37 in traces) hackbench-118740 [057] ..... 1029.550558: pipe_read: *reader wakes up: pipe is not empty* hackbench-118740 [057] ..... 1029.550558: pipe_read: tail: 36 -> 37 [head = 37] hackbench-118740 [057] ..... 1029.550559: pipe_read: *pipe is empty; wakeup writer 118768* hackbench-118740 [057] ..... 1029.550559: pipe_read: *sleeps* hackbench-118766 [185] ..... 1029.550592: pipe_write: *New writer comes in* hackbench-118766 [185] ..... 1029.550592: pipe_write: head: 37 -> 38 [tail: 37] hackbench-118766 [185] ..... 1029.550592: pipe_write: *wakes up reader 118766* hackbench-118740 [185] ..... 1029.550598: pipe_read: *reader wakes up; pipe not empty* hackbench-118740 [185] ..... 1029.550599: pipe_read: tail: 37 -> 38 [head: 38] hackbench-118740 [185] ..... 1029.550599: pipe_read: *pipe is empty* hackbench-118740 [185] ..... 1029.550599: pipe_read: *reader sleeps; wakeup writer 118768* ... CPU 206 switches back to writer hackbench-118768 [206] ..... 1029.550601: pipe_write: tail = READ_ONCE(pipe->tail) [38] hackbench-118768 [206] ..... 1029.550601: pipe_write: pipe_full()? (u32)(37 - 38) >= 16? Yes hackbench-118768 [206] ..... 1029.550601: pipe_write: *writer goes back to sleep* [ Tasks 118740 and 118768 can then indefinitely wait on each other. ] The unsigned arithmetic in pipe_occupancy() wraps around when "pipe->tail > pipe->head" leading to pipe_full() returning true despite the pipe being empty. The case of genuine wraparound of "pipe->head" is handled since pipe buffer has data allowing readers to make progress until the pipe->tail wraps too after which the reader will wakeup a sleeping writer, however, mistaking the pipe to be full when it is in fact empty can lead to readers and writers waiting on each other indefinitely. This issue became more problematic and surfaced as a hang in hackbench after the optimization in commit aaec5a9 ("pipe_read: don't wake up the writer if the pipe is still full") significantly reduced the number of spurious wakeups of writers that had previously helped mask the issue. To avoid missing any updates between the reads of "pipe->head" and "pipe->write", unionize the two with a single unsigned long "pipe->head_tail" member that can be loaded atomically. Using "pipe->head_tail" to read the head and the tail ensures the lockless checks do not miss any updates to the head or the tail and since those two are only updated under "pipe->mutex", it ensures that the head is always ahead of, or equal to the tail resulting in correct calculations. [ prateek: commit log, testing on x86 platforms. ] Reported-and-debugged-by: Swapnil Sapkal <swapnil.sapkal@amd.com> Closes: https://lore.kernel.org/lkml/e813814e-7094-4673-bc69-731af065a0eb@amd.com/ Reported-by: Alexey Gladkov <legion@kernel.org> Closes: https://lore.kernel.org/all/Z8Wn0nTvevLRG_4m@example.org/ Fixes: 8cefc10 ("pipe: Use head and tail pointers for the ring, not cursor and length") Tested-by: Swapnil Sapkal <swapnil.sapkal@amd.com> Reviewed-by: Oleg Nesterov <oleg@redhat.com> Tested-by: Alexey Gladkov <legion@kernel.org> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 99fa936 commit 3d25216

File tree

2 files changed

+45
-13
lines changed

2 files changed

+45
-13
lines changed

fs/pipe.c

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -210,11 +210,10 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
210210
/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
211211
static inline bool pipe_readable(const struct pipe_inode_info *pipe)
212212
{
213-
unsigned int head = READ_ONCE(pipe->head);
214-
unsigned int tail = READ_ONCE(pipe->tail);
213+
union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) };
215214
unsigned int writers = READ_ONCE(pipe->writers);
216215

217-
return !pipe_empty(head, tail) || !writers;
216+
return !pipe_empty(idx.head, idx.tail) || !writers;
218217
}
219218

220219
static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
@@ -417,11 +416,10 @@ static inline int is_packetized(struct file *file)
417416
/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
418417
static inline bool pipe_writable(const struct pipe_inode_info *pipe)
419418
{
420-
unsigned int head = READ_ONCE(pipe->head);
421-
unsigned int tail = READ_ONCE(pipe->tail);
419+
union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) };
422420
unsigned int max_usage = READ_ONCE(pipe->max_usage);
423421

424-
return !pipe_full(head, tail, max_usage) ||
422+
return !pipe_full(idx.head, idx.tail, max_usage) ||
425423
!READ_ONCE(pipe->readers);
426424
}
427425

@@ -659,7 +657,7 @@ pipe_poll(struct file *filp, poll_table *wait)
659657
{
660658
__poll_t mask;
661659
struct pipe_inode_info *pipe = filp->private_data;
662-
unsigned int head, tail;
660+
union pipe_index idx;
663661

664662
/* Epoll has some historical nasty semantics, this enables them */
665663
WRITE_ONCE(pipe->poll_usage, true);
@@ -680,19 +678,18 @@ pipe_poll(struct file *filp, poll_table *wait)
680678
* if something changes and you got it wrong, the poll
681679
* table entry will wake you up and fix it.
682680
*/
683-
head = READ_ONCE(pipe->head);
684-
tail = READ_ONCE(pipe->tail);
681+
idx.head_tail = READ_ONCE(pipe->head_tail);
685682

686683
mask = 0;
687684
if (filp->f_mode & FMODE_READ) {
688-
if (!pipe_empty(head, tail))
685+
if (!pipe_empty(idx.head, idx.tail))
689686
mask |= EPOLLIN | EPOLLRDNORM;
690687
if (!pipe->writers && filp->f_pipe != pipe->w_counter)
691688
mask |= EPOLLHUP;
692689
}
693690

694691
if (filp->f_mode & FMODE_WRITE) {
695-
if (!pipe_full(head, tail, pipe->max_usage))
692+
if (!pipe_full(idx.head, idx.tail, pipe->max_usage))
696693
mask |= EPOLLOUT | EPOLLWRNORM;
697694
/*
698695
* Most Unices do not set EPOLLERR for FIFOs but on Linux they

include/linux/pipe_fs_i.h

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,33 @@ struct pipe_buffer {
3131
unsigned long private;
3232
};
3333

34+
/*
35+
* Really only alpha needs 32-bit fields, but
36+
* might as well do it for 64-bit architectures
37+
* since that's what we've historically done,
38+
* and it makes 'head_tail' always be a simple
39+
* 'unsigned long'.
40+
*/
41+
#ifdef CONFIG_64BIT
42+
typedef unsigned int pipe_index_t;
43+
#else
44+
typedef unsigned short pipe_index_t;
45+
#endif
46+
47+
/*
48+
* We have to declare this outside 'struct pipe_inode_info',
49+
* but then we can't use 'union pipe_index' for an anonymous
50+
* union, so we end up having to duplicate this declaration
51+
* below. Annoying.
52+
*/
53+
union pipe_index {
54+
unsigned long head_tail;
55+
struct {
56+
pipe_index_t head;
57+
pipe_index_t tail;
58+
};
59+
};
60+
3461
/**
3562
* struct pipe_inode_info - a linux kernel pipe
3663
* @mutex: mutex protecting the whole thing
@@ -58,8 +85,16 @@ struct pipe_buffer {
5885
struct pipe_inode_info {
5986
struct mutex mutex;
6087
wait_queue_head_t rd_wait, wr_wait;
61-
unsigned int head;
62-
unsigned int tail;
88+
89+
/* This has to match the 'union pipe_index' above */
90+
union {
91+
unsigned long head_tail;
92+
struct {
93+
pipe_index_t head;
94+
pipe_index_t tail;
95+
};
96+
};
97+
6398
unsigned int max_usage;
6499
unsigned int ring_size;
65100
unsigned int nr_accounted;

0 commit comments

Comments
 (0)