Skip to content

Commit 024996d

Browse files
davidhildenbrandMa Wupeng
authored andcommitted
mm/madvise: make MADV_POPULATE_(READ|WRITE) handle VM_FAULT_RETRY properly
stable inclusion from stable-v6.6.30 commit 9e898211704cade0616fa876108d366f88a9624f bugzilla: https://gitee.com/openeuler/kernel/issues/I9MPZ8 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9e898211704cade0616fa876108d366f88a9624f -------------------------------- [ Upstream commit 631426ba1d45a8672b177ee85ad4cabe760dd131 ] Darrick reports that in some cases where pread() would fail with -EIO and mmap()+access would generate a SIGBUS signal, MADV_POPULATE_READ / MADV_POPULATE_WRITE will keep retrying forever and not fail with -EFAULT. While the madvise() call can be interrupted by a signal, this is not the desired behavior. MADV_POPULATE_READ / MADV_POPULATE_WRITE should behave like page faults in that case: fail and not retry forever. A reproducer can be found at [1]. The reason is that __get_user_pages(), as called by faultin_vma_page_range(), will not handle VM_FAULT_RETRY in a proper way: it will simply return 0 when VM_FAULT_RETRY happened, making madvise_populate()->faultin_vma_page_range() retry again and again, never setting FOLL_TRIED->FAULT_FLAG_TRIED for __get_user_pages(). __get_user_pages_locked() does what we want, but duplicating that logic in faultin_vma_page_range() feels wrong. So let's use __get_user_pages_locked() instead, that will detect VM_FAULT_RETRY and set FOLL_TRIED when retrying, making the fault handler return VM_FAULT_SIGBUS (VM_FAULT_ERROR) at some point, propagating -EFAULT from faultin_page() to __get_user_pages(), all the way to madvise_populate(). But, there is an issue: __get_user_pages_locked() will end up re-taking the MM lock and then __get_user_pages() will do another VMA lookup. In the meantime, the VMA layout could have changed and we'd fail with different error codes than we'd want to. As __get_user_pages() will currently do a new VMA lookup either way, let it do the VMA handling in a different way, controlled by a new FOLL_MADV_POPULATE flag, effectively moving these checks from madvise_populate() + faultin_page_range() in there. With this change, Darricks reproducer properly fails with -EFAULT, as documented for MADV_POPULATE_READ / MADV_POPULATE_WRITE. [1] https://lore.kernel.org/all/20240313171936.GN1927156@frogsfrogsfrogs/ Link: https://lkml.kernel.org/r/20240314161300.382526-1-david@redhat.com Link: https://lkml.kernel.org/r/20240314161300.382526-2-david@redhat.com Fixes: 4ca9b38 ("mm/madvise: introduce MADV_POPULATE_(READ|WRITE) to prefault page tables") Signed-off-by: David Hildenbrand <david@redhat.com> Reported-by: Darrick J. Wong <djwong@kernel.org> Closes: https://lore.kernel.org/all/20240311223815.GW1927156@frogsfrogsfrogs/ Cc: Darrick J. Wong <djwong@kernel.org> Cc: Hugh Dickins <hughd@google.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Sasha Levin <sashal@kernel.org> Signed-off-by: ZhangPeng <zhangpeng362@huawei.com>
1 parent 583ef6b commit 024996d

File tree

3 files changed

+40
-41
lines changed

3 files changed

+40
-41
lines changed

mm/gup.c

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,6 +1204,22 @@ static long __get_user_pages(struct mm_struct *mm,
12041204

12051205
/* first iteration or cross vma bound */
12061206
if (!vma || start >= vma->vm_end) {
1207+
/*
1208+
* MADV_POPULATE_(READ|WRITE) wants to handle VMA
1209+
* lookups+error reporting differently.
1210+
*/
1211+
if (gup_flags & FOLL_MADV_POPULATE) {
1212+
vma = vma_lookup(mm, start);
1213+
if (!vma) {
1214+
ret = -ENOMEM;
1215+
goto out;
1216+
}
1217+
if (check_vma_flags(vma, gup_flags)) {
1218+
ret = -EINVAL;
1219+
goto out;
1220+
}
1221+
goto retry;
1222+
}
12071223
vma = gup_vma_lookup(mm, start);
12081224
if (!vma && in_gate_area(mm, start)) {
12091225
ret = get_gate_page(mm, start & PAGE_MASK,
@@ -1670,35 +1686,35 @@ long populate_vma_page_range(struct vm_area_struct *vma,
16701686
}
16711687

16721688
/*
1673-
* faultin_vma_page_range() - populate (prefault) page tables inside the
1674-
* given VMA range readable/writable
1689+
* faultin_page_range() - populate (prefault) page tables inside the
1690+
* given range readable/writable
16751691
*
16761692
* This takes care of mlocking the pages, too, if VM_LOCKED is set.
16771693
*
1678-
* @vma: target vma
1694+
* @mm: the mm to populate page tables in
16791695
* @start: start address
16801696
* @end: end address
16811697
* @write: whether to prefault readable or writable
16821698
* @locked: whether the mmap_lock is still held
16831699
*
1684-
* Returns either number of processed pages in the vma, or a negative error
1685-
* code on error (see __get_user_pages()).
1700+
* Returns either number of processed pages in the MM, or a negative error
1701+
* code on error (see __get_user_pages()). Note that this function reports
1702+
* errors related to VMAs, such as incompatible mappings, as expected by
1703+
* MADV_POPULATE_(READ|WRITE).
16861704
*
1687-
* vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
1688-
* covered by the VMA. If it's released, *@locked will be set to 0.
1705+
* The range must be page-aligned.
1706+
*
1707+
* mm->mmap_lock must be held. If it's released, *@locked will be set to 0.
16891708
*/
1690-
long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
1691-
unsigned long end, bool write, int *locked)
1709+
long faultin_page_range(struct mm_struct *mm, unsigned long start,
1710+
unsigned long end, bool write, int *locked)
16921711
{
1693-
struct mm_struct *mm = vma->vm_mm;
16941712
unsigned long nr_pages = (end - start) / PAGE_SIZE;
16951713
int gup_flags;
16961714
long ret;
16971715

16981716
VM_BUG_ON(!PAGE_ALIGNED(start));
16991717
VM_BUG_ON(!PAGE_ALIGNED(end));
1700-
VM_BUG_ON_VMA(start < vma->vm_start, vma);
1701-
VM_BUG_ON_VMA(end > vma->vm_end, vma);
17021718
mmap_assert_locked(mm);
17031719

17041720
/*
@@ -1710,19 +1726,13 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
17101726
* a poisoned page.
17111727
* !FOLL_FORCE: Require proper access permissions.
17121728
*/
1713-
gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE;
1729+
gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE |
1730+
FOLL_MADV_POPULATE;
17141731
if (write)
17151732
gup_flags |= FOLL_WRITE;
17161733

1717-
/*
1718-
* We want to report -EINVAL instead of -EFAULT for any permission
1719-
* problems or incompatible mappings.
1720-
*/
1721-
if (check_vma_flags(vma, gup_flags))
1722-
return -EINVAL;
1723-
1724-
ret = __get_user_pages(mm, start, nr_pages, gup_flags,
1725-
NULL, locked);
1734+
ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked,
1735+
gup_flags);
17261736
lru_add_drain();
17271737
return ret;
17281738
}

mm/internal.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -694,9 +694,8 @@ struct anon_vma *folio_anon_vma(struct folio *folio);
694694
void unmap_mapping_folio(struct folio *folio);
695695
extern long populate_vma_page_range(struct vm_area_struct *vma,
696696
unsigned long start, unsigned long end, int *locked);
697-
extern long faultin_vma_page_range(struct vm_area_struct *vma,
698-
unsigned long start, unsigned long end,
699-
bool write, int *locked);
697+
extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
698+
unsigned long end, bool write, int *locked);
700699
extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
701700
unsigned long bytes);
702701

@@ -1127,10 +1126,13 @@ enum {
11271126
FOLL_FAST_ONLY = 1 << 20,
11281127
/* allow unlocking the mmap lock */
11291128
FOLL_UNLOCKABLE = 1 << 21,
1129+
/* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */
1130+
FOLL_MADV_POPULATE = 1 << 22,
11301131
};
11311132

11321133
#define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \
1133-
FOLL_FAST_ONLY | FOLL_UNLOCKABLE)
1134+
FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \
1135+
FOLL_MADV_POPULATE)
11341136

11351137
/*
11361138
* Indicates for which pages that are write-protected in the page table,

mm/madvise.c

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -938,27 +938,14 @@ static long madvise_populate(struct vm_area_struct *vma,
938938
{
939939
const bool write = behavior == MADV_POPULATE_WRITE;
940940
struct mm_struct *mm = vma->vm_mm;
941-
unsigned long tmp_end;
942941
int locked = 1;
943942
long pages;
944943

945944
*prev = vma;
946945

947946
while (start < end) {
948-
/*
949-
* We might have temporarily dropped the lock. For example,
950-
* our VMA might have been split.
951-
*/
952-
if (!vma || start >= vma->vm_end) {
953-
vma = vma_lookup(mm, start);
954-
if (!vma)
955-
return -ENOMEM;
956-
}
957-
958-
tmp_end = min_t(unsigned long, end, vma->vm_end);
959947
/* Populate (prefault) page tables readable/writable. */
960-
pages = faultin_vma_page_range(vma, start, tmp_end, write,
961-
&locked);
948+
pages = faultin_page_range(mm, start, end, write, &locked);
962949
if (!locked) {
963950
mmap_read_lock(mm);
964951
locked = 1;
@@ -979,7 +966,7 @@ static long madvise_populate(struct vm_area_struct *vma,
979966
pr_warn_once("%s: unhandled return value: %ld\n",
980967
__func__, pages);
981968
fallthrough;
982-
case -ENOMEM:
969+
case -ENOMEM: /* No VMA or out of memory. */
983970
return -ENOMEM;
984971
}
985972
}

0 commit comments

Comments
 (0)