Skip to content

Commit c68d332

Browse files
thejhopsiff
authored andcommitted
mm/hugetlb: unshare page tables during VMA split, not before
commit 081056dc00a27bccb55ccc3c6f230a3d5fd3f7e0 upstream. Currently, __split_vma() triggers hugetlb page table unsharing through vm_ops->may_split(). This happens before the VMA lock and rmap locks are taken - which is too early, it allows racing VMA-locked page faults in our process and racing rmap walks from other processes to cause page tables to be shared again before we actually perform the split. Fix it by explicitly calling into the hugetlb unshare logic from __split_vma() in the same place where THP splitting also happens. At that point, both the VMA and the rmap(s) are write-locked. An annoying detail is that we can now call into the helper hugetlb_unshare_pmds() from two different locking contexts: 1. from hugetlb_split(), holding: - mmap lock (exclusively) - VMA lock - file rmap lock (exclusively) 2. hugetlb_unshare_all_pmds(), which I think is designed to be able to call us with only the mmap lock held (in shared mode), but currently only runs while holding mmap lock (exclusively) and VMA lock Backporting note: This commit fixes a racy protection that was introduced in commit b30c14c ("hugetlb: unshare some PMDs when splitting VMAs"); that commit claimed to fix an issue introduced in 5.13, but it should actually also go all the way back. [jannh@google.com: v2] Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a@google.com Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a@google.com Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a@google.com Fixes: 39dde65 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Jann Horn <jannh@google.com> Cc: Liam Howlett <liam.howlett@oracle.com> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Oscar Salvador <osalvador@suse.de> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: <stable@vger.kernel.org> [b30c14c: hugetlb: unshare some PMDs when splitting VMAs] Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> [stable backport: added missing include] Signed-off-by: Jann Horn <jannh@google.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> (cherry picked from commit 9cf5b2a3b72c23fb7b84736d5d19ee6ea718762b)
1 parent 5a77cff commit c68d332

File tree

5 files changed

+57
-16
lines changed

5 files changed

+57
-16
lines changed

include/linux/hugetlb.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
272272
bool is_hugetlb_entry_migration(pte_t pte);
273273
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
274274
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
275+
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
275276

276277
#else /* !CONFIG_HUGETLB_PAGE */
277278

@@ -465,6 +466,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
465466

466467
static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
467468

469+
static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
470+
468471
#endif /* !CONFIG_HUGETLB_PAGE */
469472

470473
#ifndef pgd_write

mm/hugetlb.c

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
8787
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
8888
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
8989
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
90-
unsigned long start, unsigned long end);
90+
unsigned long start, unsigned long end, bool take_locks);
9191
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
9292

9393
static void hugetlb_free_folio(struct folio *folio)
@@ -5071,26 +5071,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
50715071
{
50725072
if (addr & ~(huge_page_mask(hstate_vma(vma))))
50735073
return -EINVAL;
5074+
return 0;
5075+
}
50745076

5077+
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
5078+
{
50755079
/*
50765080
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
50775081
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
50785082
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
5083+
* This function is called in the middle of a VMA split operation, with
5084+
* MM, VMA and rmap all write-locked to prevent concurrent page table
5085+
* walks (except hardware and gup_fast()).
50795086
*/
5087+
vma_assert_write_locked(vma);
5088+
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
5089+
50805090
if (addr & ~PUD_MASK) {
5081-
/*
5082-
* hugetlb_vm_op_split is called right before we attempt to
5083-
* split the VMA. We will need to unshare PMDs in the old and
5084-
* new VMAs, so let's unshare before we split.
5085-
*/
50865091
unsigned long floor = addr & PUD_MASK;
50875092
unsigned long ceil = floor + PUD_SIZE;
50885093

5089-
if (floor >= vma->vm_start && ceil <= vma->vm_end)
5090-
hugetlb_unshare_pmds(vma, floor, ceil);
5094+
if (floor >= vma->vm_start && ceil <= vma->vm_end) {
5095+
/*
5096+
* Locking:
5097+
* Use take_locks=false here.
5098+
* The file rmap lock is already held.
5099+
* The hugetlb VMA lock can't be taken when we already
5100+
* hold the file rmap lock, and we don't need it because
5101+
* its purpose is to synchronize against concurrent page
5102+
* table walks, which are not possible thanks to the
5103+
* locks held by our caller.
5104+
*/
5105+
hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
5106+
}
50915107
}
5092-
5093-
return 0;
50945108
}
50955109

50965110
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -7491,9 +7505,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
74917505
}
74927506
}
74937507

7508+
/*
7509+
* If @take_locks is false, the caller must ensure that no concurrent page table
7510+
* access can happen (except for gup_fast() and hardware page walks).
7511+
* If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
7512+
* concurrent page fault handling) and the file rmap lock.
7513+
*/
74947514
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
74957515
unsigned long start,
7496-
unsigned long end)
7516+
unsigned long end,
7517+
bool take_locks)
74977518
{
74987519
struct hstate *h = hstate_vma(vma);
74997520
unsigned long sz = huge_page_size(h);
@@ -7517,8 +7538,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
75177538
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
75187539
start, end);
75197540
mmu_notifier_invalidate_range_start(&range);
7520-
hugetlb_vma_lock_write(vma);
7521-
i_mmap_lock_write(vma->vm_file->f_mapping);
7541+
if (take_locks) {
7542+
hugetlb_vma_lock_write(vma);
7543+
i_mmap_lock_write(vma->vm_file->f_mapping);
7544+
} else {
7545+
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
7546+
}
75227547
for (address = start; address < end; address += PUD_SIZE) {
75237548
ptep = hugetlb_walk(vma, address, sz);
75247549
if (!ptep)
@@ -7528,8 +7553,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
75287553
spin_unlock(ptl);
75297554
}
75307555
flush_hugetlb_tlb_range(vma, start, end);
7531-
i_mmap_unlock_write(vma->vm_file->f_mapping);
7532-
hugetlb_vma_unlock_write(vma);
7556+
if (take_locks) {
7557+
i_mmap_unlock_write(vma->vm_file->f_mapping);
7558+
hugetlb_vma_unlock_write(vma);
7559+
}
75337560
/*
75347561
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
75357562
* Documentation/mm/mmu_notifier.rst.
@@ -7544,7 +7571,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
75447571
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
75457572
{
75467573
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
7547-
ALIGN_DOWN(vma->vm_end, PUD_SIZE));
7574+
ALIGN_DOWN(vma->vm_end, PUD_SIZE),
7575+
/* take_locks = */ true);
75487576
}
75497577

75507578
#ifdef CONFIG_CMA

mm/vma.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,14 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
416416
init_vma_prep(&vp, vma);
417417
vp.insert = new;
418418
vma_prepare(&vp);
419+
420+
/*
421+
* Get rid of huge pages and shared page tables straddling the split
422+
* boundary.
423+
*/
419424
vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
425+
if (is_vm_hugetlb_page(vma))
426+
hugetlb_split(vma, addr);
420427

421428
if (new_below) {
422429
vma->vm_start = addr;

mm/vma_internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <linux/file.h>
1818
#include <linux/fs.h>
1919
#include <linux/huge_mm.h>
20+
#include <linux/hugetlb.h>
2021
#include <linux/hugetlb_inline.h>
2122
#include <linux/kernel.h>
2223
#include <linux/khugepaged.h>

tools/testing/vma/vma_internal.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -735,6 +735,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
735735
(void)adjust_next;
736736
}
737737

738+
static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
739+
738740
static inline void vma_iter_free(struct vma_iterator *vmi)
739741
{
740742
mas_destroy(&vmi->mas);

0 commit comments

Comments
 (0)