Skip to content

Commit b709eb8

Browse files
lorenzo-stoakesPeter Zijlstra
authored andcommitted
perf: map pages in advance
We are adjusting struct page to make it smaller, removing unneeded fields which correctly belong to struct folio. Two of those fields are page->index and page->mapping. Perf is currently making use of both of these. This is unnecessary. This patch eliminates this. Perf establishes its own internally controlled memory-mapped pages using vm_ops hooks. The first page in the mapping is the read/write user control page, and the rest of the mapping consists of read-only pages. The VMA is backed by kernel memory either from the buddy allocator or vmalloc depending on configuration. It is intended to be mapped read/write, but because it has a page_mkwrite() hook, vma_wants_writenotify() indicates that it should be mapped read-only. When a write fault occurs, the provided page_mkwrite() hook, perf_mmap_fault() (doing double duty handing faults as well) uses the vmf->pgoff field to determine if this is the first page, allowing for the desired read/write first page, read-only rest mapping. For this to work the implementation has to carefully work around faulting logic. When a page is write-faulted, the fault() hook is called first, then its page_mkwrite() hook is called (to allow for dirty tracking in file systems). On fault we set the folio's mapping in perf_mmap_fault(), this is because when do_page_mkwrite() is subsequently invoked, it treats a missing mapping as an indicator that the fault should be retried. We also set the folio's index so, given the folio is being treated as faux user memory, it correctly references its offset within the VMA. This explains why the mapping and index fields are used - but it's not necessary. We preallocate pages when perf_mmap() is called for the first time via rb_alloc(), and further allocate auxiliary pages via rb_aux_alloc() as needed if the mapping requires it. This allocation is done in the f_ops->mmap() hook provided in perf_mmap(), and so we can instead simply map all the memory right away here - there's no point in handling (read) page faults when we don't demand page nor need to be notified about them (perf does not). This patch therefore changes this logic to map everything when the mmap() hook is called, establishing a PFN map. It implements vm_ops->pfn_mkwrite() to provide the required read/write vs. read-only behaviour, which does not require the previously implemented workarounds. While it is not ideal to use a VM_PFNMAP here, doing anything else will result in the page_mkwrite() hook need to be provided, which requires the same page->mapping hack this patch seeks to undo. It will also result in the pages being treated as folios and placed on the rmap, which really does not make sense for these mappings. Semantically it makes sense to establish this as some kind of special mapping, as the pages are managed by perf and are not strictly user pages, but currently the only means by which we can do so functionally while maintaining the required R/W and R/O behaviour is a PFN map. There should be no change to actual functionality as a result of this change. Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lkml.kernel.org/r/20250103153151.124163-1-lorenzo.stoakes@oracle.com
1 parent 6d64273 commit b709eb8

File tree

2 files changed

+82
-55
lines changed

2 files changed

+82
-55
lines changed

kernel/events/core.c

Lines changed: 81 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -6277,41 +6277,6 @@ void perf_event_update_userpage(struct perf_event *event)
62776277
}
62786278
EXPORT_SYMBOL_GPL(perf_event_update_userpage);
62796279

6280-
static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
6281-
{
6282-
struct perf_event *event = vmf->vma->vm_file->private_data;
6283-
struct perf_buffer *rb;
6284-
vm_fault_t ret = VM_FAULT_SIGBUS;
6285-
6286-
if (vmf->flags & FAULT_FLAG_MKWRITE) {
6287-
if (vmf->pgoff == 0)
6288-
ret = 0;
6289-
return ret;
6290-
}
6291-
6292-
rcu_read_lock();
6293-
rb = rcu_dereference(event->rb);
6294-
if (!rb)
6295-
goto unlock;
6296-
6297-
if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
6298-
goto unlock;
6299-
6300-
vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
6301-
if (!vmf->page)
6302-
goto unlock;
6303-
6304-
get_page(vmf->page);
6305-
vmf->page->mapping = vmf->vma->vm_file->f_mapping;
6306-
vmf->page->index = vmf->pgoff;
6307-
6308-
ret = 0;
6309-
unlock:
6310-
rcu_read_unlock();
6311-
6312-
return ret;
6313-
}
6314-
63156280
static void ring_buffer_attach(struct perf_event *event,
63166281
struct perf_buffer *rb)
63176282
{
@@ -6551,13 +6516,87 @@ static void perf_mmap_close(struct vm_area_struct *vma)
65516516
ring_buffer_put(rb); /* could be last */
65526517
}
65536518

6519+
static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
6520+
{
6521+
/* The first page is the user control page, others are read-only. */
6522+
return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
6523+
}
6524+
65546525
static const struct vm_operations_struct perf_mmap_vmops = {
65556526
.open = perf_mmap_open,
65566527
.close = perf_mmap_close, /* non mergeable */
6557-
.fault = perf_mmap_fault,
6558-
.page_mkwrite = perf_mmap_fault,
6528+
.pfn_mkwrite = perf_mmap_pfn_mkwrite,
65596529
};
65606530

6531+
static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
6532+
{
6533+
unsigned long nr_pages = vma_pages(vma);
6534+
int err = 0;
6535+
unsigned long pagenum;
6536+
6537+
/*
6538+
* We map this as a VM_PFNMAP VMA.
6539+
*
6540+
* This is not ideal as this is designed broadly for mappings of PFNs
6541+
* referencing memory-mapped I/O ranges or non-system RAM i.e. for which
6542+
* !pfn_valid(pfn).
6543+
*
6544+
* We are mapping kernel-allocated memory (memory we manage ourselves)
6545+
* which would more ideally be mapped using vm_insert_page() or a
6546+
* similar mechanism, that is as a VM_MIXEDMAP mapping.
6547+
*
6548+
* However this won't work here, because:
6549+
*
6550+
* 1. It uses vma->vm_page_prot, but this field has not been completely
6551+
* setup at the point of the f_op->mmp() hook, so we are unable to
6552+
* indicate that this should be mapped CoW in order that the
6553+
* mkwrite() hook can be invoked to make the first page R/W and the
6554+
* rest R/O as desired.
6555+
*
6556+
* 2. Anything other than a VM_PFNMAP of valid PFNs will result in
6557+
* vm_normal_page() returning a struct page * pointer, which means
6558+
* vm_ops->page_mkwrite() will be invoked rather than
6559+
* vm_ops->pfn_mkwrite(), and this means we have to set page->mapping
6560+
* to work around retry logic in the fault handler, however this
6561+
* field is no longer allowed to be used within struct page.
6562+
*
6563+
* 3. Having a struct page * made available in the fault logic also
6564+
* means that the page gets put on the rmap and becomes
6565+
* inappropriately accessible and subject to map and ref counting.
6566+
*
6567+
* Ideally we would have a mechanism that could explicitly express our
6568+
* desires, but this is not currently the case, so we instead use
6569+
* VM_PFNMAP.
6570+
*
6571+
* We manage the lifetime of these mappings with internal refcounts (see
6572+
* perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of
6573+
* this mapping is maintained correctly.
6574+
*/
6575+
for (pagenum = 0; pagenum < nr_pages; pagenum++) {
6576+
unsigned long va = vma->vm_start + PAGE_SIZE * pagenum;
6577+
struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum);
6578+
6579+
if (page == NULL) {
6580+
err = -EINVAL;
6581+
break;
6582+
}
6583+
6584+
/* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */
6585+
err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE,
6586+
vm_get_page_prot(vma->vm_flags & ~VM_SHARED));
6587+
if (err)
6588+
break;
6589+
}
6590+
6591+
#ifdef CONFIG_MMU
6592+
/* Clear any partial mappings on error. */
6593+
if (err)
6594+
zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL);
6595+
#endif
6596+
6597+
return err;
6598+
}
6599+
65616600
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
65626601
{
65636602
struct perf_event *event = file->private_data;
@@ -6682,6 +6721,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
66826721
goto again;
66836722
}
66846723

6724+
/* We need the rb to map pages. */
6725+
rb = event->rb;
66856726
goto unlock;
66866727
}
66876728

@@ -6776,6 +6817,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
67766817
vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
67776818
vma->vm_ops = &perf_mmap_vmops;
67786819

6820+
if (!ret)
6821+
ret = map_range(rb, vma);
6822+
67796823
if (event->pmu->event_mapped)
67806824
event->pmu->event_mapped(event, vma->vm_mm);
67816825

kernel/events/ring_buffer.c

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -643,7 +643,6 @@ static void rb_free_aux_page(struct perf_buffer *rb, int idx)
643643
struct page *page = virt_to_page(rb->aux_pages[idx]);
644644

645645
ClearPagePrivate(page);
646-
page->mapping = NULL;
647646
__free_page(page);
648647
}
649648

@@ -819,7 +818,6 @@ static void perf_mmap_free_page(void *addr)
819818
{
820819
struct page *page = virt_to_page(addr);
821820

822-
page->mapping = NULL;
823821
__free_page(page);
824822
}
825823

@@ -890,28 +888,13 @@ __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
890888
return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
891889
}
892890

893-
static void perf_mmap_unmark_page(void *addr)
894-
{
895-
struct page *page = vmalloc_to_page(addr);
896-
897-
page->mapping = NULL;
898-
}
899-
900891
static void rb_free_work(struct work_struct *work)
901892
{
902893
struct perf_buffer *rb;
903-
void *base;
904-
int i, nr;
905894

906895
rb = container_of(work, struct perf_buffer, work);
907-
nr = data_page_nr(rb);
908-
909-
base = rb->user_page;
910-
/* The '<=' counts in the user page. */
911-
for (i = 0; i <= nr; i++)
912-
perf_mmap_unmark_page(base + (i * PAGE_SIZE));
913896

914-
vfree(base);
897+
vfree(rb->user_page);
915898
kfree(rb);
916899
}
917900

0 commit comments

Comments
 (0)