Skip to content

Commit 7c5afcf

Browse files
committed
add direct map removal patches to secret hiding CI
Add an updated version of relevant patches from my v4 direct map removal series [1]. Updated here means - Drop all selftests patches, as they are irrelevant for our CI - Address comments from David about squashing commits - Rebase on top of Fuad's v7 [1]: https://lore.kernel.org/kvm/20250221160728.1584559-1-roypat@amazon.co.uk/ Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
1 parent 7e34e20 commit 7c5afcf

File tree

2 files changed

+386
-0
lines changed

2 files changed

+386
-0
lines changed
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
From 138b7a4c83c43b42851cb8fec2bbdbaadd960241 Mon Sep 17 00:00:00 2001
2+
From: Patrick Roy <roypat@amazon.co.uk>
3+
Date: Fri, 7 Feb 2025 11:16:06 +0000
4+
Subject: [PATCH 1/2] mm: introduce AS_NO_DIRECT_MAP
5+
6+
Add AS_NO_DIRECT_MAP for mappings where direct map entries of folios are
7+
set to not present . Currently, mappings that match this description are
8+
secretmem mappings (memfd_secret()). Later, some guest_memfd
9+
configurations will also fall into this category.
10+
11+
Reject this new type of mappings in all locations that currently reject
12+
secretmem mappings, on the assumption that if secretmem mappings are
13+
rejected somewhere, it is precisely because of an inability to deal with
14+
folios without direct map entries, and then make memfd_secret() use
15+
AS_NO_DIRECT_MAP on its address_space to drop its special
16+
vma_is_secretmem()/secretmem_mapping() checks.
17+
18+
This drops a optimization in gup_fast_folio_allowed() where
19+
secretmem_mapping() was only called if CONFIG_SECRETMEM=y. secretmem is
20+
enabled by default since commit b758fe6df50d ("mm/secretmem: make it on
21+
by default"), so the secretmem check did not actually end up elided in
22+
most cases anymore anyway.
23+
24+
Use a new flag instead of overloading AS_INACCESSIBLE (which is already
25+
set by guest_memfd) because not all guest_memfd mappings will end up
26+
being direct map removed (e.g. in pKVM setups, parts of guest_memfd that
27+
can be mapped to userspace should also be GUP-able, and generally not
28+
have restrictions on who can access it).
29+
30+
Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
31+
---
32+
include/linux/pagemap.h | 16 ++++++++++++++++
33+
include/linux/secretmem.h | 18 ------------------
34+
lib/buildid.c | 4 ++--
35+
mm/gup.c | 14 +++-----------
36+
mm/mlock.c | 2 +-
37+
mm/secretmem.c | 6 +-----
38+
6 files changed, 23 insertions(+), 37 deletions(-)
39+
40+
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
41+
index 47bfc6b1b632..903b41e89cf8 100644
42+
--- a/include/linux/pagemap.h
43+
+++ b/include/linux/pagemap.h
44+
@@ -210,6 +210,7 @@ enum mapping_flags {
45+
AS_STABLE_WRITES = 7, /* must wait for writeback before modifying
46+
folio contents */
47+
AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */
48+
+ AS_NO_DIRECT_MAP = 9, /* Folios in the mapping are not in the direct map */
49+
/* Bits 16-25 are used for FOLIO_ORDER */
50+
AS_FOLIO_ORDER_BITS = 5,
51+
AS_FOLIO_ORDER_MIN = 16,
52+
@@ -335,6 +336,21 @@ static inline bool mapping_inaccessible(struct address_space *mapping)
53+
return test_bit(AS_INACCESSIBLE, &mapping->flags);
54+
}
55+
56+
+static inline void mapping_set_no_direct_map(struct address_space *mapping)
57+
+{
58+
+ set_bit(AS_NO_DIRECT_MAP, &mapping->flags);
59+
+}
60+
+
61+
+static inline bool mapping_no_direct_map(struct address_space *mapping)
62+
+{
63+
+ return test_bit(AS_NO_DIRECT_MAP, &mapping->flags);
64+
+}
65+
+
66+
+static inline bool vma_is_no_direct_map(const struct vm_area_struct *vma)
67+
+{
68+
+ return vma->vm_file && mapping_no_direct_map(vma->vm_file->f_mapping);
69+
+}
70+
+
71+
static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
72+
{
73+
return mapping->gfp_mask;
74+
diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
75+
index e918f96881f5..0ae1fb057b3d 100644
76+
--- a/include/linux/secretmem.h
77+
+++ b/include/linux/secretmem.h
78+
@@ -4,28 +4,10 @@
79+
80+
#ifdef CONFIG_SECRETMEM
81+
82+
-extern const struct address_space_operations secretmem_aops;
83+
-
84+
-static inline bool secretmem_mapping(struct address_space *mapping)
85+
-{
86+
- return mapping->a_ops == &secretmem_aops;
87+
-}
88+
-
89+
-bool vma_is_secretmem(struct vm_area_struct *vma);
90+
bool secretmem_active(void);
91+
92+
#else
93+
94+
-static inline bool vma_is_secretmem(struct vm_area_struct *vma)
95+
-{
96+
- return false;
97+
-}
98+
-
99+
-static inline bool secretmem_mapping(struct address_space *mapping)
100+
-{
101+
- return false;
102+
-}
103+
-
104+
static inline bool secretmem_active(void)
105+
{
106+
return false;
107+
diff --git a/lib/buildid.c b/lib/buildid.c
108+
index c4b0f376fb34..33f173a607ad 100644
109+
--- a/lib/buildid.c
110+
+++ b/lib/buildid.c
111+
@@ -65,8 +65,8 @@ static int freader_get_folio(struct freader *r, loff_t file_off)
112+
113+
freader_put_folio(r);
114+
115+
- /* reject secretmem folios created with memfd_secret() */
116+
- if (secretmem_mapping(r->file->f_mapping))
117+
+ /* reject secretmem folios created with memfd_secret() or guest_memfd() */
118+
+ if (mapping_no_direct_map(r->file->f_mapping))
119+
return -EFAULT;
120+
121+
r->folio = filemap_get_folio(r->file->f_mapping, file_off >> PAGE_SHIFT);
122+
diff --git a/mm/gup.c b/mm/gup.c
123+
index 3883b307780e..b1483a876740 100644
124+
--- a/mm/gup.c
125+
+++ b/mm/gup.c
126+
@@ -1283,7 +1283,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
127+
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
128+
return -EOPNOTSUPP;
129+
130+
- if (vma_is_secretmem(vma))
131+
+ if (vma_is_no_direct_map(vma))
132+
return -EFAULT;
133+
134+
if (write) {
135+
@@ -2786,7 +2786,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
136+
{
137+
bool reject_file_backed = false;
138+
struct address_space *mapping;
139+
- bool check_secretmem = false;
140+
unsigned long mapping_flags;
141+
142+
/*
143+
@@ -2798,14 +2797,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
144+
reject_file_backed = true;
145+
146+
/* We hold a folio reference, so we can safely access folio fields. */
147+
-
148+
- /* secretmem folios are always order-0 folios. */
149+
- if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))
150+
- check_secretmem = true;
151+
-
152+
- if (!reject_file_backed && !check_secretmem)
153+
- return true;
154+
-
155+
if (WARN_ON_ONCE(folio_test_slab(folio)))
156+
return false;
157+
158+
@@ -2847,8 +2838,9 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
159+
* At this point, we know the mapping is non-null and points to an
160+
* address_space object.
161+
*/
162+
- if (check_secretmem && secretmem_mapping(mapping))
163+
+ if (mapping_no_direct_map(mapping))
164+
return false;
165+
+
166+
/* The only remaining allowed file system is shmem. */
167+
return !reject_file_backed || shmem_mapping(mapping);
168+
}
169+
diff --git a/mm/mlock.c b/mm/mlock.c
170+
index cde076fa7d5e..a43f308be70d 100644
171+
--- a/mm/mlock.c
172+
+++ b/mm/mlock.c
173+
@@ -474,7 +474,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
174+
175+
if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
176+
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
177+
- vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE))
178+
+ vma_is_dax(vma) || vma_is_no_direct_map(vma) || (oldflags & VM_DROPPABLE))
179+
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
180+
goto out;
181+
182+
diff --git a/mm/secretmem.c b/mm/secretmem.c
183+
index 1b0a214ee558..ea4c04d469b1 100644
184+
--- a/mm/secretmem.c
185+
+++ b/mm/secretmem.c
186+
@@ -136,11 +136,6 @@ static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
187+
return 0;
188+
}
189+
190+
-bool vma_is_secretmem(struct vm_area_struct *vma)
191+
-{
192+
- return vma->vm_ops == &secretmem_vm_ops;
193+
-}
194+
-
195+
static const struct file_operations secretmem_fops = {
196+
.release = secretmem_release,
197+
.mmap = secretmem_mmap,
198+
@@ -214,6 +209,7 @@ static struct file *secretmem_file_create(unsigned long flags)
199+
200+
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
201+
mapping_set_unevictable(inode->i_mapping);
202+
+ mapping_set_no_direct_map(inode->i_mapping);
203+
204+
inode->i_op = &secretmem_iops;
205+
inode->i_mapping->a_ops = &secretmem_aops;
206+
--
207+
2.48.1
208+
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
From 9bbc39f9c7622f0060d395b1063a564c24926d8d Mon Sep 17 00:00:00 2001
2+
From: Patrick Roy <roypat@amazon.co.uk>
3+
Date: Fri, 7 Feb 2025 14:33:01 +0000
4+
Subject: [PATCH 2/2] KVM: guest_memfd: Add flag to remove from direct map
5+
6+
Add KVM_GMEM_NO_DIRECT_MAP flag for KVM_CREATE_GUEST_MEMFD() ioctl. When
7+
set, guest_memfd folios will be removed from the direct map after
8+
preparation, with direct map entries only restored when the folios are
9+
freed.
10+
11+
To ensure these folios do not end up in places where the kernel cannot
12+
deal with them, set AS_NO_DIRECT_MAP on the guest_memfd's struct
13+
address_space if KVM_GMEM_NO_DIRECT_MAP is requested.
14+
15+
Add KVM_CAP_GMEM_NO_DIRECT_MAP to let userspace discover whether
16+
guest_memfd supports KVM_GMEM_NO_DIRECT_MAP. Support depends on
17+
guest_memfd itself being supported, but also on whether KVM can
18+
manipulate the direct map at page granularity at all (possible most of
19+
the time, just arm64 is a notable outlier where its impossible if the
20+
direct map has been setup using hugepages, as arm64 cannot break these
21+
apart due to break-before-make semantics).
22+
23+
Note that this flag causes removal of direct map entries for all
24+
guest_memfd folios independent of whether they are "shared" or "private"
25+
(although current guest_memfd only supports either all folios in the
26+
"shared" state, or all folios in the "private" state if
27+
!IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM)). The usecase for removing
28+
direct map entries of also the shared parts of guest_memfd are a special
29+
type of non-CoCo VM where, host userspace is trusted to have access to
30+
all of guest memory, but where Spectre-style transient execution attacks
31+
through the host kernel's direct map should still be mitigated.
32+
33+
Note that KVM retains access to guest memory via userspace
34+
mappings of guest_memfd, which are reflected back into KVM's memslots
35+
via userspace_addr. This is needed for things like MMIO emulation on
36+
x86_64 to work. Previous iterations attempted to instead have KVM
37+
temporarily restore direct map entries whenever such an access to guest
38+
memory was needed, but this turned out to have a significant performance
39+
impact, as well as additional complexity due to needing to refcount
40+
direct map reinsertion operations and making them play nicely with gmem
41+
truncations.
42+
43+
This iteration also doesn't have KVM perform TLB flushes after direct
44+
map manipulations. This is because TLB flushes resulted in a up to 40x
45+
elongation of page faults in guest_memfd (scaling with the number of CPU
46+
cores), or a 5x elongation of memory population. On the one hand, TLB
47+
flushes are not needed for functional correctness (the virt->phys
48+
mapping technically stays "correct", the kernel should simply to not it
49+
for a while), so this is a correct optimization to make. On the other
50+
hand, it means that the desired protection from Spectre-style attacks is
51+
not perfect, as an attacker could try to prevent a stale TLB entry from
52+
getting evicted, keeping it alive until the page it refers to is used by
53+
the guest for some sensitive data, and then targeting it using a
54+
spectre-gadget.
55+
56+
Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
57+
---
58+
include/uapi/linux/kvm.h | 3 +++
59+
virt/kvm/guest_memfd.c | 28 +++++++++++++++++++++++++++-
60+
virt/kvm/kvm_main.c | 5 +++++
61+
3 files changed, 35 insertions(+), 1 deletion(-)
62+
63+
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
64+
index 117937a895da..fb02a93546d8 100644
65+
--- a/include/uapi/linux/kvm.h
66+
+++ b/include/uapi/linux/kvm.h
67+
@@ -930,6 +930,7 @@ struct kvm_enable_cap {
68+
#define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237
69+
#define KVM_CAP_X86_GUEST_MODE 238
70+
#define KVM_CAP_GMEM_SHARED_MEM 239
71+
+#define KVM_CAP_GMEM_NO_DIRECT_MAP 240
72+
73+
struct kvm_irq_routing_irqchip {
74+
__u32 irqchip;
75+
@@ -1573,6 +1574,8 @@ struct kvm_create_guest_memfd {
76+
__u64 reserved[6];
77+
};
78+
79+
+#define KVM_GMEM_NO_DIRECT_MAP (1ULL << 0)
80+
+
81+
#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
82+
83+
struct kvm_pre_fault_memory {
84+
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
85+
index fbf89e643add..a2b96bc51391 100644
86+
--- a/virt/kvm/guest_memfd.c
87+
+++ b/virt/kvm/guest_memfd.c
88+
@@ -4,6 +4,7 @@
89+
#include <linux/kvm_host.h>
90+
#include <linux/pagemap.h>
91+
#include <linux/anon_inodes.h>
92+
+#include <linux/set_memory.h>
93+
94+
#include "kvm_mm.h"
95+
96+
@@ -50,8 +51,23 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
97+
return 0;
98+
}
99+
100+
+static bool kvm_gmem_test_no_direct_map(struct inode *inode)
101+
+{
102+
+ return ((unsigned long) inode->i_private) & KVM_GMEM_NO_DIRECT_MAP;
103+
+}
104+
+
105+
static inline void kvm_gmem_mark_prepared(struct folio *folio)
106+
{
107+
+ struct inode *inode = folio_inode(folio);
108+
+
109+
+ if (kvm_gmem_test_no_direct_map(inode)) {
110+
+ int r = set_direct_map_valid_noflush(folio_page(folio, 0), folio_nr_pages(folio),
111+
+ false);
112+
+
113+
+ if (!r)
114+
+ folio_set_private(folio);
115+
+ }
116+
+
117+
folio_mark_uptodate(folio);
118+
}
119+
120+
@@ -478,6 +494,10 @@ static void kvm_gmem_free_folio(struct folio *folio)
121+
kvm_pfn_t pfn = page_to_pfn(page);
122+
int order = folio_order(folio);
123+
124+
+ if (folio_test_private(folio))
125+
+ WARN_ON_ONCE(set_direct_map_valid_noflush(folio_page(folio, 0),
126+
+ folio_nr_pages(folio), true));
127+
+
128+
kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
129+
}
130+
#endif
131+
@@ -551,6 +571,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
132+
/* Unmovable mappings are supposed to be marked unevictable as well. */
133+
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
134+
135+
+ if (flags & KVM_GMEM_NO_DIRECT_MAP)
136+
+ mapping_set_no_direct_map(inode->i_mapping);
137+
+
138+
kvm_get_kvm(kvm);
139+
gmem->kvm = kvm;
140+
xa_init(&gmem->bindings);
141+
@@ -570,7 +593,10 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
142+
{
143+
loff_t size = args->size;
144+
u64 flags = args->flags;
145+
- u64 valid_flags = 0;
146+
+ u64 valid_flags = KVM_GMEM_NO_DIRECT_MAP;
147+
+
148+
+ if (!can_set_direct_map())
149+
+ valid_flags &= ~KVM_GMEM_NO_DIRECT_MAP;
150+
151+
if (flags & ~valid_flags)
152+
return -EINVAL;
153+
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
154+
index 3e40acb9f5c0..32ca1c921ab0 100644
155+
--- a/virt/kvm/kvm_main.c
156+
+++ b/virt/kvm/kvm_main.c
157+
@@ -65,6 +65,7 @@
158+
#include <trace/events/kvm.h>
159+
160+
#include <linux/kvm_dirty_ring.h>
161+
+#include <linux/set_memory.h>
162+
163+
164+
/* Worst case buffer size needed for holding an integer. */
165+
@@ -4823,6 +4824,10 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
166+
return kvm_supported_mem_attributes(kvm);
167+
#endif
168+
#ifdef CONFIG_KVM_PRIVATE_MEM
169+
+ case KVM_CAP_GMEM_NO_DIRECT_MAP:
170+
+ if (!can_set_direct_map())
171+
+ return false;
172+
+ fallthrough;
173+
case KVM_CAP_GUEST_MEMFD:
174+
return !kvm || kvm_arch_has_private_mem(kvm);
175+
#endif
176+
--
177+
2.48.1
178+

0 commit comments

Comments
 (0)