Skip to content

Commit 66dab75

Browse files
committed
sandbox: Drop all capabilities that don't make sense in userns
When unsharing a user namespace, we get a full set of capabilities, of which a ton don't make sense to keep. Why drop them? Because it's possible that other tools check if they have the required capabilities to run, like systing now checking if it is invoked with CAP_BPF. If we don't drop CAP_BPF, systing will think it's able to attach BPF programs even though in reality it can't as CAP_BPF in a user namespace doesn't actually allow you to attach BPF programs. While we're at it, let's be a bit more thorough with the capability logic and make sure we modify all capability sets to only contain the capabilities we want to keep.
1 parent 8fe5df4 commit 66dab75

File tree

2 files changed

+102
-20
lines changed

2 files changed

+102
-20
lines changed

mkosi/qemu.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,7 @@ def start_virtiofsd(
350350
"--sandbox=chroot",
351351
f"--inode-file-handles={'prefer' if os.getuid() == 0 and not uidmap else 'never'}",
352352
"--log-level=error",
353+
"--modcaps=-mknod",
353354
] # fmt: skip
354355

355356
if selinux:

mkosi/sandbox.py

Lines changed: 101 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,21 @@
2222
AT_RECURSIVE = 0x8000
2323
AT_SYMLINK_NOFOLLOW = 0x100
2424
BTRFS_SUPER_MAGIC = 0x9123683E
25+
CAP_CHOWN = 0
26+
CAP_DAC_OVERRIDE = 1
27+
CAP_DAC_READ_SEARCH = 2
28+
CAP_FOWNER = 3
29+
CAP_FSETID = 4
30+
CAP_SETGID = 6
31+
CAP_SETUID = 7
32+
CAP_SETPCAP = 8
33+
CAP_NET_BIND_SERVICE = 10
2534
CAP_NET_ADMIN = 12
35+
CAP_SYS_CHROOT = 18
36+
CAP_SYS_PTRACE = 19
2637
CAP_SYS_ADMIN = 21
38+
CAP_SYS_RESOURCE = 24
39+
CAP_SETFCAP = 31
2740
CLONE_NEWIPC = 0x08000000
2841
CLONE_NEWNET = 0x40000000
2942
CLONE_NEWNS = 0x00020000
@@ -59,7 +72,10 @@
5972
OPEN_TREE_CLONE = 1
6073
OVERLAYFS_SUPER_MAGIC = 0x794C7630
6174
PR_CAP_AMBIENT = 47
75+
PR_CAP_AMBIENT_IS_SET = 1
6276
PR_CAP_AMBIENT_RAISE = 2
77+
PR_CAP_AMBIENT_LOWER = 3
78+
PR_CAPBSET_DROP = 24
6379
# These definitions are taken from the libseccomp headers
6480
SCMP_ACT_ALLOW = 0x7FFF0000
6581
SCMP_ACT_ERRNO = 0x00050000
@@ -171,41 +187,75 @@ def umount2(path: str, flags: int = 0) -> None:
171187
oserror("umount2", path)
172188

173189

174-
def cap_permitted_to_ambient() -> None:
175-
"""
176-
When unsharing a user namespace and mapping the current user to itself, the user has a full
177-
set of capabilities in the user namespace. This allows the user to do mounts after unsharing a
178-
mount namespace for example. However, these capabilities are lost again when the user executes
179-
a subprocess. As we also want subprocesses invoked by the user to be able to mount stuff, we
180-
make sure the capabilities are inherited by adding all the user's capabilities to the inherited
181-
and ambient capabilities set, which makes sure that they are passed down to subprocesses.
182-
"""
190+
def capability_mask(capabilities: list[int]) -> int:
191+
mask = 0
192+
193+
for cap in capabilities:
194+
mask |= 1 << cap
195+
196+
return mask
197+
198+
199+
def drop_capabilities(*, keep: list[int]) -> None:
200+
# First, fetch the permitted capabilities and AND them
201+
# with the ones with we want to keep to get the final list
202+
# of capabilities.
203+
183204
header = cap_user_header_t(LINUX_CAPABILITY_VERSION_3, 0)
184205
payload = (cap_user_data_t * LINUX_CAPABILITY_U32S_3)()
185206

186207
if libc.capget(ctypes.addressof(header), ctypes.byref(payload)) < 0:
187208
oserror("capget")
188209

189-
payload[0].inheritable = payload[0].permitted
190-
payload[1].inheritable = payload[1].permitted
210+
permitted = payload[1].permitted << 32 | payload[0].permitted
211+
permitted &= capability_mask(keep)
191212

192-
if libc.capset(ctypes.addressof(header), ctypes.byref(payload)) < 0:
193-
oserror("capset")
194-
195-
effective = payload[1].effective << 32 | payload[0].effective
213+
# Next, drop unwanted capabilities from the bounding set as
214+
# later we'll drop the capability that lets us do so (CAP_SETPCAP).
196215

197216
with open("/proc/sys/kernel/cap_last_cap", "rb") as f:
198217
last_cap = int(f.read())
199218

200-
libc.prctl.argtypes = (ctypes.c_int, ctypes.c_ulong, ctypes.c_ulong, ctypes.c_ulong, ctypes.c_ulong)
219+
libc.prctl.argtypes = (ctypes.c_int, ctypes.c_ulong)
201220

202221
for cap in range(ctypes.sizeof(ctypes.c_uint64) * 8):
203222
if cap > last_cap:
204223
break
205224

206-
if effective & (1 << cap) and libc.prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) < 0:
225+
if not (permitted & (1 << cap)) and libc.prctl(PR_CAPBSET_DROP, cap) < 0:
207226
oserror("prctl")
208227

228+
# Now, modify the permitted, effective and inheritable
229+
# capability sets with capset().
230+
231+
payload[0].permitted = permitted
232+
payload[1].permitted = permitted >> 32
233+
payload[0].effective = permitted
234+
payload[1].effective = permitted >> 32
235+
payload[0].inheritable = permitted
236+
payload[1].inheritable = permitted >> 32
237+
238+
if libc.capset(ctypes.addressof(header), ctypes.byref(payload)) < 0:
239+
oserror("capset")
240+
241+
# Finally, modify the ambient set using the associated pcrtl()'s.
242+
243+
libc.prctl.argtypes = (ctypes.c_int, ctypes.c_ulong, ctypes.c_ulong, ctypes.c_ulong, ctypes.c_ulong)
244+
245+
for cap in range(ctypes.sizeof(ctypes.c_uint64) * 8):
246+
if cap > last_cap:
247+
break
248+
249+
if permitted & (1 << cap):
250+
if libc.prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) < 0:
251+
oserror("prctl")
252+
else:
253+
r = libc.prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, cap, 0, 0)
254+
if r < 0:
255+
oserror("prctl")
256+
if r > 0 and libc.prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, cap, 0, 0) < 0:
257+
oserror("prctl")
258+
209259

210260
def have_effective_cap(capability: int) -> bool:
211261
with open("/proc/self/status", "rb") as f:
@@ -464,15 +514,46 @@ def become_user(uid: int, gid: int) -> None:
464514
raise OSError(rc, os.strerror(rc))
465515

466516

467-
def acquire_privileges(*, become_root: bool = False) -> bool:
517+
def acquire_privileges(*, become_root: bool = False, network: bool = False) -> bool:
468518
if have_effective_cap(CAP_SYS_ADMIN) and (os.getuid() == 0 or not become_root):
469519
return False
470520

471521
if become_root:
472522
become_user(0, 0)
473523
else:
474524
become_user(os.getuid(), os.getgid())
475-
cap_permitted_to_ambient()
525+
526+
# When unsharing a user namespace, the process user has a full set of capabilities in the new user
527+
# namespace. This allows the process to do mounts after unsharing a mount namespace for example. However,
528+
# these capabilities are lost again when the user executes a subprocess. As we also want subprocesses
529+
# invoked by the user to be able to mount stuff, we make sure the capabilities we are interested in are
530+
# inherited across execve() by adding all the these capabilities to the inherited and ambient capability
531+
# sets, which makes sure that they are passed down to subprocesses, regardless if we're uid 0 in the user
532+
# namespace or not.
533+
534+
caps = [
535+
CAP_CHOWN,
536+
CAP_DAC_OVERRIDE,
537+
CAP_DAC_READ_SEARCH,
538+
CAP_FOWNER,
539+
CAP_FSETID,
540+
CAP_SETGID,
541+
CAP_SETUID,
542+
CAP_SETPCAP,
543+
CAP_SYS_CHROOT,
544+
CAP_SYS_PTRACE,
545+
CAP_SYS_ADMIN,
546+
CAP_SYS_RESOURCE,
547+
CAP_SETFCAP,
548+
]
549+
if network:
550+
# If we're unsharing the network namespace, we want CAP_NET_BIND_SERVICE and CAP_NET_ADMIN as well.
551+
caps += [
552+
CAP_NET_BIND_SERVICE,
553+
CAP_NET_ADMIN,
554+
]
555+
556+
drop_capabilities(keep=caps)
476557

477558
return True
478559

@@ -1031,7 +1112,7 @@ def main(argv: list[str] = sys.argv[1:]) -> None:
10311112
if unshare_ipc:
10321113
namespaces |= CLONE_NEWIPC
10331114

1034-
userns = acquire_privileges(become_root=become_root)
1115+
userns = acquire_privileges(become_root=become_root, network=bool(namespaces & CLONE_NEWNET))
10351116

10361117
seccomp_suppress(
10371118
# If we're root in a user namespace with a single user, we're still not going to be able to

0 commit comments

Comments
 (0)