|
22 | 22 | AT_RECURSIVE = 0x8000 |
23 | 23 | AT_SYMLINK_NOFOLLOW = 0x100 |
24 | 24 | BTRFS_SUPER_MAGIC = 0x9123683E |
| 25 | +CAP_CHOWN = 0 |
| 26 | +CAP_DAC_OVERRIDE = 1 |
| 27 | +CAP_DAC_READ_SEARCH = 2 |
| 28 | +CAP_FOWNER = 3 |
| 29 | +CAP_FSETID = 4 |
| 30 | +CAP_SETGID = 6 |
| 31 | +CAP_SETUID = 7 |
| 32 | +CAP_SETPCAP = 8 |
| 33 | +CAP_NET_BIND_SERVICE = 10 |
25 | 34 | CAP_NET_ADMIN = 12 |
| 35 | +CAP_SYS_CHROOT = 18 |
| 36 | +CAP_SYS_PTRACE = 19 |
26 | 37 | CAP_SYS_ADMIN = 21 |
| 38 | +CAP_SYS_RESOURCE = 24 |
| 39 | +CAP_SETFCAP = 31 |
27 | 40 | CLONE_NEWIPC = 0x08000000 |
28 | 41 | CLONE_NEWNET = 0x40000000 |
29 | 42 | CLONE_NEWNS = 0x00020000 |
|
59 | 72 | OPEN_TREE_CLONE = 1 |
60 | 73 | OVERLAYFS_SUPER_MAGIC = 0x794C7630 |
61 | 74 | PR_CAP_AMBIENT = 47 |
| 75 | +PR_CAP_AMBIENT_IS_SET = 1 |
62 | 76 | PR_CAP_AMBIENT_RAISE = 2 |
| 77 | +PR_CAP_AMBIENT_LOWER = 3 |
| 78 | +PR_CAPBSET_DROP = 24 |
63 | 79 | # These definitions are taken from the libseccomp headers |
64 | 80 | SCMP_ACT_ALLOW = 0x7FFF0000 |
65 | 81 | SCMP_ACT_ERRNO = 0x00050000 |
@@ -171,41 +187,75 @@ def umount2(path: str, flags: int = 0) -> None: |
171 | 187 | oserror("umount2", path) |
172 | 188 |
|
173 | 189 |
|
174 | | -def cap_permitted_to_ambient() -> None: |
175 | | - """ |
176 | | - When unsharing a user namespace and mapping the current user to itself, the user has a full |
177 | | - set of capabilities in the user namespace. This allows the user to do mounts after unsharing a |
178 | | - mount namespace for example. However, these capabilities are lost again when the user executes |
179 | | - a subprocess. As we also want subprocesses invoked by the user to be able to mount stuff, we |
180 | | - make sure the capabilities are inherited by adding all the user's capabilities to the inherited |
181 | | - and ambient capabilities set, which makes sure that they are passed down to subprocesses. |
182 | | - """ |
| 190 | +def capability_mask(capabilities: list[int]) -> int: |
| 191 | + mask = 0 |
| 192 | + |
| 193 | + for cap in capabilities: |
| 194 | + mask |= 1 << cap |
| 195 | + |
| 196 | + return mask |
| 197 | + |
| 198 | + |
| 199 | +def drop_capabilities(*, keep: list[int]) -> None: |
| 200 | + # First, fetch the permitted capabilities and AND them |
| 201 | + # with the ones with we want to keep to get the final list |
| 202 | + # of capabilities. |
| 203 | + |
183 | 204 | header = cap_user_header_t(LINUX_CAPABILITY_VERSION_3, 0) |
184 | 205 | payload = (cap_user_data_t * LINUX_CAPABILITY_U32S_3)() |
185 | 206 |
|
186 | 207 | if libc.capget(ctypes.addressof(header), ctypes.byref(payload)) < 0: |
187 | 208 | oserror("capget") |
188 | 209 |
|
189 | | - payload[0].inheritable = payload[0].permitted |
190 | | - payload[1].inheritable = payload[1].permitted |
| 210 | + permitted = payload[1].permitted << 32 | payload[0].permitted |
| 211 | + permitted &= capability_mask(keep) |
191 | 212 |
|
192 | | - if libc.capset(ctypes.addressof(header), ctypes.byref(payload)) < 0: |
193 | | - oserror("capset") |
194 | | - |
195 | | - effective = payload[1].effective << 32 | payload[0].effective |
| 213 | + # Next, drop unwanted capabilities from the bounding set as |
| 214 | + # later we'll drop the capability that lets us do so (CAP_SETPCAP). |
196 | 215 |
|
197 | 216 | with open("/proc/sys/kernel/cap_last_cap", "rb") as f: |
198 | 217 | last_cap = int(f.read()) |
199 | 218 |
|
200 | | - libc.prctl.argtypes = (ctypes.c_int, ctypes.c_ulong, ctypes.c_ulong, ctypes.c_ulong, ctypes.c_ulong) |
| 219 | + libc.prctl.argtypes = (ctypes.c_int, ctypes.c_ulong) |
201 | 220 |
|
202 | 221 | for cap in range(ctypes.sizeof(ctypes.c_uint64) * 8): |
203 | 222 | if cap > last_cap: |
204 | 223 | break |
205 | 224 |
|
206 | | - if effective & (1 << cap) and libc.prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) < 0: |
| 225 | + if not (permitted & (1 << cap)) and libc.prctl(PR_CAPBSET_DROP, cap) < 0: |
207 | 226 | oserror("prctl") |
208 | 227 |
|
| 228 | + # Now, modify the permitted, effective and inheritable |
| 229 | + # capability sets with capset(). |
| 230 | + |
| 231 | + payload[0].permitted = permitted |
| 232 | + payload[1].permitted = permitted >> 32 |
| 233 | + payload[0].effective = permitted |
| 234 | + payload[1].effective = permitted >> 32 |
| 235 | + payload[0].inheritable = permitted |
| 236 | + payload[1].inheritable = permitted >> 32 |
| 237 | + |
| 238 | + if libc.capset(ctypes.addressof(header), ctypes.byref(payload)) < 0: |
| 239 | + oserror("capset") |
| 240 | + |
| 241 | + # Finally, modify the ambient set using the associated pcrtl()'s. |
| 242 | + |
| 243 | + libc.prctl.argtypes = (ctypes.c_int, ctypes.c_ulong, ctypes.c_ulong, ctypes.c_ulong, ctypes.c_ulong) |
| 244 | + |
| 245 | + for cap in range(ctypes.sizeof(ctypes.c_uint64) * 8): |
| 246 | + if cap > last_cap: |
| 247 | + break |
| 248 | + |
| 249 | + if permitted & (1 << cap): |
| 250 | + if libc.prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) < 0: |
| 251 | + oserror("prctl") |
| 252 | + else: |
| 253 | + r = libc.prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, cap, 0, 0) |
| 254 | + if r < 0: |
| 255 | + oserror("prctl") |
| 256 | + if r > 0 and libc.prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, cap, 0, 0) < 0: |
| 257 | + oserror("prctl") |
| 258 | + |
209 | 259 |
|
210 | 260 | def have_effective_cap(capability: int) -> bool: |
211 | 261 | with open("/proc/self/status", "rb") as f: |
@@ -464,15 +514,46 @@ def become_user(uid: int, gid: int) -> None: |
464 | 514 | raise OSError(rc, os.strerror(rc)) |
465 | 515 |
|
466 | 516 |
|
467 | | -def acquire_privileges(*, become_root: bool = False) -> bool: |
| 517 | +def acquire_privileges(*, become_root: bool = False, network: bool = False) -> bool: |
468 | 518 | if have_effective_cap(CAP_SYS_ADMIN) and (os.getuid() == 0 or not become_root): |
469 | 519 | return False |
470 | 520 |
|
471 | 521 | if become_root: |
472 | 522 | become_user(0, 0) |
473 | 523 | else: |
474 | 524 | become_user(os.getuid(), os.getgid()) |
475 | | - cap_permitted_to_ambient() |
| 525 | + |
| 526 | + # When unsharing a user namespace, the process user has a full set of capabilities in the new user |
| 527 | + # namespace. This allows the process to do mounts after unsharing a mount namespace for example. However, |
| 528 | + # these capabilities are lost again when the user executes a subprocess. As we also want subprocesses |
| 529 | + # invoked by the user to be able to mount stuff, we make sure the capabilities we are interested in are |
| 530 | + # inherited across execve() by adding all the these capabilities to the inherited and ambient capability |
| 531 | + # sets, which makes sure that they are passed down to subprocesses, regardless if we're uid 0 in the user |
| 532 | + # namespace or not. |
| 533 | + |
| 534 | + caps = [ |
| 535 | + CAP_CHOWN, |
| 536 | + CAP_DAC_OVERRIDE, |
| 537 | + CAP_DAC_READ_SEARCH, |
| 538 | + CAP_FOWNER, |
| 539 | + CAP_FSETID, |
| 540 | + CAP_SETGID, |
| 541 | + CAP_SETUID, |
| 542 | + CAP_SETPCAP, |
| 543 | + CAP_SYS_CHROOT, |
| 544 | + CAP_SYS_PTRACE, |
| 545 | + CAP_SYS_ADMIN, |
| 546 | + CAP_SYS_RESOURCE, |
| 547 | + CAP_SETFCAP, |
| 548 | + ] |
| 549 | + if network: |
| 550 | + # If we're unsharing the network namespace, we want CAP_NET_BIND_SERVICE and CAP_NET_ADMIN as well. |
| 551 | + caps += [ |
| 552 | + CAP_NET_BIND_SERVICE, |
| 553 | + CAP_NET_ADMIN, |
| 554 | + ] |
| 555 | + |
| 556 | + drop_capabilities(keep=caps) |
476 | 557 |
|
477 | 558 | return True |
478 | 559 |
|
@@ -1031,7 +1112,7 @@ def main(argv: list[str] = sys.argv[1:]) -> None: |
1031 | 1112 | if unshare_ipc: |
1032 | 1113 | namespaces |= CLONE_NEWIPC |
1033 | 1114 |
|
1034 | | - userns = acquire_privileges(become_root=become_root) |
| 1115 | + userns = acquire_privileges(become_root=become_root, network=bool(namespaces & CLONE_NEWNET)) |
1035 | 1116 |
|
1036 | 1117 | seccomp_suppress( |
1037 | 1118 | # If we're root in a user namespace with a single user, we're still not going to be able to |
|
0 commit comments