Skip to content

Commit e662459

Browse files
authored
[ci] Enable Cgroup support in CI for core (#51454)
Enable CI environment with writeable cgroupv2 in CI. To use this environment for testing, you need to use the `--privileged-container` flag in your test definitions. To enable bazel read/write to sys/fs/cgroup in your tests, you need to use the `--build-type cgroup` --------- Signed-off-by: Ibrahim Rabbani <irabbani@anyscale.com>
1 parent 5e05c2f commit e662459

11 files changed

+124
-4
lines changed

.bazelrc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,8 @@ build:ubsan --linkopt -fsanitize=undefined
210210
build:ubsan --linkopt -fno-sanitize-recover=all
211211
build:ubsan --per_file_copt="-external/com_github_grpc_grpc/.*@-fsanitize=undefined"
212212

213+
build:cgroup --sandbox_writable_path=/sys/fs/cgroup --config=llvm
214+
213215
# Import local specific llvm config options, which can be generated by
214216
# ci/env/install-llvm-dependencies.sh
215217
try-import %workspace%/.llvm-local.bazelrc

.buildkite/cicd.rayci.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,15 @@ steps:
1313
depends_on:
1414
- oss-ci-base_test
1515
- forge
16+
- label: ":coral: reef: privileged container tests"
17+
commands:
18+
- bazel run //ci/ray_ci:test_in_docker --
19+
//ci/ray_ci:test_privileged ci
20+
--cache-test-results
21+
--build-name oss-ci-base_test
22+
--build-type cgroup
23+
--privileged
24+
instance_type: small
25+
depends_on:
26+
- oss-ci-base_test
27+
- forge

ci/ray_ci/BUILD.bazel

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,3 +208,16 @@ py_test(
208208
ci_require("pytest"),
209209
],
210210
)
211+
212+
# This test is only run on linux machines
213+
# with docker containers that have --privileged
214+
# enabled.
215+
py_test(
216+
name = "test_privileged",
217+
size = "small",
218+
srcs = ["test_privileged.py"],
219+
tags = [
220+
"team:ci"
221+
],
222+
deps = [ci_require("pytest")],
223+
)

ci/ray_ci/container.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from typing import List, Tuple, Optional
77

8+
89
_CUDA_COPYRIGHT = """
910
==========
1011
== CUDA ==

ci/ray_ci/linux_container.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,15 @@ def __init__(
1919
volumes: Optional[List[str]] = None,
2020
envs: Optional[List[str]] = None,
2121
tmp_filesystem: Optional[str] = None,
22+
privileged: bool = False,
2223
) -> None:
2324
super().__init__(docker_tag, volumes, envs)
2425

2526
if tmp_filesystem is not None:
2627
if tmp_filesystem != "tmpfs":
2728
raise ValueError("Only tmpfs is supported for tmp filesystem")
2829
self.tmp_filesystem = tmp_filesystem
30+
self.privileged = privileged
2931

3032
def install_ray(
3133
self, build_type: Optional[str] = None, mask: Optional[str] = None
@@ -78,16 +80,18 @@ def get_run_command_extra_args(
7880
"--mount",
7981
f"type={self.tmp_filesystem},destination=/tmp",
8082
]
81-
for cap in _DOCKER_CAP_ADD:
82-
extra_args += ["--cap-add", cap]
83+
if self.privileged:
84+
extra_args += ["--privileged"]
85+
else:
86+
for cap in _DOCKER_CAP_ADD:
87+
extra_args += ["--cap-add", cap]
8388
if gpu_ids:
8489
extra_args += ["--gpus", f'"device={",".join(map(str, gpu_ids))}"']
8590
extra_args += [
8691
"--workdir",
8792
"/rayci",
8893
"--shm-size=2.5gb",
8994
]
90-
9195
return extra_args
9296

9397
def get_artifact_mount(self) -> Tuple[str, str]:

ci/ray_ci/linux_tester_container.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ def __init__(
1818
build_type: Optional[str] = None,
1919
install_mask: Optional[str] = None,
2020
tmp_filesystem: Optional[str] = None,
21+
privileged: bool = False,
2122
) -> None:
2223
LinuxContainer.__init__(
2324
self,
@@ -28,6 +29,7 @@ def __init__(
2829
"/var/run/docker.sock:/var/run/docker.sock",
2930
],
3031
tmp_filesystem=tmp_filesystem,
32+
privileged=privileged,
3133
)
3234
TesterContainer.__init__(
3335
self,

ci/ray_ci/test_privileged.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import os
2+
import pytest
3+
import sys
4+
5+
from pathlib import Path
6+
7+
# In privileged containers, we expect the following
8+
# cgroupv1 is disabled
9+
# cgroupv2 is enabled and mounted on /sys/fs/cgroup
10+
# the user running tests has read and write access to the cgroup subtree
11+
# memory and cpu controllers are enabled
12+
13+
_MOUNT_FILE_PATH = "/proc/mounts"
14+
_CGROUP2_PATH = "/sys/fs/cgroup"
15+
_CTRL_FILE = "cgroup.controllers"
16+
_EXPECTED_CTRLS = ["memory", "cpu"]
17+
18+
19+
# mount file format:
20+
# cgroup /sys/fs/cgroup cgroup2 rw,nosuid,nodev,noexec,relatime 0 0
21+
def test_only_cgroupv2_mounted_rw():
22+
found_cgroupv2 = False
23+
found_cgroupv1 = False
24+
with open(Path(_MOUNT_FILE_PATH)) as f:
25+
for line in f:
26+
c = line.split()
27+
found_cgroupv2 = found_cgroupv2 or (
28+
c[2] == "cgroup2" and c[1] == _CGROUP2_PATH and "rw" in c[3]
29+
)
30+
found_cgroupv1 = found_cgroupv1 or (c[2] == "cgroup")
31+
assert found_cgroupv2 and not found_cgroupv1
32+
33+
34+
def test_cgroupv2_rw_for_test_user():
35+
assert os.access(_CGROUP2_PATH, os.R_OK) and os.access(_CGROUP2_PATH, os.W_OK)
36+
37+
38+
def test_cgroupv2_controllers_enabled():
39+
with open(os.path.join(_CGROUP2_PATH, _CTRL_FILE)) as f:
40+
enabled = f.readlines()
41+
assert len(enabled) == 1
42+
enabled_ctrls = enabled[0].split()
43+
for expected_ctrl in _EXPECTED_CTRLS:
44+
assert (
45+
expected_ctrl in enabled_ctrls
46+
), f"Expected {expected_ctrl} to be enabled for cgroups2, but it is not"
47+
48+
49+
if __name__ == "__main__":
50+
sys.exit(pytest.main(["-v", __file__]))

ci/ray_ci/test_tester.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,28 @@ def test_get_tag_matcher() -> None:
4444
)
4545

4646

47+
def test_linux_privileged() -> None:
48+
with mock.patch(
49+
"ci.ray_ci.linux_tester_container.LinuxTesterContainer.install_ray",
50+
return_value=None,
51+
):
52+
container = _get_container(
53+
team="core",
54+
operating_system="linux",
55+
workers=3,
56+
worker_id=1,
57+
parallelism_per_worker=2,
58+
network=None,
59+
gpus=0,
60+
tmp_filesystem=None,
61+
privileged=True,
62+
)
63+
assert (
64+
container.privileged
65+
and "--privileged" in container.get_run_command_extra_args()
66+
)
67+
68+
4769
def test_get_container() -> None:
4870
with mock.patch(
4971
"ci.ray_ci.linux_tester_container.LinuxTesterContainer.install_ray",

ci/ray_ci/tester.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@
159159
"asan-clang",
160160
"ubsan",
161161
"tsan-clang",
162+
"cgroup",
162163
# java build types
163164
"java",
164165
# do not build ray
@@ -188,6 +189,13 @@
188189
type=str,
189190
help=("Filesystem to use for /tmp"),
190191
)
192+
@click.option(
193+
"--privileged",
194+
is_flag=True,
195+
show_default=True,
196+
default=False,
197+
help="Run the test in a privileged Docker container",
198+
)
191199
def main(
192200
targets: List[str],
193201
team: str,
@@ -212,6 +220,7 @@ def main(
212220
install_mask: Optional[str],
213221
bisect_run_test_target: Optional[str],
214222
tmp_filesystem: Optional[str],
223+
privileged: bool,
215224
) -> None:
216225
if not bazel_workspace_dir:
217226
raise Exception("Please use `bazelisk run //ci/ray_ci`")
@@ -241,6 +250,7 @@ def main(
241250
build_type=build_type,
242251
skip_ray_installation=skip_ray_installation,
243252
install_mask=install_mask,
253+
privileged=privileged,
244254
)
245255
if build_only:
246256
sys.exit(0)
@@ -295,6 +305,7 @@ def _get_container(
295305
build_type: Optional[str] = None,
296306
install_mask: Optional[str] = None,
297307
skip_ray_installation: bool = False,
308+
privileged: bool = False,
298309
) -> TesterContainer:
299310
shard_count = workers * parallelism_per_worker
300311
shard_start = worker_id * parallelism_per_worker
@@ -316,6 +327,7 @@ def _get_container(
316327
build_type=build_type,
317328
tmp_filesystem=tmp_filesystem,
318329
install_mask=install_mask,
330+
privileged=privileged,
319331
)
320332

321333
if operating_system == "windows":

ci/ray_ci/tester_container.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,8 @@ def _run_tests_in_docker(
252252
test_cmd += "--config=ubsan "
253253
if self.build_type == "tsan-clang":
254254
test_cmd += "--config=tsan-clang "
255+
if self.build_type == "cgroup":
256+
test_cmd += "--config=cgroup "
255257
for env in test_envs:
256258
test_cmd += f"--test_env {env} "
257259
if test_arg:

0 commit comments

Comments
 (0)