Skip to content

Commit a2e476e

Browse files
authored
update calc_mem.py script used to obtain within-container memory limit for cgroup v2 (#114)
* update calc_mem.py script used to obtain within-container memory limit for cgroup v2 update calc_mem.py script used to obtain within-container memory limit for cgroup v2, obtaining memory limit from new read-only file, falling back to psutil-based approach. Additionally, allow calc_mem.py to return memory values in kb or b. Also update CPU limit for cgroup v2 in calc_mem.py and util.misc; add psutil to conda requirements * expand unit tests of util.misc.available_cpu_count() expand unit tests of util.misc.available_cpu_count() to include tests where cgroup v2 is used, where cgroup v1 is used, where limits are imposed (or not) on either, or where multiprocessing.cpu_count() is used as the fallback in the event a hex bitmask cannot be found in the usual fallback of /proc/self/status * explicit monkeypatching of os.path to test cgroup v2 on cgroup v1 environment
1 parent 4201ae5 commit a2e476e

File tree

4 files changed

+174
-36
lines changed

4 files changed

+174
-36
lines changed

docker/calc_mem.py

Lines changed: 78 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,32 @@
44
Print result to stdout.
55
"""
66

7+
import math
78
import argparse
89
import sys
910
import os
11+
import re
12+
import logging
1013
import os.path
1114
import multiprocessing
1215

16+
import psutil
17+
18+
#from util.misc import available_cpu_count # use the version of available_cpu_count() from viral-core/util/misc.py
19+
20+
log = logging.getLogger(__name__)
21+
1322
parser = argparse.ArgumentParser('Calculated memory allocated to the process')
14-
parser.add_argument('mem_unit', choices=('mb', 'gb'), help='memory units')
23+
parser.add_argument('mem_unit', choices=('b', 'kb', 'mb', 'gb'), help='memory units')
1524
parser.add_argument('mem_fraction', type=int, help='what fraction of total memory to report')
1625
parser.add_argument('--per-cpu', dest="per_cpu", action='store_true', help='Calculate memory per-CPU.')
1726
args = parser.parse_args()
1827

1928
if not (1 <= args.mem_fraction <= 100):
2029
raise RuntimeError("mem_fraction should be in the range [1,100]")
2130

22-
unit2factor = {'k': 1024, 'm': 1024*1024, 'g': 1024*1024*1024}
31+
unit2factor = {'b': 1, 'k': 1024, 'm': 1024*1024, 'g': 1024*1024*1024}
32+
MAX_INT32 = (2 ** 31)-1
2333

2434
def available_cpu_count():
2535
"""
@@ -33,24 +43,42 @@ def available_cpu_count():
3343

3444
cgroup_cpus = MAX_INT32
3545
try:
36-
def slurp_file(fname):
37-
with open(fname) as f:
38-
return f.read()
39-
def get_cpu_val(name):
40-
return float(slurp_file('/sys/fs/cgroup/cpu/cpu.'+name).strip())
41-
cfs_quota = get_cpu_val('cfs_quota_us')
42-
if cfs_quota > 0:
43-
cfs_period = get_cpu_val('cfs_period_us')
44-
log.debug('cfs_quota %s, cfs_period %s', cfs_quota, cfs_period)
45-
cgroup_cpus = max(1, int(cfs_quota / cfs_period))
46+
def _load(path, encoding="utf-8"):
47+
""" Loads a file content """
48+
with open(path, 'r', encoding=encoding, newline="") as handle:
49+
tmp = handle.read()
50+
return tmp
51+
52+
# cgroup CPU count determination (w/ v2) adapted from:
53+
# https://github.com/conan-io/conan/blob/2.9.2/conan/tools/build/cpu.py#L31-L54
54+
#
55+
# see also:
56+
# https://docs.kernel.org/scheduler/sched-bwc.html
57+
58+
# This is necessary to determine docker cpu_count
59+
cfs_quota_us = cfs_period_us = 0
60+
# cgroup v2
61+
if os.path.exists("/sys/fs/cgroup/cgroup.controllers"):
62+
cpu_max = _load("/sys/fs/cgroup/cpu.max").split()
63+
if cpu_max[0] != "max":
64+
if len(cpu_max) == 1:
65+
cfs_quota_us, cfs_period_us = int(cpu_max[0]), 100_000
66+
else:
67+
cfs_quota_us, cfs_period_us = map(int, cpu_max)
68+
# cgroup v1
69+
else:
70+
cfs_quota_us = int(_load("/sys/fs/cgroup/cpu/cpu.cfs_quota_us"))
71+
cfs_period_us = int(_load("/sys/fs/cgroup/cpu/cpu.cfs_period_us"))
72+
73+
log.debug('cfs_quota_us %s, cfs_period_us %s', cfs_quota_us, cfs_period_us)
74+
if cfs_quota_us > 0 and cfs_period_us > 0:
75+
cgroup_cpus = max(1, int(math.ceil(cfs_quota_us / cfs_period_us)))
4676
except Exception as e:
4777
pass
4878

4979
proc_cpus = MAX_INT32
5080
try:
51-
with open('/proc/self/status') as f:
52-
status = f.read()
53-
m = re.search(r'(?m)^Cpus_allowed:\s*(.*)$', status)
81+
m = re.search(r'(?m)^Cpus_allowed:\s*(.*)$', _load('/proc/self/status'))
5482
if m:
5583
res = bin(int(m.group(1).replace(',', ''), 16)).count('1')
5684
if res > 0:
@@ -75,15 +103,44 @@ def mem_from_proc_meminfo():
75103

76104
def mem_from_cgroups():
77105
"""Return the total memory, in bytes, as given by cgroups (or sys.maxsize if not given)"""
78-
cgroups_memlimit_fname = '/sys/fs/cgroup/memory/memory.limit_in_bytes'
79-
if os.path.isfile(cgroups_memlimit_fname):
80-
with open(cgroups_memlimit_fname) as f:
81-
val = f.read().strip()
82-
return int(val) * unit2factor.get(val[-1], 1)
106+
# list of potential cgroup paths to max mem info
107+
# see:
108+
# (cgroup v1) https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
109+
# (cgroup v2) https://www.kernel.org/doc/Documentation/cgroup-v2.txt
110+
cgroups_memlimit_fnames = [
111+
'/sys/fs/cgroup/memory/memory.limit_in_bytes', # cgroup v1
112+
'/sys/fs/cgroup/memory.max' # cgroup v2
113+
]
114+
# try the various potential cgroup memory info paths
115+
for cgroups_memlimit_fname in cgroups_memlimit_fnames:
116+
if os.path.isfile(cgroups_memlimit_fname):
117+
with open(cgroups_memlimit_fname) as f:
118+
val = f.read().strip()
119+
if val != "max":
120+
return int(val) * unit2factor.get(val[-1], 1)
83121

84122
return sys.maxsize
85123

86-
mem_in_bytes = min(mem_from_proc_meminfo(), mem_from_cgroups())
124+
def mem_from_psutil(metric_name="total"):
125+
""" Use psutil to get a memory metric by name in a cross-platform way
126+
Returning sys.maxsize (obviously wrong large value)
127+
in the event the value cannot be obtained.
128+
129+
For available metrics, see:
130+
https://psutil.readthedocs.io/en/latest/#psutil.virtual_memory
131+
"""
132+
mem_info = psutil.virtual_memory()
133+
134+
return int(getattr(mem_info,metric_name,sys.maxsize))
135+
136+
# of the memory values obtained, use the smallest value
137+
# this results in obviously-wrong values obtained from sys.maxsize
138+
# in mem_from_cgroups() or mem_from_psutil() falling in precedence
139+
mem_in_bytes = min(
140+
mem_from_psutil(),
141+
mem_from_proc_meminfo(),
142+
mem_from_cgroups()
143+
)
87144

88145
if args.per_cpu:
89146
mem_in_bytes = mem_in_bytes/available_cpu_count()

requirements-conda.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ novoalign=3.09.04
1818
parallel>=20190922
1919
picard=2.25.6
2020
pigz>=2.4
21+
psutil>=6.1.0
2122
prinseq>=0.20.4
2223
samtools>=1.16.1
2324
trimmomatic>=0.38

test/unit/test_util_misc.py

Lines changed: 67 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import os, random, collections
66
import unittest
77
import subprocess
8+
import multiprocessing
89
import util.misc
910
import util.file
1011
import pytest
@@ -284,15 +285,76 @@ def test_chk():
284285

285286
def test_available_cpu_count(monkeypatch_function_result):
286287
reported_cpu_count = util.misc.available_cpu_count()
287-
288288
assert reported_cpu_count >= int(os.environ.get('PYTEST_XDIST_WORKER_COUNT', '1'))
289+
assert util.misc.available_cpu_count() == reported_cpu_count
289290

290-
with monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu/cpu.cfs_quota_us', patch_result='1'), \
291-
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu/cpu.cfs_period_us', patch_result='1'):
291+
# cgroup v2 limited to 1 cpu
292+
with monkeypatch_function_result(os.path.exists, "/sys/fs/cgroup/cgroup.controllers", patch_result=True, patch_module=os.path), \
293+
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu.max', patch_result="100000 100000"):
292294
assert util.misc.available_cpu_count() == 1
293295

294-
assert util.misc.available_cpu_count() == reported_cpu_count
296+
# cgroup v2 limited to 2 cpu
297+
with monkeypatch_function_result(os.path.exists, "/sys/fs/cgroup/cgroup.controllers", patch_result=True, patch_module=os.path), \
298+
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu.max', patch_result="200000 100000"):
299+
assert util.misc.available_cpu_count() == 2
300+
301+
# cgroup v2 with no CPU limit imposed on cgroup
302+
# (fall back to /proc/self/status method, with limit imposed there):
303+
# 'Cpus_allowed: d' = 0b1101 bitmask (meaning execution allowed on 3 CPUs)
304+
with monkeypatch_function_result(os.path.exists, "/sys/fs/cgroup/cgroup.controllers", patch_result=True, patch_module=os.path), \
305+
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu.max', patch_result="max 100000"), \
306+
monkeypatch_function_result(util.file.slurp_file, '/proc/self/status', patch_result='Cpus_allowed: d'):
307+
assert util.misc.available_cpu_count() == 3
308+
309+
# cgroup v1 limited to 2 CPUs
310+
with monkeypatch_function_result(os.path.exists, "/sys/fs/cgroup/cgroup.controllers", patch_result=False, patch_module=os.path), \
311+
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu/cpu.cfs_quota_us', patch_result='200000'), \
312+
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu/cpu.cfs_period_us', patch_result='100000'):
313+
314+
assert util.misc.available_cpu_count() == 2
315+
316+
# cgroup v1 limited to 1 CPU
317+
with monkeypatch_function_result(os.path.exists, "/sys/fs/cgroup/cgroup.controllers", patch_result=False, patch_module=os.path), \
318+
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu/cpu.cfs_quota_us', patch_result='1'), \
319+
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu/cpu.cfs_period_us', patch_result='1'):
320+
321+
assert util.misc.available_cpu_count() == 1
295322

296-
with monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu/cpu.cfs_quota_us', patch_result='-1'), \
323+
# cgroup v1 with no limit imposed on the cgroup
324+
# (fall back to /proc/self/status method, with limit imposed there):
325+
# 'Cpus_allowed: c' = 0b1100 bitmask (meaning execution allowed on 2 CPUs)
326+
with monkeypatch_function_result(os.path.exists, "/sys/fs/cgroup/cgroup.controllers", patch_result=False, patch_module=os.path), \
327+
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu/cpu.cfs_quota_us', patch_result='-1'), \
328+
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu/cpu.cfs_period_us', patch_result='1'), \
329+
monkeypatch_function_result(util.file.slurp_file, '/proc/self/status', patch_result='Cpus_allowed: c'):
330+
331+
assert util.misc.available_cpu_count() == 2
332+
333+
# cgroup v1 with no limit imposed on the cgoup or via /proc/self/status
334+
# (fall back to /proc/self/status method, with no limit imposed there)
335+
with monkeypatch_function_result(os.path.exists, "/sys/fs/cgroup/cgroup.controllers", patch_result=False, patch_module=os.path), \
336+
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu/cpu.cfs_quota_us', patch_result='-1'), \
297337
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu/cpu.cfs_period_us', patch_result='1'):
338+
339+
assert util.misc.available_cpu_count() == reported_cpu_count
340+
341+
# cgroup v1 with no limit imposed on the cgoup
342+
# with 'Cpus_allowed' not present in /proc/self/status
343+
# (fall back to multiprocessing.cpu_count() method)
344+
with monkeypatch_function_result(os.path.exists, "/sys/fs/cgroup/cgroup.controllers", patch_result=False, patch_module=os.path), \
345+
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu/cpu.cfs_quota_us', patch_result='-1'), \
346+
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu/cpu.cfs_period_us', patch_result='1'), \
347+
monkeypatch_function_result(util.file.slurp_file, '/proc/self/status', patch_result='unexpected_key: 1'):
348+
298349
assert util.misc.available_cpu_count() == reported_cpu_count
350+
351+
# cgroup v1 with no limit imposed on the cgoup
352+
# with 'Cpus_allowed' not present in /proc/self/status
353+
# (fall back to multiprocessing.cpu_count() method with CPU count of 2 reported)
354+
with monkeypatch_function_result(os.path.exists, "/sys/fs/cgroup/cgroup.controllers", patch_result=False, patch_module=os.path), \
355+
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu/cpu.cfs_quota_us', patch_result='-1'), \
356+
monkeypatch_function_result(util.file.slurp_file, '/sys/fs/cgroup/cpu/cpu.cfs_period_us', patch_result='1'), \
357+
monkeypatch_function_result(util.file.slurp_file, '/proc/self/status', patch_result='unexpected_key: 1'), \
358+
monkeypatch_function_result(multiprocessing.cpu_count, patch_result=2, patch_module=multiprocessing):
359+
360+
assert util.misc.available_cpu_count() == 2

util/misc.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
'''A few miscellaneous tools. '''
2+
import math
23
import collections
34
import contextlib
45
import itertools, functools, operator
@@ -334,21 +335,38 @@ def available_cpu_count():
334335

335336
cgroup_cpus = MAX_INT32
336337
try:
337-
def get_cpu_val(name):
338-
return float(util.file.slurp_file('/sys/fs/cgroup/cpu/cpu.'+name).strip())
339-
cfs_quota = get_cpu_val('cfs_quota_us')
340-
if cfs_quota > 0:
341-
cfs_period = get_cpu_val('cfs_period_us')
342-
log.debug('cfs_quota %s, cfs_period %s', cfs_quota, cfs_period)
343-
cgroup_cpus = max(1, int(cfs_quota / cfs_period))
338+
# cgroup CPU count determination (w/ v2) adapted from:
339+
# https://github.com/conan-io/conan/blob/2.9.2/conan/tools/build/cpu.py#L31-L54
340+
#
341+
# see also:
342+
# https://docs.kernel.org/scheduler/sched-bwc.html
343+
344+
# This is necessary to determine docker cpu_count
345+
cfs_quota_us = cfs_period_us = 0
346+
# cgroup v2
347+
if os.path.exists("/sys/fs/cgroup/cgroup.controllers"):
348+
log.debug("cgroup v2 detected")
349+
cpu_max = util.file.slurp_file("/sys/fs/cgroup/cpu.max").split()
350+
if cpu_max[0] != "max":
351+
if len(cpu_max) == 1:
352+
cfs_quota_us, cfs_period_us = int(cpu_max[0]), 100_000
353+
else:
354+
cfs_quota_us, cfs_period_us = map(int, cpu_max)
355+
# cgroup v1
356+
else:
357+
log.debug("cgroup v1 detected")
358+
cfs_quota_us = int(util.file.slurp_file("/sys/fs/cgroup/cpu/cpu.cfs_quota_us"))
359+
cfs_period_us = int(util.file.slurp_file("/sys/fs/cgroup/cpu/cpu.cfs_period_us"))
360+
361+
log.debug('cfs_quota_us %s, cfs_period_us %s', cfs_quota_us, cfs_period_us)
362+
if cfs_quota_us > 0 and cfs_period_us > 0:
363+
cgroup_cpus = max(1, int(math.ceil(cfs_quota_us / cfs_period_us)))
344364
except Exception as e:
345365
pass
346366

347367
proc_cpus = MAX_INT32
348368
try:
349-
with open('/proc/self/status') as f:
350-
status = f.read()
351-
m = re.search(r'(?m)^Cpus_allowed:\s*(.*)$', status)
369+
m = re.search(r'(?m)^Cpus_allowed:\s*(.*)$', util.file.slurp_file('/proc/self/status'))
352370
if m:
353371
res = bin(int(m.group(1).replace(',', ''), 16)).count('1')
354372
if res > 0:

0 commit comments

Comments
 (0)