Skip to content

Commit 6a88fa2

Browse files
committed
ADD: support for docker container metrics under cgroupsv2.
1 parent a5327f0 commit 6a88fa2

File tree

3 files changed

+210
-105
lines changed

3 files changed

+210
-105
lines changed

CHANGELOG

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ Release Notes - Clusterd - Version 1.11.0-0.6.1
22
-----------------------------------------------
33

44
* FIX: Missing mesos-agent metrics when using cgroupsv2.
5+
* ADD: Support for docker container metrics under cgroupsv2.
56

67

78
Release Notes - Clusterd - Version 1.11.0-0.6.0

src/slave/containerizer/docker.cpp

Lines changed: 207 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -2079,12 +2079,19 @@ Future<ResourceStatistics> DockerContainerizerProcess::usage(
20792079
ResourceStatistics result;
20802080

20812081
#ifdef __linux__
2082-
const Try<ResourceStatistics> cgroupStats = cgroupsStatistics(pid);
2083-
if (cgroupStats.isError()) {
2084-
return Failure("Failed to collect cgroup stats: " + cgroupStats.error());
2082+
if (!cgroups2::enabled()) {
2083+
const Try<ResourceStatistics> cgroupStats = cgroupsStatistics(pid);
2084+
if (cgroupStats.isError()) {
2085+
return Failure("Failed to collect cgroup stats: " + cgroupStats.error());
2086+
}
2087+
result = cgroupStats.get();
2088+
} else {
2089+
const Try<ResourceStatistics> cgroupStats = cgroupsv2Statistics(containerId);
2090+
if (cgroupStats.isError()) {
2091+
return Failure("Failed to collect cgroupv2 stats: " + cgroupStats.error());
2092+
}
2093+
result = cgroupStats.get();
20852094
}
2086-
2087-
result = cgroupStats.get();
20882095
#endif // __linux__
20892096

20902097
Option<double> cpuRequest, cpuLimit, memLimit;
@@ -2221,134 +2228,229 @@ Future<ResourceStatistics> DockerContainerizerProcess::usage(
22212228
}));
22222229
}
22232230

2231+
Try<std::string> DockerContainerizerProcess::getCgroupV2Path(pid_t pid) const {
2232+
std::string path = "/proc/" + std::to_string(pid) + "/cgroup";
2233+
std::ifstream file(path);
2234+
if (!file.is_open()) {
2235+
return Error("Error open cgroup file: " + path);
2236+
}
2237+
2238+
std::string line;
2239+
while (std::getline(file, line)) {
2240+
if (line.rfind("0::", 0) == 0) {
2241+
size_t pos = line.find("::");
2242+
if (pos != std::string::npos && pos + 2 < line.size()) {
2243+
return line.substr(pos + 2);
2244+
}
2245+
}
2246+
}
2247+
2248+
return Error("Could not find cgroup for PID " + std::to_string(pid));
2249+
}
2250+
2251+
Try<ResourceStatistics> DockerContainerizerProcess::cgroupsv2Statistics(ContainerID containerId) const
2252+
{
2253+
#ifndef __linux__
2254+
return Error("Does not support cgroups on non-linux platform");
2255+
#else
2256+
2257+
if (!containers_.contains(containerId)) {
2258+
return Error("Unknown container " + stringify(containerId));
2259+
}
2260+
2261+
Container* container = containers_.at(containerId);
2262+
2263+
Try<std::string> cgPath = getCgroupV2Path(container->pid.get());
2264+
if (cgPath.isError()) {
2265+
return Error(cgPath.error());
2266+
}
2267+
2268+
std::stringstream sc;
2269+
sc << flags.cgroups_hierarchy << cgPath.get();
2270+
const string cgroup = sc.str();
2271+
2272+
Try<cgroups2::cpu::Stats> cpuStats = cgroups2::cpu::stats(cgroup);
2273+
if (cpuStats.isError()) {
2274+
return Error("Failed to get cgroup CPU stats: " + cpuStats.error());
2275+
}
2276+
2277+
ResourceStatistics usage;
2278+
usage.set_cpus_user_time_secs(cpuStats->user_time.secs());
2279+
usage.set_cpus_system_time_secs(cpuStats->system_time.secs());
2280+
2281+
if (cpuStats->periods.isSome()) {
2282+
usage.set_cpus_nr_periods(*cpuStats->periods);
2283+
}
2284+
if (cpuStats->throttled.isSome()) {
2285+
usage.set_cpus_nr_throttled(*cpuStats->throttled);
2286+
}
2287+
if (cpuStats->throttle_time.isSome()) {
2288+
usage.set_cpus_throttled_time_secs(cpuStats->throttle_time->secs());
2289+
}
2290+
2291+
if (cpuStats->periods.isNone()
2292+
|| cpuStats->throttled.isNone()
2293+
|| cpuStats->throttle_time.isNone()) {
2294+
LOG(ERROR) << "cpu throttling stats missing for cgroup '" << cgroup << "'"
2295+
" despite the 'cpu' controller being enabled";
2296+
}
2297+
2298+
Try<cgroups2::memory::Stats> memoryStats = cgroups2::memory::stats(cgroup);
2299+
if (memoryStats.isError()) {
2300+
return Error("Failed to get cgroup memory stats: " + memoryStats.error());
2301+
}
2302+
2303+
// Kernel memory usage.
2304+
usage.set_mem_kmem_usage_bytes(memoryStats->kernel.bytes());
2305+
2306+
// Kernel TCP buffers usage.
2307+
usage.set_mem_kmem_tcp_usage_bytes(memoryStats->sock.bytes());
2308+
2309+
// Page cache usage.
2310+
usage.set_mem_file_bytes(memoryStats->file.bytes());
2311+
usage.set_mem_cache_bytes(memoryStats->file.bytes());
2312+
2313+
// Anonymous memory usage.
2314+
usage.set_mem_anon_bytes(memoryStats->anon.bytes());
2315+
usage.set_mem_rss_bytes(memoryStats->anon.bytes());
2316+
2317+
// File mapped memory usage.
2318+
usage.set_mem_mapped_file_bytes(memoryStats->file_mapped.bytes());
2319+
2320+
// Total unevictable memory.
2321+
usage.set_mem_unevictable_bytes(memoryStats->unevictable.bytes());
2322+
2323+
return usage;
2324+
#endif // __linux__
2325+
}
2326+
22242327

22252328
Try<ResourceStatistics> DockerContainerizerProcess::cgroupsStatistics(
22262329
pid_t pid) const
22272330
{
22282331
#ifndef __linux__
22292332
return Error("Does not support cgroups on non-linux platform");
22302333
#else
2231-
ResourceStatistics result;
22322334

2233-
if (!cgroups2::enabled()) {
2234-
static const Result<string> cpuacctHierarchy = cgroups::hierarchy("cpuacct");
2235-
static const Result<string> memHierarchy = cgroups::hierarchy("memory");
22362335

2237-
// NOTE: Normally, a Docker container should be in its own cgroup.
2238-
// However, a zombie process (exited but not reaped) will be
2239-
// temporarily moved into the system root cgroup. We add some
2240-
// defensive check here to make sure we are not reporting statistics
2241-
// for the root cgroup. See MESOS-8480 for details.
2242-
const string systemRootCgroup = stringify(os::PATH_SEPARATOR);
2336+
static const Result<string> cpuacctHierarchy = cgroups::hierarchy("cpuacct");
2337+
static const Result<string> memHierarchy = cgroups::hierarchy("memory");
22432338

2244-
if (cpuacctHierarchy.isError()) {
2245-
return Error(
2246-
"Failed to determine the cgroup 'cpuacct' subsystem hierarchy: " +
2247-
cpuacctHierarchy.error());
2248-
}
2339+
// NOTE: Normally, a Docker container should be in its own cgroup.
2340+
// However, a zombie process (exited but not reaped) will be
2341+
// temporarily moved into the system root cgroup. We add some
2342+
// defensive check here to make sure we are not reporting statistics
2343+
// for the root cgroup. See MESOS-8480 for details.
2344+
const string systemRootCgroup = stringify(os::PATH_SEPARATOR);
22492345

2250-
if (memHierarchy.isError()) {
2251-
return Error(
2252-
"Failed to determine the cgroup 'memory' subsystem hierarchy: " +
2253-
memHierarchy.error());
2254-
}
2346+
if (cpuacctHierarchy.isError()) {
2347+
return Error(
2348+
"Failed to determine the cgroup 'cpuacct' subsystem hierarchy: " +
2349+
cpuacctHierarchy.error());
2350+
}
22552351

2256-
const Result<string> cpuacctCgroup = cgroups::cpuacct::cgroup(pid);
2257-
if (cpuacctCgroup.isError()) {
2258-
return Error(
2259-
"Failed to determine cgroup for the 'cpuacct' subsystem: " +
2260-
cpuacctCgroup.error());
2261-
} else if (cpuacctCgroup.isNone()) {
2262-
return Error("Unable to find 'cpuacct' cgroup subsystem");
2263-
} else if (cpuacctCgroup.get() == systemRootCgroup) {
2352+
if (memHierarchy.isError()) {
2353+
return Error(
2354+
"Failed to determine the cgroup 'memory' subsystem hierarchy: " +
2355+
memHierarchy.error());
2356+
}
2357+
2358+
const Result<string> cpuacctCgroup = cgroups::cpuacct::cgroup(pid);
2359+
if (cpuacctCgroup.isError()) {
2360+
return Error(
2361+
"Failed to determine cgroup for the 'cpuacct' subsystem: " +
2362+
cpuacctCgroup.error());
2363+
} else if (cpuacctCgroup.isNone()) {
2364+
return Error("Unable to find 'cpuacct' cgroup subsystem");
2365+
} else if (cpuacctCgroup.get() == systemRootCgroup) {
2366+
return Error(
2367+
"Process '" + stringify(pid) +
2368+
"' should not be in the system root cgroup (being destroyed?)");
2369+
}
2370+
2371+
const Result<string> memCgroup = cgroups::memory::cgroup(pid);
2372+
if (memCgroup.isError()) {
2373+
return Error(
2374+
"Failed to determine cgroup for the 'memory' subsystem: " +
2375+
memCgroup.error());
2376+
} else if (memCgroup.isNone()) {
2377+
return Error("Unable to find 'memory' cgroup subsystem");
2378+
} else if (memCgroup.get() == systemRootCgroup) {
2379+
return Error(
2380+
"Process '" + stringify(pid) +
2381+
"' should not be in the system root cgroup (being destroyed?)");
2382+
}
2383+
2384+
const Try<cgroups::cpuacct::Stats> cpuAcctStat =
2385+
cgroups::cpuacct::stat(cpuacctHierarchy.get(), cpuacctCgroup.get());
2386+
2387+
if (cpuAcctStat.isError()) {
2388+
return Error("Failed to get cpu.stat: " + cpuAcctStat.error());
2389+
}
2390+
2391+
const Try<hashmap<string, uint64_t>> memStats =
2392+
cgroups::stat(memHierarchy.get(), memCgroup.get(), "memory.stat");
2393+
2394+
if (memStats.isError()) {
2395+
return Error(
2396+
"Error getting memory statistics from cgroups memory subsystem: " +
2397+
memStats.error());
2398+
}
2399+
2400+
if (!memStats->contains("rss")) {
2401+
return Error("cgroups memory stats does not contain 'rss' data");
2402+
}
2403+
2404+
ResourceStatistics result;
2405+
result.set_timestamp(Clock::now().secs());
2406+
result.set_cpus_system_time_secs(cpuAcctStat->system.secs());
2407+
result.set_cpus_user_time_secs(cpuAcctStat->user.secs());
2408+
result.set_mem_rss_bytes(memStats->at("rss"));
2409+
2410+
// Add the cpu.stat information only if CFS is enabled.
2411+
if (flags.cgroups_enable_cfs) {
2412+
static const Result<string> cpuHierarchy = cgroups::hierarchy("cpu");
2413+
2414+
if (cpuHierarchy.isError()) {
22642415
return Error(
2265-
"Process '" + stringify(pid) +
2266-
"' should not be in the system root cgroup (being destroyed?)");
2416+
"Failed to determine the cgroup 'cpu' subsystem hierarchy: " +
2417+
cpuHierarchy.error());
22672418
}
22682419

2269-
const Result<string> memCgroup = cgroups::memory::cgroup(pid);
2270-
if (memCgroup.isError()) {
2420+
const Result<string> cpuCgroup = cgroups::cpu::cgroup(pid);
2421+
if (cpuCgroup.isError()) {
22712422
return Error(
2272-
"Failed to determine cgroup for the 'memory' subsystem: " +
2273-
memCgroup.error());
2274-
} else if (memCgroup.isNone()) {
2275-
return Error("Unable to find 'memory' cgroup subsystem");
2276-
} else if (memCgroup.get() == systemRootCgroup) {
2423+
"Failed to determine cgroup for the 'cpu' subsystem: " +
2424+
cpuCgroup.error());
2425+
} else if (cpuCgroup.isNone()) {
2426+
return Error("Unable to find 'cpu' cgroup subsystem");
2427+
} else if (cpuCgroup.get() == systemRootCgroup) {
22772428
return Error(
22782429
"Process '" + stringify(pid) +
22792430
"' should not be in the system root cgroup (being destroyed?)");
22802431
}
22812432

2282-
const Try<cgroups::cpuacct::Stats> cpuAcctStat =
2283-
cgroups::cpuacct::stat(cpuacctHierarchy.get(), cpuacctCgroup.get());
2433+
const Try<hashmap<string, uint64_t>> stat =
2434+
cgroups::stat(cpuHierarchy.get(), cpuCgroup.get(), "cpu.stat");
22842435

2285-
if (cpuAcctStat.isError()) {
2286-
return Error("Failed to get cpu.stat: " + cpuAcctStat.error());
2436+
if (stat.isError()) {
2437+
return Error("Failed to read cpu.stat: " + stat.error());
22872438
}
22882439

2289-
const Try<hashmap<string, uint64_t>> memStats =
2290-
cgroups::stat(memHierarchy.get(), memCgroup.get(), "memory.stat");
2291-
2292-
if (memStats.isError()) {
2293-
return Error(
2294-
"Error getting memory statistics from cgroups memory subsystem: " +
2295-
memStats.error());
2440+
Option<uint64_t> nr_periods = stat->get("nr_periods");
2441+
if (nr_periods.isSome()) {
2442+
result.set_cpus_nr_periods(nr_periods.get());
22962443
}
22972444

2298-
if (!memStats->contains("rss")) {
2299-
return Error("cgroups memory stats does not contain 'rss' data");
2445+
Option<uint64_t> nr_throttled = stat->get("nr_throttled");
2446+
if (nr_throttled.isSome()) {
2447+
result.set_cpus_nr_throttled(nr_throttled.get());
23002448
}
23012449

2302-
result.set_timestamp(Clock::now().secs());
2303-
result.set_cpus_system_time_secs(cpuAcctStat->system.secs());
2304-
result.set_cpus_user_time_secs(cpuAcctStat->user.secs());
2305-
result.set_mem_rss_bytes(memStats->at("rss"));
2306-
2307-
// Add the cpu.stat information only if CFS is enabled.
2308-
if (flags.cgroups_enable_cfs) {
2309-
static const Result<string> cpuHierarchy = cgroups::hierarchy("cpu");
2310-
2311-
if (cpuHierarchy.isError()) {
2312-
return Error(
2313-
"Failed to determine the cgroup 'cpu' subsystem hierarchy: " +
2314-
cpuHierarchy.error());
2315-
}
2316-
2317-
const Result<string> cpuCgroup = cgroups::cpu::cgroup(pid);
2318-
if (cpuCgroup.isError()) {
2319-
return Error(
2320-
"Failed to determine cgroup for the 'cpu' subsystem: " +
2321-
cpuCgroup.error());
2322-
} else if (cpuCgroup.isNone()) {
2323-
return Error("Unable to find 'cpu' cgroup subsystem");
2324-
} else if (cpuCgroup.get() == systemRootCgroup) {
2325-
return Error(
2326-
"Process '" + stringify(pid) +
2327-
"' should not be in the system root cgroup (being destroyed?)");
2328-
}
2329-
2330-
const Try<hashmap<string, uint64_t>> stat =
2331-
cgroups::stat(cpuHierarchy.get(), cpuCgroup.get(), "cpu.stat");
2332-
2333-
if (stat.isError()) {
2334-
return Error("Failed to read cpu.stat: " + stat.error());
2335-
}
2336-
2337-
Option<uint64_t> nr_periods = stat->get("nr_periods");
2338-
if (nr_periods.isSome()) {
2339-
result.set_cpus_nr_periods(nr_periods.get());
2340-
}
2341-
2342-
Option<uint64_t> nr_throttled = stat->get("nr_throttled");
2343-
if (nr_throttled.isSome()) {
2344-
result.set_cpus_nr_throttled(nr_throttled.get());
2345-
}
2346-
2347-
Option<uint64_t> throttled_time = stat->get("throttled_time");
2348-
if (throttled_time.isSome()) {
2349-
result.set_cpus_throttled_time_secs(
2350-
Nanoseconds(throttled_time.get()).secs());
2351-
}
2450+
Option<uint64_t> throttled_time = stat->get("throttled_time");
2451+
if (throttled_time.isSome()) {
2452+
result.set_cpus_throttled_time_secs(
2453+
Nanoseconds(throttled_time.get()).secs());
23522454
}
23532455
}
23542456

src/slave/containerizer/docker.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,8 @@ class DockerContainerizerProcess
298298
#endif // __linux__
299299

300300
Try<ResourceStatistics> cgroupsStatistics(pid_t pid) const;
301+
Try<ResourceStatistics> cgroupsv2Statistics(ContainerID containerId) const;
302+
Try<std::string> getCgroupV2Path(pid_t pid) const;
301303

302304
// Call back for when the executor exits. This will trigger
303305
// container destroy.

0 commit comments

Comments
 (0)