From e1ce37b06e6d516d9081e2c11891bf3731694c84 Mon Sep 17 00:00:00 2001 From: Will Andrews Date: Fri, 9 Aug 2024 18:39:36 +0000 Subject: [PATCH] process-exporter: collect mmap statistics if enabled This enables tracking maps per process group, as well as supporting alerts based on what % of the max a group is using. --- README.md | 11 +++++++ cmd/process-exporter/main.go | 3 ++ collector/process_collector.go | 55 ++++++++++++++++++++++++++++++++++ proc/grouper.go | 3 ++ proc/grouper_test.go | 54 ++++++++++++++++----------------- proc/read.go | 41 +++++++++++++++++++++++-- proc/tracker_test.go | 8 ++--- 7 files changed, 142 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 7be56369..05ab6a63 100644 --- a/README.md +++ b/README.md @@ -258,6 +258,11 @@ Number of context switches based on /proc/[pid]/status fields voluntary_ctxt_swi and nonvoluntary_ctxt_switches. The extra label `ctxswitchtype` can have two values: `voluntary` and `nonvoluntary`. +### mmap_count gauge + +If gathering mmaps file is enabled, this indicates the maximum number of mmap entries +of any process in the group. + ### memory_bytes gauge Number of bytes of memory used. The extra label `memtype` can have three values: @@ -355,6 +360,12 @@ Same as minor_page_faults_total, but broken down per-thread subgroup. Same as context_switches_total, but broken down per-thread subgroup. +## Global metrics + +### max_map_count gauge + +If gathering mmaps file is enabled, this metric indicates the global maximum per process. + ## Instrumentation cost process-exporter will consume CPU in proportion to the number of processes in diff --git a/cmd/process-exporter/main.go b/cmd/process-exporter/main.go index 6dfc0a08..7ccc2821 100644 --- a/cmd/process-exporter/main.go +++ b/cmd/process-exporter/main.go @@ -163,6 +163,8 @@ func main() { "if a proc is tracked, track with it any children that aren't part of their own group") threads = flag.Bool("threads", true, "report on per-threadname metrics as well") + mmaps = flag.Bool("gather-mmaps", true, + "gather metrics from maps file, which contains mmap info") smaps = flag.Bool("gather-smaps", true, "gather metrics from smaps file, which contains proportional resident memory size") man = flag.Bool("man", false, @@ -244,6 +246,7 @@ func main() { ProcFSPath: *procfsPath, Children: *children, Threads: *threads, + GatherMMaps: *mmaps, GatherSMaps: *smaps, Namer: matchnamer, Recheck: *recheck, diff --git a/collector/process_collector.go b/collector/process_collector.go index 8f41ba47..17d7c52c 100644 --- a/collector/process_collector.go +++ b/collector/process_collector.go @@ -1,7 +1,11 @@ package collector import ( + "bufio" "log" + "os" + "strconv" + "strings" "time" common "github.com/ncabatoff/process-exporter" @@ -52,6 +56,12 @@ var ( []string{"groupname", "ctxswitchtype"}, nil) + mmapCountDesc = prometheus.NewDesc( + "namedprocess_namegroup_mmap_count", + "maximum number of mmap entries in use", + []string{"groupname"}, + nil) + membytesDesc = prometheus.NewDesc( "namedprocess_namegroup_memory_bytes", "number of bytes of memory in use", @@ -106,6 +116,12 @@ var ( nil, nil) + maxMapCountDesc = prometheus.NewDesc( + "namedprocess_max_map_count", + "maximum number of map entries allowed per process", + nil, + nil) + threadWchanDesc = prometheus.NewDesc( "namedprocess_namegroup_threads_wchan", "Number of threads in this group waiting on each wchan", @@ -159,6 +175,7 @@ type ( ProcFSPath string Children bool Threads bool + GatherMMaps bool GatherSMaps bool Namer common.MatchNamer Recheck bool @@ -172,7 +189,9 @@ type ( *proc.Grouper threads bool smaps bool + mmaps bool source proc.Source + maxMapCount int scrapeErrors int scrapeProcReadErrors int scrapePartialErrors int @@ -186,12 +205,14 @@ func NewProcessCollector(options ProcessCollectorOption) (*NamedProcessCollector return nil, err } + fs.GatherMMaps = options.GatherMMaps fs.GatherSMaps = options.GatherSMaps p := &NamedProcessCollector{ scrapeChan: make(chan scrapeRequest), Grouper: proc.NewGrouper(options.Namer, options.Children, options.Threads, options.Recheck, options.RecheckTimeLimit, options.Debug, options.RemoveEmptyGroups), source: fs, threads: options.Threads, + mmaps: options.GatherMMaps, smaps: options.GatherSMaps, debug: options.Debug, } @@ -253,6 +274,25 @@ func (p *NamedProcessCollector) start() { } } +func GetMaxMapCount() (int, error) { + r, err := os.Open("/proc/sys/vm/max_map_count") + if err != nil { + return 0, err + } + defer r.Close() + reader := bufio.NewReader(r) + rdln, err := reader.ReadString('\n') + if err != nil { + return 0, err + } + line := strings.Trim(rdln, "\n") + val, err := strconv.Atoi(line) + if err != nil { + return 0, err + } + return val, nil +} + func (p *NamedProcessCollector) scrape(ch chan<- prometheus.Metric) { permErrs, groups, err := p.Update(p.source.AllProcs()) p.scrapePartialErrors += permErrs.Partial @@ -309,6 +349,11 @@ func (p *NamedProcessCollector) scrape(ch chan<- prometheus.Metric) { prometheus.GaugeValue, float64(count), gname, wchan) } + if p.mmaps { + ch <- prometheus.MustNewConstMetric(mmapCountDesc, + prometheus.GaugeValue, float64(gcounts.Memory.MmapCount), gname) + } + if p.smaps { ch <- prometheus.MustNewConstMetric(membytesDesc, prometheus.GaugeValue, float64(gcounts.Memory.ProportionalBytes), gname, "proportionalResident") @@ -349,6 +394,16 @@ func (p *NamedProcessCollector) scrape(ch chan<- prometheus.Metric) { } } } + if p.mmaps { + max_map_count, err := GetMaxMapCount() + if err == nil { + p.maxMapCount = max_map_count + } else { + p.scrapePartialErrors++ + } + } + ch <- prometheus.MustNewConstMetric(maxMapCountDesc, + prometheus.GaugeValue, float64(p.maxMapCount)) ch <- prometheus.MustNewConstMetric(scrapeErrorsDesc, prometheus.CounterValue, float64(p.scrapeErrors)) ch <- prometheus.MustNewConstMetric(scrapeProcReadErrorsDesc, diff --git a/proc/grouper.go b/proc/grouper.go index bbf240ca..6c224cf1 100644 --- a/proc/grouper.go +++ b/proc/grouper.go @@ -68,6 +68,9 @@ func groupadd(grp Group, ts Update) Group { grp.Memory.ResidentBytes += ts.Memory.ResidentBytes grp.Memory.VirtualBytes += ts.Memory.VirtualBytes grp.Memory.VmSwapBytes += ts.Memory.VmSwapBytes + if grp.Memory.MmapCount < ts.Memory.MmapCount { + grp.Memory.MmapCount = ts.Memory.MmapCount + } grp.Memory.ProportionalBytes += ts.Memory.ProportionalBytes grp.Memory.ProportionalSwapBytes += ts.Memory.ProportionalSwapBytes if ts.Filedesc.Open != -1 { diff --git a/proc/grouper_test.go b/proc/grouper_test.go index 826d1e97..6b6bb485 100644 --- a/proc/grouper_test.go +++ b/proc/grouper_test.go @@ -45,30 +45,30 @@ func TestGrouperBasic(t *testing.T) { }{ { []IDInfo{ - piinfost(p1, n1, Counts{1, 2, 3, 4, 5, 6, 0, 0}, Memory{7, 8, 0, 0, 0}, + piinfost(p1, n1, Counts{1, 2, 3, 4, 5, 6, 0, 0}, Memory{7, 8, 0, 0, 0, 0}, Filedesc{4, 400}, 2, States{Other: 1}), - piinfost(p2, n2, Counts{2, 3, 4, 5, 6, 7, 0, 0}, Memory{8, 9, 0, 0, 0}, + piinfost(p2, n2, Counts{2, 3, 4, 5, 6, 7, 0, 0}, Memory{8, 9, 0, 0, 0, 0}, Filedesc{40, 400}, 3, States{Waiting: 1}), }, GroupByName{ - "g1": Group{Counts{}, States{Other: 1}, msi{}, 1, Memory{7, 8, 0, 0, 0}, starttime, + "g1": Group{Counts{}, States{Other: 1}, msi{}, 1, Memory{7, 8, 0, 0, 0, 0}, starttime, 4, 0.01, 2, nil}, - "g2": Group{Counts{}, States{Waiting: 1}, msi{}, 1, Memory{8, 9, 0, 0, 0}, starttime, + "g2": Group{Counts{}, States{Waiting: 1}, msi{}, 1, Memory{8, 9, 0, 0, 0, 0}, starttime, 40, 0.1, 3, nil}, }, }, { []IDInfo{ piinfost(p1, n1, Counts{2, 3, 4, 5, 6, 7, 0, 0}, - Memory{6, 7, 0, 0, 0}, Filedesc{100, 400}, 4, States{Zombie: 1}), + Memory{6, 7, 0, 0, 0, 0}, Filedesc{100, 400}, 4, States{Zombie: 1}), piinfost(p2, n2, Counts{4, 5, 6, 7, 8, 9, 0, 0}, - Memory{9, 8, 0, 0, 0}, Filedesc{400, 400}, 2, States{Running: 1}), + Memory{9, 8, 0, 0, 0, 0}, Filedesc{400, 400}, 2, States{Running: 1}), }, GroupByName{ "g1": Group{Counts{1, 1, 1, 1, 1, 1, 0, 0}, States{Zombie: 1}, msi{}, 1, - Memory{6, 7, 0, 0, 0}, starttime, 100, 0.25, 4, nil}, + Memory{6, 7, 0, 0, 0, 0}, starttime, 100, 0.25, 4, nil}, "g2": Group{Counts{2, 2, 2, 2, 2, 2, 0, 0}, States{Running: 1}, msi{}, 1, - Memory{9, 8, 0, 0, 0}, starttime, 400, 1, 2, nil}, + Memory{9, 8, 0, 0, 0, 0}, starttime, 400, 1, 2, nil}, }, }, } @@ -95,10 +95,10 @@ func TestGrouperProcJoin(t *testing.T) { }{ { []IDInfo{ - piinfo(p1, n1, Counts{1, 2, 3, 4, 5, 6, 0, 0}, Memory{3, 4, 0, 0, 0}, Filedesc{4, 400}, 2), + piinfo(p1, n1, Counts{1, 2, 3, 4, 5, 6, 0, 0}, Memory{3, 4, 0, 0, 0, 0}, Filedesc{4, 400}, 2), }, GroupByName{ - "g1": Group{Counts{}, States{}, msi{}, 1, Memory{3, 4, 0, 0, 0}, starttime, 4, 0.01, 2, nil}, + "g1": Group{Counts{}, States{}, msi{}, 1, Memory{3, 4, 0, 0, 0, 0}, starttime, 4, 0.01, 2, nil}, }, }, { // The counts for pid2 won't be factored into the total yet because we only add @@ -106,24 +106,24 @@ func TestGrouperProcJoin(t *testing.T) { // affected though. []IDInfo{ piinfost(p1, n1, Counts{3, 4, 5, 6, 7, 8, 0, 0}, - Memory{3, 4, 0, 0, 0}, Filedesc{4, 400}, 2, States{Running: 1}), + Memory{3, 4, 0, 0, 0, 0}, Filedesc{4, 400}, 2, States{Running: 1}), piinfost(p2, n2, Counts{1, 1, 1, 1, 1, 1, 0, 0}, - Memory{1, 2, 0, 0, 0}, Filedesc{40, 400}, 3, States{Sleeping: 1}), + Memory{1, 2, 0, 0, 0, 0}, Filedesc{40, 400}, 3, States{Sleeping: 1}), }, GroupByName{ "g1": Group{Counts{2, 2, 2, 2, 2, 2, 0, 0}, States{Running: 1, Sleeping: 1}, msi{}, 2, - Memory{4, 6, 0, 0, 0}, starttime, 44, 0.1, 5, nil}, + Memory{4, 6, 0, 0, 0, 0}, starttime, 44, 0.1, 5, nil}, }, }, { []IDInfo{ piinfost(p1, n1, Counts{4, 5, 6, 7, 8, 9, 0, 0}, - Memory{1, 5, 0, 0, 0}, Filedesc{4, 400}, 2, States{Running: 1}), + Memory{1, 5, 0, 0, 0, 0}, Filedesc{4, 400}, 2, States{Running: 1}), piinfost(p2, n2, Counts{2, 2, 2, 2, 2, 2, 0, 0}, - Memory{2, 4, 0, 0, 0}, Filedesc{40, 400}, 3, States{Running: 1}), + Memory{2, 4, 0, 0, 0, 0}, Filedesc{40, 400}, 3, States{Running: 1}), }, GroupByName{ "g1": Group{Counts{4, 4, 4, 4, 4, 4, 0, 0}, States{Running: 2}, msi{}, 2, - Memory{3, 9, 0, 0, 0}, starttime, 44, 0.1, 5, nil}, + Memory{3, 9, 0, 0, 0, 0}, starttime, 44, 0.1, 5, nil}, }, }, } @@ -150,18 +150,18 @@ func TestGrouperNonDecreasing(t *testing.T) { }{ { []IDInfo{ - piinfo(p1, n1, Counts{3, 4, 5, 6, 7, 8, 0, 0}, Memory{3, 4, 0, 0, 0}, Filedesc{4, 400}, 2), - piinfo(p2, n2, Counts{1, 1, 1, 1, 1, 1, 0, 0}, Memory{1, 2, 0, 0, 0}, Filedesc{40, 400}, 3), + piinfo(p1, n1, Counts{3, 4, 5, 6, 7, 8, 0, 0}, Memory{3, 4, 0, 0, 0, 0}, Filedesc{4, 400}, 2), + piinfo(p2, n2, Counts{1, 1, 1, 1, 1, 1, 0, 0}, Memory{1, 2, 0, 0, 0, 0}, Filedesc{40, 400}, 3), }, GroupByName{ - "g1": Group{Counts{}, States{}, msi{}, 2, Memory{4, 6, 0, 0, 0}, starttime, 44, 0.1, 5, nil}, + "g1": Group{Counts{}, States{}, msi{}, 2, Memory{4, 6, 0, 0, 0, 0}, starttime, 44, 0.1, 5, nil}, }, }, { []IDInfo{ - piinfo(p1, n1, Counts{4, 5, 6, 7, 8, 9, 0, 0}, Memory{1, 5, 0, 0, 0}, Filedesc{4, 400}, 2), + piinfo(p1, n1, Counts{4, 5, 6, 7, 8, 9, 0, 0}, Memory{1, 5, 0, 0, 0, 0}, Filedesc{4, 400}, 2), }, GroupByName{ - "g1": Group{Counts{1, 1, 1, 1, 1, 1, 0, 0}, States{}, msi{}, 1, Memory{1, 5, 0, 0, 0}, starttime, 4, 0.01, 2, nil}, + "g1": Group{Counts{1, 1, 1, 1, 1, 1, 0, 0}, States{}, msi{}, 1, Memory{1, 5, 0, 0, 0, 0}, starttime, 4, 0.01, 2, nil}, }, }, { []IDInfo{}, @@ -193,19 +193,19 @@ func TestGrouperRemoveEmptyGroups(t *testing.T) { }{ { []IDInfo{ - piinfo(p1, n1, Counts{3, 4, 5, 6, 7, 8, 0, 0}, Memory{3, 4, 0, 0, 0}, Filedesc{4, 400}, 2), - piinfo(p2, n2, Counts{1, 1, 1, 1, 1, 1, 0, 0}, Memory{1, 2, 0, 0, 0}, Filedesc{40, 400}, 3), + piinfo(p1, n1, Counts{3, 4, 5, 6, 7, 8, 0, 0}, Memory{3, 4, 0, 0, 0, 0}, Filedesc{4, 400}, 2), + piinfo(p2, n2, Counts{1, 1, 1, 1, 1, 1, 0, 0}, Memory{1, 2, 0, 0, 0, 0}, Filedesc{40, 400}, 3), }, GroupByName{ - n1: Group{Counts{}, States{}, msi{}, 1, Memory{3, 4, 0, 0, 0}, starttime, 4, 0.01, 2, nil}, - n2: Group{Counts{}, States{}, msi{}, 1, Memory{1, 2, 0, 0, 0}, starttime, 40, 0.1, 3, nil}, + n1: Group{Counts{}, States{}, msi{}, 1, Memory{3, 4, 0, 0, 0, 0}, starttime, 4, 0.01, 2, nil}, + n2: Group{Counts{}, States{}, msi{}, 1, Memory{1, 2, 0, 0, 0, 0}, starttime, 40, 0.1, 3, nil}, }, }, { []IDInfo{ - piinfo(p1, n1, Counts{4, 5, 6, 7, 8, 9, 0, 0}, Memory{1, 5, 0, 0, 0}, Filedesc{4, 400}, 2), + piinfo(p1, n1, Counts{4, 5, 6, 7, 8, 9, 0, 0}, Memory{1, 5, 0, 0, 0, 0}, Filedesc{4, 400}, 2), }, GroupByName{ - n1: Group{Counts{1, 1, 1, 1, 1, 1, 0, 0}, States{}, msi{}, 1, Memory{1, 5, 0, 0, 0}, starttime, 4, 0.01, 2, nil}, + n1: Group{Counts{1, 1, 1, 1, 1, 1, 0, 0}, States{}, msi{}, 1, Memory{1, 5, 0, 0, 0, 0}, starttime, 4, 0.01, 2, nil}, }, }, { []IDInfo{}, diff --git a/proc/read.go b/proc/read.go index 7358aa1b..f1200ad3 100644 --- a/proc/read.go +++ b/proc/read.go @@ -1,7 +1,9 @@ package proc import ( + "bytes" "fmt" + "io" "os" "path/filepath" "strconv" @@ -53,6 +55,7 @@ type ( ResidentBytes uint64 VirtualBytes uint64 VmSwapBytes uint64 + MmapCount uint64 ProportionalBytes uint64 ProportionalSwapBytes uint64 } @@ -193,6 +196,7 @@ type ( procfs.FS BootTime uint64 MountPoint string + GatherMMaps bool GatherSMaps bool debug bool } @@ -464,6 +468,30 @@ func (p proc) GetStates() (States, error) { return s, nil } +func (p proc) GetMmapCount() (uint64, error) { + buf := make([]byte, 32*1024) + count := 0 + lineSep := []byte{'\n'} + id, err := p.GetProcID() + if err != nil { + return 0, err + } + r, _ := os.Open(fmt.Sprintf("/proc/%d/maps", id.Pid)) + defer r.Close() + + for { + c, err := r.Read(buf) + count += bytes.Count(buf[:c], lineSep) + + switch { + case err == io.EOF: + return uint64(count), nil + case err != nil: + return 0, err + } + } +} + // GetMetrics returns the current metrics for the proc. The results are // not cached. func (p proc) GetMetrics() (Metrics, int, error) { @@ -506,6 +534,15 @@ func (p proc) GetMetrics() (Metrics, int, error) { VmSwapBytes: uint64(status.VmSwap), } + if p.proccache.fs.GatherMMaps { + count, err := p.GetMmapCount() + if err == nil { + memory.MmapCount = count + } else { + softerrors |= 1 + } + } + if p.proccache.fs.GatherSMaps { smaps, err := p.Proc.ProcSMapsRollup() if err != nil { @@ -592,7 +629,7 @@ func NewFS(mountPoint string, debug bool) (*FS, error) { if err != nil { return nil, err } - return &FS{fs, stat.BootTime, mountPoint, false, debug}, nil + return &FS{fs, stat.BootTime, mountPoint, false, false, debug}, nil } func (fs *FS) threadFs(pid int) (*FS, error) { @@ -601,7 +638,7 @@ func (fs *FS) threadFs(pid int) (*FS, error) { if err != nil { return nil, err } - return &FS{tfs, fs.BootTime, mountPoint, fs.GatherSMaps, false}, nil + return &FS{tfs, fs.BootTime, mountPoint, fs.GatherMMaps, fs.GatherSMaps, false}, nil } // AllProcs implements Source. diff --git a/proc/tracker_test.go b/proc/tracker_test.go index 4a295160..9af76b08 100644 --- a/proc/tracker_test.go +++ b/proc/tracker_test.go @@ -99,15 +99,15 @@ func TestTrackerMetrics(t *testing.T) { want Update }{ { - piinfost(p, n, Counts{1, 2, 3, 4, 5, 6, 0, 0}, Memory{7, 8, 0, 0, 0}, + piinfost(p, n, Counts{1, 2, 3, 4, 5, 6, 0, 0}, Memory{7, 8, 0, 0, 0, 0}, Filedesc{1, 10}, 9, States{Sleeping: 1}), - Update{n, Delta{}, Memory{7, 8, 0, 0, 0}, Filedesc{1, 10}, tm, + Update{n, Delta{}, Memory{7, 8, 0, 0, 0, 0}, Filedesc{1, 10}, tm, 9, States{Sleeping: 1}, msi{}, nil}, }, { - piinfost(p, n, Counts{2, 3, 4, 5, 6, 7, 0, 0}, Memory{1, 2, 0, 0, 0}, + piinfost(p, n, Counts{2, 3, 4, 5, 6, 7, 0, 0}, Memory{1, 2, 0, 0, 0, 0}, Filedesc{2, 20}, 1, States{Running: 1}), - Update{n, Delta{1, 1, 1, 1, 1, 1, 0, 0}, Memory{1, 2, 0, 0, 0}, + Update{n, Delta{1, 1, 1, 1, 1, 1, 0, 0}, Memory{1, 2, 0, 0, 0, 0}, Filedesc{2, 20}, tm, 1, States{Running: 1}, msi{}, nil}, }, }