From be5807f6345cfe8763f494125129a5fdf61ef5b4 Mon Sep 17 00:00:00 2001 From: jnywong Date: Mon, 19 May 2025 11:36:29 +0100 Subject: [PATCH 1/5] Add User Group Diagnostics dashboard --- dashboards/group.jsonnet | 196 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 dashboards/group.jsonnet diff --git a/dashboards/group.jsonnet b/dashboards/group.jsonnet new file mode 100644 index 0000000..aff5709 --- /dev/null +++ b/dashboards/group.jsonnet @@ -0,0 +1,196 @@ +#!/usr/bin/env -S jsonnet -J ../vendor +local grafonnet = import 'github.com/grafana/grafonnet/gen/grafonnet-v11.1.0/main.libsonnet'; +local dashboard = grafonnet.dashboard; +local ts = grafonnet.panel.timeSeries; +local prometheus = grafonnet.query.prometheus; + +local common = import './common.libsonnet'; + +local memoryUsage = + common.tsOptions + + ts.new('Memory Usage') + + ts.panelOptions.withDescription( + ||| + Per group memory usage + + Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to + be set up. + ||| + ) + + ts.standardOptions.withUnit('bytes') + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + sum( + container_memory_working_set_bytes{name!="", pod=~"jupyter-.*", namespace=~"$hub_name"} + * on (namespace, pod) group_left(annotation_hub_jupyter_org_username, usergroup) + group( + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~".*", pod=~"jupyter-.*"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + * on (namespace, annotation_hub_jupyter_org_username) group_left(usergroup) + group( + label_replace(jupyterhub_user_group_info{namespace=~"$hub_name", username=~".*", usergroup=~"$user_group"}, + "annotation_hub_jupyter_org_username", "$1", "username", "(.+)") + ) by (annotation_hub_jupyter_org_username, usergroup, namespace) + ) by (usergroup, namespace) + ||| + ) + + prometheus.withLegendFormat('{{ usergroup }} - ({{ namespace }})'), + ]); + + +local cpuUsage = + common.tsOptions + + ts.new('CPU Usage') + + ts.panelOptions.withDescription( + ||| + Per group CPU usage + + Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to + be set up. + ||| + ) + + ts.standardOptions.withUnit('percentunit') + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + sum( + # exclude name="" because the same container can be reported + # with both no name and `name=k8s_...`, + # in which case sum() by (pod) reports double the actual metric + irate(container_cpu_usage_seconds_total{name!="", pod=~"jupyter-.*"}[5m]) + * on (namespace, pod) group_left(annotation_hub_jupyter_org_username) + group( + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~".*"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + * on (namespace, annotation_hub_jupyter_org_username) group_left(usergroup) + group( + label_replace(jupyterhub_user_group_info{namespace=~"$hub_name", username=~".*", usergroup=~"$user_group"}, + "annotation_hub_jupyter_org_username", "$1", "username", "(.+)") + ) by (annotation_hub_jupyter_org_username, usergroup, namespace) + ) by (usergroup, namespace) + ||| + ) + + prometheus.withLegendFormat('{{ usergroup }} - ({{ namespace }})'), + ]); + +local homedirSharedUsage = + common.tsOptions + + ts.new('Home Directory Usage (on shared home directories)') + + ts.panelOptions.withDescription( + ||| + Per group home directory size, when using a shared home directory. + + Requires https://github.com/yuvipanda/prometheus-dirsize-exporter and https://github.com/2i2c-org/jupyterhub-groups-exporter to + be set up. + ||| + ) + + ts.standardOptions.withUnit('bytes') + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + sum( + max( + dirsize_total_size_bytes{namespace=~"$hub_name"} + ) by (namespace, directory) + * on (namespace, directory) group_left(usergroup) + group( + label_replace( + jupyterhub_user_group_info{namespace=~"$hub_name", username_escaped=~".*", usergroup=~"$user_group"}, + "directory", "$1", "username_escaped", "(.+)") + ) by (directory, namespace, usergroup) + ) by (namespace, usergroup) + ||| + ) + + prometheus.withLegendFormat('{{ usergroup }} - ({{ namespace }})'), + ]); + +local memoryRequests = + common.tsOptions + + ts.new('Memory Requests') + + ts.panelOptions.withDescription( + ||| + Per group memory requests + + Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to + be set up. + ||| + ) + + ts.standardOptions.withUnit('bytes') + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + sum( + kube_pod_container_resource_requests{resource="memory", namespace=~"$hub_name", pod=~"jupyter-.*"} * on (namespace, pod) + group_left(annotation_hub_jupyter_org_username) group( + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~".*"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + * on (namespace, annotation_hub_jupyter_org_username) group_left(usergroup) + group( + label_replace(jupyterhub_user_group_info{namespace=~"$hub_name", username=~".*", usergroup=~"$user_group"}, + "annotation_hub_jupyter_org_username", "$1", "username", "(.+)") + ) by (annotation_hub_jupyter_org_username, usergroup, namespace) + ) by (usergroup, namespace) + ||| + ) + + prometheus.withLegendFormat('{{ usergroup }} - ({{ namespace }})'), + ]); + +local cpuRequests = + common.tsOptions + + ts.new('CPU Requests') + + ts.panelOptions.withDescription( + ||| + Per group CPU requests + + Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to + be set up. + ||| + ) + + ts.standardOptions.withUnit('percentunit') + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + sum( + kube_pod_container_resource_requests{resource="cpu", namespace=~"$hub_name", pod=~"jupyter-.*"} * on (namespace, pod) + group_left(annotation_hub_jupyter_org_username) group( + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~".*"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + * on (namespace, annotation_hub_jupyter_org_username) group_left(usergroup) + group( + label_replace(jupyterhub_user_group_info{namespace=~"$hub_name", username=~".*", usergroup=~"$user_group"}, + "annotation_hub_jupyter_org_username", "$1", "username", "(.+)") + ) by (annotation_hub_jupyter_org_username, usergroup, namespace) + ) by (usergroup, namespace) + ||| + ) + + prometheus.withLegendFormat('{{ usergroup }} - ({{ namespace }})'), + ]); + +dashboard.new('User Group Diagnostics Dashboard') ++ dashboard.withTags(['jupyterhub']) ++ dashboard.withUid('group-diagnostics-dashboard') ++ dashboard.withEditable(true) ++ dashboard.withVariables([ + common.variables.prometheus, + common.variables.hub_name, + common.variables.user_group, +]) ++ dashboard.withPanels( + grafonnet.util.grid.makeGrid( + [ + memoryUsage, + cpuUsage, + homedirSharedUsage, + memoryRequests, + cpuRequests, + ], + panelWidth=24, + panelHeight=12, + ) +) From f57aca77171184bfd922e62abcd2f630eac7b676 Mon Sep 17 00:00:00 2001 From: jnywong Date: Mon, 19 May 2025 11:36:57 +0100 Subject: [PATCH 2/5] Update to use unescaped usernames from pod annotations --- dashboards/user.jsonnet | 74 +++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 33 deletions(-) mode change 100755 => 100644 dashboards/user.jsonnet diff --git a/dashboards/user.jsonnet b/dashboards/user.jsonnet old mode 100755 new mode 100644 index bda8bd5..a7a007d --- a/dashboards/user.jsonnet +++ b/dashboards/user.jsonnet @@ -11,7 +11,7 @@ local memoryUsage = + ts.new('Memory Usage') + ts.panelOptions.withDescription( ||| - Per-user per-server memory usage + Per user memory usage ||| ) + ts.standardOptions.withUnit('bytes') @@ -20,18 +20,15 @@ local memoryUsage = '$PROMETHEUS_DS', ||| sum( - # exclude name="" because the same container can be reported - # with both no name and `name=k8s_...`, - # in which case sum() by (pod) reports double the actual metric - container_memory_working_set_bytes{name!="", instance=~"$instance"} - * on (namespace, pod) group_left(container) - group( - kube_pod_labels{label_app="jupyterhub", label_component="singleuser-server", namespace=~"$hub", pod=~"$user_pod"} - ) by (pod, namespace) - ) by (pod, namespace) + container_memory_working_set_bytes{name!="", pod=~"jupyter-.*", namespace=~"$hub_name"} + * on (namespace, pod) group_left(annotation_hub_jupyter_org_username) + group( + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~"$user_name", pod=~"jupyter-.*"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + ) by (annotation_hub_jupyter_org_username, namespace) ||| ) - + prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'), + + prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'), ]); @@ -40,7 +37,7 @@ local cpuUsage = + ts.new('CPU Usage') + ts.panelOptions.withDescription( ||| - Per-user per-server CPU usage + Per user CPU usage ||| ) + ts.standardOptions.withUnit('percentunit') @@ -52,15 +49,15 @@ local cpuUsage = # exclude name="" because the same container can be reported # with both no name and `name=k8s_...`, # in which case sum() by (pod) reports double the actual metric - irate(container_cpu_usage_seconds_total{name!="", instance=~"$instance"}[5m]) - * on (namespace, pod) group_left(container) + irate(container_cpu_usage_seconds_total{name!="", pod=~"jupyter-.*"}[5m]) + * on (namespace, pod) group_left(annotation_hub_jupyter_org_username) group( - kube_pod_labels{label_app="jupyterhub", label_component="singleuser-server", namespace=~"$hub", pod=~"$user_pod"} - ) by (pod, namespace) - ) by (pod, namespace) + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~"$user_name"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + ) by (annotation_hub_jupyter_org_username, namespace) ||| ) - + prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'), + + prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'), ]); local homedirSharedUsage = @@ -87,11 +84,17 @@ local homedirSharedUsage = '$PROMETHEUS_DS', ||| max( - dirsize_total_size_bytes{namespace="$hub"} - ) by (directory, namespace) + dirsize_total_size_bytes{namespace=~"$hub_name"} + * on (namespace, directory) group_left(username) + group( + label_replace( + jupyterhub_user_group_info{namespace=~"$hub_name", username_escaped=~".*"}, + "directory", "$1", "username_escaped", "(.+)") + ) by (directory, namespace, username) + ) by (namespace, username) ||| ) - + prometheus.withLegendFormat('{{ directory }} - ({{ namespace }})'), + + prometheus.withLegendFormat('{{ username }} - ({{ namespace }})'), ]); local memoryRequests = @@ -99,7 +102,7 @@ local memoryRequests = + ts.new('Memory Requests') + ts.panelOptions.withDescription( ||| - Per-user per-server memory Requests + Per-user memory requests ||| ) + ts.standardOptions.withUnit('bytes') @@ -108,11 +111,14 @@ local memoryRequests = '$PROMETHEUS_DS', ||| sum( - kube_pod_container_resource_requests{resource="memory", namespace=~"$hub", node=~"$instance"} - ) by (pod, namespace) + kube_pod_container_resource_requests{resource="memory", namespace=~"$hub_name", pod=~"jupyter-.*"} * on (namespace, pod) + group_left(annotation_hub_jupyter_org_username) group( + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~"$user_name"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + ) by (annotation_hub_jupyter_org_username, namespace) ||| ) - + prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'), + + prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'), ]); local cpuRequests = @@ -120,7 +126,7 @@ local cpuRequests = + ts.new('CPU Requests') + ts.panelOptions.withDescription( ||| - Per-user per-server CPU Requests + Per user CPU requests ||| ) + ts.standardOptions.withUnit('percentunit') @@ -129,22 +135,24 @@ local cpuRequests = '$PROMETHEUS_DS', ||| sum( - kube_pod_container_resource_requests{resource="cpu", namespace=~"$hub", node=~"$instance"} - ) by (pod, namespace) + kube_pod_container_resource_requests{resource="cpu", namespace=~"$hub_name", pod=~"jupyter-.*"} * on (namespace, pod) + group_left(annotation_hub_jupyter_org_username) group( + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~"$user_name"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + ) by (annotation_hub_jupyter_org_username, namespace) ||| ) - + prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'), + + prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'), ]); dashboard.new('User Diagnostics Dashboard') + dashboard.withTags(['jupyterhub']) -+ dashboard.withUid('user-pod-diagnostics-dashboard') ++ dashboard.withUid('user-diagnostics-dashboard') + dashboard.withEditable(true) + dashboard.withVariables([ common.variables.prometheus, - common.variables.hub, - common.variables.user_pod, - common.variables.instance, + common.variables.hub_name, + common.variables.user_name, ]) + dashboard.withPanels( grafonnet.util.grid.makeGrid( From 40d4231e487fe706ee439c27ad356bd8af9af3a9 Mon Sep 17 00:00:00 2001 From: jnywong Date: Mon, 19 May 2025 11:37:14 +0100 Subject: [PATCH 3/5] Update variables --- dashboards/common.libsonnet | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/dashboards/common.libsonnet b/dashboards/common.libsonnet index 34fc57f..d1dcbb8 100644 --- a/dashboards/common.libsonnet +++ b/dashboards/common.libsonnet @@ -63,12 +63,33 @@ local var = grafonnet.dashboard.variable; + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + var.query.queryTypes.withLabelValues('namespace', 'kube_service_labels{service="hub"}') , - user_pod: - var.query.new('user_pod') + hub_name: + var.query.new('hub_name') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + + var.query.queryTypes.withLabelValues('namespace', 'kube_service_labels{service="hub"}') + , + namespace: + var.query.new('namespace') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + + var.query.queryTypes.withLabelValues('namespace', 'kube_pod_labels') + , + user_group: + var.query.new('user_group') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + + var.query.queryTypes.withLabelValues('usergroup', 'jupyterhub_user_group_info') + , + user_name: + var.query.new('user_name') + var.query.withDatasourceFromVariable(self.prometheus) + var.query.selectionOptions.withMulti() + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') - + var.query.queryTypes.withLabelValues('pod', 'kube_pod_labels{label_app="jupyterhub", label_component="singleuser-server", namespace=~"$hub"}') + + var.query.queryTypes.withLabelValues('annotation_hub_jupyter_org_username', 'kube_pod_annotations{ namespace=~"$hub_name"}') , // Queries should use the 'instance' label when querying metrics that // come from collectors present on each node - such as node_exporter or From fed3645d539f3ce9c11fcea3a53bc62f2c8f7481 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 19 May 2025 10:38:26 +0000 Subject: [PATCH 4/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dashboards/common.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dashboards/common.libsonnet b/dashboards/common.libsonnet index d1dcbb8..40d0b23 100644 --- a/dashboards/common.libsonnet +++ b/dashboards/common.libsonnet @@ -67,9 +67,9 @@ local var = grafonnet.dashboard.variable; var.query.new('hub_name') + var.query.withDatasourceFromVariable(self.prometheus) + var.query.selectionOptions.withMulti() - + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + var.query.queryTypes.withLabelValues('namespace', 'kube_service_labels{service="hub"}') - , + , namespace: var.query.new('namespace') + var.query.withDatasourceFromVariable(self.prometheus) @@ -83,7 +83,7 @@ local var = grafonnet.dashboard.variable; + var.query.selectionOptions.withMulti() + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + var.query.queryTypes.withLabelValues('usergroup', 'jupyterhub_user_group_info') - , + , user_name: var.query.new('user_name') + var.query.withDatasourceFromVariable(self.prometheus) From 37b1e5f58579d9c23d037c00bc85ea771b59c071 Mon Sep 17 00:00:00 2001 From: jnywong Date: Fri, 6 Jun 2025 12:11:00 +0100 Subject: [PATCH 5/5] Add more info to panel descriptions --- dashboards/group.jsonnet | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/dashboards/group.jsonnet b/dashboards/group.jsonnet index aff5709..2b526a4 100644 --- a/dashboards/group.jsonnet +++ b/dashboards/group.jsonnet @@ -11,10 +11,12 @@ local memoryUsage = + ts.new('Memory Usage') + ts.panelOptions.withDescription( ||| - Per group memory usage + Per group memory usage. + + User groups are derived from authenticator managed groups where available, e.g. GitHub teams. If a user is a member of multiple groups, then they will be assigned to the group 'other' by default. Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to - be set up. + be set up. If the panels show no data, then please try selecting another time range where usage was active. ||| ) + ts.standardOptions.withUnit('bytes') @@ -47,8 +49,10 @@ local cpuUsage = ||| Per group CPU usage + User groups are derived from authenticator managed groups where available, e.g. GitHub teams. If a user is a member of multiple groups, then they will be assigned to the group 'other' by default. + Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to - be set up. + be set up. If the panels show no data, then please try selecting another time range where usage was active. ||| ) + ts.standardOptions.withUnit('percentunit') @@ -83,6 +87,8 @@ local homedirSharedUsage = ||| Per group home directory size, when using a shared home directory. + User groups are derived from authenticator managed groups where available, e.g. GitHub teams. If a user is a member of multiple groups, then they will be assigned to the group 'other' by default. + Requires https://github.com/yuvipanda/prometheus-dirsize-exporter and https://github.com/2i2c-org/jupyterhub-groups-exporter to be set up. ||| @@ -115,8 +121,10 @@ local memoryRequests = ||| Per group memory requests + User groups are derived from authenticator managed groups where available, e.g. GitHub teams. If a user is a member of multiple groups, then they will be assigned to the group 'other' by default. + Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to - be set up. + be set up. If the panels show no data, then please try selecting another time range where usage was active. ||| ) + ts.standardOptions.withUnit('bytes') @@ -147,8 +155,10 @@ local cpuRequests = ||| Per group CPU requests + User groups are derived from authenticator managed groups where available, e.g. GitHub teams. If a user is a member of multiple groups, then they will be assigned to the group 'other' by default. + Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to - be set up. + be set up. If the panels show no data, then please try selecting another time range where usage was active. ||| ) + ts.standardOptions.withUnit('percentunit')