Skip to content

Add "User Group Diagnostics" Grafana dashboard #149

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions dashboards/common.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,33 @@ local var = grafonnet.dashboard.variable;
+ var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*')
+ var.query.queryTypes.withLabelValues('namespace', 'kube_service_labels{service="hub"}')
,
user_pod:
var.query.new('user_pod')
hub_name:
var.query.new('hub_name')
+ var.query.withDatasourceFromVariable(self.prometheus)
+ var.query.selectionOptions.withMulti()
+ var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*')
+ var.query.queryTypes.withLabelValues('pod', 'kube_pod_labels{label_app="jupyterhub", label_component="singleuser-server", namespace=~"$hub"}')
+ var.query.queryTypes.withLabelValues('namespace', 'kube_service_labels{service="hub"}')
,
namespace:
var.query.new('namespace')
+ var.query.withDatasourceFromVariable(self.prometheus)
+ var.query.selectionOptions.withMulti()
+ var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*')
+ var.query.queryTypes.withLabelValues('namespace', 'kube_pod_labels')
,
user_group:
var.query.new('user_group')
+ var.query.withDatasourceFromVariable(self.prometheus)
+ var.query.selectionOptions.withMulti()
+ var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*')
+ var.query.queryTypes.withLabelValues('usergroup', 'jupyterhub_user_group_info')
,
user_name:
var.query.new('user_name')
+ var.query.withDatasourceFromVariable(self.prometheus)
+ var.query.selectionOptions.withMulti()
+ var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*')
+ var.query.queryTypes.withLabelValues('annotation_hub_jupyter_org_username', 'kube_pod_annotations{ namespace=~"$hub_name"}')
,
// Queries should use the 'instance' label when querying metrics that
// come from collectors present on each node - such as node_exporter or
Expand Down
206 changes: 206 additions & 0 deletions dashboards/group.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
#!/usr/bin/env -S jsonnet -J ../vendor
local grafonnet = import 'github.com/grafana/grafonnet/gen/grafonnet-v11.1.0/main.libsonnet';
local dashboard = grafonnet.dashboard;
local ts = grafonnet.panel.timeSeries;
local prometheus = grafonnet.query.prometheus;

local common = import './common.libsonnet';

local memoryUsage =
common.tsOptions
+ ts.new('Memory Usage')
+ ts.panelOptions.withDescription(
|||
Per group memory usage.

User groups are derived from authenticator managed groups where available, e.g. GitHub teams. If a user is a member of multiple groups, then they will be assigned to the group 'other' by default.

Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to
be set up. If the panels show no data, then please try selecting another time range where usage was active.
|||
)
+ ts.standardOptions.withUnit('bytes')
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
|||
sum(
container_memory_working_set_bytes{name!="", pod=~"jupyter-.*", namespace=~"$hub_name"}
* on (namespace, pod) group_left(annotation_hub_jupyter_org_username, usergroup)
group(
kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~".*", pod=~"jupyter-.*"}
) by (pod, namespace, annotation_hub_jupyter_org_username)
* on (namespace, annotation_hub_jupyter_org_username) group_left(usergroup)
group(
label_replace(jupyterhub_user_group_info{namespace=~"$hub_name", username=~".*", usergroup=~"$user_group"},
"annotation_hub_jupyter_org_username", "$1", "username", "(.+)")
) by (annotation_hub_jupyter_org_username, usergroup, namespace)
) by (usergroup, namespace)
|||
)
+ prometheus.withLegendFormat('{{ usergroup }} - ({{ namespace }})'),
]);


local cpuUsage =
common.tsOptions
+ ts.new('CPU Usage')
+ ts.panelOptions.withDescription(
|||
Per group CPU usage

User groups are derived from authenticator managed groups where available, e.g. GitHub teams. If a user is a member of multiple groups, then they will be assigned to the group 'other' by default.

Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to
be set up. If the panels show no data, then please try selecting another time range where usage was active.
|||
)
+ ts.standardOptions.withUnit('percentunit')
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
|||
sum(
# exclude name="" because the same container can be reported
# with both no name and `name=k8s_...`,
# in which case sum() by (pod) reports double the actual metric
irate(container_cpu_usage_seconds_total{name!="", pod=~"jupyter-.*"}[5m])
* on (namespace, pod) group_left(annotation_hub_jupyter_org_username)
group(
kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~".*"}
) by (pod, namespace, annotation_hub_jupyter_org_username)
* on (namespace, annotation_hub_jupyter_org_username) group_left(usergroup)
group(
label_replace(jupyterhub_user_group_info{namespace=~"$hub_name", username=~".*", usergroup=~"$user_group"},
"annotation_hub_jupyter_org_username", "$1", "username", "(.+)")
) by (annotation_hub_jupyter_org_username, usergroup, namespace)
) by (usergroup, namespace)
|||
)
+ prometheus.withLegendFormat('{{ usergroup }} - ({{ namespace }})'),
]);

local homedirSharedUsage =
common.tsOptions
+ ts.new('Home Directory Usage (on shared home directories)')
+ ts.panelOptions.withDescription(
|||
Per group home directory size, when using a shared home directory.

User groups are derived from authenticator managed groups where available, e.g. GitHub teams. If a user is a member of multiple groups, then they will be assigned to the group 'other' by default.

Requires https://github.com/yuvipanda/prometheus-dirsize-exporter and https://github.com/2i2c-org/jupyterhub-groups-exporter to
be set up.
|||
)
+ ts.standardOptions.withUnit('bytes')
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
|||
sum(
max(
dirsize_total_size_bytes{namespace=~"$hub_name"}
) by (namespace, directory)
* on (namespace, directory) group_left(usergroup)
group(
label_replace(
jupyterhub_user_group_info{namespace=~"$hub_name", username_escaped=~".*", usergroup=~"$user_group"},
"directory", "$1", "username_escaped", "(.+)")
) by (directory, namespace, usergroup)
) by (namespace, usergroup)
|||
)
+ prometheus.withLegendFormat('{{ usergroup }} - ({{ namespace }})'),
]);

local memoryRequests =
common.tsOptions
+ ts.new('Memory Requests')
+ ts.panelOptions.withDescription(
|||
Per group memory requests

User groups are derived from authenticator managed groups where available, e.g. GitHub teams. If a user is a member of multiple groups, then they will be assigned to the group 'other' by default.

Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to
be set up. If the panels show no data, then please try selecting another time range where usage was active.
|||
)
+ ts.standardOptions.withUnit('bytes')
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
|||
sum(
kube_pod_container_resource_requests{resource="memory", namespace=~"$hub_name", pod=~"jupyter-.*"} * on (namespace, pod)
group_left(annotation_hub_jupyter_org_username) group(
kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~".*"}
) by (pod, namespace, annotation_hub_jupyter_org_username)
* on (namespace, annotation_hub_jupyter_org_username) group_left(usergroup)
group(
label_replace(jupyterhub_user_group_info{namespace=~"$hub_name", username=~".*", usergroup=~"$user_group"},
"annotation_hub_jupyter_org_username", "$1", "username", "(.+)")
) by (annotation_hub_jupyter_org_username, usergroup, namespace)
) by (usergroup, namespace)
|||
)
+ prometheus.withLegendFormat('{{ usergroup }} - ({{ namespace }})'),
]);

local cpuRequests =
common.tsOptions
+ ts.new('CPU Requests')
+ ts.panelOptions.withDescription(
|||
Per group CPU requests

User groups are derived from authenticator managed groups where available, e.g. GitHub teams. If a user is a member of multiple groups, then they will be assigned to the group 'other' by default.

Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to
be set up. If the panels show no data, then please try selecting another time range where usage was active.
|||
)
+ ts.standardOptions.withUnit('percentunit')
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
|||
sum(
kube_pod_container_resource_requests{resource="cpu", namespace=~"$hub_name", pod=~"jupyter-.*"} * on (namespace, pod)
group_left(annotation_hub_jupyter_org_username) group(
kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~".*"}
) by (pod, namespace, annotation_hub_jupyter_org_username)
* on (namespace, annotation_hub_jupyter_org_username) group_left(usergroup)
group(
label_replace(jupyterhub_user_group_info{namespace=~"$hub_name", username=~".*", usergroup=~"$user_group"},
"annotation_hub_jupyter_org_username", "$1", "username", "(.+)")
) by (annotation_hub_jupyter_org_username, usergroup, namespace)
) by (usergroup, namespace)
|||
)
+ prometheus.withLegendFormat('{{ usergroup }} - ({{ namespace }})'),
]);

dashboard.new('User Group Diagnostics Dashboard')
+ dashboard.withTags(['jupyterhub'])
+ dashboard.withUid('group-diagnostics-dashboard')
+ dashboard.withEditable(true)
+ dashboard.withVariables([
common.variables.prometheus,
common.variables.hub_name,
common.variables.user_group,
])
+ dashboard.withPanels(
grafonnet.util.grid.makeGrid(
[
memoryUsage,
cpuUsage,
homedirSharedUsage,
memoryRequests,
cpuRequests,
],
panelWidth=24,
panelHeight=12,
)
)
74 changes: 41 additions & 33 deletions dashboards/user.jsonnet
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ local memoryUsage =
+ ts.new('Memory Usage')
+ ts.panelOptions.withDescription(
|||
Per-user per-server memory usage
Per user memory usage
|||
)
+ ts.standardOptions.withUnit('bytes')
Expand All @@ -20,18 +20,15 @@ local memoryUsage =
'$PROMETHEUS_DS',
|||
sum(
# exclude name="" because the same container can be reported
# with both no name and `name=k8s_...`,
# in which case sum() by (pod) reports double the actual metric
container_memory_working_set_bytes{name!="", instance=~"$instance"}
* on (namespace, pod) group_left(container)
group(
kube_pod_labels{label_app="jupyterhub", label_component="singleuser-server", namespace=~"$hub", pod=~"$user_pod"}
) by (pod, namespace)
) by (pod, namespace)
container_memory_working_set_bytes{name!="", pod=~"jupyter-.*", namespace=~"$hub_name"}
* on (namespace, pod) group_left(annotation_hub_jupyter_org_username)
group(
kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~"$user_name", pod=~"jupyter-.*"}
) by (pod, namespace, annotation_hub_jupyter_org_username)
) by (annotation_hub_jupyter_org_username, namespace)
|||
)
+ prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'),
+ prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'),
]);


Expand All @@ -40,7 +37,7 @@ local cpuUsage =
+ ts.new('CPU Usage')
+ ts.panelOptions.withDescription(
|||
Per-user per-server CPU usage
Per user CPU usage
|||
)
+ ts.standardOptions.withUnit('percentunit')
Expand All @@ -52,15 +49,15 @@ local cpuUsage =
# exclude name="" because the same container can be reported
# with both no name and `name=k8s_...`,
# in which case sum() by (pod) reports double the actual metric
irate(container_cpu_usage_seconds_total{name!="", instance=~"$instance"}[5m])
* on (namespace, pod) group_left(container)
irate(container_cpu_usage_seconds_total{name!="", pod=~"jupyter-.*"}[5m])
* on (namespace, pod) group_left(annotation_hub_jupyter_org_username)
group(
kube_pod_labels{label_app="jupyterhub", label_component="singleuser-server", namespace=~"$hub", pod=~"$user_pod"}
) by (pod, namespace)
) by (pod, namespace)
kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~"$user_name"}
) by (pod, namespace, annotation_hub_jupyter_org_username)
) by (annotation_hub_jupyter_org_username, namespace)
|||
)
+ prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'),
+ prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'),
]);

local homedirSharedUsage =
Expand All @@ -87,19 +84,25 @@ local homedirSharedUsage =
'$PROMETHEUS_DS',
|||
max(
dirsize_total_size_bytes{namespace=~"$hub"}
) by (directory, namespace)
dirsize_total_size_bytes{namespace=~"$hub_name"}
* on (namespace, directory) group_left(username)
group(
label_replace(
jupyterhub_user_group_info{namespace=~"$hub_name", username_escaped=~".*"},
"directory", "$1", "username_escaped", "(.+)")
) by (directory, namespace, username)
) by (namespace, username)
|||
)
+ prometheus.withLegendFormat('{{ directory }} - ({{ namespace }})'),
+ prometheus.withLegendFormat('{{ username }} - ({{ namespace }})'),
]);

local memoryRequests =
common.tsOptions
+ ts.new('Memory Requests')
+ ts.panelOptions.withDescription(
|||
Per-user per-server memory Requests
Per-user memory requests
|||
)
+ ts.standardOptions.withUnit('bytes')
Expand All @@ -108,19 +111,22 @@ local memoryRequests =
'$PROMETHEUS_DS',
|||
sum(
kube_pod_container_resource_requests{resource="memory", namespace=~"$hub", node=~"$instance"}
) by (pod, namespace)
kube_pod_container_resource_requests{resource="memory", namespace=~"$hub_name", pod=~"jupyter-.*"} * on (namespace, pod)
group_left(annotation_hub_jupyter_org_username) group(
kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~"$user_name"}
) by (pod, namespace, annotation_hub_jupyter_org_username)
) by (annotation_hub_jupyter_org_username, namespace)
|||
)
+ prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'),
+ prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'),
]);

local cpuRequests =
common.tsOptions
+ ts.new('CPU Requests')
+ ts.panelOptions.withDescription(
|||
Per-user per-server CPU Requests
Per user CPU requests
|||
)
+ ts.standardOptions.withUnit('percentunit')
Expand All @@ -129,22 +135,24 @@ local cpuRequests =
'$PROMETHEUS_DS',
|||
sum(
kube_pod_container_resource_requests{resource="cpu", namespace=~"$hub", node=~"$instance"}
) by (pod, namespace)
kube_pod_container_resource_requests{resource="cpu", namespace=~"$hub_name", pod=~"jupyter-.*"} * on (namespace, pod)
group_left(annotation_hub_jupyter_org_username) group(
kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~"$user_name"}
) by (pod, namespace, annotation_hub_jupyter_org_username)
) by (annotation_hub_jupyter_org_username, namespace)
|||
)
+ prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'),
+ prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'),
]);

dashboard.new('User Diagnostics Dashboard')
+ dashboard.withTags(['jupyterhub'])
+ dashboard.withUid('user-pod-diagnostics-dashboard')
+ dashboard.withUid('user-diagnostics-dashboard')
+ dashboard.withEditable(true)
+ dashboard.withVariables([
common.variables.prometheus,
common.variables.hub,
common.variables.user_pod,
common.variables.instance,
common.variables.hub_name,
common.variables.user_name,
])
+ dashboard.withPanels(
grafonnet.util.grid.makeGrid(
Expand Down