diff --git a/dashboards/common.libsonnet b/dashboards/common.libsonnet index 34fc57f..40d0b23 100644 --- a/dashboards/common.libsonnet +++ b/dashboards/common.libsonnet @@ -63,12 +63,33 @@ local var = grafonnet.dashboard.variable; + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + var.query.queryTypes.withLabelValues('namespace', 'kube_service_labels{service="hub"}') , - user_pod: - var.query.new('user_pod') + hub_name: + var.query.new('hub_name') + var.query.withDatasourceFromVariable(self.prometheus) + var.query.selectionOptions.withMulti() + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') - + var.query.queryTypes.withLabelValues('pod', 'kube_pod_labels{label_app="jupyterhub", label_component="singleuser-server", namespace=~"$hub"}') + + var.query.queryTypes.withLabelValues('namespace', 'kube_service_labels{service="hub"}') + , + namespace: + var.query.new('namespace') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + + var.query.queryTypes.withLabelValues('namespace', 'kube_pod_labels') + , + user_group: + var.query.new('user_group') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + + var.query.queryTypes.withLabelValues('usergroup', 'jupyterhub_user_group_info') + , + user_name: + var.query.new('user_name') + + var.query.withDatasourceFromVariable(self.prometheus) + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + + var.query.queryTypes.withLabelValues('annotation_hub_jupyter_org_username', 'kube_pod_annotations{ namespace=~"$hub_name"}') , // Queries should use the 'instance' label when querying metrics that // come from collectors present on each node - such as node_exporter or diff --git a/dashboards/group.jsonnet b/dashboards/group.jsonnet new file mode 100644 index 0000000..2b526a4 --- /dev/null +++ b/dashboards/group.jsonnet @@ -0,0 +1,206 @@ +#!/usr/bin/env -S jsonnet -J ../vendor +local grafonnet = import 'github.com/grafana/grafonnet/gen/grafonnet-v11.1.0/main.libsonnet'; +local dashboard = grafonnet.dashboard; +local ts = grafonnet.panel.timeSeries; +local prometheus = grafonnet.query.prometheus; + +local common = import './common.libsonnet'; + +local memoryUsage = + common.tsOptions + + ts.new('Memory Usage') + + ts.panelOptions.withDescription( + ||| + Per group memory usage. + + User groups are derived from authenticator managed groups where available, e.g. GitHub teams. If a user is a member of multiple groups, then they will be assigned to the group 'other' by default. + + Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to + be set up. If the panels show no data, then please try selecting another time range where usage was active. + ||| + ) + + ts.standardOptions.withUnit('bytes') + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + sum( + container_memory_working_set_bytes{name!="", pod=~"jupyter-.*", namespace=~"$hub_name"} + * on (namespace, pod) group_left(annotation_hub_jupyter_org_username, usergroup) + group( + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~".*", pod=~"jupyter-.*"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + * on (namespace, annotation_hub_jupyter_org_username) group_left(usergroup) + group( + label_replace(jupyterhub_user_group_info{namespace=~"$hub_name", username=~".*", usergroup=~"$user_group"}, + "annotation_hub_jupyter_org_username", "$1", "username", "(.+)") + ) by (annotation_hub_jupyter_org_username, usergroup, namespace) + ) by (usergroup, namespace) + ||| + ) + + prometheus.withLegendFormat('{{ usergroup }} - ({{ namespace }})'), + ]); + + +local cpuUsage = + common.tsOptions + + ts.new('CPU Usage') + + ts.panelOptions.withDescription( + ||| + Per group CPU usage + + User groups are derived from authenticator managed groups where available, e.g. GitHub teams. If a user is a member of multiple groups, then they will be assigned to the group 'other' by default. + + Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to + be set up. If the panels show no data, then please try selecting another time range where usage was active. + ||| + ) + + ts.standardOptions.withUnit('percentunit') + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + sum( + # exclude name="" because the same container can be reported + # with both no name and `name=k8s_...`, + # in which case sum() by (pod) reports double the actual metric + irate(container_cpu_usage_seconds_total{name!="", pod=~"jupyter-.*"}[5m]) + * on (namespace, pod) group_left(annotation_hub_jupyter_org_username) + group( + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~".*"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + * on (namespace, annotation_hub_jupyter_org_username) group_left(usergroup) + group( + label_replace(jupyterhub_user_group_info{namespace=~"$hub_name", username=~".*", usergroup=~"$user_group"}, + "annotation_hub_jupyter_org_username", "$1", "username", "(.+)") + ) by (annotation_hub_jupyter_org_username, usergroup, namespace) + ) by (usergroup, namespace) + ||| + ) + + prometheus.withLegendFormat('{{ usergroup }} - ({{ namespace }})'), + ]); + +local homedirSharedUsage = + common.tsOptions + + ts.new('Home Directory Usage (on shared home directories)') + + ts.panelOptions.withDescription( + ||| + Per group home directory size, when using a shared home directory. + + User groups are derived from authenticator managed groups where available, e.g. GitHub teams. If a user is a member of multiple groups, then they will be assigned to the group 'other' by default. + + Requires https://github.com/yuvipanda/prometheus-dirsize-exporter and https://github.com/2i2c-org/jupyterhub-groups-exporter to + be set up. + ||| + ) + + ts.standardOptions.withUnit('bytes') + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + sum( + max( + dirsize_total_size_bytes{namespace=~"$hub_name"} + ) by (namespace, directory) + * on (namespace, directory) group_left(usergroup) + group( + label_replace( + jupyterhub_user_group_info{namespace=~"$hub_name", username_escaped=~".*", usergroup=~"$user_group"}, + "directory", "$1", "username_escaped", "(.+)") + ) by (directory, namespace, usergroup) + ) by (namespace, usergroup) + ||| + ) + + prometheus.withLegendFormat('{{ usergroup }} - ({{ namespace }})'), + ]); + +local memoryRequests = + common.tsOptions + + ts.new('Memory Requests') + + ts.panelOptions.withDescription( + ||| + Per group memory requests + + User groups are derived from authenticator managed groups where available, e.g. GitHub teams. If a user is a member of multiple groups, then they will be assigned to the group 'other' by default. + + Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to + be set up. If the panels show no data, then please try selecting another time range where usage was active. + ||| + ) + + ts.standardOptions.withUnit('bytes') + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + sum( + kube_pod_container_resource_requests{resource="memory", namespace=~"$hub_name", pod=~"jupyter-.*"} * on (namespace, pod) + group_left(annotation_hub_jupyter_org_username) group( + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~".*"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + * on (namespace, annotation_hub_jupyter_org_username) group_left(usergroup) + group( + label_replace(jupyterhub_user_group_info{namespace=~"$hub_name", username=~".*", usergroup=~"$user_group"}, + "annotation_hub_jupyter_org_username", "$1", "username", "(.+)") + ) by (annotation_hub_jupyter_org_username, usergroup, namespace) + ) by (usergroup, namespace) + ||| + ) + + prometheus.withLegendFormat('{{ usergroup }} - ({{ namespace }})'), + ]); + +local cpuRequests = + common.tsOptions + + ts.new('CPU Requests') + + ts.panelOptions.withDescription( + ||| + Per group CPU requests + + User groups are derived from authenticator managed groups where available, e.g. GitHub teams. If a user is a member of multiple groups, then they will be assigned to the group 'other' by default. + + Requires https://github.com/2i2c-org/jupyterhub-groups-exporter to + be set up. If the panels show no data, then please try selecting another time range where usage was active. + ||| + ) + + ts.standardOptions.withUnit('percentunit') + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + sum( + kube_pod_container_resource_requests{resource="cpu", namespace=~"$hub_name", pod=~"jupyter-.*"} * on (namespace, pod) + group_left(annotation_hub_jupyter_org_username) group( + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~".*"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + * on (namespace, annotation_hub_jupyter_org_username) group_left(usergroup) + group( + label_replace(jupyterhub_user_group_info{namespace=~"$hub_name", username=~".*", usergroup=~"$user_group"}, + "annotation_hub_jupyter_org_username", "$1", "username", "(.+)") + ) by (annotation_hub_jupyter_org_username, usergroup, namespace) + ) by (usergroup, namespace) + ||| + ) + + prometheus.withLegendFormat('{{ usergroup }} - ({{ namespace }})'), + ]); + +dashboard.new('User Group Diagnostics Dashboard') ++ dashboard.withTags(['jupyterhub']) ++ dashboard.withUid('group-diagnostics-dashboard') ++ dashboard.withEditable(true) ++ dashboard.withVariables([ + common.variables.prometheus, + common.variables.hub_name, + common.variables.user_group, +]) ++ dashboard.withPanels( + grafonnet.util.grid.makeGrid( + [ + memoryUsage, + cpuUsage, + homedirSharedUsage, + memoryRequests, + cpuRequests, + ], + panelWidth=24, + panelHeight=12, + ) +) diff --git a/dashboards/user.jsonnet b/dashboards/user.jsonnet old mode 100755 new mode 100644 index e702f8c..a7a007d --- a/dashboards/user.jsonnet +++ b/dashboards/user.jsonnet @@ -11,7 +11,7 @@ local memoryUsage = + ts.new('Memory Usage') + ts.panelOptions.withDescription( ||| - Per-user per-server memory usage + Per user memory usage ||| ) + ts.standardOptions.withUnit('bytes') @@ -20,18 +20,15 @@ local memoryUsage = '$PROMETHEUS_DS', ||| sum( - # exclude name="" because the same container can be reported - # with both no name and `name=k8s_...`, - # in which case sum() by (pod) reports double the actual metric - container_memory_working_set_bytes{name!="", instance=~"$instance"} - * on (namespace, pod) group_left(container) - group( - kube_pod_labels{label_app="jupyterhub", label_component="singleuser-server", namespace=~"$hub", pod=~"$user_pod"} - ) by (pod, namespace) - ) by (pod, namespace) + container_memory_working_set_bytes{name!="", pod=~"jupyter-.*", namespace=~"$hub_name"} + * on (namespace, pod) group_left(annotation_hub_jupyter_org_username) + group( + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~"$user_name", pod=~"jupyter-.*"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + ) by (annotation_hub_jupyter_org_username, namespace) ||| ) - + prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'), + + prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'), ]); @@ -40,7 +37,7 @@ local cpuUsage = + ts.new('CPU Usage') + ts.panelOptions.withDescription( ||| - Per-user per-server CPU usage + Per user CPU usage ||| ) + ts.standardOptions.withUnit('percentunit') @@ -52,15 +49,15 @@ local cpuUsage = # exclude name="" because the same container can be reported # with both no name and `name=k8s_...`, # in which case sum() by (pod) reports double the actual metric - irate(container_cpu_usage_seconds_total{name!="", instance=~"$instance"}[5m]) - * on (namespace, pod) group_left(container) + irate(container_cpu_usage_seconds_total{name!="", pod=~"jupyter-.*"}[5m]) + * on (namespace, pod) group_left(annotation_hub_jupyter_org_username) group( - kube_pod_labels{label_app="jupyterhub", label_component="singleuser-server", namespace=~"$hub", pod=~"$user_pod"} - ) by (pod, namespace) - ) by (pod, namespace) + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~"$user_name"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + ) by (annotation_hub_jupyter_org_username, namespace) ||| ) - + prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'), + + prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'), ]); local homedirSharedUsage = @@ -87,11 +84,17 @@ local homedirSharedUsage = '$PROMETHEUS_DS', ||| max( - dirsize_total_size_bytes{namespace=~"$hub"} - ) by (directory, namespace) + dirsize_total_size_bytes{namespace=~"$hub_name"} + * on (namespace, directory) group_left(username) + group( + label_replace( + jupyterhub_user_group_info{namespace=~"$hub_name", username_escaped=~".*"}, + "directory", "$1", "username_escaped", "(.+)") + ) by (directory, namespace, username) + ) by (namespace, username) ||| ) - + prometheus.withLegendFormat('{{ directory }} - ({{ namespace }})'), + + prometheus.withLegendFormat('{{ username }} - ({{ namespace }})'), ]); local memoryRequests = @@ -99,7 +102,7 @@ local memoryRequests = + ts.new('Memory Requests') + ts.panelOptions.withDescription( ||| - Per-user per-server memory Requests + Per-user memory requests ||| ) + ts.standardOptions.withUnit('bytes') @@ -108,11 +111,14 @@ local memoryRequests = '$PROMETHEUS_DS', ||| sum( - kube_pod_container_resource_requests{resource="memory", namespace=~"$hub", node=~"$instance"} - ) by (pod, namespace) + kube_pod_container_resource_requests{resource="memory", namespace=~"$hub_name", pod=~"jupyter-.*"} * on (namespace, pod) + group_left(annotation_hub_jupyter_org_username) group( + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~"$user_name"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + ) by (annotation_hub_jupyter_org_username, namespace) ||| ) - + prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'), + + prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'), ]); local cpuRequests = @@ -120,7 +126,7 @@ local cpuRequests = + ts.new('CPU Requests') + ts.panelOptions.withDescription( ||| - Per-user per-server CPU Requests + Per user CPU requests ||| ) + ts.standardOptions.withUnit('percentunit') @@ -129,22 +135,24 @@ local cpuRequests = '$PROMETHEUS_DS', ||| sum( - kube_pod_container_resource_requests{resource="cpu", namespace=~"$hub", node=~"$instance"} - ) by (pod, namespace) + kube_pod_container_resource_requests{resource="cpu", namespace=~"$hub_name", pod=~"jupyter-.*"} * on (namespace, pod) + group_left(annotation_hub_jupyter_org_username) group( + kube_pod_annotations{namespace=~"$hub_name", annotation_hub_jupyter_org_username=~"$user_name"} + ) by (pod, namespace, annotation_hub_jupyter_org_username) + ) by (annotation_hub_jupyter_org_username, namespace) ||| ) - + prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'), + + prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'), ]); dashboard.new('User Diagnostics Dashboard') + dashboard.withTags(['jupyterhub']) -+ dashboard.withUid('user-pod-diagnostics-dashboard') ++ dashboard.withUid('user-diagnostics-dashboard') + dashboard.withEditable(true) + dashboard.withVariables([ common.variables.prometheus, - common.variables.hub, - common.variables.user_pod, - common.variables.instance, + common.variables.hub_name, + common.variables.user_name, ]) + dashboard.withPanels( grafonnet.util.grid.makeGrid(