Skip to content

Separate pod and user diagnostics #146

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions dashboards/common.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,35 @@ local var = grafonnet.dashboard.variable;
var.datasource.new('PROMETHEUS_DS', 'prometheus')
+ var.datasource.generalOptions.showOnDashboard.withValueOnly()
,
// Limit namespaces to those that run a hub service
hub:
var.query.new('hub')
+ var.query.withDatasourceFromVariable(self.prometheus)
+ var.query.selectionOptions.withMulti()
+ var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*')
+ var.query.queryTypes.withLabelValues('namespace', 'kube_service_labels{service="hub"}')
,
namespace:
var.query.new('namespace')
+ var.query.withDatasourceFromVariable(self.prometheus)
+ var.query.selectionOptions.withMulti()
+ var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*')
+ var.query.queryTypes.withLabelValues('namespace', 'kube_pod_labels')
,
user_pod:
var.query.new('user_pod')
+ var.query.withDatasourceFromVariable(self.prometheus)
+ var.query.selectionOptions.withMulti()
+ var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*')
+ var.query.queryTypes.withLabelValues('pod', 'kube_pod_labels{label_app="jupyterhub", label_component="singleuser-server", namespace=~"$hub"}')
,
user_name:
var.query.new('user_name')
+ var.query.withDatasourceFromVariable(self.prometheus)
+ var.query.selectionOptions.withMulti()
+ var.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*')
+ var.query.queryTypes.withLabelValues('annotation_hub_jupyter_org_username', 'kube_pod_annotations{ namespace=~"$hub"}')
,
// Queries should use the 'instance' label when querying metrics that
// come from collectors present on each node - such as node_exporter or
// container_ metrics, and use the 'node' label when querying metrics
Expand Down
128 changes: 128 additions & 0 deletions dashboards/pod.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#!/usr/bin/env -S jsonnet -J ../vendor
local grafonnet = import 'github.com/grafana/grafonnet/gen/grafonnet-v11.1.0/main.libsonnet';
local dashboard = grafonnet.dashboard;
local ts = grafonnet.panel.timeSeries;
local prometheus = grafonnet.query.prometheus;

local common = import './common.libsonnet';

local memoryUsage =
common.tsOptions
+ ts.new('Memory Usage')
+ ts.panelOptions.withDescription(
|||
Per-user per-server memory usage
|||
)
+ ts.standardOptions.withUnit('bytes')
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
|||
sum(
# exclude name="" because the same container can be reported
# with both no name and `name=k8s_...`,
# in which case sum() by (pod) reports double the actual metric
container_memory_working_set_bytes{name!="", instance=~"$instance", namespace=~"$namespace"}
* on (namespace, pod) group_left(container)
group(
kube_pod_labels{namespace=~"$namespace"}
) by (pod, namespace)
) by (pod, namespace)
|||
)
+ prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'),
]);


local cpuUsage =
common.tsOptions
+ ts.new('CPU Usage')
+ ts.panelOptions.withDescription(
|||
Per-user per-server CPU usage
|||
)
+ ts.standardOptions.withUnit('percentunit')
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
|||
sum(
# exclude name="" because the same container can be reported
# with both no name and `name=k8s_...`,
# in which case sum() by (pod) reports double the actual metric
irate(container_cpu_usage_seconds_total{name!="", instance=~"$instance"}[5m])
* on (namespace, pod) group_left(container)
group(
kube_pod_labels{namespace=~"$namespace"}
) by (pod, namespace)
) by (pod, namespace)
|||
)
+ prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'),
]);

local memoryRequests =
common.tsOptions
+ ts.new('Memory Requests')
+ ts.panelOptions.withDescription(
|||
Per-user per-server memory Requests
|||
)
+ ts.standardOptions.withUnit('bytes')
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
|||
sum(
kube_pod_container_resource_requests{resource="memory", namespace=~"$namespace", node=~"$instance"}
) by (pod, namespace)
|||
)
+ prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'),
]);

local cpuRequests =
common.tsOptions
+ ts.new('CPU Requests')
+ ts.panelOptions.withDescription(
|||
Per-user per-server CPU Requests
|||
)
+ ts.standardOptions.withUnit('percentunit')
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
|||
sum(
kube_pod_container_resource_requests{resource="cpu", namespace=~"$namespace", node=~"$instance"}
) by (pod, namespace)
|||
)
+ prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'),
]);

dashboard.new('Pod Diagnostics Dashboard')
+ dashboard.withTags(['kubernetes'])
+ dashboard.withUid('pod-diagnostics-dashboard')
+ dashboard.withEditable(true)
+ dashboard.withVariables([
common.variables.prometheus,
common.variables.namespace,
common.variables.instance,
])
+ dashboard.withPanels(
grafonnet.util.grid.makeGrid(
[
memoryUsage,
cpuUsage,
memoryRequests,
cpuRequests,
],
panelWidth=24,
panelHeight=12,
)
)
56 changes: 31 additions & 25 deletions dashboards/user.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ local memoryUsage =
+ ts.new('Memory Usage')
+ ts.panelOptions.withDescription(
|||
Per-user per-server memory usage
Per-user memory usage
|||
)
+ ts.standardOptions.withUnit('bytes')
Expand All @@ -23,15 +23,15 @@ local memoryUsage =
# exclude name="" because the same container can be reported
# with both no name and `name=k8s_...`,
# in which case sum() by (pod) reports double the actual metric
container_memory_working_set_bytes{name!="", instance=~"$instance"}
* on (namespace, pod) group_left(container)
container_memory_working_set_bytes{name!="", instance=~"$instance", pod!="jupyter-deployment-service-check", pod=~"jupyter-.*"}
* on (namespace, pod) group_left(annotation_hub_jupyter_org_username)
group(
kube_pod_labels{label_app="jupyterhub", label_component="singleuser-server", namespace=~"$hub", pod=~"$user_pod"}
) by (pod, namespace)
) by (pod, namespace)
kube_pod_annotations{namespace=~"$hub", annotation_hub_jupyter_org_username=~"$user_name"}
) by (pod, namespace, annotation_hub_jupyter_org_username)
) by (annotation_hub_jupyter_org_username, namespace)
|||
)
+ prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'),
+ prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'),
]);


Expand All @@ -40,7 +40,7 @@ local cpuUsage =
+ ts.new('CPU Usage')
+ ts.panelOptions.withDescription(
|||
Per-user per-server CPU usage
Per-user CPU usage
|||
)
+ ts.standardOptions.withUnit('percentunit')
Expand All @@ -52,15 +52,15 @@ local cpuUsage =
# exclude name="" because the same container can be reported
# with both no name and `name=k8s_...`,
# in which case sum() by (pod) reports double the actual metric
irate(container_cpu_usage_seconds_total{name!="", instance=~"$instance"}[5m])
* on (namespace, pod) group_left(container)
irate(container_cpu_usage_seconds_total{name!="", instance=~"$instance", pod!="jupyter-deployment-service-check",pod=~"jupyter-.*"}[5m])
* on (namespace, pod) group_left(annotation_hub_jupyter_org_username)
group(
kube_pod_labels{label_app="jupyterhub", label_component="singleuser-server", namespace=~"$hub", pod=~"$user_pod"}
) by (pod, namespace)
) by (pod, namespace)
kube_pod_annotations{namespace=~"$hub", annotation_hub_jupyter_org_username=~"$user_name"}
) by (pod, namespace, annotation_hub_jupyter_org_username)
) by (annotation_hub_jupyter_org_username, namespace)
|||
)
+ prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'),
+ prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'),
]);

local homedirSharedUsage =
Expand All @@ -87,7 +87,7 @@ local homedirSharedUsage =
'$PROMETHEUS_DS',
|||
max(
dirsize_total_size_bytes{namespace="$hub"}
dirsize_total_size_bytes{namespace=~"$hub"}
) by (directory, namespace)
|||
)
Expand All @@ -99,7 +99,7 @@ local memoryRequests =
+ ts.new('Memory Requests')
+ ts.panelOptions.withDescription(
|||
Per-user per-server memory Requests
Per-user memory Requests
|||
)
+ ts.standardOptions.withUnit('bytes')
Expand All @@ -108,19 +108,22 @@ local memoryRequests =
'$PROMETHEUS_DS',
|||
sum(
kube_pod_container_resource_requests{resource="memory", namespace=~"$hub", node=~"$instance"}
) by (pod, namespace)
kube_pod_container_resource_requests{resource="memory", namespace=~"$hub", node=~"$instance", pod!="jupyter-deployment-service-check", pod=~"jupyter-.*"} * on (namespace, pod)
group_left(annotation_hub_jupyter_org_username) group(
kube_pod_annotations{namespace=~"$hub", annotation_hub_jupyter_org_username=~"$user_name"}
) by (pod, namespace, annotation_hub_jupyter_org_username)
) by (annotation_hub_jupyter_org_username, namespace)
|||
)
+ prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'),
+ prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'),
]);

local cpuRequests =
common.tsOptions
+ ts.new('CPU Requests')
+ ts.panelOptions.withDescription(
|||
Per-user per-server CPU Requests
Per-user CPU Requests
|||
)
+ ts.standardOptions.withUnit('percentunit')
Expand All @@ -129,21 +132,24 @@ local cpuRequests =
'$PROMETHEUS_DS',
|||
sum(
kube_pod_container_resource_requests{resource="cpu", namespace=~"$hub", node=~"$instance"}
) by (pod, namespace)
kube_pod_container_resource_requests{resource="cpu", namespace=~"$hub", node=~"$instance", pod!="jupyter-deployment-service-check", pod=~"jupyter-.*"} * on (namespace, pod)
group_left(annotation_hub_jupyter_org_username) group(
kube_pod_annotations{namespace=~"$hub", annotation_hub_jupyter_org_username=~"$user_name"}
) by (pod, namespace, annotation_hub_jupyter_org_username)
) by (annotation_hub_jupyter_org_username, namespace)
|||
)
+ prometheus.withLegendFormat('{{ pod }} - ({{ namespace }})'),
+ prometheus.withLegendFormat('{{ annotation_hub_jupyter_org_username }} - ({{ namespace }})'),
]);

dashboard.new('User Diagnostics Dashboard')
+ dashboard.withTags(['jupyterhub'])
+ dashboard.withUid('user-pod-diagnostics-dashboard')
+ dashboard.withUid('user-diagnostics-dashboard')
+ dashboard.withEditable(true)
+ dashboard.withVariables([
common.variables.prometheus,
common.variables.hub,
common.variables.user_pod,
common.variables.user_name,
common.variables.instance,
])
+ dashboard.withPanels(
Expand Down
10 changes: 7 additions & 3 deletions docs/howto/user-diagnostics.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
# Look at individual user metrics

A common support request for JupyterHub admins pertains to specific issues
faced by a particular user. The "User Diagnostics" dashboard helps with this.
It also helps look for *outliers* in a hub - people using too much of a particular
resource, or not enough of a particular resource.
faced by a particular user. The "User Diagnostics" dashboard helps displaying
metrics on a per-user level. It also helps look for *outliers* in a hub –
people using too much of a particular resource, or not enough of a particular
resource.

The "Pod Diagnostics" dashboard displays metrics on a per-user per-server level.
For example, a JupyterHub can be configured to [allow multiple named servers per user](https://jupyterhub.readthedocs.io/en/stable/howto/configuration/config-user-env.html#named-servers) running at the same time.

## Home directory size (with shared volumes)

Expand Down
6 changes: 5 additions & 1 deletion docs/tutorials/deploy.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ TO MAKE CHANGES, EDIT THE JSONNET FILE AND DEPLOY AGAIN
```

3. `kube-state-metrics` must be configured to add some labels to metrics
[since version 2.0](https://kubernetes.io/blog/2021/04/13/kube-state-metrics-v-2-0/).
[(since version 2.0)](https://kubernetes.io/blog/2021/04/13/kube-state-metrics-v-2-0/) and annotations to metrics (since version 2.2.0).
If deployed with the prometheus helm chart, the config should look like this:

```yaml
Expand All @@ -37,6 +37,10 @@ TO MAKE CHANGES, EDIT THE JSONNET FILE AND DEPLOY AGAIN
- pods=[app,component,hub.jupyter.org/username]
# allowing all labels is probably fine for nodes, since they don't churn much, unlike pods
- nodes=[*]
- service=[app, component]
metricAnnotationsAllowlist:
# collect pod annotation for unescaped usernames
- pods=[hub.jupyter.org/username]
```

```{tip}
Expand Down