Skip to content

Commit 2f6804a

Browse files
authored
Merge pull request #155 from projectsyn/feat/clustermesh-alerts
Implement initial alert for Cilium Clustermesh
2 parents c50d4bd + 0c76d1a commit 2f6804a

File tree

33 files changed

+1483
-15
lines changed

33 files changed

+1483
-15
lines changed

.cruft.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"name": "Cilium",
88
"slug": "cilium",
99
"parameter_key": "cilium",
10-
"test_cases": "defaults helm-opensource olm-opensource egress-gateway bgp-control-plane kubeproxyreplacement-strict l2-announcement",
10+
"test_cases": "defaults helm-opensource olm-opensource egress-gateway bgp-control-plane kubeproxyreplacement-strict l2-announcement clustermesh",
1111
"add_lib": "n",
1212
"add_pp": "n",
1313
"add_golden": "y",
@@ -24,7 +24,8 @@
2424
"github_owner": "projectsyn",
2525
"github_name": "component-cilium",
2626
"github_url": "https://github.com/projectsyn/component-cilium",
27-
"_template": "https://github.com/projectsyn/commodore-component-template.git"
27+
"_template": "https://github.com/projectsyn/commodore-component-template.git",
28+
"_commit": "98d16f99766e6c6d97322dbe42e058f0e2bf73d0"
2829
}
2930
},
3031
"directory": null

.github/workflows/test.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ jobs:
3939
- bgp-control-plane
4040
- kubeproxyreplacement-strict
4141
- l2-announcement
42+
- clustermesh
4243
defaults:
4344
run:
4445
working-directory: ${{ env.COMPONENT_NAME }}
@@ -60,6 +61,7 @@ jobs:
6061
- bgp-control-plane
6162
- kubeproxyreplacement-strict
6263
- l2-announcement
64+
- clustermesh
6365
defaults:
6466
run:
6567
working-directory: ${{ env.COMPONENT_NAME }}

Makefile.vars.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,4 +57,4 @@ KUBENT_IMAGE ?= ghcr.io/doitintl/kube-no-trouble:latest
5757
KUBENT_DOCKER ?= $(DOCKER_CMD) $(DOCKER_ARGS) $(root_volume) --entrypoint=/app/kubent $(KUBENT_IMAGE)
5858

5959
instance ?= defaults
60-
test_instances = tests/defaults.yml tests/helm-opensource.yml tests/olm-opensource.yml tests/egress-gateway.yml tests/bgp-control-plane.yml tests/kubeproxyreplacement-strict.yml tests/l2-announcement.yml
60+
test_instances = tests/defaults.yml tests/helm-opensource.yml tests/olm-opensource.yml tests/egress-gateway.yml tests/bgp-control-plane.yml tests/kubeproxyreplacement-strict.yml tests/l2-announcement.yml tests/clustermesh.yml

class/cilium.yml

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@ parameters:
44
opensource: cilium
55
enterprise: cilium-enterprise
66
=_kapitan:
7+
jsonnet_input_paths:
8+
- ${_base_directory}/component/aggregated-clusterroles.jsonnet
9+
- ${_base_directory}/component/egress-gateway-policies.jsonnet
10+
- ${_base_directory}/component/l2-announcement-policies.jsonnet
11+
- ${_base_directory}/component/bgp-control-plane.jsonnet
12+
- ${_base_directory}/component/ocp-manage-kube-proxy.jsonnet
13+
- ${_base_directory}/component/alerts.jsonnet
714
olm:
815
dependencies:
916
- type: https
@@ -22,12 +29,7 @@ parameters:
2229
input_type: jsonnet
2330
output_path: ${_instance}/olm/
2431

25-
- input_paths:
26-
- ${_base_directory}/component/aggregated-clusterroles.jsonnet
27-
- ${_base_directory}/component/egress-gateway-policies.jsonnet
28-
- ${_base_directory}/component/l2-announcement-policies.jsonnet
29-
- ${_base_directory}/component/bgp-control-plane.jsonnet
30-
- ${_base_directory}/component/ocp-manage-kube-proxy.jsonnet
32+
- input_paths: ${_kapitan:jsonnet_input_paths}
3133
input_type: jsonnet
3234
output_path: ${_instance}/
3335

@@ -49,12 +51,7 @@ parameters:
4951
- ${_base_directory}/component/helm-namespace.jsonnet
5052
input_type: jsonnet
5153
output_path: ${_instance}/01_cilium_helmchart
52-
- input_paths:
53-
- ${_base_directory}/component/aggregated-clusterroles.jsonnet
54-
- ${_base_directory}/component/egress-gateway-policies.jsonnet
55-
- ${_base_directory}/component/l2-announcement-policies.jsonnet
56-
- ${_base_directory}/component/bgp-control-plane.jsonnet
57-
- ${_base_directory}/component/ocp-manage-kube-proxy.jsonnet
54+
- input_paths: ${_kapitan:jsonnet_input_paths}
5855
input_type: jsonnet
5956
output_path: ${_instance}/
6057
- input_paths:

class/defaults.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,11 @@ parameters:
107107
peerings: {}
108108
loadbalancer_ip_pools: {}
109109

110+
alerts:
111+
ignoreNames: []
112+
patches: {}
113+
additionalRules: {}
114+
110115
olm:
111116
source:
112117
opensource: https://github.com/isovalent/olm-for-cilium/archive/main.tar.gz

component/alerts.jsonnet

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
local com = import 'lib/commodore.libjsonnet';
2+
local kap = import 'lib/kapitan.libjsonnet';
3+
local prom = import 'lib/prom.libsonnet';
4+
local util = import 'util.libsonnet';
5+
6+
local inv = kap.inventory();
7+
local params = inv.parameters.cilium;
8+
9+
local ignoreNames = com.renderArray(params.alerts.ignoreNames);
10+
11+
local clustermesh_enabled =
12+
std.get(params.cilium_helm_values, 'clustermesh', { config: { enabled: false } }).config.enabled;
13+
14+
local alertpatching = if util.isOpenshift then
15+
import 'lib/alert-patching.libsonnet'
16+
else
17+
{
18+
filterPatchRules(g, ignoreNames, patches, preserveRecordingRules, patchNames): g,
19+
};
20+
21+
local clustermesh_group = {
22+
name: 'cilium-clustermesh.rules',
23+
rules: [
24+
{
25+
local this = self,
26+
alert: 'CiliumClustermeshRemoteClusterNotReady',
27+
expr: 'cilium_clustermesh_remote_cluster_readiness_status == 0',
28+
'for': '10m',
29+
labels: {
30+
severity: 'critical',
31+
},
32+
annotations: {
33+
runbook_url:
34+
'https://hub.syn.tools/cilium/runbooks/CiliumClustermeshRemoteClusterNotReady.html',
35+
message: 'Remote cluster ${{ labels.target_cluster }} not reachable from ${{ labels.source_node_name }}',
36+
description: |||
37+
Remote cluster ${{ labels.target_cluster }} has been unreachable from
38+
${{ labels.source_node_name }} on cluster ${{ labels.source_cluster }} for the
39+
last %s.
40+
||| % this['for'],
41+
},
42+
},
43+
],
44+
};
45+
46+
local clustermesh_alerts = prom.PrometheusRule('cilium-clustermesh') {
47+
spec+: {
48+
groups: [
49+
alertpatching.filterPatchRules(
50+
clustermesh_group,
51+
ignoreNames=ignoreNames,
52+
patches=params.alerts.patches,
53+
preserveRecordingRules=true,
54+
patchNames=false,
55+
),
56+
],
57+
},
58+
};
59+
60+
local additional_group =
61+
local parseRuleName(rname) =
62+
local rparts = std.splitLimit(rname, ':', 1);
63+
assert
64+
std.length(rparts) == 2 :
65+
'Expected custom alert rule to be prefixed with `record:` or `alert:`.';
66+
assert
67+
std.member([ 'alert', 'record' ], rparts[0]) :
68+
'Expected custom alert rule to be prefixed with `record:` or `alert:`, got `%s:`.' % rparts[0];
69+
{ [rparts[0]]: rparts[1] };
70+
{
71+
name: 'cilium-user.rules',
72+
rules: [
73+
params.alerts.additionalRules[rname] + parseRuleName(rname)
74+
for rname in std.objectFields(params.alerts.additionalRules)
75+
],
76+
};
77+
78+
local additional_alerts = prom.PrometheusRule('cilium-custom') {
79+
spec+: {
80+
groups: [
81+
alertpatching.filterPatchRules(
82+
additional_group,
83+
ignoreNames=ignoreNames,
84+
patches=params.alerts.patches,
85+
preserveRecordingRules=true,
86+
patchNames=false,
87+
),
88+
],
89+
},
90+
};
91+
92+
{
93+
[if clustermesh_enabled && std.length(clustermesh_group.rules) > 0 then
94+
'10_clustermesh_alerts']: clustermesh_alerts,
95+
[if std.length(additional_group.rules) > 0 then
96+
'10_custom_alerts']: additional_alerts,
97+
}

docs/modules/ROOT/pages/references/parameters.adoc

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,46 @@ Make sure to check the upstream documentation for the version of Cilium that you
712712
The LoadBalancer IP address management (LB IPAM) feature is under active development and sometimes has significant changes between Cilium minor versions.
713713
====
714714

715+
== `alerts`
716+
717+
This section allows users to configure alerts for Cilium.
718+
The component expects that an externally-managed Prometheus stack is running on the target cluster.
719+
For OpenShift 4, the component makes use of the component libraries provided by https://github.com/appuio/component-openshift4-monitoring.git[component `openshift4-monitoring`].
720+
On other distributions, the component expects that a component library `prom.libsonnet` is available, for example via https://github.com/projectsyn/component-prometheus.git[component `prometheus`].
721+
722+
=== `alerts.ignoreNames`
723+
724+
[horizontal]
725+
type:: list
726+
default:: `[]`
727+
728+
Alerts which shouldn't be deployed.
729+
The list supports removal of entries by prefixing them with `~`.
730+
731+
=== `alerts.patches`
732+
733+
[horizontal]
734+
type:: dict
735+
default:: `{}`
736+
737+
Patches for alerts managed by the component.
738+
The component expects that keys in this object match the name of an alert managed through the component.
739+
The value of each entry is expected to be a valid partial Prometheus rule definition.
740+
741+
=== `alerts.additionalRules`
742+
743+
[horizontal]
744+
type:: dict
745+
default:: `{}`
746+
747+
This parameter allows users to configure additional Prometheus recording and alerting rules.
748+
The component expects that keys of this object are prefixed with either `alert:` or `record:` and will use these prefixes to create alerting or recording rules.
749+
The component will take the suffix of the key as the alerting or recording rule name and will set field `alert` or `record` of the rule to the suffix.
750+
The value of each entry will be used as the base Prometheus rule in which the `alert` or `record` field will be injected.
751+
752+
NOTE: Parameters `alerts.ignoreNames` and `alerts.patches` are also applied to alerts defined through this parameter.
753+
754+
715755
== Example
716756

717757
[source,yaml]
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
= CiliumClustermeshRemoteClusterNotReady
2+
3+
include::partial$runbooks/contribution_note.adoc[]
4+
5+
== icon:glasses[] Overview
6+
7+
This alert fires if a remote cluster is not reachable from a node for 10 minutes or longer.
8+
This usually indicates one of the following two:
9+
10+
. A local issue on the node prevents the Cilium agent to connect to the remote cluster or the cluster mesh API server
11+
. The remote cluster's cluster mesh API server is not available
12+
. There are network issues preventing cluster mesh connectivity
13+
14+
TIP: Depending on the network configuration, there may be static routes on each node for the remote cluster's cluster mesh API server.
15+
16+
NOTE: When using KVStoreMesh, the agents on the cluster connect to the *local* cache of the remote cluster mesh API server.
17+
18+
== icon:bug[] Steps for debugging
19+
20+
NOTE: The steps in this section assume that your current Kubernetes context points to the source cluster.
21+
22+
TIP: This section assumes that you're running cluster mesh with the cluster mesh API server enabled.
23+
24+
=== Prerequisites
25+
26+
* `cilium` CLI, install from https://github.com/cilium/cilium-cli[icon:github[] cilium/cilium-cli]
27+
28+
=== Identifying the root cause
29+
30+
First, check the source cluster's overall cluster mesh status
31+
32+
[source,bash]
33+
----
34+
cilium -n cilium clustermesh status --as=cluster-admin <1>
35+
----
36+
<1> `--as=cluster-admin` is required on VSHN Managed OpenShift, may need to be left out on other clusters.
37+
38+
If the output indicates that all nodes are unable to connect to the remote cluster's clustermesh API, it's likely that the issue is either on the remote cluster, or in the network between the clusters.
39+
40+
If the output indicates that only a few source nodes are affected, it's likely that the issue is in the Cilium agent or the routing configuration of the nodes.
41+
42+
=== Investigating cluster mesh API
43+
44+
The cluster mesh API runs in the `cilium` namespace as deployment `clustermesh-apiserver`.
45+
Check that the pod runs and check the logs for errors with
46+
47+
[source,bash]
48+
----
49+
kubectl -n cilium get pods -l app.kubernetes.io/name=clustermesh-apiserver
50+
kubectl -n cilium logs deploy/clustermesh-apiserver --all-containers
51+
----
52+
53+
=== Checking connectivity from a Cilium agent pod
54+
55+
If the `cilium clustermesh status` output indicates that only a few nodes are affected, you can run a more detailed check from the nodes' agent pods.
56+
In the output of these command, you should see whether the agent can connect to the cluster mesh API and whether the clustermesh certificates are still valid.
57+
58+
[source,bash]
59+
----
60+
NODE=<node name of affected node> <1>
61+
AGENT_POD=$(kubectl -n cilium get pods --field-selector=spec.nodeName=$NODE \
62+
-l app.kubernetes.io/name=cilium-agent -oname)
63+
64+
kubectl -n cilium exec -it $AGENT_POD --as=cluster-admin -- cilium status <2>
65+
kubectl -n cilium exec -it $AGENT_POD --as=cluster-admin -- cilium troubleshoot clustermesh <3>
66+
----
67+
<1> Set this to the name of an affected node's `Node` object
68+
<2> Show a summary of the Cilium agent status.
69+
You should see in the output of this command whether the agent can't reach one or more of the remote cluster's nodes.
70+
<3> This command will show connection details to the remote cluster's cluster mesh API server or the local cache in case you're using KVStoreMesh.
71+
72+
TIP: `--as=cluster-admin` may need to be left out on some clusters.
73+
74+
If the output of `cilium troubleshoot clustermesh` refers to the local cluster's cluster mesh API server, it's likely that you're using KVStoreMesh.
75+
In that case you can check the KVStoreMesh connection to the remote cluster mesh API server in the `clustermesh-apiserver` deployment:
76+
77+
[source,bash]
78+
----
79+
kubectl -n cilium --as=cluster-admin exec -it deploy/clustermesh-apiserver -c kvstoremesh -- \
80+
clustermesh-apiserver kvstoremesh-dbg status <1>
81+
82+
kubectl exec -it -n cilium --as=cluster-admin deploy/clustermesh-apiserver -c kvstoremesh -- \
83+
clustermesh-apiserver kvstoremesh-dbg troubleshoot <2>
84+
----
85+
<1> Show a connection summary of the KVStoreMesh
86+
<2> Show connection details of the KVStoreMesh
87+
88+
You can also run `cilium-health status --probe` in the agent pod to actively probe the node to node connectivity:
89+
90+
[source,bash]
91+
----
92+
kubectl -n cilium exec -it $AGENT_POD --as=cluster-admin -- cilium-health status --probe
93+
----
94+
95+
=== Checking node routing tables and connectivity
96+
97+
For setups which use static routes to make the nodes of the clusters participating in the cluster mesh reachable from each other, you can check the routing tables on the host and verify connectivity with `ping`.
98+
99+
.OpenShift 4
100+
[source,bash]
101+
----
102+
NODE=<node name of affected node>
103+
REMOTE_NODE=<ip of a node in the remote cluster>
104+
oc -n syn-debug-nodes debug node/${NODE} --as=cluster-admin -- chroot /host ip r
105+
oc -n syn-debug-nodes debug node/${NODE} --as=cluster-admin -- chroot /host ping -c4 ${REMOTE_NODE}
106+
----
107+
108+
.Other K8s
109+
[source,bash]
110+
----
111+
DEBUG_IMAGE=ghcr.io/digitalocean-packages/doks-debug:latest <1>
112+
NODE=<node name of affected node>
113+
REMOTE_NODE=<ip of a node in the remote cluster>
114+
kubectl debug node/${NODE} -it --image=${DEBUG_IMAGE} -- ip r
115+
kubectl debug node/${NODE} -it --image=${DEBUG_IMAGE} -- ping -c4 ${REMOTE_NODE} <2>
116+
----
117+
<1> We're using the DigitalOcean `doks-debug` image, which comes with a bunch of common tools installed.
118+
See https://github.com/digitalocean/doks-debug[icon:github[] digitalocean/doks-debug] for details.
119+
<2> This command hasn't been tested yet, it's possible that your cluster configuration will not allow `ping` in node debug containers.
120+
121+
== icon:book[] Upstream documentation
122+
123+
* https://docs.cilium.io/en/latest/network/clustermesh/intro/[Cilium OSS -- Cluster Mesh documetation]
124+
* https://docs.isovalent.com/configuration-guide/cluster-mesh/troubleshooting.html[Cilium OSS -- Cluster Mesh Troubleshooting]
125+
* https://docs.cilium.io/en/stable/operations/troubleshooting/#troubleshooting-clustermesh[Cilium OSS -- Troubleshooting Cluster Mesh]
126+
* https://docs.isovalent.com/configuration-guide/cluster-mesh/operating.html[Cilium Enterprise -- Operating Cluster Mesh]
127+
* https://docs.isovalent.com/configuration-guide/cluster-mesh/troubleshooting.html[Cilium Enterprise -- Troubleshooting Cluster Mesh]

docs/modules/ROOT/partials/nav.adoc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,7 @@
66
* xref:how-tos/upgrade-cilium-enterprise.adoc[Upgrade Cilium Enterprise]
77
* OpenShift 4
88
** xref:how-tos/openshift4/upgrade-cilium-oss-to-cilium-enterprise.adoc[Upgrade Cilium OSS to Cilium Enterprise]
9+
10+
.Runbooks
11+
12+
* xref:runbooks/CiliumClustermeshRemoteClusterNotReady.adoc[]

jsonnetfile.jsonnet

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
version: 1,
3+
dependencies: [
4+
{
5+
source: {
6+
git: {
7+
remote: 'https://github.com/projectsyn/jsonnet-libs',
8+
subdir: '',
9+
},
10+
},
11+
version: 'main',
12+
name: 'syn',
13+
},
14+
],
15+
}

0 commit comments

Comments
 (0)