Skip to content

Commit ec92551

Browse files
authored
fix: Add proper startupProbe checking if startup is completed (#715)
* fix: Add proper startUp probe * changelog * clippy
1 parent c6e7fb8 commit ec92551

File tree

3 files changed

+70
-28
lines changed

3 files changed

+70
-28
lines changed

CHANGELOG.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,23 @@ All notable changes to this project will be documented in this file.
1212
- Support configuring JVM arguments ([#677]).
1313
- Aggregate emitted Kubernetes events on the CustomResources ([#677]).
1414

15-
## Changed
15+
### Changed
1616

1717
- Increased the default temporary secret lifetime for coordinators from 1 day to 15 days.
1818
This is because Trino currently does not offer a HA setup for them, a restart kills all running queries ([#694]).
1919
- Default to OCI for image metadata and product image selection ([#695]).
2020

21+
### Fixed
22+
23+
- Add a startupProbe, which checks via `/v1/info` that the coordinator/worker have finished starting.
24+
Also migrate the other probes from `tcpSocket` to `httpGet` on `/v1/info` ([#715]).
25+
2126
[#676]: https://github.com/stackabletech/trino-operator/pull/676
2227
[#677]: https://github.com/stackabletech/trino-operator/pull/677
2328
[#687]: https://github.com/stackabletech/trino-operator/pull/687
2429
[#694]: https://github.com/stackabletech/trino-operator/pull/694
2530
[#695]: https://github.com/stackabletech/trino-operator/pull/695
31+
[#715]: https://github.com/stackabletech/trino-operator/pull/715
2632

2733
## [24.11.1] - 2025-01-10
2834

rust/operator-binary/src/controller.rs

Lines changed: 62 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ use stackable_operator::{
3535
api::{
3636
apps::v1::{StatefulSet, StatefulSetSpec},
3737
core::v1::{
38-
ConfigMap, ConfigMapVolumeSource, ContainerPort, EnvVar, EnvVarSource, Probe,
39-
Secret, SecretKeySelector, Service, ServicePort, ServiceSpec, TCPSocketAction,
38+
ConfigMap, ConfigMapVolumeSource, ContainerPort, EnvVar, EnvVarSource, ExecAction,
39+
HTTPGetAction, Probe, Secret, SecretKeySelector, Service, ServicePort, ServiceSpec,
4040
Volume,
4141
},
4242
},
@@ -1061,6 +1061,8 @@ fn build_rolegroup_statefulset(
10611061
.context(AddVolumeMountSnafu)?
10621062
.add_container_ports(container_ports(trino))
10631063
.resources(merged_config.resources.clone().into())
1064+
// The probes are set on coordinators and workers
1065+
.startup_probe(startup_probe(trino))
10641066
.readiness_probe(readiness_probe(trino))
10651067
.liveness_probe(liveness_probe(trino))
10661068
.build();
@@ -1484,40 +1486,74 @@ fn container_ports(trino: &v1alpha1::TrinoCluster) -> Vec<ContainerPort> {
14841486
ports
14851487
}
14861488

1487-
fn readiness_probe(trino: &v1alpha1::TrinoCluster) -> Probe {
1488-
let port_name = if trino.expose_https_port() {
1489-
HTTPS_PORT_NAME
1490-
} else {
1491-
HTTP_PORT_NAME
1492-
};
1489+
fn startup_probe(trino: &v1alpha1::TrinoCluster) -> Probe {
1490+
Probe {
1491+
exec: Some(finished_starting_probe(trino)),
1492+
period_seconds: Some(5),
1493+
// Give the coordinator or worker 10 minutes to start up
1494+
failure_threshold: Some(120),
1495+
timeout_seconds: Some(3),
1496+
..Default::default()
1497+
}
1498+
}
14931499

1500+
fn readiness_probe(trino: &v1alpha1::TrinoCluster) -> Probe {
14941501
Probe {
1495-
initial_delay_seconds: Some(10),
1496-
period_seconds: Some(10),
1497-
failure_threshold: Some(5),
1498-
tcp_socket: Some(TCPSocketAction {
1499-
port: IntOrString::String(port_name.to_string()),
1500-
..TCPSocketAction::default()
1501-
}),
1502+
http_get: Some(http_get_probe(trino)),
1503+
period_seconds: Some(5),
1504+
failure_threshold: Some(1),
1505+
timeout_seconds: Some(3),
15021506
..Probe::default()
15031507
}
15041508
}
15051509

15061510
fn liveness_probe(trino: &v1alpha1::TrinoCluster) -> Probe {
1507-
let port_name = if trino.expose_https_port() {
1508-
HTTPS_PORT_NAME
1511+
Probe {
1512+
http_get: Some(http_get_probe(trino)),
1513+
period_seconds: Some(5),
1514+
// Coordinators are currently not highly available, so you always have a singe instance.
1515+
// Restarting it causes all queries to fail, so let's not restart it directly after the first
1516+
// probe failure, but wait for 3 failures
1517+
// NOTE: This also applies to workers
1518+
failure_threshold: Some(3),
1519+
timeout_seconds: Some(3),
1520+
..Probe::default()
1521+
}
1522+
}
1523+
1524+
/// Check that `/v1/info` returns `200`.
1525+
///
1526+
/// This is the same probe as the [upstream helm-chart](https://github.com/trinodb/charts/blob/7cd0a7bff6c52e0ee6ca6d5394cd72c150ad4379/charts/trino/templates/deployment-coordinator.yaml#L214)
1527+
/// is using.
1528+
fn http_get_probe(trino: &v1alpha1::TrinoCluster) -> HTTPGetAction {
1529+
let (schema, port_name) = if trino.expose_https_port() {
1530+
("HTTPS", HTTPS_PORT_NAME)
15091531
} else {
1510-
HTTP_PORT_NAME
1532+
("HTTP", HTTP_PORT_NAME)
15111533
};
15121534

1513-
Probe {
1514-
initial_delay_seconds: Some(30),
1515-
period_seconds: Some(10),
1516-
tcp_socket: Some(TCPSocketAction {
1517-
port: IntOrString::String(port_name.to_string()),
1518-
..TCPSocketAction::default()
1519-
}),
1520-
..Probe::default()
1535+
HTTPGetAction {
1536+
port: IntOrString::String(port_name.to_string()),
1537+
scheme: Some(schema.to_string()),
1538+
path: Some("/v1/info".to_string()),
1539+
..Default::default()
1540+
}
1541+
}
1542+
1543+
/// Wait until `/v1/info` returns `"starting":false`.
1544+
///
1545+
/// This probe works on coordinators and workers.
1546+
fn finished_starting_probe(trino: &v1alpha1::TrinoCluster) -> ExecAction {
1547+
let port = trino.exposed_port();
1548+
ExecAction {
1549+
command: Some(vec![
1550+
"/bin/bash".to_string(),
1551+
"-x".to_string(),
1552+
"-euo".to_string(),
1553+
"pipefail".to_string(),
1554+
"-c".to_string(),
1555+
format!("curl --fail --insecure https://127.0.0.1:{port}/v1/info | grep --silent '\\\"starting\\\":false'"),
1556+
]),
15211557
}
15221558
}
15231559

rust/operator-binary/src/crd/catalog/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ pub mod versioned {
4848
pub struct TrinoCatalogSpec {
4949
/// The `connector` defines which connector is used.
5050
pub connector: TrinoCatalogConnector,
51-
#[serde(default)]
5251

52+
#[serde(default)]
5353
/// The `configOverrides` allow overriding arbitrary Trino settings.
5454
/// For example, for Hive you could add `hive.metastore.username: trino`.
5555
pub config_overrides: HashMap<String, String>,

0 commit comments

Comments
 (0)