@@ -35,8 +35,8 @@ use stackable_operator::{
35
35
api:: {
36
36
apps:: v1:: { StatefulSet , StatefulSetSpec } ,
37
37
core:: v1:: {
38
- ConfigMap , ConfigMapVolumeSource , ContainerPort , EnvVar , EnvVarSource , Probe ,
39
- Secret , SecretKeySelector , Service , ServicePort , ServiceSpec , TCPSocketAction ,
38
+ ConfigMap , ConfigMapVolumeSource , ContainerPort , EnvVar , EnvVarSource , ExecAction ,
39
+ HTTPGetAction , Probe , Secret , SecretKeySelector , Service , ServicePort , ServiceSpec ,
40
40
Volume ,
41
41
} ,
42
42
} ,
@@ -1061,6 +1061,8 @@ fn build_rolegroup_statefulset(
1061
1061
. context ( AddVolumeMountSnafu ) ?
1062
1062
. add_container_ports ( container_ports ( trino) )
1063
1063
. resources ( merged_config. resources . clone ( ) . into ( ) )
1064
+ // The probes are set on coordinators and workers
1065
+ . startup_probe ( startup_probe ( trino) )
1064
1066
. readiness_probe ( readiness_probe ( trino) )
1065
1067
. liveness_probe ( liveness_probe ( trino) )
1066
1068
. build ( ) ;
@@ -1484,40 +1486,74 @@ fn container_ports(trino: &v1alpha1::TrinoCluster) -> Vec<ContainerPort> {
1484
1486
ports
1485
1487
}
1486
1488
1487
- fn readiness_probe ( trino : & v1alpha1:: TrinoCluster ) -> Probe {
1488
- let port_name = if trino. expose_https_port ( ) {
1489
- HTTPS_PORT_NAME
1490
- } else {
1491
- HTTP_PORT_NAME
1492
- } ;
1489
+ fn startup_probe ( trino : & v1alpha1:: TrinoCluster ) -> Probe {
1490
+ Probe {
1491
+ exec : Some ( finished_starting_probe ( trino) ) ,
1492
+ period_seconds : Some ( 5 ) ,
1493
+ // Give the coordinator or worker 10 minutes to start up
1494
+ failure_threshold : Some ( 120 ) ,
1495
+ timeout_seconds : Some ( 3 ) ,
1496
+ ..Default :: default ( )
1497
+ }
1498
+ }
1493
1499
1500
+ fn readiness_probe ( trino : & v1alpha1:: TrinoCluster ) -> Probe {
1494
1501
Probe {
1495
- initial_delay_seconds : Some ( 10 ) ,
1496
- period_seconds : Some ( 10 ) ,
1497
- failure_threshold : Some ( 5 ) ,
1498
- tcp_socket : Some ( TCPSocketAction {
1499
- port : IntOrString :: String ( port_name. to_string ( ) ) ,
1500
- ..TCPSocketAction :: default ( )
1501
- } ) ,
1502
+ http_get : Some ( http_get_probe ( trino) ) ,
1503
+ period_seconds : Some ( 5 ) ,
1504
+ failure_threshold : Some ( 1 ) ,
1505
+ timeout_seconds : Some ( 3 ) ,
1502
1506
..Probe :: default ( )
1503
1507
}
1504
1508
}
1505
1509
1506
1510
fn liveness_probe ( trino : & v1alpha1:: TrinoCluster ) -> Probe {
1507
- let port_name = if trino. expose_https_port ( ) {
1508
- HTTPS_PORT_NAME
1511
+ Probe {
1512
+ http_get : Some ( http_get_probe ( trino) ) ,
1513
+ period_seconds : Some ( 5 ) ,
1514
+ // Coordinators are currently not highly available, so you always have a singe instance.
1515
+ // Restarting it causes all queries to fail, so let's not restart it directly after the first
1516
+ // probe failure, but wait for 3 failures
1517
+ // NOTE: This also applies to workers
1518
+ failure_threshold : Some ( 3 ) ,
1519
+ timeout_seconds : Some ( 3 ) ,
1520
+ ..Probe :: default ( )
1521
+ }
1522
+ }
1523
+
1524
+ /// Check that `/v1/info` returns `200`.
1525
+ ///
1526
+ /// This is the same probe as the [upstream helm-chart](https://github.com/trinodb/charts/blob/7cd0a7bff6c52e0ee6ca6d5394cd72c150ad4379/charts/trino/templates/deployment-coordinator.yaml#L214)
1527
+ /// is using.
1528
+ fn http_get_probe ( trino : & v1alpha1:: TrinoCluster ) -> HTTPGetAction {
1529
+ let ( schema, port_name) = if trino. expose_https_port ( ) {
1530
+ ( "HTTPS" , HTTPS_PORT_NAME )
1509
1531
} else {
1510
- HTTP_PORT_NAME
1532
+ ( "HTTP" , HTTP_PORT_NAME )
1511
1533
} ;
1512
1534
1513
- Probe {
1514
- initial_delay_seconds : Some ( 30 ) ,
1515
- period_seconds : Some ( 10 ) ,
1516
- tcp_socket : Some ( TCPSocketAction {
1517
- port : IntOrString :: String ( port_name. to_string ( ) ) ,
1518
- ..TCPSocketAction :: default ( )
1519
- } ) ,
1520
- ..Probe :: default ( )
1535
+ HTTPGetAction {
1536
+ port : IntOrString :: String ( port_name. to_string ( ) ) ,
1537
+ scheme : Some ( schema. to_string ( ) ) ,
1538
+ path : Some ( "/v1/info" . to_string ( ) ) ,
1539
+ ..Default :: default ( )
1540
+ }
1541
+ }
1542
+
1543
+ /// Wait until `/v1/info` returns `"starting":false`.
1544
+ ///
1545
+ /// This probe works on coordinators and workers.
1546
+ fn finished_starting_probe ( trino : & v1alpha1:: TrinoCluster ) -> ExecAction {
1547
+ let port = trino. exposed_port ( ) ;
1548
+ ExecAction {
1549
+ command : Some ( vec ! [
1550
+ "/bin/bash" . to_string( ) ,
1551
+ "-x" . to_string( ) ,
1552
+ "-euo" . to_string( ) ,
1553
+ "pipefail" . to_string( ) ,
1554
+ "-c" . to_string( ) ,
1555
+ format!( "curl --fail --insecure https://127.0.0.1:{port}/v1/info | grep --silent '\\ \" starting\\ \" :false'" ) ,
1556
+ ] ) ,
1521
1557
}
1522
1558
}
1523
1559
0 commit comments