Skip to content

Commit c261516

Browse files
Change the liveness probes to use the web UI port and to fail after one minute (#491)
* Use the web UI port for liveness probes * Use defined web UI pages for liveness probes
1 parent 8f0a7c4 commit c261516

File tree

3 files changed

+106
-18
lines changed

3 files changed

+106
-18
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ All notable changes to this project will be documented in this file.
1414
### Changed
1515

1616
- Use new label builders ([#454]).
17+
- Change the liveness probes to use the web UI port and to fail after
18+
one minute ([#491]).
1719

1820
### Removed
1921

@@ -35,6 +37,7 @@ All notable changes to this project will be documented in this file.
3537
[#462]: https://github.com/stackabletech/hdfs-operator/pull/462
3638
[#474]: https://github.com/stackabletech/hdfs-operator/pull/474
3739
[#475]: https://github.com/stackabletech/hdfs-operator/pull/475
40+
[#491]: https://github.com/stackabletech/hdfs-operator/pull/491
3841

3942
## [23.11.0] - 2023-11-24
4043

rust/crd/src/constants.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ pub const DEFAULT_NAME_NODE_GRACEFUL_SHUTDOWN_TIMEOUT: Duration =
4949
pub const DEFAULT_DATA_NODE_GRACEFUL_SHUTDOWN_TIMEOUT: Duration =
5050
Duration::from_minutes_unchecked(30);
5151

52+
pub const READINESS_PROBE_INITIAL_DELAY_SECONDS: i32 = 10;
53+
pub const READINESS_PROBE_PERIOD_SECONDS: i32 = 10;
54+
pub const READINESS_PROBE_FAILURE_THRESHOLD: i32 = 3;
55+
pub const LIVENESS_PROBE_INITIAL_DELAY_SECONDS: i32 = 10;
56+
pub const LIVENESS_PROBE_PERIOD_SECONDS: i32 = 10;
57+
pub const LIVENESS_PROBE_FAILURE_THRESHOLD: i32 = 5;
58+
5259
// hdfs-site.xml
5360
pub const DFS_NAMENODE_NAME_DIR: &str = "dfs.namenode.name.dir";
5461
pub const DFS_NAMENODE_SHARED_EDITS_DIR: &str = "dfs.namenode.shared.edits.dir";

rust/operator-binary/src/container.rs

Lines changed: 96 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,12 @@ use stackable_hdfs_crd::{
1717
constants::{
1818
DATANODE_ROOT_DATA_DIR_PREFIX, DEFAULT_DATA_NODE_METRICS_PORT,
1919
DEFAULT_JOURNAL_NODE_METRICS_PORT, DEFAULT_NAME_NODE_METRICS_PORT,
20-
JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_DIR, LISTENER_VOLUME_NAME, LOG4J_PROPERTIES,
21-
NAMENODE_ROOT_DATA_DIR, SERVICE_PORT_NAME_IPC, SERVICE_PORT_NAME_RPC,
22-
STACKABLE_ROOT_DATA_DIR,
20+
JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_DIR, LISTENER_VOLUME_NAME,
21+
LIVENESS_PROBE_FAILURE_THRESHOLD, LIVENESS_PROBE_INITIAL_DELAY_SECONDS,
22+
LIVENESS_PROBE_PERIOD_SECONDS, LOG4J_PROPERTIES, NAMENODE_ROOT_DATA_DIR,
23+
READINESS_PROBE_FAILURE_THRESHOLD, READINESS_PROBE_INITIAL_DELAY_SECONDS,
24+
READINESS_PROBE_PERIOD_SECONDS, SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS,
25+
SERVICE_PORT_NAME_IPC, SERVICE_PORT_NAME_RPC, STACKABLE_ROOT_DATA_DIR,
2326
},
2427
storage::DataNodeStorageConfig,
2528
AnyNodeConfig, DataNodeContainer, HdfsCluster, HdfsPodRef, HdfsRole, NameNodeContainer,
@@ -35,8 +38,9 @@ use stackable_operator::{
3538
k8s_openapi::{
3639
api::core::v1::{
3740
ConfigMapKeySelector, ConfigMapVolumeSource, Container, ContainerPort,
38-
EmptyDirVolumeSource, EnvVar, EnvVarSource, ObjectFieldSelector, PersistentVolumeClaim,
39-
Probe, ResourceRequirements, TCPSocketAction, Volume, VolumeMount,
41+
EmptyDirVolumeSource, EnvVar, EnvVarSource, HTTPGetAction, ObjectFieldSelector,
42+
PersistentVolumeClaim, Probe, ResourceRequirements, TCPSocketAction, Volume,
43+
VolumeMount,
4044
},
4145
apimachinery::pkg::util::intstr::IntOrString,
4246
},
@@ -114,8 +118,21 @@ pub enum ContainerConfig {
114118
container_name: String,
115119
/// Volume mounts for config and logging.
116120
volume_mounts: ContainerVolumeDirs,
117-
/// Readiness and liveness probe service port name.
118-
tcp_socket_action_port_name: &'static str,
121+
/// Port name of the IPC/RPC port, used for the readiness probe.
122+
ipc_port_name: &'static str,
123+
/// Port name of the web UI HTTP port, used for the liveness probe.
124+
web_ui_http_port_name: &'static str,
125+
/// Port name of the web UI HTTPS port, used for the liveness probe.
126+
web_ui_https_port_name: &'static str,
127+
/// Path of the web UI URL; The path defaults to / in Kubernetes
128+
/// and the kubelet follows redirects. The default would work if
129+
/// the location header is set properly but that is not the case
130+
/// for the DataNode. On a TLS-enabled DataNode, calling
131+
/// https://127.0.0.1:9865/ redirects to the non-TLS URL
132+
/// http://127.0.0.1:9865/index.html which causes the liveness
133+
/// probe to fail. So it is best to not rely on the location
134+
/// header but instead provide the resolved path directly.
135+
web_ui_path: &'static str,
119136
/// The JMX Exporter metrics port.
120137
metrics_port: u16,
121138
},
@@ -390,11 +407,23 @@ impl ContainerConfig {
390407
cb.resources(resources);
391408
}
392409

393-
if let Some(probe) = self.tcp_socket_action_probe(10, 10) {
394-
cb.readiness_probe(probe.clone());
410+
if let Some(probe) = self.web_ui_port_probe(
411+
hdfs,
412+
LIVENESS_PROBE_PERIOD_SECONDS,
413+
LIVENESS_PROBE_INITIAL_DELAY_SECONDS,
414+
LIVENESS_PROBE_FAILURE_THRESHOLD,
415+
) {
395416
cb.liveness_probe(probe);
396417
}
397418

419+
if let Some(probe) = self.ipc_port_probe(
420+
READINESS_PROBE_PERIOD_SECONDS,
421+
READINESS_PROBE_INITIAL_DELAY_SECONDS,
422+
READINESS_PROBE_FAILURE_THRESHOLD,
423+
) {
424+
cb.readiness_probe(probe.clone());
425+
}
426+
398427
Ok(cb.build())
399428
}
400429

@@ -788,24 +817,64 @@ wait_for_termination $!
788817
}
789818
}
790819

791-
/// Creates a probe for [`stackable_operator::k8s_openapi::api::core::v1::TCPSocketAction`]
792-
/// for liveness or readiness probes
793-
fn tcp_socket_action_probe(
820+
/// Creates a probe for the web UI port
821+
fn web_ui_port_probe(
794822
&self,
823+
hdfs: &HdfsCluster,
795824
period_seconds: i32,
796825
initial_delay_seconds: i32,
826+
failure_threshold: i32,
797827
) -> Option<Probe> {
798828
match self {
799829
ContainerConfig::Hdfs {
800-
tcp_socket_action_port_name,
830+
web_ui_http_port_name,
831+
web_ui_https_port_name,
832+
web_ui_path,
801833
..
802-
} => Some(Probe {
834+
} => {
835+
let http_get_action = if hdfs.has_https_enabled() {
836+
HTTPGetAction {
837+
port: IntOrString::String(web_ui_https_port_name.to_string()),
838+
scheme: Some("HTTPS".into()),
839+
path: Some(web_ui_path.to_string()),
840+
..HTTPGetAction::default()
841+
}
842+
} else {
843+
HTTPGetAction {
844+
port: IntOrString::String(web_ui_http_port_name.to_string()),
845+
scheme: Some("HTTP".into()),
846+
path: Some(web_ui_path.to_string()),
847+
..HTTPGetAction::default()
848+
}
849+
};
850+
Some(Probe {
851+
http_get: Some(http_get_action),
852+
period_seconds: Some(period_seconds),
853+
initial_delay_seconds: Some(initial_delay_seconds),
854+
failure_threshold: Some(failure_threshold),
855+
..Probe::default()
856+
})
857+
}
858+
_ => None,
859+
}
860+
}
861+
862+
/// Creates a probe for the IPC/RPC port
863+
fn ipc_port_probe(
864+
&self,
865+
period_seconds: i32,
866+
initial_delay_seconds: i32,
867+
failure_threshold: i32,
868+
) -> Option<Probe> {
869+
match self {
870+
ContainerConfig::Hdfs { ipc_port_name, .. } => Some(Probe {
803871
tcp_socket: Some(TCPSocketAction {
804-
port: IntOrString::String(String::from(*tcp_socket_action_port_name)),
872+
port: IntOrString::String(ipc_port_name.to_string()),
805873
..TCPSocketAction::default()
806874
}),
807875
period_seconds: Some(period_seconds),
808876
initial_delay_seconds: Some(initial_delay_seconds),
877+
failure_threshold: Some(failure_threshold),
809878
..Probe::default()
810879
}),
811880
_ => None,
@@ -1177,21 +1246,30 @@ impl From<HdfsRole> for ContainerConfig {
11771246
role: role.clone(),
11781247
container_name: role.to_string(),
11791248
volume_mounts: ContainerVolumeDirs::from(role),
1180-
tcp_socket_action_port_name: SERVICE_PORT_NAME_RPC,
1249+
ipc_port_name: SERVICE_PORT_NAME_RPC,
1250+
web_ui_http_port_name: SERVICE_PORT_NAME_HTTP,
1251+
web_ui_https_port_name: SERVICE_PORT_NAME_HTTPS,
1252+
web_ui_path: "/dfshealth.html",
11811253
metrics_port: DEFAULT_NAME_NODE_METRICS_PORT,
11821254
},
11831255
HdfsRole::DataNode => Self::Hdfs {
11841256
role: role.clone(),
11851257
container_name: role.to_string(),
11861258
volume_mounts: ContainerVolumeDirs::from(role),
1187-
tcp_socket_action_port_name: SERVICE_PORT_NAME_IPC,
1259+
ipc_port_name: SERVICE_PORT_NAME_IPC,
1260+
web_ui_http_port_name: SERVICE_PORT_NAME_HTTP,
1261+
web_ui_https_port_name: SERVICE_PORT_NAME_HTTPS,
1262+
web_ui_path: "/datanode.html",
11881263
metrics_port: DEFAULT_DATA_NODE_METRICS_PORT,
11891264
},
11901265
HdfsRole::JournalNode => Self::Hdfs {
11911266
role: role.clone(),
11921267
container_name: role.to_string(),
11931268
volume_mounts: ContainerVolumeDirs::from(role),
1194-
tcp_socket_action_port_name: SERVICE_PORT_NAME_RPC,
1269+
ipc_port_name: SERVICE_PORT_NAME_RPC,
1270+
web_ui_http_port_name: SERVICE_PORT_NAME_HTTP,
1271+
web_ui_https_port_name: SERVICE_PORT_NAME_HTTPS,
1272+
web_ui_path: "/journalnode.html",
11951273
metrics_port: DEFAULT_JOURNAL_NODE_METRICS_PORT,
11961274
},
11971275
}

0 commit comments

Comments
 (0)