Skip to content

Commit 7cdd862

Browse files
committed
feat(talos-upgrade): make control-plane upgrade idempotent and safe on retry
The talos upgrade script now inspects each control-plane node before attempting an upgrade. It reads both the current Talos version and the schematic version from the node via `talosctl` and skips upgrade if they already match the desired target. This prevents re-upgrading and rebooting nodes on subsequent `terraform apply` runs when a previous upgrade stopped mid-way, which is especially critical on large clusters where a full re-rollout becomes increasingly risky.
1 parent 7381b0a commit 7cdd862

File tree

1 file changed

+42
-0
lines changed

1 file changed

+42
-0
lines changed

talos.tf

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,22 @@ resource "terraform_data" "upgrade_control_plane" {
9595
${local.talos_healthcheck_enabled} && talosctl --talosconfig "$talosconfig" health --server -n '${local.talos_primary_node_private_ipv4}'
9696
set -- ${join(" ", local.control_plane_private_ipv4_list)}
9797
for host in "$@"; do
98+
echo "Checking node $host..."
99+
100+
current_version=$(talosctl --talosconfig "$talosconfig" get version -n "$host" -o json | jq -r '.spec.version // empty')
101+
current_schematic=$(talosctl --talosconfig "$talosconfig" get extensions -n "$host" -o json \
102+
| jq -r 'select(.spec.metadata.name=="schematic") | .spec.metadata.version' | head -n1 || true)
103+
104+
# Skips upgrading the node if talos version and schematic matches
105+
if [ "$${current_version:-}" = "${var.talos_version}" ] && [ "$${current_schematic:-}" = "${local.talos_schematic_id}" ]; then
106+
echo "Node $host already at target version and schematic — skipping upgrade"
107+
continue
108+
fi
109+
110+
echo "Upgrading $host to ${var.talos_version} / schematic ${local.talos_schematic_id}..."
98111
talosctl --talosconfig "$talosconfig" upgrade -n "$host" --preserve --image '${local.talos_installer_image_url}'
99112
${local.talos_healthcheck_enabled} && talosctl --talosconfig "$talosconfig" health --server -n "$host"
113+
echo "Node $host upgraded successfully"
100114
done
101115
echo "Control plane Nodes upgraded successfully"
102116
else
@@ -136,8 +150,22 @@ resource "terraform_data" "upgrade_worker" {
136150
${local.talos_healthcheck_enabled} && talosctl --talosconfig "$talosconfig" health --server -n '${local.talos_primary_node_private_ipv4}'
137151
set -- ${join(" ", local.worker_private_ipv4_list)}
138152
for host in "$@"; do
153+
echo "Checking node $host..."
154+
155+
current_version=$(talosctl --talosconfig "$talosconfig" get version -n "$host" -o json | jq -r '.spec.version // empty')
156+
current_schematic=$(talosctl --talosconfig "$talosconfig" get extensions -n "$host" -o json \
157+
| jq -r 'select(.spec.metadata.name=="schematic") | .spec.metadata.version' | head -n1 || true)
158+
159+
# Skips upgrading the node if talos version and schematic matches
160+
if [ "$${current_version:-}" = "${var.talos_version}" ] && [ "$${current_schematic:-}" = "${local.talos_schematic_id}" ]; then
161+
echo "Node $host already at target version and schematic — skipping upgrade"
162+
continue
163+
fi
164+
165+
echo "Upgrading $host to ${var.talos_version} / schematic ${local.talos_schematic_id}..."
139166
talosctl --talosconfig "$talosconfig" upgrade -n "$host" --preserve --image '${local.talos_installer_image_url}'
140167
${local.talos_healthcheck_enabled} && talosctl --talosconfig "$talosconfig" health --server -n '${local.talos_primary_node_private_ipv4}'
168+
echo "Node $host upgraded successfully"
141169
done
142170
echo "Worker Nodes upgraded successfully"
143171
else
@@ -179,8 +207,22 @@ resource "terraform_data" "upgrade_cluster_autoscaler" {
179207
${local.talos_healthcheck_enabled} && talosctl --talosconfig "$talosconfig" health --server -n '${local.talos_primary_node_private_ipv4}'
180208
set -- ${join(" ", local.cluster_autoscaler_private_ipv4_list)}
181209
for host in "$@"; do
210+
echo "Checking node $host..."
211+
212+
current_version=$(talosctl --talosconfig "$talosconfig" get version -n "$host" -o json | jq -r '.spec.version // empty')
213+
current_schematic=$(talosctl --talosconfig "$talosconfig" get extensions -n "$host" -o json \
214+
| jq -r 'select(.spec.metadata.name=="schematic") | .spec.metadata.version' | head -n1 || true)
215+
216+
# Skips upgrading the node if talos version and schematic matches
217+
if [ "$${current_version:-}" = "${var.talos_version}" ] && [ "$${current_schematic:-}" = "${local.talos_schematic_id}" ]; then
218+
echo "Node $host already at target version and schematic — skipping upgrade"
219+
continue
220+
fi
221+
222+
echo "Upgrading $host to ${var.talos_version} / schematic ${local.talos_schematic_id}..."
182223
talosctl --talosconfig "$talosconfig" upgrade -n "$host" --preserve --image '${local.talos_installer_image_url}'
183224
${local.talos_healthcheck_enabled} && talosctl --talosconfig "$talosconfig" health --server -n '${local.talos_primary_node_private_ipv4}'
225+
echo "Node $host upgraded successfully"
184226
done
185227
echo "Cluster Autoscaler Nodes upgraded successfully"
186228
else

0 commit comments

Comments
 (0)