Skip to content

Commit d680ca8

Browse files
authored
Add missing toleration to aws_nvidia_installer DaemonSet to support tainted GPU nodes (#3075)
2 parents 06f527d + c2a70fe commit d680ca8

File tree

2 files changed

+10
-1
lines changed

2 files changed

+10
-1
lines changed

src/_nebari/stages/kubernetes_initialize/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,9 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]):
8989
for node_group in self.config.amazon_web_services.node_groups.values()
9090
)
9191
input_vars.gpu_node_group_names = [
92-
group for group in self.config.amazon_web_services.node_groups.keys()
92+
group
93+
for group in self.config.amazon_web_services.node_groups.keys()
94+
if self.config.amazon_web_services.node_groups[group].gpu
9395
]
9496
input_vars.aws_region = self.config.amazon_web_services.region
9597

src/_nebari/stages/kubernetes_initialize/template/modules/nvidia-installer/aws-nvidia-installer.tf

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,13 @@ resource "kubernetes_daemonset" "aws_nvidia_installer" {
6868
operator = "Exists"
6969
effect = "NoSchedule"
7070
}
71+
72+
toleration {
73+
key = "dedicated"
74+
operator = "Equal"
75+
value = "nebari"
76+
effect = "NoSchedule"
77+
}
7178
}
7279
}
7380

0 commit comments

Comments
 (0)