Skip to content

Commit d1af051

Browse files
authored
[LoginNodes] Allow multiple login node pools (#6389)
1 parent 9882f24 commit d1af051

File tree

19 files changed

+462
-245
lines changed

19 files changed

+462
-245
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ CHANGELOG
1010
- Add support for ap-southeast-3 region.
1111
- Add security groups to login node network load balancer.
1212
- Add `AllowedIps` configuration for login nodes.
13+
- Allow multiple login node pools.
1314

1415
**BUG FIXES**
1516
- Fix validator `EfaPlacementGroupValidator` so that it does not suggest to configure a Placement Group when Capacity Blocks are used.

cli/src/pcluster/api/controllers/cluster_operations_controller.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -265,18 +265,27 @@ def describe_cluster(cluster_name, region=None):
265265

266266
def _get_login_nodes(cluster):
267267
login_nodes_status = cluster.login_nodes_status
268+
269+
# TODO Fix once the API models are updated to support multiple pools in describe-cluster response
268270
if login_nodes_status.get_login_nodes_pool_available():
269-
status = LoginNodesState.FAILED
270-
if login_nodes_status.get_status() == LoginNodesPoolState.ACTIVE:
271-
status = LoginNodesState.ACTIVE
272-
elif login_nodes_status.get_status() == LoginNodesPoolState.PENDING:
273-
status = LoginNodesState.PENDING
274-
login_nodes = LoginNodesPool(status=status)
275-
login_nodes.address = login_nodes_status.get_address()
276-
login_nodes.scheme = login_nodes_status.get_scheme()
277-
login_nodes.healthy_nodes = login_nodes_status.get_healthy_nodes()
278-
login_nodes.unhealthy_nodes = login_nodes_status.get_unhealthy_nodes()
279-
return login_nodes
271+
login_nodes = []
272+
273+
for _pool_name, pool_status in login_nodes_status.get_pool_status_dict().items():
274+
status = LoginNodesState.FAILED
275+
if pool_status.get_status() == LoginNodesPoolState.ACTIVE:
276+
status = LoginNodesState.ACTIVE
277+
elif pool_status.get_status() == LoginNodesPoolState.PENDING:
278+
status = LoginNodesState.PENDING
279+
pool = LoginNodesPool(status=status)
280+
# pool.name = pool_name
281+
pool.address = pool_status.get_address()
282+
pool.scheme = pool_status.get_scheme()
283+
pool.healthy_nodes = pool_status.get_healthy_nodes()
284+
pool.unhealthy_nodes = pool_status.get_unhealthy_nodes()
285+
login_nodes.append(pool)
286+
break
287+
288+
return login_nodes[0]
280289
return None
281290

282291

cli/src/pcluster/config/cluster_config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1406,6 +1406,11 @@ def __init__(
14061406
super().__init__(**kwargs)
14071407
self.pools = pools
14081408

1409+
def _register_validators(self, context: ValidatorContext = None): # noqa: D102 #pylint: disable=unused-argument
1410+
self._register_validator(
1411+
DuplicateNameValidator, name_list=[pool.name for pool in self.pools], resource_name="Pool"
1412+
)
1413+
14091414

14101415
class HeadNode(Resource):
14111416
"""Represent the Head Node resource."""

cli/src/pcluster/config/update_policy.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -491,6 +491,25 @@ def condition_checker_login_nodes_stop_policy(_, patch):
491491
return not patch.cluster.has_running_login_nodes()
492492

493493

494+
def condition_checker_login_nodes_pool_stop_policy(change, patch):
495+
"""Check if login nodes are running in the pool in which update was requested."""
496+
pool_name = get_pool_name_from_change_paths(change)
497+
return not patch.cluster.has_running_login_nodes(pool_name=pool_name)
498+
499+
500+
def get_pool_name_from_change_paths(change):
501+
"""
502+
Return the name of the pool in which update was requested.
503+
504+
Example path=['LoginNodes', 'Pools[pool2]', 'Ssh'].
505+
"""
506+
for path in change.path:
507+
if re.search("Pools\\[", path):
508+
_, pool_name = extract_type_and_name_from_path(path)
509+
return pool_name
510+
return ""
511+
512+
494513
# Common fail_reason messages
495514
UpdatePolicy.FAIL_REASONS = {
496515
"ebs_volume_resize": "Updating the file system after a resize operation requires commands specific to your "
@@ -501,6 +520,9 @@ def condition_checker_login_nodes_stop_policy(_, patch):
501520
"pcluster update-compute-fleet command and then run an update with the --force-update flag"
502521
),
503522
"login_nodes_running": lambda change, patch: "The update is not supported when login nodes are running",
523+
"login_nodes_pool_running": lambda change, patch: (
524+
f"The update is not supported when login nodes in {get_pool_name_from_change_paths(change)} are running"
525+
),
504526
"compute_or_login_nodes_running": lambda change, patch: (
505527
"The update is not supported when compute or login nodes are running"
506528
),
@@ -669,8 +691,7 @@ def condition_checker_login_nodes_stop_policy(_, patch):
669691
condition_checker=condition_checker_managed_fsx,
670692
)
671693

672-
673-
# Update policy for updating LoginNodes / Pools
694+
# Update policy for adding or removing a login node pool
674695
UpdatePolicy.LOGIN_NODES_POOLS = UpdatePolicy(
675696
name="LOGIN_NODES_POOLS_UPDATE_POLICY",
676697
level=6,
@@ -679,7 +700,7 @@ def condition_checker_login_nodes_stop_policy(_, patch):
679700
action_needed=UpdatePolicy.ACTIONS_NEEDED["login_nodes_stop"],
680701
)
681702

682-
# Update supported only with all login nodes down
703+
# Update supported only with all login nodes in the cluster down
683704
UpdatePolicy.LOGIN_NODES_STOP = UpdatePolicy(
684705
name="LOGIN_NODES_STOP",
685706
level=10,
@@ -688,6 +709,14 @@ def condition_checker_login_nodes_stop_policy(_, patch):
688709
action_needed=UpdatePolicy.ACTIONS_NEEDED["login_nodes_stop"],
689710
)
690711

712+
# Update supported only with all login nodes in the pool down
713+
UpdatePolicy.LOGIN_NODES_POOL_STOP = UpdatePolicy(
714+
name="LOGIN_NODES_POOL_STOP",
715+
level=10,
716+
condition_checker=condition_checker_login_nodes_pool_stop_policy,
717+
fail_reason=UpdatePolicy.FAIL_REASONS["login_nodes_pool_running"],
718+
action_needed=UpdatePolicy.ACTIONS_NEEDED["login_nodes_stop"],
719+
)
691720

692721
# Update supported only with all computre and login nodes down
693722
UpdatePolicy.COMPUTE_AND_LOGIN_NODES_STOP = UpdatePolicy(

cli/src/pcluster/models/cluster.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -284,12 +284,11 @@ def compute_fleet_status(self) -> ComputeFleetStatus:
284284

285285
@property
286286
def login_nodes_status(self):
287-
"""Status of the login nodes pool."""
287+
"""Status of the login nodes."""
288288
login_nodes_status = LoginNodesStatus(self.stack_name)
289289
if self.stack.scheduler == "slurm" and self.config.login_nodes:
290-
# This approach works since by design we have now only one pool.
291-
# We should fix this if we want to add more than a login nodes pool per cluster.
292-
login_nodes_status.retrieve_data(self.config.login_nodes.pools[0].name)
290+
login_node_pool_names = [pool.name for pool in self.config.login_nodes.pools]
291+
login_nodes_status.retrieve_data(login_node_pool_names)
293292
return login_nodes_status
294293

295294
@property
@@ -755,13 +754,18 @@ def has_running_capacity(self, updated_value: bool = False) -> bool:
755754
)
756755
return self.__has_running_capacity
757756

758-
def has_running_login_nodes(self, updated_value: bool = False) -> bool:
759-
"""Return True if the cluster has running login nodes. Note: the value will be cached."""
757+
def has_running_login_nodes(self, updated_value: bool = False, pool_name: str = None) -> bool:
758+
"""
759+
Return True if the cluster has running login nodes, or a specific pool if a pool name is provided.
760+
761+
Note: the value will be cached.
762+
"""
763+
healthy_nodes = self.login_nodes_status.get_healthy_nodes(pool_name=pool_name)
764+
unhealthy_nodes = self.login_nodes_status.get_unhealthy_nodes(pool_name=pool_name)
765+
760766
if self.__has_running_login_nodes is None or updated_value:
761767
self.__has_running_login_nodes = (
762-
self.login_nodes_status.get_healthy_nodes() is not None
763-
and self.login_nodes_status.get_unhealthy_nodes() is not None
764-
and self.login_nodes_status.get_healthy_nodes() + self.login_nodes_status.get_unhealthy_nodes() != 0
768+
healthy_nodes is not None and unhealthy_nodes is not None and healthy_nodes + unhealthy_nodes != 0
765769
)
766770
return self.__has_running_login_nodes
767771

cli/src/pcluster/models/login_nodes_status.py

Lines changed: 89 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -28,57 +28,57 @@ def __str__(self):
2828
return str(self.value)
2929

3030

31-
class LoginNodesStatus:
32-
"""Represents the status of the cluster login nodes pools."""
31+
class PoolStatus:
32+
"""Represents the status of a pool of login nodes."""
3333

34-
def __init__(self, stack_name):
35-
self._stack_name = stack_name
36-
self._login_nodes_pool_name = None
37-
self._login_nodes_pool_available = False
38-
self._load_balancer_arn = None
39-
self._target_group_arn = None
40-
self._status = None
34+
def __init__(self, stack_name, pool_name):
4135
self._dns_name = None
36+
self._status = None
4237
self._scheme = None
38+
self._pool_name = pool_name
39+
self._pool_available = False
40+
self._stack_name = stack_name
4341
self._healthy_nodes = None
4442
self._unhealthy_nodes = None
43+
self._load_balancer_arn = None
44+
self._target_group_arn = None
45+
self._retrieve_data()
4546

4647
def __str__(self):
4748
return (
4849
f'("status": "{self._status}", "address": "{self._dns_name}", "scheme": "{self._scheme}", '
49-
f'"healthyNodes": "{self._healthy_nodes}", "unhealthy_nodes": "{self._unhealthy_nodes}")'
50+
f'"healthy_nodes": "{self._healthy_nodes}", "unhealthy_nodes": "{self._unhealthy_nodes}"),'
5051
)
5152

52-
def get_login_nodes_pool_available(self):
53-
"""Return the status of a login nodes fleet."""
54-
return self._login_nodes_pool_available
53+
def get_healthy_nodes(self):
54+
"""Return the number of healthy nodes of the login node pool."""
55+
return self._healthy_nodes
56+
57+
def get_unhealthy_nodes(self):
58+
"""Return the number of unhealthy nodes of the login node pool."""
59+
return self._unhealthy_nodes
60+
61+
def get_pool_available(self):
62+
"""Return true if the pool is available."""
63+
return self._pool_available
5564

5665
def get_status(self):
57-
"""Return the status of a login nodes fleet."""
66+
"""Return the status of the login node pool."""
5867
return self._status
5968

6069
def get_address(self):
61-
"""Return the single connection address of a login nodes fleet."""
70+
"""Return the connection addresses of the login node pool."""
6271
return self._dns_name
6372

6473
def get_scheme(self):
65-
"""Return the schema of a login nodes fleet."""
74+
"""Return the schema of the login node pool."""
6675
return self._scheme
6776

68-
def get_healthy_nodes(self):
69-
"""Return the number of healthy nodes of a login nodes fleet."""
70-
return self._healthy_nodes
71-
72-
def get_unhealthy_nodes(self):
73-
"""Return the number of unhealthy nodes of a login nodes fleet."""
74-
return self._unhealthy_nodes
75-
76-
def retrieve_data(self, login_nodes_pool_name):
77+
def _retrieve_data(self):
7778
"""Initialize the class with the information related to the login nodes pool."""
78-
self._login_nodes_pool_name = login_nodes_pool_name
7979
self._retrieve_assigned_load_balancer()
8080
if self._load_balancer_arn:
81-
self._login_nodes_pool_available = True
81+
self._pool_available = True
8282
self._populate_target_groups()
8383
self._populate_target_group_health()
8484

@@ -98,7 +98,7 @@ def _load_balancer_arn_from_tags(self, tags_list):
9898
for tags in tags_list:
9999
if self._key_value_tag_found(
100100
tags, "parallelcluster:cluster-name", self._stack_name
101-
) and self._key_value_tag_found(tags, "parallelcluster:login-nodes-pool", self._login_nodes_pool_name):
101+
) and self._key_value_tag_found(tags, "parallelcluster:login-nodes-pool", self._pool_name):
102102
self._load_balancer_arn = tags.get("ResourceArn")
103103
break
104104

@@ -153,3 +153,64 @@ def _populate_target_group_health(self):
153153
"This is expected if login nodes pool creation/deletion is in progress",
154154
e,
155155
)
156+
157+
158+
class LoginNodesStatus:
159+
"""Represents the status of the cluster login nodes pools."""
160+
161+
def __init__(self, stack_name):
162+
self._stack_name = stack_name
163+
self._pool_status_dict = dict()
164+
self._login_nodes_pool_available = False
165+
self._total_healthy_nodes = None
166+
self._total_unhealthy_nodes = None
167+
168+
def __str__(self):
169+
out = ""
170+
for pool_status in self._pool_status_dict.values():
171+
out += str(pool_status)
172+
return out
173+
174+
def get_login_nodes_pool_available(self):
175+
"""Return true if a pool is available in the login nodes fleet."""
176+
return self._login_nodes_pool_available
177+
178+
def get_pool_status_dict(self):
179+
"""Return a dictionary mapping each login node pool name to respective pool status."""
180+
return self._pool_status_dict
181+
182+
def get_healthy_nodes(self, pool_name=None):
183+
"""Return the total number of healthy login nodes in the cluster or a specific pool."""
184+
healthy_nodes = (
185+
self._pool_status_dict.get(pool_name).get_healthy_nodes() if pool_name else self._total_healthy_nodes
186+
)
187+
return healthy_nodes
188+
189+
def get_unhealthy_nodes(self, pool_name=None):
190+
"""Return the total number of unhealthy login nodes in the cluster or a specific pool."""
191+
unhealthy_nodes = (
192+
self._pool_status_dict.get(pool_name).get_unhealthy_nodes() if pool_name else self._total_unhealthy_nodes
193+
)
194+
return unhealthy_nodes
195+
196+
def retrieve_data(self, login_node_pool_names):
197+
"""Initialize the class with the information related to the login node fleet."""
198+
for pool_name in login_node_pool_names:
199+
self._pool_status_dict[pool_name] = PoolStatus(self._stack_name, pool_name)
200+
self._total_healthy_nodes = sum(
201+
(
202+
pool_status.get_healthy_nodes()
203+
for pool_status in self._pool_status_dict.values()
204+
if pool_status.get_healthy_nodes()
205+
)
206+
)
207+
self._total_unhealthy_nodes = sum(
208+
(
209+
pool_status.get_unhealthy_nodes()
210+
for pool_status in self._pool_status_dict.values()
211+
if pool_status.get_unhealthy_nodes()
212+
)
213+
)
214+
self._login_nodes_pool_available = any(
215+
(pool_status.get_pool_available() for pool_status in self._pool_status_dict.values())
216+
)

0 commit comments

Comments
 (0)