-
Notifications
You must be signed in to change notification settings - Fork 315
[Develop] Introduce Global Cleanup IAM Role for ParallelCluster Build-Image #6912
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
d007c31
cf0a835
08f92f0
3b4a4f5
5201175
2fa7e4f
eec0ac3
8c398cf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -10,6 +10,8 @@ | |||||
import logging | ||||||
import os as os_lib | ||||||
|
||||||
import yaml | ||||||
|
||||||
from pcluster.api.controllers.common import ( | ||||||
assert_supported_operation, | ||||||
configure_aws_region, | ||||||
|
@@ -52,14 +54,15 @@ | |||||
from pcluster.aws.common import AWSClientError | ||||||
from pcluster.aws.ec2 import Ec2Client | ||||||
from pcluster.constants import SUPPORTED_ARCHITECTURES, SUPPORTED_OSES, Operation | ||||||
from pcluster.imagebuilder_utils import ensure_default_build_image_stack_cleanup_role | ||||||
from pcluster.models.imagebuilder import ( | ||||||
BadRequestImageBuilderActionError, | ||||||
ConfigValidationError, | ||||||
ImageBuilder, | ||||||
NonExistingImageError, | ||||||
) | ||||||
from pcluster.models.imagebuilder_resources import ImageBuilderStack, NonExistingStackError | ||||||
from pcluster.utils import get_installed_version, to_utc_datetime | ||||||
from pcluster.utils import get_installed_version, get_partition, to_utc_datetime | ||||||
from pcluster.validators.common import FailureLevel | ||||||
|
||||||
LOGGER = logging.getLogger(__name__) | ||||||
|
@@ -105,6 +108,29 @@ def build_image( | |||||
validation_failure_level = validation_failure_level or ValidationLevel.ERROR | ||||||
dryrun = dryrun or False | ||||||
|
||||||
raw_cfg_str = build_image_request_content["imageConfiguration"] | ||||||
cfg_dict = yaml.safe_load(raw_cfg_str) or {} | ||||||
# If CleanupLambdaRole exists in the config, skip ensure_default_build_image_stack_cleanup_role | ||||||
has_custom_cleanup_role = cfg_dict.get("Build", {}).get("Iam", {}).get("CleanupLambdaRole") | ||||||
# If LambdaFunctionsVpcConfig exists in the config, attach the AWS-managed LambdaVPCAccess policy | ||||||
has_lambda_functions_vpc_config = cfg_dict.get("DeploymentSettings", {}).get("LambdaFunctionsVpcConfig") | ||||||
Comment on lines
+115
to
+116
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need the lambda function vpc configuraitons? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See aws-parallelcluster/cli/src/pcluster/templates/imagebuilder_stack.py Lines 741 to 742 in 6d4e24d
The cleanup role needs the |
||||||
|
||||||
if not has_custom_cleanup_role: | ||||||
try: | ||||||
account_id = AWSApi.instance().sts.get_account_id() | ||||||
ensure_default_build_image_stack_cleanup_role( | ||||||
account_id, get_partition(), attach_vpc_access_policy=bool(has_lambda_functions_vpc_config) | ||||||
) | ||||||
except AWSClientError as e: | ||||||
if e.error_code in ("AccessDenied", "AccessDeniedException", "UnauthorizedOperation"): | ||||||
raise BadRequestException( | ||||||
"Current principal lacks permissions to create or update the ParallelCluster build-image " | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [Design] Why do we need to update the role>? Didn't we agreed on having versioned roles specifically to not update the policy? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reached an agreement yesterday. Done. Changed the logic to create roles with revision in their name. |
||||||
"cleanup IAM role. " | ||||||
"Either pass `Build/Iam/CleanupLambdaRole` or grant the missing permissions to continue. " | ||||||
"For detailed instructions, please refer to our public documentation." | ||||||
) | ||||||
raise | ||||||
|
||||||
build_image_request_content = BuildImageRequestContent.from_dict(build_image_request_content) | ||||||
|
||||||
try: | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,12 +8,21 @@ | |
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES | ||
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
import json | ||
import logging | ||
import os | ||
|
||
import yaml | ||
|
||
from pcluster.aws.aws_api import AWSApi | ||
from pcluster.utils import get_url_scheme, yaml_load | ||
from pcluster.aws.common import AWSClientError | ||
from pcluster.constants import ( | ||
IAM_ROLE_PATH, | ||
PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_BOOTSTRAP_TAG_KEY, | ||
PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_PREFIX, | ||
PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_REVISION, | ||
) | ||
from pcluster.utils import generate_string_hash, get_url_scheme, yaml_load | ||
|
||
ROOT_VOLUME_TYPE = "gp3" | ||
PCLUSTER_RESERVED_VOLUME_SIZE = 37 | ||
|
@@ -65,3 +74,175 @@ def _generate_action(action_name, commands): | |
"""Generate action in imagebuilder components.""" | ||
action = {"name": action_name, "action": "ExecuteBash", "inputs": {"commands": [commands]}} | ||
return action | ||
|
||
|
||
def get_cleanup_role_name(account_id: str) -> str: | ||
"""Return the role name including a revision number.""" | ||
hashed_account_id = generate_string_hash(account_id) | ||
return ( | ||
f"{PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_PREFIX}-{hashed_account_id}" | ||
f"-revision-{PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_REVISION}" | ||
) | ||
|
||
|
||
def _expected_inline_policy(account_id: str, partition: str): | ||
"""Return the inline policy document (JSON-serialised string).""" | ||
return json.dumps( | ||
{ | ||
"Version": "2012-10-17", | ||
"Statement": [ | ||
{ | ||
"Action": ["iam:DetachRolePolicy", "iam:DeleteRole", "iam:DeleteRolePolicy"], | ||
"Resource": f"arn:{partition}:iam::{account_id}:role/parallelcluster/*", | ||
"Effect": "Allow", | ||
}, | ||
{ | ||
"Action": ["iam:DeleteInstanceProfile", "iam:RemoveRoleFromInstanceProfile"], | ||
"Resource": f"arn:{partition}:iam::{account_id}:instance-profile/parallelcluster/*", | ||
"Effect": "Allow", | ||
}, | ||
{ | ||
"Action": "imagebuilder:DeleteInfrastructureConfiguration", | ||
"Resource": f"arn:{partition}:imagebuilder:*:{account_id}:infrastructure-configuration/" | ||
f"parallelclusterimage-*", | ||
"Effect": "Allow", | ||
}, | ||
{ | ||
"Action": ["imagebuilder:DeleteComponent"], | ||
"Resource": [f"arn:{partition}:imagebuilder:*:{account_id}:component/parallelclusterimage-*/*"], | ||
"Effect": "Allow", | ||
}, | ||
{ | ||
"Action": "imagebuilder:DeleteImageRecipe", | ||
"Resource": f"arn:{partition}:imagebuilder:*:{account_id}:image-recipe/parallelclusterimage-*/*", | ||
"Effect": "Allow", | ||
}, | ||
{ | ||
"Action": "imagebuilder:DeleteDistributionConfiguration", | ||
"Resource": f"arn:{partition}:imagebuilder:*:{account_id}:distribution-configuration/" | ||
f"parallelclusterimage-*", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. f-string not needed here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's needed. It's the same string. But due to length restriction so have to split to two lines.
|
||
"Effect": "Allow", | ||
}, | ||
{ | ||
"Action": ["imagebuilder:DeleteImage", "imagebuilder:GetImage", "imagebuilder:CancelImageCreation"], | ||
"Resource": f"arn:{partition}:imagebuilder:*:{account_id}:image/parallelclusterimage-*/*", | ||
"Effect": "Allow", | ||
}, | ||
{ | ||
"Action": "cloudformation:DeleteStack", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [Security] Can we scope this down using conditions on tags? For example, we known that every pcluster stack will have at least pcluster version tag There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ref: https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_elements_condition_operators.html
Testing if it works. |
||
"Resource": f"arn:{partition}:cloudformation:*:{account_id}:stack/*/*", | ||
"Condition": { | ||
"ForAnyValue:StringLike": {"cloudformation:ResourceTag/parallelcluster:image_id": "*"} | ||
}, | ||
"Effect": "Allow", | ||
}, | ||
# The below two permissions are required for the DeleteStackFunction Lambda to tag the | ||
# created AMI with 'parallelcluster:build_status' and 'parallelcluster:parent_image' tags | ||
{"Action": "ec2:CreateTags", "Resource": f"arn:{partition}:ec2:*::image/*", "Effect": "Allow"}, | ||
{"Action": "tag:TagResources", "Resource": f"arn:{partition}:ec2:*::image/*", "Effect": "Allow"}, | ||
{ | ||
"Action": ["lambda:DeleteFunction", "lambda:RemovePermission"], | ||
"Resource": f"arn:{partition}:lambda:*:{account_id}:function:ParallelClusterImage-*", | ||
"Effect": "Allow", | ||
}, | ||
{ | ||
"Action": "logs:DeleteLogGroup", | ||
"Resource": f"arn:{partition}:logs:*:{account_id}:log-group:/aws/lambda/ParallelClusterImage-*:*", | ||
"Effect": "Allow", | ||
}, | ||
{ | ||
"Action": [ | ||
"SNS:GetTopicAttributes", | ||
"SNS:DeleteTopic", | ||
"SNS:GetSubscriptionAttributes", | ||
"SNS:Unsubscribe", | ||
], | ||
"Resource": f"arn:{partition}:sns:*:{account_id}:ParallelClusterImage-*", | ||
"Effect": "Allow", | ||
}, | ||
], | ||
} | ||
) | ||
|
||
|
||
def ensure_default_build_image_stack_cleanup_role( | ||
account_id: str, partition="aws", attach_vpc_access_policy: bool = False | ||
) -> str: | ||
""" | ||
Ensure the global (account-wide) cleanup role exists and is at the expected revision. | ||
|
||
The function follows a safe order: | ||
1. If the role does not exist, create it without the bootstrapped tag. | ||
2. If LambdaFunctionsVpcConfig exists in the config, attach the AWS-managed LambdaVPCAccess policy. | ||
3. Attach the AWS-managed Lambda basic policy. | ||
4. Update/write the inline policy (least-privilege cleanup policy). | ||
5. Only after the inline policy succeeds, set the bootstrapped tag. | ||
|
||
This way, if step 2, 3 or 4 fails (e.g., lack of iam:PutRolePolicy permission), | ||
future invocations will keep retrying. | ||
""" | ||
iam = AWSApi.instance().iam | ||
role_name = get_cleanup_role_name(account_id) | ||
role_arn = f"arn:{partition}:iam::{account_id}:role{IAM_ROLE_PATH}{role_name}" | ||
|
||
# Assume-role trust policy | ||
assume_doc = { | ||
"Version": "2012-10-17", | ||
"Statement": [ | ||
{ | ||
"Effect": "Allow", | ||
"Principal": {"Service": "lambda.amazonaws.com"}, | ||
"Action": "sts:AssumeRole", | ||
"Condition": { | ||
"ArnLike": { | ||
"aws:SourceArn": f"arn:{partition}:lambda:*:{account_id}:function:ParallelClusterImage-*" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [Security] This condition allows every function in the stack to assume this role, but we want only a specific function to be able to. Can we use a more specific name? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No we can't, the delete stack lambda function is named like "ParallelClusterImage-15a09820-6274-11f0-8162-0e903a4ee48d". |
||
} | ||
}, | ||
} | ||
], | ||
} | ||
# Check whether the role already exists | ||
try: | ||
resp = iam.get_role(role_name=role_name) | ||
tags = {t["Key"]: t["Value"] for t in resp["Role"].get("Tags", [])} | ||
already_bootstrapped = tags.get(PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_BOOTSTRAP_TAG_KEY, "").lower() == "true" | ||
except AWSClientError as e: | ||
if e.error_code == "NoSuchEntity": | ||
logging.info("Creating default build-image stack cleanup role %s because it does not exists.", role_name) | ||
iam.create_role( | ||
RoleName=role_name, | ||
Path=IAM_ROLE_PATH, | ||
AssumeRolePolicyDocument=json.dumps(assume_doc), | ||
Description="AWS ParallelCluster build-image cleanup Lambda execution role", | ||
) | ||
already_bootstrapped = False | ||
else: | ||
raise | ||
|
||
# Attach AWSLambdaVPCAccessExecutionRole | ||
if attach_vpc_access_policy: | ||
iam.attach_role_policy( | ||
role_name, | ||
f"arn:{partition}:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole", | ||
) | ||
|
||
if already_bootstrapped: | ||
return role_arn | ||
|
||
# Attach AWSLambdaBasicExecutionRole | ||
cleanup_role_basic_managed_policy = f"arn:{partition}:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" | ||
iam.attach_role_policy(role_name, cleanup_role_basic_managed_policy) | ||
|
||
# Put inline policy | ||
iam.put_role_policy( | ||
role_name=role_name, | ||
policy_name="ParallelClusterCleanupInline", | ||
policy_document=_expected_inline_policy(account_id, partition), | ||
) | ||
|
||
# Set bootstrapped tag after policy write succeeds | ||
iam.tag_role( | ||
role_name=role_name, | ||
tags=[{"Key": PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_BOOTSTRAP_TAG_KEY, "Value": "true"}], | ||
) | ||
return role_arn |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[CodeStyle] Encapsulate the logic to check whether a custom role is set into a dedicated function and unit test it independently.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think it's necessary. It's a common dig logic and the logic to test if it can correctly dig is already include in the unit test
test_enable_cleanup_role_call_and_vpc_flag
.