Merge pull request #94127 from jeana-redhat/OSDOCS-14523-CAPI-AWS-parity

jeana-redhat · web-flow · commit 4dca81be3e46 · 2025-06-16T11:52:46.000-04:00
OSDOCS-14523: Porting AWS MAPI features to CAPI docs
diff --git a/machine_management/applying-autoscaling.adoc b/machine_management/applying-autoscaling.adoc
@@ -32,7 +32,7 @@ include::modules/cluster-autoscaler-cr.adoc[leveloffset=+3]
 include::modules/cluster-autoscaler-config-priority-expander.adoc[leveloffset=+3]
 
 //Labeling GPU machine sets for the cluster autoscaler
-include::modules/machine-feature-agnostic-options-label-gpu-autoscaler.adoc[leveloffset=+3]
+include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+3]
 
 :FeatureName: cluster autoscaler
 :FeatureResourceName: ClusterAutoscaler
diff --git a/machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc b/machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc
@@ -22,9 +22,45 @@ include::modules/capi-yaml-machine-template-aws.adoc[leveloffset=+2]
 //Sample YAML for a CAPI AWS compute machine set resource
 include::modules/capi-yaml-machine-set-aws.adoc[leveloffset=+2]
 
-// [id="cluster-api-supported-features-aws_{context}"]
-// == Enabling {aws-full} features with the Cluster API
+[id="cluster-api-supported-features-aws_{context}"]
+== Enabling {aws-full} features with the Cluster API
 
-// You can enable the following features by updating values in the Cluster API custom resource manifests.
+You can enable the following features by updating values in the Cluster API custom resource manifests.
 
-//Not sure what, if anything, we can add here at this time.
+////
+//Not yet supported, relies on Cluster API CAS support
+// Cluster autoscaler GPU labels
+include::modules/machine-feature-agnostic-options-label-gpu-autoscaler.adoc[leveloffset=+2]
+
+[role="_additional-resources"]
+.Additional resources
+* xref:../../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
+////
+
+// Elastic Fabric Adapter instances and placement group options
+include::modules/machine-feature-aws-existing-placement-group.adoc[leveloffset=+2]
+
+// Amazon EC2 Instance Metadata Service configuration options
+include::modules/machine-feature-aws-imds-options.adoc[leveloffset=+2]
+
+////
+//This link is for a note that does not apply to TP clusters, reassess for Cluster API GA
+[role="_additional-resources"]
+.Additional resources
+* xref:../../../machine_configuration/mco-update-boot-images.adoc#mco-update-boot-images[Updated boot images]
+////
+
+// Dedicated Instances configuration options
+include::modules/machine-feature-aws-dedicated-instances.adoc[leveloffset=+2]
+
+// Non-guaranteed Spot Instances and hourly cost limits
+include::modules/machine-feature-agnostic-nonguaranteed-instances.adoc[leveloffset=+2]
+
+// Capacity Reservation configuration options
+include::modules/machine-feature-agnostic-capacity-reservation.adoc[leveloffset=+2]
+
+//Adding a GPU node to a machine set (stesmith)
+include::modules/machine-feature-aws-add-nvidia-gpu-node.adoc[leveloffset=+2]
+
+// //Deploying the Node Feature Discovery Operator (stesmith)
+// include::modules/nvidia-gpu-aws-deploying-the-node-feature-discovery-operator.adoc[leveloffset=+1]
diff --git a/machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-bare-metal.adoc b/machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-bare-metal.adoc
@@ -22,6 +22,8 @@ include::modules/capi-yaml-machine-template-bare-metal.adoc[leveloffset=+2]
 //Sample YAML for a CAPI bare metal compute machine set resource
 include::modules/capi-yaml-machine-set-bare-metal.adoc[leveloffset=+2]
 
+////
+//Section depends on migration support
 [id="cluster-api-supported-features-bare-metal_{context}"]
 == Enabling bare metal features with the Cluster API
 
@@ -33,3 +35,4 @@ include::modules/machine-feature-agnostic-options-label-gpu-autoscaler.adoc[leve
 [role="_additional-resources"]
 .Additional resources
 * xref:../../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
+////
diff --git a/modules/capi-yaml-machine-template-aws.adoc b/modules/capi-yaml-machine-template-aws.adoc
@@ -19,12 +19,11 @@ metadata:
 spec:
   template:
     spec: # <3>
-      uncompressedUserData: true
       iamInstanceProfile: # ...
       instanceType: m5.large
       ignition:
         storageType: UnencryptedUserData
-        version: "3.2"
+        version: "3.4"
       ami:
         id: # ...
       subnet:
diff --git a/modules/machine-feature-agnostic-capacity-reservation.adoc b/modules/machine-feature-agnostic-capacity-reservation.adoc
@@ -0,0 +1,70 @@
+// Module included in the following assemblies:
+//
+// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc
+// There are parallel features in Azure so this module is set up for reuse.
+
+ifeval::["{context}" == "cluster-api-config-options-aws"]
+:aws:
+endif::[]
+
+:_mod-docs-content-type: CONCEPT
+[id="machine-feature-agnostic-capacity-reservation_{context}"]
+= Capacity Reservation configuration options
+
+{product-title} version {product-version} and later supports
+ifdef::azure[on-demand Capacity Reservation with Capacity Reservation groups on {azure-full} clusters.]
+ifdef::aws[Capacity Reservations on {aws-full} clusters, including On-Demand Capacity Reservations and Capacity Blocks for ML.]
+
+You can deploy machines on any available resources that match the parameters of a capacity request that you define.
+These parameters specify the 
+ifdef::azure[VM size,]
+ifdef::aws[instance type,]
+region, and number of instances that you want to reserve.
+If your 
+ifdef::azure[{azure-short} subscription quota]
+ifdef::aws[Capacity Reservation]
+can accommodate the capacity request, the deployment succeeds.
+
+include::snippets/apply-machine-configuration-method.adoc[tag=method-machine-template]
+
+ifdef::azure[]
+[NOTE]
+====
+You cannot change an existing Capacity Reservation configuration for a machine set. 
+To use a different Capacity Reservation group, you must replace the machine set and the machines that the previous machine set deployed.
+====
+endif::azure[]
+
+.Sample Capacity Reservation configuration
+[source,yaml]
+----
+apiVersion: infrastructure.cluster.x-k8s.io/v1beta2
+kind: AWSMachineTemplate
+# ...
+spec:
+  template:
+    spec:
+      capacityReservationId: <capacity_reservation> # <1>
+      marketType: <market_type> # <2>
+# ...
+----
+<1> Specify the ID of the 
+ifdef::azure[Capacity Reservation group]
+ifdef::aws[Capacity Block for ML or On-Demand Capacity Reservation]
+that you want to deploy machines on.
+ifdef::aws[]
+<2> Specify the market type to use.
+The following values are valid:
+`CapacityBlock`:: Use this market type with Capacity Blocks for ML.
+`OnDemand`:: Use this market type with On-Demand Capacity Reservations.
+`Spot`:: Use this market type with Spot Instances.
+This option is not compatible with Capacity Reservations.
+endif::aws[]
+
+For more information, including limitations and suggested use cases for this offering, see
+ifdef::azure[link:https://learn.microsoft.com/en-us/azure/virtual-machines/capacity-reservation-overview[On-demand Capacity Reservation] in the {azure-full} documentation.]
+ifdef::aws[link:https://docs.aws.amazon.com/en_us/AWSEC2/latest/UserGuide/capacity-reservation-overview.html[On-Demand Capacity Reservations and Capacity Blocks for ML] in the {aws-short} documentation.]
+
+ifeval::["{context}" == "cluster-api-config-options-aws"]
+:!aws:
+endif::[]
diff --git a/modules/machine-feature-agnostic-nonguaranteed-instances.adoc b/modules/machine-feature-agnostic-nonguaranteed-instances.adoc
@@ -0,0 +1,67 @@
+// Module included in the following assemblies:
+//
+// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc
+// There are parallel features in Azure and GCP so this module is set up for reuse.
+
+ifeval::["{context}" == "cluster-api-config-options-aws"]
+:aws:
+endif::[]
+
+:_mod-docs-content-type: CONCEPT
+[id="machine-feature-agnostic-nonguaranteed-instances_{context}"]
+ifdef::aws[= Non-guaranteed Spot Instances and hourly cost limits]
+
+ifdef::aws[]
+You can deploy machines as non-guaranteed Spot Instances on {aws-first}. 
+Spot Instances use spare AWS EC2 capacity and are less expensive than On-Demand Instances. 
+You can use Spot Instances for workloads that can tolerate interruptions, such as batch or stateless, horizontally scalable workloads.
+endif::aws[]
+
+include::snippets/apply-machine-configuration-method.adoc[tag=method-machine-template]
+
+ifdef::aws[]
+[IMPORTANT]
+====
+AWS EC2 can reclaim the capacity for a Spot Instance at any time. 
+====
+
+.Sample Spot Instance configuration
+[source,yaml]
+----
+apiVersion: infrastructure.cluster.x-k8s.io/v1beta2
+kind: AWSMachineTemplate
+# ...
+spec:
+  template:
+    spec:
+      spotMarketOptions: <1> 
+        maxPrice: <price_per_hour> <2>
+# ...
+----
+<1> Specifies the use of Spot Instances.
+<2> Optional: Specifies an hourly cost limit in US dollars for the Spot Instance. 
+For example, setting the `<price_per_hour>` value to `2.50` limits the cost of the Spot Instance to USD 2.50 per hour.
+When this value is not set, the maximum price charges up to the On-Demand Instance price.
++
+[WARNING]
+====
+Setting a specific `maxPrice: <price_per_hour>` value might increase the frequency of interruptions compared to using the default On-Demand Instance price.
+It is strongly recommended to use the default On-Demand Instance price and to not set the maximum price for Spot Instances.
+====
+
+Interruptions can occur when using Spot Instances for the following reasons:
+
+* The instance price exceeds your maximum price
+* The demand for Spot Instances increases
+* The supply of Spot Instances decreases
+
+AWS gives a two-minute warning to the user when an interruption occurs. 
+{product-title} begins to remove the workloads from the affected instances when AWS issues the termination warning.
+
+When AWS terminates an instance, a termination handler running on the Spot Instance node deletes the machine resource. 
+To satisfy the compute machine set `replicas` quantity, the compute machine set creates a machine that requests a Spot Instance.
+endif::aws[]
+
+ifeval::["{context}" == "cluster-api-config-options-aws"]
+:!aws:
+endif::[]
diff --git a/modules/machine-feature-agnostic-options-label-gpu-autoscaler.adoc b/modules/machine-feature-agnostic-options-label-gpu-autoscaler.adoc
@@ -1,7 +1,5 @@
 // Module included in the following assemblies:
 //
-// * machine_management/applying-autoscaling.adoc
-// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc
 
 :_mod-docs-content-type: CONCEPT
 [id="machine-feature-agnostic-options-label-gpu-autoscaler_{context}"]
diff --git a/modules/machine-feature-aws-add-nvidia-gpu-node.adoc b/modules/machine-feature-aws-add-nvidia-gpu-node.adoc
@@ -0,0 +1,65 @@
+// Module included in the following assemblies:
+//
+// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc
+
+:_mod-docs-content-type: CONCEPT
+[id="machine-feature-aws-add-nvidia-gpu-node_{context}"]
+= GPU-enabled machine options
+
+You can deploy GPU-enabled compute machines on {aws-first}.
+The following sample configuration uses an link:https://aws.amazon.com/ec2/instance-types/#Accelerated_Computing[{aws-short} G4dn instance type], which includes an NVIDIA Tesla T4 Tensor Core GPU, as an example.
+
+For more information about supported instance types, see the following pages in the NVIDIA documentation:
+
+* link:https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html[NVIDIA GPU Operator Community support matrix]
+
+* link:https://docs.nvidia.com/ai-enterprise/latest/product-support-matrix/index.html[NVIDIA AI Enterprise support matrix]
+
+include::snippets/apply-machine-configuration-method.adoc[tag=method-machine-template-and-machine-set]
+
+// Cluster API machine template spec
+.Sample GPU-enabled machine template configuration
+[source,yaml]
+----
+apiVersion: infrastructure.cluster.x-k8s.io/v1beta2
+kind: AWSMachineTemplate
+# ...
+spec:
+  template:
+    spec:
+      instanceType: g4dn.xlarge <1>
+# ...
+----
+<1> Specifies a G4dn instance type.
+
+// Cluster API machine set spec
+.Sample GPU-enabled machine set configuration
+[source,yaml]
+----
+apiVersion: cluster.x-k8s.io/v1beta1
+kind: MachineSet
+metadata:
+  name: <cluster_name>-gpu-<region> <1>
+  namespace: openshift-cluster-api
+  labels:
+    cluster.x-k8s.io/cluster-name: <cluster_name>
+spec:
+  clusterName: <cluster_name>
+  replicas: 1
+  selector:
+    matchLabels:
+      test: example
+      cluster.x-k8s.io/cluster-name: <cluster_name>
+      cluster.x-k8s.io/set-name: <cluster_name>-gpu-<region> <2>
+  template:
+    metadata:
+      labels:
+        test: example
+        cluster.x-k8s.io/cluster-name: <cluster_name>
+        cluster.x-k8s.io/set-name: <cluster_name>-gpu-<region> <3>
+        node-role.kubernetes.io/<role>: ""
+# ...
+----
+<1> Specifies a name that includes the `gpu` role. The name includes the cluster ID as a prefix and the region as a suffix.
+<2> Specifies a selector label that matches the machine set name.
+<3> Specifies a template label that matches the machine set name.
diff --git a/modules/machine-feature-aws-dedicated-instances.adoc b/modules/machine-feature-aws-dedicated-instances.adoc
@@ -0,0 +1,33 @@
+// Module included in the following assemblies:
+//
+// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc
+
+:_mod-docs-content-type: CONCEPT
+[id="machine-feature-aws-dedicated-instances_{context}"]
+= Dedicated Instance configuration options
+
+You can deploy machines that are backed by Dedicated Instances on {aws-first} clusters. 
+
+Dedicated Instances run in a virtual private cloud (VPC) on hardware that is dedicated to a single customer. 
+These Amazon EC2 instances are physically isolated at the host hardware level. 
+The isolation of Dedicated Instances occurs even if the instances belong to different AWS accounts that are linked to a single payer account. 
+However, other instances that are not dedicated can share hardware with Dedicated Instances if they belong to the same AWS account.
+
+{product-title} supports instances with public or dedicated tenancy.
+
+include::snippets/apply-machine-configuration-method.adoc[tag=method-machine-template]
+
+.Sample Dedicated Instances configuration
+[source,yaml]
+----
+apiVersion: infrastructure.cluster.x-k8s.io/v1beta2
+kind: AWSMachineTemplate
+# ...
+spec:
+  template:
+    spec:
+      tenancy: dedicated <1>
+# ...
+----
+<1> Specifies using instances with dedicated tenancy that run on single-tenant hardware.
+If you do not specify this value, instances with public tenancy that run on shared hardware are used by default.
diff --git a/modules/machine-feature-aws-existing-placement-group.adoc b/modules/machine-feature-aws-existing-placement-group.adoc
@@ -0,0 +1,58 @@
+// Module included in the following assemblies:
+//
+// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc
+
+:_mod-docs-content-type: CONCEPT
+[id="machine-feature-aws-existing-placement-group_{context}"]
+= Elastic Fabric Adapter instances and placement group options
+
+You can deploy compute machines on link:https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html[Elastic Fabric Adapter] (EFA) instances within an existing AWS placement group.
+
+EFA instances do not require placement groups, and you can use placement groups for purposes other than configuring an EFA. 
+The following example uses an EFA and placement group together to demonstrate a configuration that can improve network performance for machines within the specified placement group.
+
+include::snippets/apply-machine-configuration-method.adoc[tag=method-machine-template]
+
+.Sample EFA instance and placement group configuration
+[source,yaml]
+----
+apiVersion: infrastructure.cluster.x-k8s.io/v1beta2
+kind: AWSMachineTemplate
+# ...
+spec:
+  template:
+    spec:
+      instanceType: <supported_instance_type> # <1>
+      networkInterfaceType: efa # <2>
+      placementGroupName: <placement_group> # <3>
+      placementGroupPartition: <placement_group_partition_number> # <4>
+# ...
+----
+<1> Specifies an instance type that link:https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types[supports EFAs].
+<2> Specifies the `efa` network interface type.
+<3> Specifies the name of the existing AWS placement group to deploy machines in.
+<4> Optional: Specifies the partition number of the existing AWS placement group where you want your machines deployed. 
+
+[NOTE]
+====
+Ensure that the link:https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html#limitations-placement-groups[rules and limitations] for the type of placement group that you create are compatible with your intended use case.
+====
+
+////
+The MAPI version of this has additional parameters in the providerSpec:
+
+----
+placement:
+  availabilityZone: <zone> # <3>
+  region: <region> # <4>
+----
+<3> Specifies the zone, for example, `us-east-1a`.
+<4> Specifies the region, for example, `us-east-1`.
+
+Do we need to say anything specific about this, or is this just redundant with the failure domain?
+
+Note: 
+CAPI has networkInterfaceType: efa
+MAPI has networkInterfaceType: EFA
+Capitalization matters!
+////
diff --git a/modules/machine-feature-aws-imds-options.adoc b/modules/machine-feature-aws-imds-options.adoc
diff --git a/snippets/apply-machine-configuration-method.adoc b/snippets/apply-machine-configuration-method.adoc