diff --git a/docs/admin/runai-setup/self-hosted/bcm/files/metallb.txt b/docs/admin/runai-setup/self-hosted/bcm/files/metallb.txt new file mode 100644 index 0000000000..39bbbeaac6 --- /dev/null +++ b/docs/admin/runai-setup/self-hosted/bcm/files/metallb.txt @@ -0,0 +1,29 @@ + + --- + apiVersion: metallb.io/v1beta1 + kind: L2Advertisement + metadata: + name: l2-ingress + namespace: metallb-system + spec: + ipAddressPools: + - ingress-pool + nodeSelectors: + - matchLabels: + node-role.kubernetes.io/runai-system: "true" + + --- + apiVersion: metallb.io/v1beta1 + kind: IPAddressPool + metadata: + name: ingress-pool + namespace: metallb-system + spec: + addresses: + - 192.168.0.250-192.168.0.251 # Example of two ip address - + autoAssign: false + serviceAllocation: + priority: 50 + namespaces: + - ingress-nginx + - knative-serving diff --git a/docs/admin/runai-setup/self-hosted/bcm/files/networkoperator.txt b/docs/admin/runai-setup/self-hosted/bcm/files/networkoperator.txt new file mode 100644 index 0000000000..59585feae3 --- /dev/null +++ b/docs/admin/runai-setup/self-hosted/bcm/files/networkoperator.txt @@ -0,0 +1,24 @@ + + deployCR: true + nfd: + enabled: true + ofedDriver: + deploy: false + psp: + enabled: false + rdmaSharedDevicePlugin: + deploy: false + secondaryNetwork: + cniPlugins: + deploy: true + deploy: true + ipamPlugin: + deploy: false + multus: + deploy: true + nvIpam: + deploy: true + sriovDevicePlugin: + deploy: false + sriovNetworkOperator: + enabled: true diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-1.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-1.png new file mode 100644 index 0000000000..8a3ecba5f6 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-1.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-10.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-10.png new file mode 100644 index 0000000000..dfe50ce18c Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-10.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-11.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-11.png new file mode 100644 index 0000000000..9fb30c80e0 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-11.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-12.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-12.png new file mode 100644 index 0000000000..aa062b7129 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-12.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-13.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-13.png new file mode 100644 index 0000000000..c51c73e6fe Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-13.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-14.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-14.png new file mode 100644 index 0000000000..8648f07603 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-14.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-15.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-15.png new file mode 100644 index 0000000000..9f73640170 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-15.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-16.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-16.png new file mode 100644 index 0000000000..90b5b2fd84 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-16.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-17.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-17.png new file mode 100644 index 0000000000..a9cb9d6bb1 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-17.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-18.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-18.png new file mode 100644 index 0000000000..3fa9bf99e1 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-18.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-19.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-19.png new file mode 100644 index 0000000000..37feb53e39 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-19.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-2.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-2.png new file mode 100644 index 0000000000..6d0b8ac671 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-2.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-20.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-20.png new file mode 100644 index 0000000000..98df22cc1a Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-20.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-21.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-21.png new file mode 100644 index 0000000000..304fbbcd1d Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-21.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-22.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-22.png new file mode 100644 index 0000000000..488ce1fd45 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-22.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-23.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-23.png new file mode 100644 index 0000000000..2310eedc2e Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-23.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-24.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-24.png new file mode 100644 index 0000000000..1ec37b3684 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-24.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-25.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-25.png new file mode 100644 index 0000000000..09be9724f7 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-25.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-26.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-26.png new file mode 100644 index 0000000000..dd9eaa048f Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-26.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-27.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-27.png new file mode 100644 index 0000000000..260b8da4c9 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-27.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-28.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-28.png new file mode 100644 index 0000000000..c5532cc8f7 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-28.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-29.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-29.png new file mode 100644 index 0000000000..2d853b5869 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-29.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-3.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-3.png new file mode 100644 index 0000000000..6b0cc96e02 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-3.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-30.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-30.png new file mode 100644 index 0000000000..0e1b0a76f7 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-30.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-31.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-31.png new file mode 100644 index 0000000000..98b266d05f Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-31.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-32.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-32.png new file mode 100644 index 0000000000..b1b06de326 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-32.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-33.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-33.png new file mode 100644 index 0000000000..1580c37904 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-33.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-34.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-34.png new file mode 100644 index 0000000000..cb44f573d7 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-34.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-35.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-35.png new file mode 100644 index 0000000000..07b868e98d Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-35.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-36.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-36.png new file mode 100644 index 0000000000..2d18293149 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-36.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-37.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-37.png new file mode 100644 index 0000000000..9368e5ddc8 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-37.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-38.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-38.png new file mode 100644 index 0000000000..99e85ba76b Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-38.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-4.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-4.png new file mode 100644 index 0000000000..aa0a2b9e1c Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-4.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-40.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-40.png new file mode 100644 index 0000000000..3189e9f27d Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-40.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-5.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-5.png new file mode 100644 index 0000000000..c878c8606c Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-5.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-6.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-6.png new file mode 100644 index 0000000000..866b2ca1db Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-6.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-7.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-7.png new file mode 100644 index 0000000000..06bc9d03d2 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-7.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-8.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-8.png new file mode 100644 index 0000000000..fe6f126db5 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-8.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-9.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-9.png new file mode 100644 index 0000000000..7a3a5618db Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image-9.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image.png b/docs/admin/runai-setup/self-hosted/bcm/images/image.png new file mode 100644 index 0000000000..c6c043e973 Binary files /dev/null and b/docs/admin/runai-setup/self-hosted/bcm/images/image.png differ diff --git a/docs/admin/runai-setup/self-hosted/bcm/install-cluster.md b/docs/admin/runai-setup/self-hosted/bcm/install-cluster.md new file mode 100644 index 0000000000..0f7e261df2 --- /dev/null +++ b/docs/admin/runai-setup/self-hosted/bcm/install-cluster.md @@ -0,0 +1,80 @@ +# Install the Cluster + + +## System and Network Requirements +Before installing the NVIDIA Run:ai cluster, validate that the [system requirements](./system-requirements.md) and [network requirements](./network-requirements.md) are met. Make sure you have the [software artifacts](./preparations.md) prepared. + +Once all the requirements are met, it is highly recommend to use the NVIDIA Run:ai cluster preinstall diagnostics tool to: + +* Test the below requirements in addition to failure points related to Kubernetes, NVIDIA, storage, and networking +* Look at additional components installed and analyze their relevance to a successful installation + +For more information, see [preinstall diagnostics](https://github.com/run-ai/preinstall-diagnostics). To run the preinstall diagnostics tool, [download](https://runai.jfrog.io/ui/native/pd-cli-prod/preinstall-diagnostics-cli/) the latest version, and run: + +```bash +chmod +x ./preinstall-diagnostics- && \ +./preinstall-diagnostics- \ + --domain ${CONTROL_PLANE_FQDN} \ + --cluster-domain ${CLUSTER_FQDN} \ +#if the diagnostics image is hosted in a private registry + --image-pull-secret ${IMAGE_PULL_SECRET_NAME} \ + --image ${PRIVATE_REGISTRY_IMAGE_URL} +``` + +## Helm + +NVIDIA Run:ai requires [Helm](https://helm.sh/) 3.14 or later. To install Helm, see [Installing Helm](https://helm.sh/docs/intro/install/). + +## Permissions + +A Kubernetes user with the `cluster-admin` role is required to ensure a successful installation. For more information, see [Using RBAC authorization](https://kubernetes.io/docs/reference/access-authn-authz/rbac/). + +## Installation + +Follow the steps below to add a new cluster. + +!!! Note + When adding a cluster for the first time, the New Cluster form automatically opens when you log in to the NVIDIA Run:ai platform. Other actions are prevented, until the cluster is created. + +If this is your first cluster and you have completed the New Cluster form, start at step 3. Otherwise, start at step 1. + +1. In the NVIDIA Run:ai platform, go to **Resources** +2. Click **+NEW CLUSTER** +3. Enter a unique name for your cluster +4. Choose the NVIDIA Run:ai cluster version (latest, by default) +5. Select **Same as control plane** +6. Click **Continue** + +**Installing NVIDIA Run:ai Cluster** + +In the next Section, the NVIDIA Run:ai cluster installation steps will be presented. + +1. Follow the installation instructions and run the commands provided on your Kubernetes cluster +2. Append `--set global.customCA.enabled=true` to the Helm installation command +3. Click **DONE** + +The cluster is displayed in the table with the status **Waiting to connect**. Once installation is complete, the cluster status changes to **Connected**. + +!!! Tip + Use the `--dry-run` flag to gain an understanding of what is being installed before the actual installation. For more details, see see [Understanding cluster access roles.](https://docs.run.ai/v2.19/admin/config/access-roles/). + + +!!! Note + To customize the installation based on your environment, see [Customize cluster installation](../../cluster-setup/customize-cluster-install.md). + +## Troubleshooting + +If you encounter an issue with the installation, try the troubleshooting scenario below. + +### Installation + +If the NVIDIA Run:ai cluster installation failed, check the installation logs to identify the issue. Run the following script to print the installation logs: + +``` bash +curl -fsSL https://raw.githubusercontent.com/run-ai/public/main/installation/get-installation-logs.sh +``` + +### Cluster Status + +If the NVIDIA Run:ai cluster installation completed, but the cluster status did not change its status to Connected, check the cluster [troubleshooting scenarios](../../troubleshooting/troubleshooting.md#cluster-health) + diff --git a/docs/admin/runai-setup/self-hosted/bcm/install-control-plane.md b/docs/admin/runai-setup/self-hosted/bcm/install-control-plane.md new file mode 100644 index 0000000000..0bb0a3d45b --- /dev/null +++ b/docs/admin/runai-setup/self-hosted/bcm/install-control-plane.md @@ -0,0 +1,36 @@ +# Install the Control Plane + +Installing the NVIDIA Run:ai control plane requires Internet connectivity. + + +## System and Network Requirements +Before installing the NVIDIA Run:ai control plane, validate that the [system requirements](./system-requirements.md) and [network requirements](./network-requirements.md) are met. Make sure you have the [software artifacts](./preparations.md) prepared. + +## Permissions + +As part of the installation, you will be required to install the NVIDIA Run:ai control plane [Helm chart](https://helm.sh/). The Helm charts require Kubernetes administrator permissions. You can review the exact objects that are created by the charts using the `--dry-run` flag on both helm charts. + +## Installation + +Run the following command. Replace `global.domain=` with the one obtained [here](./system-requirements.md#fully-qualified-domain-name-fqdn) + +```bash +helm upgrade -i runai-backend -n runai-backend runai-backend/control-plane \ +--version " " \ +--set global.customCA.enabled=true \ +--set global.domain= +``` + +!!! Note + To install a specific version, add --version to the install command. You can find available versions by running helm search repo -l runai-backend. + +## Connect to NVIDIA Run:ai User Interface + +1. Open your browser and go to: `https://`. +2. Log in using the default credentials: + + * User: `test@run.ai` + * Password: `Abcd!234` + +You will be prompted to change the password. + diff --git a/docs/admin/runai-setup/self-hosted/bcm/network-requirements.md b/docs/admin/runai-setup/self-hosted/bcm/network-requirements.md new file mode 100644 index 0000000000..93c61a54c9 --- /dev/null +++ b/docs/admin/runai-setup/self-hosted/bcm/network-requirements.md @@ -0,0 +1,64 @@ +# Network requirements + +The following network requirements are for the NVIDIA Run:ai components installation and usage. + +## Installation + +### Inbound rules + +| Name | Description | Source | Destination | Port | +| --------------------------- | ---------------- | ------- | -------------------------- | ---- | +| Installation via BCM | SSH Access | Installer Machine | NVIDIA Base Command Manager headnodes | 22 | + +### Outbound rules +| Name | Description | Source | Destination | Port | +| --------------------------- | ---------------- | ------- | -------------------------- | ---- | +| Container Registry | Pull NVIDIA Run:ai images | All kubernetes nodes | runai.jfrog.io | 443 | +| Helm repository | NVIDIA Run:ai Helm repository for installation | Installer machine | runai.jfrog.io | 443 | + +The NVIDIA Run:ai installation has [software requirements](system-requirements.md) that require additional components to be installed on the cluster. This article includes simple installation examples which can be used optionally and require the following cluster outbound ports to be open: + +| Name | Description | Source | Destination | Port | +| -------------------------- | ------------------------------------------ | -------------------- | --------------- | ---- | +| Kubernetes Registry | Ingress Nginx image repository | All kubernetes nodes | registry.k8s.io | 443 | +| Google Container Registry | GPU Operator, and Knative image repository | All kubernetes nodes | gcr.io | 443 | +| Red Hat Container Registry | Prometheus Operator image repository | All kubernetes nodes | quay.io | 443 | +| Docker Hub Registry | Training Operator image repository | All kubernetes nodes | docker.io | 443 | + + + +## External access + +Set out below are the domains to whitelist and ports to open for installation, upgrade, and usage of the application and its management. + + +!!! Note + Ensure the inbound and outbound rules are correctly applied to your firewall. + +### Inbound rules + +To allow your organization’s NVIDIA Run:ai users to interact with the cluster using the [NVIDIA Run:ai Command-line interface](../../reference/cli/runai/), or access specific UI features, certain inbound ports need to be open: + +| Name | Description | Source | Destination | Port | +| --------------------------- | ---------------- | ------- | -------------------------- | ---- | +| NVIDIA Run:ai control plane | HTTPS entrypoint | 0.0.0.0 | NVIDIA Run:ai system nodes | 443 | +| NVIDIA Run:ai cluster | HTTPS entrypoint | RFC1918 private IP ranges (10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16) + | NVIDIA Run:ai system nodes | 443 | + + +### Outbound rules + +!!! Note + Outbound rules applied to the NVIDIA Run:ai cluster component only. In case the NVIDIA Run:ai cluster is installed together with the NVIDIA Run:ai control plane, the NVIDIA Run:ai cluster FQDN refers to the NVIDIA Run:ai control plane FQDN. + {% endhint %} + +For the NVIDIA Run:ai cluster installation and usage, certain **outbound** ports must be open: + +| Name | Description | Source | Destination | Port | +| ------------------ | -------------------------------------------------------------------------------- | -------------------------- | -------------------------------- | ---- | +| Cluster sync | Sync NVIDIA Run:ai cluster with NVIDIA Run:ai control plane | NVIDIA Run:ai system nodes | NVIDIA Run:ai control plane FQDN | 443 | +| Metric store | Push NVIDIA Run:ai cluster metrics to NVIDIA Run:ai control plane's metric store | NVIDIA Run:ai system nodes | NVIDIA Run:ai control plane FQDN | 443 | + +## Internal network + +Ensure that all Kubernetes nodes can communicate with each other across all necessary ports. Kubernetes assumes full interconnectivity between nodes, so you must configure your network to allow this seamless communication. Specific port requirements may vary depending on your network setup. \ No newline at end of file diff --git a/docs/admin/runai-setup/self-hosted/bcm/next-steps.md b/docs/admin/runai-setup/self-hosted/bcm/next-steps.md new file mode 100644 index 0000000000..15924b6d9d --- /dev/null +++ b/docs/admin/runai-setup/self-hosted/bcm/next-steps.md @@ -0,0 +1,11 @@ +# Next Steps + +## Restrict System Node Scheduling (Post-Installation) + +After installation, you can configure NVIDIA Run:ai to enforce stricter scheduling rules that ensure system components and workloads are assigned to the correct nodes. The following flags are set using the `runaiconfig`. See [Advanced Cluster Configurations](../../../config/advanced-cluster-config.md) for more details. + +1. Set `global.NodeAffinity.RestrictRunAISystem=true`. This ensures that NVIDIA Run:ai system components are scheduled only on nodes labeled as system nodes: + +2. Set `global.nodeAffinity.restrictScheduling=true`. This prevents pure CPU workloads from being scheduled on GPU nodes. + + \ No newline at end of file diff --git a/docs/admin/runai-setup/self-hosted/bcm/preparations.md b/docs/admin/runai-setup/self-hosted/bcm/preparations.md new file mode 100644 index 0000000000..74d3824e7c --- /dev/null +++ b/docs/admin/runai-setup/self-hosted/bcm/preparations.md @@ -0,0 +1,13 @@ +# Preparations + +You should receive a token from NVIDIA Run:ai customer support. The following command provides access to the NVIDIA Run:ai container registry: + +```bash +kubectl create secret docker-registry runai-reg-creds \ +--docker-server=https://runai.jfrog.io \ +--docker-username=self-hosted-image-puller-prod \ +--docker-password=<$TOKEN> \ +--docker-email=support@run.ai \ +--namespace=runai-backend +``` + diff --git a/docs/admin/runai-setup/self-hosted/bcm/system-requirements.md b/docs/admin/runai-setup/self-hosted/bcm/system-requirements.md new file mode 100644 index 0000000000..a28cdf483a --- /dev/null +++ b/docs/admin/runai-setup/self-hosted/bcm/system-requirements.md @@ -0,0 +1,585 @@ +# System Requirements + + +## Pre-installation Checklist + +The following checklist is provided for convenience and can be seen as part of an expanded site survey for NVIDIA Run:ai deployments on SuperPOD. This information needs to be collected and validated before the NVIDIA Run:ai deployment begins. + +| **Component** | **Type** | Reference | +| --- | --- | --- | +| [Networking] FQDN name/ Reserved IP address | DNS A or CNAME record pointing to the load balancer reserved IP | [Reserved IPs and Domain Configuration](#reserved-ips-and-domain-configuration) | +| [Networking] Load Balancer IP address range | Additional IP address space (minimum 2, recommended 8) for the Kubernetes LoadBalancer (Inference, DataMover workloads) | [Reserved IPs and Domain Configuration](#reserved-ips-and-domain-configuration) | +| [SSL] Full-chain SSL certificate | <`*.p7b`, `*.der` or `*.pem` file> | [TLS/SSL Certificates](#tlsssl-certificates) | +| [SSL] SSL Private Key | Private certificate (e.g. `*.key`) | [TLS/SSL Certificates](#tlsssl-certificates) | +| [SSL] CA trust chain public certificate | X509 PEM file | [Local Certificate Authority](#local-certificate-authority) | + +## Installer Machine + +The machine running the installation must have: + +* At least 50GB of free space +* Docker installed +* [Helm](https://helm.sh/) 3.14 or later + +The configuration of BCM as well as the deployment of NVIDIA Run:ai can be performed through SSH access on the BCM headnodes: + +```bash +ssh root@ +``` + +## Hardware Requirements + +The following hardware requirements cover all components needed to deploy and operate NVIDIA Run:ai. By default, all NVIDIA Run:ai services run on all available nodes. + +### Kubernetes + +This configuration is the minimum requirement you need to deploy Kubernetes. + + +| Component | Required Capacity | +| ---------- | ----------------- | +| CPU | 2 cores | +| Memory | 16GB | +| Disk space | 100GB | + +### NVIDIA Run:ai - System Nodes + +This configuration is the minimum requirement you need to install and use NVIDIA Run:ai. + +| Component | Required Capacity | +| ---------- | ----------------- | +| CPU | 20 cores | +| Memory | 42GB | +| Disk space | 160GB | + + +To designate nodes to NVIDIA Run:ai system services, follow the instructions as described in [Label the NVIDIA Run:ai System Nodes](#label-the-nvidia-runai-system-nodes). + + +### NVIDIA Run:ai - Worker Nodes + +NVIDIA Run:ai supports NVIDIA SuperPods built on the A100, H100, H200, and B200 GPU architectures. These systems are optimized for high-performance AI workloads at scale. + +The following configuration represents the minimum hardware requirements for installing and operating NVIDIA Run:ai on worker nodes. Each node must meet these specifications: + +| Component | Required Capacity | +| --------- | ----------------- | +| CPU | 2 cores | +| Memory | 4GB | + + +To designate nodes to NVIDIA Run:ai workloads, follow the instructions as described in [Label the NVIDIA Run:ai Worker Nodes](#label-the-nvidia-runai-worker-nodes). + + +### Node Categories + +In BCM, a node category is a way to group nodes that share the same hardware profile and intended role. Defining node categories allows the system to assign the appropriate software image and configurations to each group during provisioning. + +Before installing NVIDIA Run:ai, make sure the necessary BCM node categories are created for: + +* NVIDIA Run:ai system nodes (for example, `runai-control-plane-spod`) +* NVIDIA Run:ai GPU worker nodes (for example, `dgx-h100-spod`) +* Optional: NVIDIA Run:ai CPU worker nodes (for example, `runai-cpu-workers`) + + +## Reserved IPs and Domain Configuration + +Before installing NVIDIA Run:ai, make sure the necessary IPs (at least 2) are reserved and the domain names are properly set up. These are critical for exposing the control plane and inference services. + +### Reserved IP Addresses + +Reserve at least two IP addresses from the same internal IP range: + +* NVIDIA Run:ai control plane – Reserve one IP address for accessing core components such as the UI, API, and workload endpoints. This IP is not used for inference workloads. + +* Inference (Knative Serving) – Reserve a second IP address specifically for serving inference workloads using Knative-based serving layer. + +All reserved IPs must be reachable within your internal network and not conflict with other internal IP allocations. + +### Fully Qualified Domain Name (FQDN) + +A Fully Qualified Domain Name (FQDN) is required to install the NVIDIA Run:ai control plane (e.g., `runai.mycorp.local`). This cannot be an IP. The domain name must be accessible inside the organization's private network. + +The FQDN must point to the control plane’s reserved IP, either: + +* As a DNS (A record) pointing directly to the IP +* Or, a CNAME alias to a host DNS record pointing to that same IP address + +### Wildcard FQDN for Inference + +For inference workloads, configure a wildcard DNS record (`*.runai-inference.mycorp.local`) that maps to the reserved inference IP address. This ensures each inference workload is accessible at a unique subdomain. + + +## TLS/SSL Certificates + +You must have a TLS certificates that is associated with the FQDN for HTTPS access. The certificate will be installed on the Kubernetes control plane nodes as well as a [Kubernetes secret](#tls-certificate) for the NVIDIA Run:ai backend and the [Kubernetes Ingress controller](#configure-kubernetes-ingress-controller). + +* The certificate CN name needs to be equal to the [FQDN](#fully-qualified-domain-name-fqdn). +* The certificate needs to include at least one Subject Alternative Name DNS entry (SAN) for the same FQDN. +* The certificate needs to include the full trust chain (signing CA public keys). + + +## Operating System + +DGX OS is supported on your SuperPod and optimized for NVIDIA infrastructure. +SR-IOV enables InfiniBand support at the host level. When used together with the [NVIDIA Network Operator](#configure-the-network-operator), it allows workloads to leverage InfiniBand networking for high-performance communication. + + +## Deploy Kubernetes: Base Command Manager + +1. From the active BCM headnode, run the following command: + ``` + cm-kubernetes-setup + ``` + +2. The following screen will pop up. Select **Deploy** and then click **Ok**: + + ![alt text](images/image.png) + + !!! Note + The number of entries in the above menu may vary. + +2. Select **Kubernetes v1.31** and then click **Ok**: + + ![alt text](images/image-1.png) + +3. Optional: Provide a DockerHub container registry mirror if required and then click **Ok**. Otherwise, leave blank and click **Ok** to proceed: + + ![alt text](images/image-3.png) + +4. Set the Kubernetes networks and then click **Ok**. The subnets need to be in a private address space (per RFC 1918). Use the default values and only modify if necessary or in case of conflict with other internal subnets within the network. Make sure the domain names of the networks are configured correctly and modify as required to match the “Kubernetes External FQDN” using the same domain set in the [FQDN](#fully-qualified-domain-name-fqdn) section: + + ![alt text](images/image-4.png) + +5. Select **yes** to expose the Kubernetes API servers to the cluster’s external network and then click **Ok**: + + ![alt text](images/image-5.png) + + +6. Select the internal network that will be used by the Kubernetes nodes and then click **Ok**: + + ![alt text](images/image-6.png) + +7. Select at least 3 Kubernetes master nodes and then click **Ok**: + + ![alt text](images/image-7.png) + + !!! Note + To ensure high availability and prevent a single point of failure, it is recommended to configure at least three Kubernetes master nodes in your cluster. + +8. Select both the NVIDIA Run:ai system and worker [node categories](#node-categories) to operate as the Kubernetes worker nodes and then click **Ok**: + + ![alt text](images/image-8.png) + + +9. Selecting individual Kubernetes nodes is not required. Click **Ok** to proceed: + + ![alt text](images/image-9.png) + +10. Select the Etcd nodes and then click **Ok**. Make sure to select the same three nodes as the Kubernetes master nodes (Step 8): + + ![alt text](images/image-10.png) + +11. Ignore the following message if it appears and click **Ok**: + + ![alt text](images/image-11.png) + +12. Set the ports as shown below and then click **Ok**. Do not modify the Etcd spool directory: + + ![alt text](images/image-12.png) + +13. Select **Calico** as the Kubernetes network plugin and then click **Ok**: + + ![alt text](images/image-13.png) + +14. Do not install Kyverno during the initial deployment. It can always be enabled at a later stage. Select **no** and then click **Ok**: + + ![alt text](images/image-14.png) + +### Operators + +Select the following Operators and then click **Ok**: + +![alt text](images/image-15.png) + +!!! Note + Do NOT select the Run:ai operator. + +#### NVIDIA GPU Operator + +NVIDIA Run:ai supports versions 22.9 to 25.3. + +1. Select the required NVIDIA GPU Operator version and then click **Ok**: + ![alt text](images/image-16.png) + + +2. Leave the YAML configuration file path empty and then click **Ok**: + ![alt text](images/image-18.png) + + +3. Configure the NVIDIA GPU Operator by selecting the following configuration parameters and then click **Ok**: + ![alt text](images/image-19.png) + +#### Network Operator + +1. Select Network Operator **v24.7.0** and then click **Ok**: + + ![alt text](images/image-17.png) + + +2. Create a [YAML file](files/networkoperator.txt){target=_blank} with the required Helm values. + +3. Add the path to the YAML file and then click **Ok**: + + ![alt text](images/image-40.png) + + +4. Do not add any MetalLB address pools at this point. Click **Ok** to proceed: + + ![alt text](images/image-20.png) + +### Kubernetes Ingress Controller + +1. Select **Ingress Controller (Nginx)** and then click **Ok**: + ![alt text](images/image-21.png) + +2. Select **yes** when asked to expose the Ingress service over port 443 and then click **Ok**: + ![alt text](images/image-22.png) + +2. Keep the Ingress HTTPS port to 30443 (default value) and then click **Ok**: + + ![alt text](images/image-23.png) + +### Permissions Manager + +Click **yes** to install the BCM Kubernetes permissions manager and then click **Ok**: + +![alt text](images/image-24.png) + +### Storage Class + + +1. Select **Local path** as the Kubernetes StorageClass and then click **Ok**: + + ![alt text](images/image-25.png) + + +2. Put the storage class on the shared storage (/cm/shared – keep defaults) and then click **Ok**: + + ![alt text](images/image-26.png) + +### Save your Configuration + +Select **Save config & deploy** and then click **Ok**: + +![alt text](images/image-27.png) + +![alt text](images/image-28.png) + +### Start Deployment + +At this point the deployment will start. Half way through the deployment all nodes that are members of the Kubernetes cluster will be rebooted and the installer will wait up to 60 minutes for all nodes to come back online. + + +## Configure BCM Kubernetes for NVIDIA Run:ai + +### Label the NVIDIA Run:ai System Nodes + +Label the system nodes to ensure that system services are scheduled on designated system nodes. + +!!! Note + [Node category names](#node-categories) are user-defined and may vary. Make sure to label the correct category. Incorrect or mixed labels may result in pods being scheduled on unintended nodes or failing to schedule altogether. + +```bash +cmsh +kubernetes +labelsets +add runai-control-plane +append categories runai-control-plane +append labels node-role.kubernetes.io/runai-system=true +commit +``` + +!!! Note + * [Node category names](#node-categories) are user-defined and may vary. Make sure to label the correct category. Incorrect or mixed labels may result in pods being scheduled on unintended nodes or failing to schedule altogether. + * For more information, see [System nodes](../../../config/node-roles.md#system-nodes). + * After installation, you can configure NVIDIA Run:ai to enforce stricter scheduling rules that ensure system components are assigned to the correct nodes. See [Next Steps](next-steps.md) for more details. + +### Label the NVIDIA Run:ai Worker Nodes + +1. Label the nodes - GPU workers: + ```bash + cmsh + kubernetes + labelsets + add runai-gpu-worker + append categories dgx-h100-spod + append labels node-role.kubernetes.io/runai-gpu-worker=true + commit + ``` +2. Optional: Label the nodes - CPU workers: + ```bash + cmsh + kubernetes + labelsets + add runai-cpu-worker + append categories runai-cpu-workers + append labels + node-role.kubernetes.io/runai-cpu-worker=true + commit + ``` + +!!! Note + * [Node category names](#node-categories) are user-defined and may vary. Make sure to label the correct category. Incorrect or mixed labels may result in pods being scheduled on unintended nodes or failing to schedule altogether. + * For more information, see [Worker nodes](../../../config/node-roles.md#worker-nodes). + * After installation, you can configure NVIDIA Run:ai to enforce stricter scheduling rules that ensure workloads are assigned to the correct nodes. See [Next Steps](next-steps.md) for more details. + + +### Create the NVIDIA Run:ai Namespaces + +Create the following Kubernetes namespaces: + +```bash +kubectl create ns runai-backend +kubectl create ns runai +``` + +!!! Note + If you cannot use kubectl, load the Kubernetes Lmod module using `module load kubernetes`. + +### Expose the NVIDIA Run:ai Endpoint - MetalLB +NVIDIA Run:ai is exposed through the MetalLB load balancer/Route Advertiser. This includes the main Kubernetes Ingress for the NVIDIA Run:ai control plane and the Kourier Ingress used for Knative Serving. +Make sure a reserved range of IP addresses is available as described in [Reserved IPs and Domain Configuration](#reserved-ips-and-domain-configuration) and MetalLB is deployed as part of the [Kubernetes installation](#deploy-kubernetes-base-command-manager). + +1. Configure the Kubernetes API proxy with strict ARP validation: + + ```bash + kubectl get configmap kube-proxy -n kube-system -o yaml | \ + sed -e "s/strictARP: false/strictARP: true/" | \ + kubectl apply -f - -n kube-system + ``` + +2. Create a new appGroup application in BCM: + + ```bash + root@bcmhead1:~# cmsh + [bcmhead1]% kubernetes + [bcmhead1->kubernetes[dra]]% appgroups + [bcmhead1->kubernetes[dra]->appgroups]% use system + [bcmhead1->kubernetes[dra]->appgroups[system]]% applications + [bcmhead1->kubernetes[dra]->appgroups[system]->applications]% add ingress-metallb + [bcmhead1->kubernetes*[dra*]->appgroups*[system*]->applications*[ingress-metallb*]]% set config /root/ingress-metallb.yaml + [bcmhead1->kubernetes*[dra*]->appgroups*[system*]->applications*[ingress-metallb*]]% commit + ``` + +3. Create the [YAML configuration](files/metallb.txt){target=_blank} to define the ingress IP pool and Layer 2 advertisement. You will need to substitute the IP address with the reserved IP address. + + +### Configure Kubernetes Ingress Controller + +#### Scale up the Ingress Deployment + +For high availability, increase the number of replicas from 1 to 3: + +```bash +# cmsh +[bcmhead1->device]% kubernetes +[bcmhead1->kubernetes[dra]]% appgroups +[bcmhead1->kubernetes[dra]->appgroups]% use system +[bcmhead1->kubernetes[dra]->appgroups[system]]% applications +[bcmhead1->kubernetes[dra]->appgroups[system]->applications]% use ingress_controller +[bcmhead1->kubernetes[dra]->appgroups[system]->applications[ingress_controller]]% environment +[bcmhead1->kubernetes[dra]->appgroups[system]->applications[ingress_controller]->environment]% set replicas value 3 +[bcmhead1->kubernetes*[dra*]->appgroups*[system*]->applications*[ingress_controller*]->environment*]% commit +``` + +#### Configure the NGINX Proxy TLS Certificates + +This process sets up TLS certificates for the Run:ai control plane [FQDN](#fully-qualified-domain-name-fqdn). + +Follow these steps on the active BCM headnode to configure the NGINX Ingress controller with your signed TLS certificate: + +1. From the active BCM headnode, run the following command: + ```bash + cm-kubernetes-setup + ``` + +2. The following screen will pop up. Select **Configure Ingress** and then click **Ok**: + + ![alt text](images/image-30.png) + +2. Select the Kubernetes cluster and then click **Ok**: + + ![alt text](images/image-31.png) + +3. Select **yes** when asked to provide signed certificates and then click **Ok**: + + ![alt text](images/image-32.png) + +4. Enter the path to the private key and PEM certificate and then click **Ok**. See [TLS Certificate](#tls-certificate) for more details: + + ![alt text](images/image-33.png) + +#### Confifure the NGINX with reserved IP for MetalLB +Patch the ingress-nginx service. Assign the reserved control plane IP address to the ingress controller: + +```bash + kubectl -n ingress-nginx patch svc ingress-nginx-controller \ + --type='merge' \ + -p '{"spec": {"type": "LoadBalancer", "loadBalancerIP": ""}}' +``` + +### Configure the Network Operator + +The default deployment of the Network Operator installs the boiler-plate services, but does not initialize the SR-IOV and secondary network plugins. The following CRD resources have to be created in the exact order as below: + +* SR-IOV Network Policies for each NVIDIA InfiniBand NIC +* An nvIPAM IP address pool +* SR-IOV InfiniBand networks + +The above CRD YAML specs can be downloaded from the following Gitlab repo: https://gitlab-master.nvidia.com/kuberpod/runai-deployment-assets. TBD: Should we add these yaml files + +1. Increase the number of simultaneous updates by the Network Operator: + ```bash + kubectl patch sriovoperatorconfigs.sriovnetwork.openshift.io -n network-operator default --patch '{ "spec": { "maxParallelNodeConfiguration": 0 } }' --type='merge' + ``` + and + + ```bash + kubectl patch sriovoperatorconfigs.sriovnetwork.openshift.io -n network-operator default --patch '{ "spec": { "featureGates": { "parallelNicConfig": true } } }' --type='merge' + ``` +2. Create the SR-IOV network node policies: + ```bash + kubectl apply -f sriov-network-node-policy.yaml + ``` + + Adjust the number of Virtual Function (numVfs) as needed. + +3. Create an IPAM IP Pool: + ```bash + kubectl apply -f nvipam-ip-pool.yaml + ``` + +4. Create the SR-IOV IB networks: + ```bash + kubectl apply -f sriov-ib-network.yaml + ``` + +!!! Note + The Network Operator will restart the DGX nodes if the number of Virtual Functions in the SR-IOV Network Policy file does not match the NVIDIA/Mellanox firmware configuration. + +## Certificates Setup for NVIDIA Run:ai + +### TLS Certificate + +You must have TLS certificate that is associated with the FQDN for HTTPS access. Create a [Kubernetes Secret](https://kubernetes.io/docs/concepts/configuration/secret/) named `runai-backend-tls` in the `runai-backend` namespace and include the path to the TLS `--cert` and its corresponding private `--key` by running the following: + +```bash +kubectl create secret tls runai-backend-tls -n runai-backend \ + --cert /path/to/fullchain.pem \ # Replace /path/to/fullchain.pem with the actual path to your TLS certificate + --key /path/to/private.pem # Replace /path/to/private.pem with the actual path to your private key +``` + +### Local Certificate Authority + +A local certificate authority serves as the root certificate for organizations that cannot use publicly trusted certificate authority if external connections or standard HTTPS authentication is required. Follow the below steps to configure the local certificate authority. + + +1. Add the public key to the `runai-backend` namespace: + ```bash + kubectl -n runai-backend create secret generic runai-ca-cert \ + --from-file=runai-ca.pem= + ``` + +2. Add the public key to the `runai` namespace: + ```bash + kubectl -n runai create secret generic runai-ca-cert \ + --from-file=runai-ca.pem= + kubectl label secret runai-ca-cert -n runai run.ai/cluster-wide=true run.ai/name=runai-ca-cert --overwrite + ``` + +3. When installing the control plane and cluster, make sure the following flag is added to the helm command `--set global.customCA.enabled=true`. + +## Additional Software Requirements + +Additional NVIDIA Run:ai capabilities, Distributed Training and Inference require additional Kubernetes applications (frameworks) to be installed. + +### Distributed Training + +Distributed training enables training of AI models over multiple nodes. This requires installing a distributed training framework on the cluster. The following frameworks are supported: + +* [TensorFlow](https://www.tensorflow.org/) +* [PyTorch](https://pytorch.org/) +* [XGBoost](https://xgboost.readthedocs.io/) +* [MPI](https://docs.open-mpi.org/) + +All are part of the Kubeflow Training Operator. NVIDIA Run:ai supports Training Operator version 1.7. The Kubeflow Training Operator gets installed as part of the BCM Kubernetes Deployment. + +The Kubeflow Training Operator is packaged with MPI version 1.0 which is not supported by NVIDIA Run:ai. You need to separately install MPI v2beta1: + +1. Run the below to install MPI v2beta: + ```bash + kubectl create -f https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.6.0/deploy/v2beta1/mpi-operator.yaml + ``` +2. Disable MPI in the Training operator by running: + ```bash + kubectl patch deployment training-operator -n kubeflow --type='json' -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args", "value": ["--enable-scheme=tfjob", "--enable-scheme=pytorchjob", "--enable-scheme=xgboostjob"]}]' + ``` + +3. Run: + ```bash + kubectl delete crd mpijobs.kubeflow.org + ``` + +4. Install MPI v2beta1 again: + ```bash + kubectl create -f https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.6.0/deploy/v2beta1/mpi-operator.yaml + # Ignore any errors in the above command + kubectl replace -f https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.6.0/deploy/v2beta1/mpi-operator.yaml + ``` + + +### Inference + +Inference enables serving of AI models. This requires the [Knative Serving](https://knative.dev/docs/serving/) framework to be installed on the cluster and supports Knative versions 1.10 to 1.15. + +Follow the [Installing Knative](https://knative.dev/docs/install/) instructions. After installation: + +1. configure Knative to use the NVIDIA Run:ai Scheduler and features, by running: + +```bash +kubectl patch configmap/config-autoscaler \ + --namespace knative-serving \ + --type merge \ + --patch '{"data":{"enable-scale-to-zero":"true"}}' && \ +kubectl patch configmap/config-features \ + --namespace knative-serving \ + --type merge \ + --patch '{"data":{"kubernetes.podspec-schedulername":"enabled","kubernetes.podspec-affinity":"enabled","kubernetes.podspec-tolerations":"enabled","kubernetes.podspec-volumes-emptydir":"enabled","kubernetes.podspec-securitycontext":"enabled","kubernetes.containerspec-addcapabilities":"enabled","kubernetes.podspec-persistent-volume-claim":"enabled","kubernetes.podspec-persistent-volume-write":"enabled","multi-container":"enabled","kubernetes.podspec-init-containers":"enabled"}}' +``` + + +2. Patch Knative Kourier service. Assign the reserved IP address and DNS for inference workloads to the Knative ingress service: + ```bash + # Replace knative.example.com with your FQDN for Inference (without the wildcard) + kubectl patch configmap/config-domain --namespace knative-serving --type merge --patch '{"data":{"":""}}' + + kubectl -n kourier-system patch svc kourier \ + --type='merge' \ + -p '{"spec": {"type": "LoadBalancer", "loadBalancerIP": ""}}' + ``` + + +### Knative Autoscaling + +NVIDIA Run:ai allows for autoscaling a deployment according to the below metrics: + +* Latency (milliseconds) +* Throughput (requests/sec) +* Concurrency (requests) + +Using a custom metric (for example, Latency) requires installing the [Kubernetes Horizontal Pod Autoscaler (HPA)](https://knative.dev/docs/install/yaml-install/serving/install-serving-with-yaml/#install-optional-serving-extensions). Use the following command to install. Make sure to update the {VERSION} in the below command with a [supported Knative version](#inference). + +```bash +kubectl apply -f https://github.com/knative/serving/releases/download/knative-{VERSION}/serving-hpa.yaml +``` diff --git a/docs/admin/runai-setup/self-hosted/bcm/uninstall.md b/docs/admin/runai-setup/self-hosted/bcm/uninstall.md new file mode 100644 index 0000000000..060102a082 --- /dev/null +++ b/docs/admin/runai-setup/self-hosted/bcm/uninstall.md @@ -0,0 +1,21 @@ +# Uninstall + +## Uninstall the control plane + +To delete the control plane, run: + +```bash +helm uninstall runai-backend -n runai-backend +``` + +## Uninstall the cluster + +Uninstalling NVIDIA Run:ai cluster from the Kubernetes cluster does __not__ delete existing projects, departments or workloads submitted by users. + +To uninstall the NVIDIA Run:ai cluster, run the following [helm](https://helm.sh/) command in your terminal: + +``` bash +helm uninstall runai-cluster -n runai +``` + +To delete the NVIDIA Run:ai cluster from the NVIDIA Run:ai Platform, see [Removing a cluster](../../../config/clusters.md#removing-a-cluster). diff --git a/docs/admin/runai-setup/self-hosted/bcm/upgrade.md b/docs/admin/runai-setup/self-hosted/bcm/upgrade.md new file mode 100644 index 0000000000..8953ff4bce --- /dev/null +++ b/docs/admin/runai-setup/self-hosted/bcm/upgrade.md @@ -0,0 +1,75 @@ +# Upgrade + +## Before upgrade +Before proceeding with the upgrade, it's crucial to apply the specific prerequisites associated with your current version of NVIDIA Run:ai and every version in between up to the version you are upgrading to. + +## Helm +NVIDIA Run:ai requires Helm 3.14 or later. Before you continue, validate your installed helm client version. To install or upgrade Helm, see Installing Helm. + +## Software files +```bash +helm repo add runai-backend https://runai.jfrog.io/artifactory/cp-charts-prod +helm repo update +``` + +## Upgrade control plane + +### System and network requirements +Before upgrading the NVIDIA Run:ai control plane, validate that the latest [system requirements](system-requirements.md) and [network requirements](network-requirements.md) are met, as they can change from time to time. + +### Upgrade + +```bash +helm get values runai-backend -n runai-backend > runai_control_plane_values.yaml +helm upgrade runai-backend -n runai-backend runai-backend/control-plane --version "" -f runai_control_plane_values.yaml --reset-then-reuse-values +``` + +!!! Note + To upgrade to a specific version, modify the --version flag by specifying the desired . You can find all available versions by using the helm search repo command. + + +## Upgrade cluster + +### System and network requirements + +Before upgrading the NVIDIA Run:ai cluster, validate that the latest [system requirements](system-requirements.md) and [network requirements](network-requirements.md) are met, as they can change from time to time. + +### Getting installation instructions + +Follow the setup and installation instructions below to get the installation instructions to upgrade the NVIDIA Run:ai cluster. + +#### Setup + +1. In the NVIDIA Run:ai UI, go to **Clusters** +2. Select the cluster you want to upgrade +3. Click **INSTALLATION INSTRUCTIONS** +4. Choose the NVIDIA Run:ai cluster version (latest, by default) +5. Select **Same as control plane** +6. Click **Continue** + +#### Installation instructions + +1. Follow the installation instructions and run the commands provided on your Kubernetes cluster +2. Append `--set global.customCA.enabled=true` to the Helm installation command +3. Click **DONE** + + +!!! Note + To upgrade to a specific version, modify the `--version` flag by specifying the desired ``. You can find all available versions by using the `helm search repo` command. + +### Troubleshooting + +If you encounter an issue with the cluster upgrade, use the troubleshooting scenarios below. + +#### Installation fails + +If the NVIDIA Run:ai cluster upgrade fails, check the installation logs to identify the issue. +Run the following script to print the installation logs: + +```bash +curl -fsSL https://raw.githubusercontent.com/run-ai/public/main/installation/get-installation-logs.sh +``` + +#### Cluster status + +If the NVIDIA Run:ai cluster upgrade completes, but the cluster status does not show as **Connected**, refer to [Troubleshooting scenarios](../../../config/clusters.md#troubleshooting-scenarios). \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 23de2f5fa2..99c5d02f3e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -240,6 +240,15 @@ nav: - 'Next Steps' : 'admin/runai-setup/self-hosted/ocp/next-steps.md' - 'Upgrade' : 'admin/runai-setup/self-hosted/ocp/upgrade.md' - 'Uninstall' : 'admin/runai-setup/self-hosted/ocp/uninstall.md' + - 'SuperPOD' : + - 'System Requirements' : 'admin/runai-setup/self-hosted/bcm/system-requirements.md' + - 'Preparations' : 'admin/runai-setup/self-hosted/bcm/preparations.md' + - 'Network Requirements' : 'admin/runai-setup/self-hosted/bcm/network-requirements.md' + - 'Install Control Plane' : 'admin/runai-setup/self-hosted/bcm/install-control-plane.md' + - 'Install Cluster' : 'admin/runai-setup/self-hosted/bcm/install-cluster.md' + - 'Next Steps' : 'admin/runai-setup/self-hosted/bcm/next-steps.md' + - 'Upgrade' : 'admin/runai-setup/self-hosted/bcm/upgrade.md' + - 'Uninstall' : 'admin/runai-setup/self-hosted/bcm/uninstall.md' - 'Researcher Setup' : - 'Introduction' : 'admin/researcher-setup/researcher-setup-intro.md' - 'Install the V1 CLI' : 'admin/researcher-setup/cli-install.md'