openshift
diff --git a/‎_topic_maps/_topic_map.yml
Lines changed: 2 additions & 0 deletions b/‎_topic_maps/_topic_map.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎hardware_accelerators/rdma-remote-direct-memory-access.adoc
Lines changed: 45 additions & 0 deletions b/‎hardware_accelerators/rdma-remote-direct-memory-access.adoc
Lines changed: 45 additions & 0 deletions
diff --git a/‎modules/rdma-configuring-the-gpu-operator.adoc
Lines changed: 323 additions & 0 deletions b/‎modules/rdma-configuring-the-gpu-operator.adoc
Lines changed: 323 additions & 0 deletions
@@ -3616,6 +3616,8 @@ Topics:
   File: amd-gpu-operator
 - Name: Intel Gaudi AI accelerators
   File: gaudi-ai-accelerator
+- Name: Remote Direct Memory Access (RDMA)
+  File: rdma-remote-direct-memory-access
 ---
 Name: Backup and restore
 Dir: backup_and_restore
 
@@ -0,0 +1,45 @@
+:_mod-docs-content-type: ASSEMBLY
+[id="rdma-remote-direct-memory-access"]
+= NVIDIA GPUDirect Remote Direct Memory Access (RDMA)
+include::_attributes/common-attributes.adoc[]
+:context: rdma-remote-direct-memory-access
+
+toc::[]
+ 
+NVIDIA GPUDirect Remote Direct Memory Access (RDMA) allows for the memory in one computer to directly access the memory of another computer without needing access through the operating system. This provides the ability to bypass kernel intervention in the process, freeing up resources and greatly reducing the CPU overhead normally needed to process network communications. This is useful for distributing GPU-accelerated workloads across clusters. And because RDMA is so suited toward high bandwidth and low latency applications, this makes it ideal for big data and machine learning applications.
+ 
+There are currently three configuration methods for NVIDIA GPUDirect RDMA:
+
+Shared device:: This method allows for an NVIDIA GPUDirect RDMA device to be shared among multiple pods on the {product-title} worker node where the device is exposed. 
+
+Host device:: This method provides direct physical Ethernet access on the worker node by 
+creating an additional host network on a pod. A plugin allows the network device to be moved from the host network namespace to the network namespace on the pod.
+
+SR-IOV legacy device:: The Single Root IO Virtualization (SR-IOV) method can share a single network device, such as an Ethernet adapter, with multiple pods. SR-IOV segments the device, recognized on the host node as a physical function (PF), into multiple virtual functions (VFs).  The VF is used like any other network device.   
+
+Each of these methods can be used across either the NVIDIA GPUDirect RDMA over Converged Ethernet (RoCE) or Infiniband infrastructures, providing an aggregate total of six methods of configuration.
+
+:FeatureName: Remote Direct Memory Access
+
+include::modules/rdma-prerequisites.adoc[leveloffset=+1]
+
+* Install the xref:../hardware_enablement/psap-node-feature-discovery-operator.adoc#installing-the-node-feature-discovery-operator_node-feature-discovery-operator[Node Feature Discovery Operator].
+
+* Install the xref:../networking/networking_operators/sr-iov-operator/installing-sriov-operator.adoc#installing-sriov-operator[SR-IOV Operator].
+
+* Install the link:https://docs.nvidia.com/networking/display/kubernetes2501/getting-started-openshift.html#network-operator-installation-using-openshift-oc-cli[NVIDIA Network Operator] (NVIDIA documentation). 
+
+* Install the link:https://docs.nvidia.com/datacenter/cloud-native/openshift/24.9.2/install-gpu-ocp.html[NVIDIA GPU Operator] (NVIDIA documentation).
+
+include::modules/rdma-disabling-irdma-kernel-module.adoc[leveloffset=+1]
+
+include::modules/rdma-creating-persistent-naming-rules.adoc[leveloffset=+1]
+
+include::modules/rdma-configuring-the-nfd-operator.adoc[leveloffset=+1]
+
+include::modules/rdma-configuring-the-sriov-operator.adoc[leveloffset=+1]
+
+include::modules/rdma-configuring-the-nvidia-network-operator.adoc[leveloffset=+1]
+
+include::modules/rdma-configuring-the-gpu-operator.adoc[leveloffset=+1]
+
@@ -0,0 +1,323 @@
+// Module included in the following assemblies:
+//
+// * hardware_accelerators/rdma-remote-direct-memory-access.adoc
+
+:_mod-docs-content-type: PROCEDURE
+[id="rdma-configuring-the-gpu-operator_{context}"]
+
+= Configuring the GPU Operator
+
+The GPU Operator automates the management of the NVIDIA drivers, device plugins for GPUs, the NVIDIA Container Toolkit, and other components required for GPU provisioning.  
+
+.Prerequisites
+
+* You have installed the GPU Operator.
+
+.Procedure 
+
+. Check that the Operator pod is running to look at the pods under the namespace by running the following command:
++
+[source,terminal]
+----
+$ oc get pods -n nvidia-gpu-operator
+----
++
+.Example output
+[source,terminal]
+----
+NAME                          READY   STATUS    RESTARTS   AGE
+gpu-operator-b4cb7d74-zxpwq   1/1     Running   0          32s
+----
+
+. Create a GPU cluster policy custom resource file similar to the following example:
++
+[source,yaml]
+----
+apiVersion: nvidia.com/v1
+kind: ClusterPolicy
+metadata:
+  name: gpu-cluster-policy
+spec:
+  vgpuDeviceManager:
+    config:
+      default: default
+    enabled: true
+  migManager:
+    config:
+      default: all-disabled
+      name: default-mig-parted-config
+    enabled: true
+  operator:
+    defaultRuntime: crio
+    initContainer: {}
+    runtimeClass: nvidia
+    use_ocp_driver_toolkit: true
+  dcgm:
+    enabled: true
+  gfd:
+    enabled: true
+  dcgmExporter:
+    config:
+      name: ''
+    serviceMonitor:
+      enabled: true
+    enabled: true
+  cdi:
+    default: false
+    enabled: false
+  driver:
+    licensingConfig:
+      nlsEnabled: true
+      configMapName: ''
+    certConfig:
+      name: ''
+    rdma:
+      enabled: false
+    kernelModuleConfig:
+      name: ''
+    upgradePolicy:
+      autoUpgrade: true
+      drain:
+        deleteEmptyDir: false
+        enable: false
+        force: false
+        timeoutSeconds: 300
+      maxParallelUpgrades: 1
+      maxUnavailable: 25%
+      podDeletion:
+        deleteEmptyDir: false
+        force: false
+        timeoutSeconds: 300
+      waitForCompletion:
+        timeoutSeconds: 0
+    repoConfig:
+      configMapName: ''
+    virtualTopology:
+      config: ''
+    enabled: true
+    useNvidiaDriverCRD: false
+    useOpenKernelModules: true
+  devicePlugin:
+    config:
+      name: ''
+      default: ''
+    mps:
+      root: /run/nvidia/mps
+    enabled: true
+  gdrcopy:
+    enabled: true
+  kataManager:
+    config:
+      artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses
+  mig:
+    strategy: single
+  sandboxDevicePlugin:
+    enabled: true
+  validator:
+    plugin:
+      env:
+        - name: WITH_WORKLOAD
+          value: 'false'
+  nodeStatusExporter:
+    enabled: true
+  daemonsets:
+    rollingUpdate:
+      maxUnavailable: '1'
+    updateStrategy: RollingUpdate
+  sandboxWorkloads:
+    defaultWorkload: container
+    enabled: false
+  gds:
+    enabled: true
+    image: nvidia-fs
+    version: 2.20.5
+    repository: nvcr.io/nvidia/cloud-native
+  vgpuManager:
+    enabled: false
+  vfioManager:
+    enabled: true
+  toolkit:
+    installDir: /usr/local/nvidia
+    enabled: true
+----
+
+. When the GPU `ClusterPolicy` custom resource has generated, create the resource on the cluster by running the following command:
++
+[source,terminal]
+----
+$ oc create -f gpu-cluster-policy.yaml
+----
++
+.Example output
+[source,terminal]
+----
+clusterpolicy.nvidia.com/gpu-cluster-policy created
+----
+
+. Validate that the Operator is installed and running by running the following command: 
++
+[source,terminal]
+----
+$ oc get pods -n nvidia-gpu-operator
+----
++
+.Example output
+[source,terminal]
+----
+NAME                                                  READY   STATUS      RESTARTS   AGE
+gpu-feature-discovery-d5ngn                           1/1     Running     0          3m20s
+gpu-feature-discovery-z42rx                           1/1     Running     0          3m23s
+gpu-operator-6bb4d4b4c5-njh78                         1/1     Running     0          4m35s
+nvidia-container-toolkit-daemonset-bkh8l              1/1     Running     0          3m20s
+nvidia-container-toolkit-daemonset-c4hzm              1/1     Running     0          3m23s
+nvidia-cuda-validator-4blvg                           0/1     Completed   0          106s
+nvidia-cuda-validator-tw8sl                           0/1     Completed   0          112s
+nvidia-dcgm-exporter-rrw4g                            1/1     Running     0          3m20s
+nvidia-dcgm-exporter-xc78t                            1/1     Running     0          3m23s
+nvidia-dcgm-nvxpf                                     1/1     Running     0          3m20s
+nvidia-dcgm-snj4j                                     1/1     Running     0          3m23s
+nvidia-device-plugin-daemonset-fk2xz                  1/1     Running     0          3m23s
+nvidia-device-plugin-daemonset-wq87j                  1/1     Running     0          3m20s
+nvidia-driver-daemonset-416.94.202410211619-0-ngrjg   4/4     Running     0          3m58s
+nvidia-driver-daemonset-416.94.202410211619-0-tm4x6   4/4     Running     0          3m58s
+nvidia-node-status-exporter-jlzxh                     1/1     Running     0          3m57s
+nvidia-node-status-exporter-zjffs                     1/1     Running     0          3m57s
+nvidia-operator-validator-l49hx                       1/1     Running     0          3m20s
+nvidia-operator-validator-n44nn                       1/1     Running     0          3m23s
+----
+
+. Optional: When you have verified the pods are running, remote shell into the NVIDIA driver daemonset pod and confirm that the NVIDIA modules are loaded. Specifically, ensure the `nvidia_peermem` is loaded. 
++
+[source,terminal]
+----
+$ oc rsh -n nvidia-gpu-operator $(oc -n nvidia-gpu-operator get pod -o name -l app.kubernetes.io/component=nvidia-driver)
+sh-4.4# lsmod|grep nvidia
+----
++
+.Example output
+[source,terminal]
+----
+nvidia_fs             327680  0
+nvidia_peermem         24576  0
+nvidia_modeset       1507328  0
+video                  73728  1 nvidia_modeset
+nvidia_uvm           6889472  8
+nvidia               8810496  43 nvidia_uvm,nvidia_peermem,nvidia_fs,gdrdrv,nvidia_modeset
+ib_uverbs             217088  3 nvidia_peermem,rdma_ucm,mlx5_ib
+drm                   741376  5 drm_kms_helper,drm_shmem_helper,nvidia,mgag200
+----
+
+. Optional: Run the `nvidia-smi` utility to show the details about the driver and the hardware:
+[source,terminal]
+----
+sh-4.4# nvidia-smi 
+----
++
+.Example output
+[source,terminal]
+----
+Wed Nov  6 22:03:53 2024       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA A40                     On  |   00000000:61:00.0 Off |                    0 |
+|  0%   37C    P0             88W /  300W |       1MiB /  46068MiB |      0%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+|   1  NVIDIA A40                     On  |   00000000:E1:00.0 Off |                    0 |
+|  0%   28C    P8             29W /  300W |       1MiB /  46068MiB |      0%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+                                                                                         
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+----
+
+. While you are still in the driver pod, set the GPU clock to maximum using the `nvidia-smi` command:
++
+[source,terminal]
+----
+$ oc rsh -n nvidia-gpu-operator nvidia-driver-daemonset-416.94.202410172137-0-ndhzc
+sh-4.4# nvidia-smi -i 0 -lgc $(nvidia-smi -i 0 --query-supported-clocks=graphics --format=csv,noheader,nounits | sort -h | tail -n 1)
+----
++
+.Example output
+[source,terminal]
+----
+GPU clocks set to "(gpuClkMin 1740, gpuClkMax 1740)" for GPU 00000000:61:00.0
+All done.
+----
++
+[source,terminal]
+----
+sh-4.4# nvidia-smi -i 1 -lgc $(nvidia-smi -i 1 --query-supported-clocks=graphics --format=csv,noheader,nounits | sort -h | tail -n 1)
+----
++
+.Example output
+[source,terminal]
+----
+GPU clocks set to "(gpuClkMin 1740, gpuClkMax 1740)" for GPU 00000000:E1:00.0
+All done.
+----
+
+. Validate the resource is available from a node describe perspective by running the following command:
++
+[source,terminal]
+----
+$ oc describe node -l node-role.kubernetes.io/worker=| grep -E 'Capacity:|Allocatable:' -A9
+----
++
+.Example output
+[source,terminal]
+----
+Capacity:
+  cpu:                          128
+  ephemeral-storage:            1561525616Ki
+  hugepages-1Gi:                0
+  hugepages-2Mi:                0
+  memory:                       263596712Ki
+  nvidia.com/gpu:               2
+  pods:                         250
+  rdma/rdma_shared_device_eth:  63
+  rdma/rdma_shared_device_ib:   63
+Allocatable:
+  cpu:                          127500m
+  ephemeral-storage:            1438028263499
+  hugepages-1Gi:                0
+  hugepages-2Mi:                0
+  memory:                       262445736Ki
+  nvidia.com/gpu:               2
+  pods:                         250
+  rdma/rdma_shared_device_eth:  63
+  rdma/rdma_shared_device_ib:   63
+--
+Capacity:
+  cpu:                          128
+  ephemeral-storage:            1561525616Ki
+  hugepages-1Gi:                0
+  hugepages-2Mi:                0
+  memory:                       263596672Ki
+  nvidia.com/gpu:               2
+  pods:                         250
+  rdma/rdma_shared_device_eth:  63
+  rdma/rdma_shared_device_ib:   63
+Allocatable:
+  cpu:                          127500m
+  ephemeral-storage:            1438028263499
+  hugepages-1Gi:                0
+  hugepages-2Mi:                0
+  memory:                       262445696Ki
+  nvidia.com/gpu:               2
+  pods:                         250
+  rdma/rdma_shared_device_eth:  63
+  rdma/rdma_shared_device_ib:   63
+----