Upgrade CI instances (#4366)

q10 · facebook-github-bot · commit 23e62dc7cc59 · 2025-06-18T20:22:25.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1447 - Upgrade CI instances for GenAI and HSTU builds Pull Request resolved: #4366 Reviewed By: gchalump Differential Revision: D76923964 Pulled By: q10 fbshipit-source-id: a5ab076386f970f1158bfedbaba8b9134ddc33ba
diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
@@ -241,7 +241,9 @@ install_cudnn () {
   test_network_connection || return 1
 
   # Install cuDNN manually
-  # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
+  # Based on install script in:
+  #   https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
+  #   https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_cudnn.sh
   declare -A cudnn_packages=(
     ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-${PLATFORM_NAME_LC}-8.3.2.44_cuda11.5-archive.tar.xz"
     ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-${PLATFORM_NAME_LC}-8.3.2.44_cuda11.5-archive.tar.xz"
@@ -250,6 +252,8 @@ install_cudnn () {
     ["121"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/${PLATFORM_NAME_LC}/cudnn-${PLATFORM_NAME_LC}-8.9.2.26_cuda12-archive.tar.xz"
     ["124"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/${PLATFORM_NAME_LC}/cudnn-${PLATFORM_NAME_LC}-8.9.2.26_cuda12-archive.tar.xz"
     ["126"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/${PLATFORM_NAME_LC}/cudnn-${PLATFORM_NAME_LC}-9.5.1.17_cuda12-archive.tar.xz"
+    ["128"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/${PLATFORM_NAME_LC}/cudnn-${PLATFORM_NAME_LC}-9.10.2.21_cuda12-archive.tar.xz"
+    ["129"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/${PLATFORM_NAME_LC}/cudnn-${PLATFORM_NAME_LC}-9.10.2.21_cuda12-archive.tar.xz"
   )
 
   # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
diff --git a/.github/workflows/fbgemm_gpu_ci_cuda.yml b/.github/workflows/fbgemm_gpu_ci_cuda.yml
@@ -76,13 +76,12 @@ jobs:
           { arch: x86, instance: "linux.24xlarge", build-target: "default", cuda-version: "12.9.1" },
 
           # GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138
-          { arch: x86, instance: "linux.8xlarge.memory", build-target: "genai", cuda-version: "12.6.3" },
-          { arch: x86, instance: "linux.8xlarge.memory", build-target: "genai", cuda-version: "12.8.1" },
-          { arch: x86, instance: "linux.8xlarge.memory", build-target: "genai", cuda-version: "12.9.1" },
+          { arch: x86, instance: "linux.12xlarge.memory", build-target: "genai", cuda-version: "12.6.3" },
+          { arch: x86, instance: "linux.12xlarge.memory", build-target: "genai", cuda-version: "12.8.1" },
+          { arch: x86, instance: "linux.12xlarge.memory", build-target: "genai", cuda-version: "12.9.1" },
 
-          { arch: x86, instance: "linux.12xlarge.memory", build-target: "hstu", cuda-version: "12.6.3" },
-          { arch: x86, instance: "linux.12xlarge.memory", build-target: "hstu", cuda-version: "12.8.1" },
-          { arch: x86, instance: "linux.12xlarge.memory", build-target: "hstu", cuda-version: "12.9.1" },
+          # Since FBGEMM HSTU is released yet, we reduce to one CUDA version to conserve CI resources
+          { arch: x86, instance: "linux.24xlarge.memory", build-target: "hstu", cuda-version: "12.9.1" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
         compiler: [ "gcc", "clang" ]
diff --git a/fbgemm_gpu/test/tbe/ssd/kv_backend_test.py b/fbgemm_gpu/test/tbe/ssd/kv_backend_test.py
@@ -34,7 +34,7 @@
 from ..common import gpu_unavailable, open_source, running_in_oss
 
 if not open_source:
-    from aiplatform.modelstore.checkpointing.utils.kv_tensor_metadata import (
+    from aiplatform.modelstore.checkpointing.utils.kv_tensor_metadata import (  # noqa F401
         generate_kvtensor_metadata,
     )
 

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@`
`34`	`34`	`from ..common import gpu_unavailable, open_source, running_in_oss`
`35`	`35`
`36`	`36`	`if not open_source:`
`37`		`- from aiplatform.modelstore.checkpointing.utils.kv_tensor_metadata import (`
	`37`	`+ from aiplatform.modelstore.checkpointing.utils.kv_tensor_metadata import ( # noqa F401`
`38`	`38`	`generate_kvtensor_metadata,`
`39`	`39`	`)`
`40`	`40`