Skip to content

Commit 23e62dc

Browse files
q10facebook-github-bot
authored andcommitted
Upgrade CI instances (#4366)
Summary: X-link: facebookresearch/FBGEMM#1447 - Upgrade CI instances for GenAI and HSTU builds Pull Request resolved: #4366 Reviewed By: gchalump Differential Revision: D76923964 Pulled By: q10 fbshipit-source-id: a5ab076386f970f1158bfedbaba8b9134ddc33ba
1 parent 6152f34 commit 23e62dc

File tree

3 files changed

+11
-8
lines changed

3 files changed

+11
-8
lines changed

.github/scripts/utils_cuda.bash

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,9 @@ install_cudnn () {
241241
test_network_connection || return 1
242242

243243
# Install cuDNN manually
244-
# Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
244+
# Based on install script in:
245+
# https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
246+
# https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_cudnn.sh
245247
declare -A cudnn_packages=(
246248
["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-${PLATFORM_NAME_LC}-8.3.2.44_cuda11.5-archive.tar.xz"
247249
["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-${PLATFORM_NAME_LC}-8.3.2.44_cuda11.5-archive.tar.xz"
@@ -250,6 +252,8 @@ install_cudnn () {
250252
["121"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/${PLATFORM_NAME_LC}/cudnn-${PLATFORM_NAME_LC}-8.9.2.26_cuda12-archive.tar.xz"
251253
["124"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/${PLATFORM_NAME_LC}/cudnn-${PLATFORM_NAME_LC}-8.9.2.26_cuda12-archive.tar.xz"
252254
["126"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/${PLATFORM_NAME_LC}/cudnn-${PLATFORM_NAME_LC}-9.5.1.17_cuda12-archive.tar.xz"
255+
["128"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/${PLATFORM_NAME_LC}/cudnn-${PLATFORM_NAME_LC}-9.10.2.21_cuda12-archive.tar.xz"
256+
["129"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/${PLATFORM_NAME_LC}/cudnn-${PLATFORM_NAME_LC}-9.10.2.21_cuda12-archive.tar.xz"
253257
)
254258

255259
# Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]

.github/workflows/fbgemm_gpu_ci_cuda.yml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,12 @@ jobs:
7676
{ arch: x86, instance: "linux.24xlarge", build-target: "default", cuda-version: "12.9.1" },
7777

7878
# GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138
79-
{ arch: x86, instance: "linux.8xlarge.memory", build-target: "genai", cuda-version: "12.6.3" },
80-
{ arch: x86, instance: "linux.8xlarge.memory", build-target: "genai", cuda-version: "12.8.1" },
81-
{ arch: x86, instance: "linux.8xlarge.memory", build-target: "genai", cuda-version: "12.9.1" },
79+
{ arch: x86, instance: "linux.12xlarge.memory", build-target: "genai", cuda-version: "12.6.3" },
80+
{ arch: x86, instance: "linux.12xlarge.memory", build-target: "genai", cuda-version: "12.8.1" },
81+
{ arch: x86, instance: "linux.12xlarge.memory", build-target: "genai", cuda-version: "12.9.1" },
8282

83-
{ arch: x86, instance: "linux.12xlarge.memory", build-target: "hstu", cuda-version: "12.6.3" },
84-
{ arch: x86, instance: "linux.12xlarge.memory", build-target: "hstu", cuda-version: "12.8.1" },
85-
{ arch: x86, instance: "linux.12xlarge.memory", build-target: "hstu", cuda-version: "12.9.1" },
83+
# Since FBGEMM HSTU is released yet, we reduce to one CUDA version to conserve CI resources
84+
{ arch: x86, instance: "linux.24xlarge.memory", build-target: "hstu", cuda-version: "12.9.1" },
8685
]
8786
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
8887
compiler: [ "gcc", "clang" ]

fbgemm_gpu/test/tbe/ssd/kv_backend_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
from ..common import gpu_unavailable, open_source, running_in_oss
3535

3636
if not open_source:
37-
from aiplatform.modelstore.checkpointing.utils.kv_tensor_metadata import (
37+
from aiplatform.modelstore.checkpointing.utils.kv_tensor_metadata import ( # noqa F401
3838
generate_kvtensor_metadata,
3939
)
4040

0 commit comments

Comments
 (0)