Skip to content

Commit d522d92

Browse files
q10facebook-github-bot
authored andcommitted
ROCm build script updates (pytorch#4414)
Summary: X-link: facebookresearch/FBGEMM#1487 - ROCm build script updates Pull Request resolved: pytorch#4414 Reviewed By: gchalump Differential Revision: D77495791 Pulled By: q10 fbshipit-source-id: 23cf7f30d18f0dbfbb741b7d5018685346b30761
1 parent 993cd49 commit d522d92

File tree

2 files changed

+25
-35
lines changed

2 files changed

+25
-35
lines changed

.github/scripts/fbgemm_gpu_build.bash

Lines changed: 22 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -167,42 +167,31 @@ __configure_fbgemm_gpu_build_docs () {
167167
__configure_fbgemm_gpu_build_rocm () {
168168
local fbgemm_variant_targets="$1"
169169

170-
# Fetch available ROCm architectures on the machine
170+
# By default, we build for a limited number of target architectures to save on
171+
# build time. This list needs to be updated if the CI ROCm machines have
172+
# different hardware.
173+
#
174+
# Target architecture mapping and ROCm compatibility table can be found at:
175+
# https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html
176+
# https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html
177+
171178
if [ "$fbgemm_variant_targets" != "" ]; then
172-
echo "[BUILD] ROCm targets have been manually provided: ${fbgemm_variant_targets}"
179+
# If targets are manually supplied, override
180+
echo "[BUILD] Using the user-supplied ROCm targets ..."
173181
local arch_list="${fbgemm_variant_targets}"
174182

183+
elif [ -n "${BUILD_FROM_NOVA+x}" ]; then
184+
# If BUILD_FROM_NOVA is set (regardless of 0 or 1 - some steps in Nova have
185+
# the value set to 0), we are building in Nova. Nova machines take much
186+
# longer time to build FBGEMM_GPU for ROCm, so we have to limit to just the
187+
# latest model.
188+
echo "[BUILD] Building in Nova environment, ignoring the provided PYTORCH_ROCM_ARCH list and limiting ROCm targets ..."
189+
local arch_list="gfx942"
190+
175191
else
176-
if which rocminfo; then
177-
# shellcheck disable=SC2155
178-
local arch_list=$(rocminfo | grep -o -m 1 'gfx.*')
179-
echo "[BUILD] Architectures list from rocminfo: ${arch_list}"
180-
181-
if [ "$arch_list" == "" ]; then
182-
# It is possible to build FBGEMM_GPU-ROCm on a machine without AMD
183-
# cards, in which case the arch_list will be empty.
184-
echo "[BUILD] rocminfo did not return anything valid!"
185-
186-
# By default, we build for a limited number of architectures to save on
187-
# build time. This list needs to be updated if the CI ROCm machines
188-
# have different hardware.
189-
#
190-
# Architecture mapping can be found at:
191-
# https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html
192-
if [ -z "${BUILD_FROM_NOVA+x}" ]; then
193-
# If BUILD_FROM_NOVA is unset, then we are building from AMD host with
194-
# sufficient resources, so we can build for more architectures.
195-
local arch_list="gfx908,gfx90a,gfx942"
196-
else
197-
# If BUILD_FROM_NOVA is set (regardless of 0 or 1), we are building in
198-
# Nova. Nova machines take a longer time to build FBGEMM_GPU for
199-
# ROCm, so we limit to one architecture.
200-
local arch_list="gfx942"
201-
fi
202-
fi
203-
else
204-
echo "[BUILD] rocminfo not found in PATH!"
205-
fi
192+
# If BUILD_FROM_NOVA is unset, then we are building from a compute host with
193+
# sufficient resources, so we can build for more AMD Instinct architectures.
194+
local arch_list="gfx908,gfx90a,gfx942"
206195
fi
207196

208197
echo "[BUILD] Setting the following ROCm targets: ${arch_list}"
@@ -286,8 +275,8 @@ __configure_fbgemm_gpu_build_cuda () {
286275
echo "[BUILD] Unknown NVCC version $cuda_version_nvcc - setting TORCH_CUDA_ARCH_LIST to: ${arch_list}"
287276
fi
288277
fi
289-
echo "[BUILD] Setting the following CUDA targets: ${arch_list}"
290278

279+
echo "[BUILD] Setting the following CUDA targets: ${arch_list}"
291280
# Unset the environment-supplied TORCH_CUDA_ARCH_LIST because it will take
292281
# precedence over cmake -DTORCH_CUDA_ARCH_LIST
293282
unset TORCH_CUDA_ARCH_LIST

.github/scripts/utils_rocm.bash

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,11 @@ install_rocm_ubuntu () {
9999
echo "[INSTALL] Cleaning up ..."
100100
print_exec rm -f "${package_name}"
101101

102-
echo "[INFO] Check ROCM GPU info ..."
102+
echo "[INFO] Printing ROCM utilities info ..."
103103
# If rocm-smi is installed on a machine without GPUs, this will return error
104104
(print_exec rocminfo) || true
105-
print_exec rocm-smi
105+
(print_exec rocm-smi) || true
106+
(print_exec hipcc -v) || true
106107

107108
echo "[INSTALL] Successfully installed ROCm ${rocm_version}"
108109
}

0 commit comments

Comments
 (0)