Skip to content

Commit f158490

Browse files
q10facebook-github-bot
authored andcommitted
Add Comprehensive Build Instructions and Isolate CPU and ROCm Builds (#1639)
Summary: - Remove `.post0` suffix from the autogenerated package version - Document the full FBGEMM_GPU OSS build process in a separate Markdown file - Remove installation of packages not needed for ROCm builds - Migrate CPU and ROCm jobs to run on top of Docker containers instead of bare metal instances - Update GitHub workflow configuration to cancel previous jobs for a PR if a new commit is pushed to the PR Pull Request resolved: #1639 Reviewed By: shintaro-iwasaki Differential Revision: D44076312 Pulled By: q10 fbshipit-source-id: 6b2d083022feb7421b26da2d998678e00c11f283
1 parent f388b95 commit f158490

11 files changed

+639
-86
lines changed

.github/scripts/setup_env.bash

Lines changed: 132 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,13 @@
1313
print_exec () {
1414
echo "+ $*"
1515
echo ""
16-
"$@"
16+
if "$@"; then
17+
local retcode=0
18+
else
19+
local retcode=$?
20+
fi
1721
echo ""
22+
return $retcode
1823
}
1924

2025
exec_with_retries () {
@@ -205,7 +210,7 @@ run_python_test () {
205210
echo "################################################################################"
206211
fi
207212

208-
if conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
213+
if print_exec conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
209214
echo "[TEST] Python test suite PASSED: ${python_test_file}"
210215
else
211216
echo "[TEST] Python test suite FAILED: ${python_test_file}"
@@ -313,7 +318,7 @@ print_ec2_info () {
313318

314319

315320
################################################################################
316-
# Environment Setup and Install Functions
321+
# Miniconda Setup Functions
317322
################################################################################
318323

319324
setup_miniconda () {
@@ -398,6 +403,11 @@ create_conda_environment () {
398403
echo "[SETUP] Successfully created Conda environment: ${env_name}"
399404
}
400405

406+
407+
################################################################################
408+
# PyTorch Setup Functions
409+
################################################################################
410+
401411
install_pytorch_conda () {
402412
local env_name="$1"
403413
local pytorch_version="$2"
@@ -553,6 +563,28 @@ install_pytorch_pip () {
553563
echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}"
554564
}
555565

566+
567+
################################################################################
568+
# CUDA Setup Functions
569+
################################################################################
570+
571+
install_nvidia_drivers_centos () {
572+
echo "################################################################################"
573+
echo "# Install NVIDIA Drivers"
574+
echo "#"
575+
echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
576+
echo "################################################################################"
577+
echo ""
578+
579+
echo "[SETUP] Adding NVIDIA repos to yum ..."
580+
print_exec sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
581+
print_exec sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
582+
print_exec sudo yum clean expire-cache
583+
584+
echo "[SETUP] Installing NVIDIA drivers ..."
585+
install_system_packages nvidia-driver-latest-dkms
586+
}
587+
556588
install_cuda () {
557589
local env_name="$1"
558590
local cuda_version="$2"
@@ -604,6 +636,86 @@ install_cuda () {
604636
echo "[INSTALL] Successfully installed CUDA ${cuda_version}"
605637
}
606638

639+
install_cudnn () {
640+
local env_name="$1"
641+
local install_path="$2"
642+
local cuda_version="$3"
643+
if [ "$cuda_version" == "" ]; then
644+
echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
645+
echo "Example:"
646+
echo " ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7"
647+
return 1
648+
else
649+
echo "################################################################################"
650+
echo "# Install cuDNN"
651+
echo "#"
652+
echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
653+
echo "################################################################################"
654+
echo ""
655+
fi
656+
657+
# Install cuDNN manually
658+
# Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
659+
local cudnn_packages=(
660+
["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
661+
["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
662+
["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
663+
["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
664+
)
665+
666+
# Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
667+
# shellcheck disable=SC2206
668+
local cuda_version_arr=(${cuda_version//./ })
669+
# Fetch the major and minor version to concat
670+
local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"
671+
672+
# Get the URL
673+
local cudnn_url="${cudnn_packages[cuda_concat_version]}"
674+
if [ "$cudnn_url" == "" ]; then
675+
# Default to cuDNN for 11.7 if no CUDA version fits
676+
echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7"
677+
cudnn_url="${cudnn_packages[117]}"
678+
fi
679+
680+
# Clear the install path
681+
rm -rf "$install_path"
682+
mkdir -p "$install_path"
683+
684+
# Create temporary directory
685+
# shellcheck disable=SC2155
686+
local tmp_dir=$(mktemp -d)
687+
cd "$tmp_dir" || return 1
688+
689+
# Download cuDNN
690+
echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..."
691+
(exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1
692+
693+
# Unpack the tarball
694+
echo "[INSTALL] Unpacking cuDNN ..."
695+
tar -xvf cudnn.tar.xz
696+
697+
# Copy the includes and libs over to the install path
698+
echo "[INSTALL] Moving cuDNN files to ${install_path} ..."
699+
rm -rf "${install_path:?}/include"
700+
rm -rf "${install_path:?}/lib"
701+
mv cudnn-linux-*/include "$install_path"
702+
mv cudnn-linux-*/lib "$install_path"
703+
704+
# Delete the temporary directory
705+
cd - || return 1
706+
rm -rf "$tmp_dir"
707+
708+
# Export the environment variables to the Conda environment
709+
echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
710+
print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"
711+
712+
echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})"
713+
}
714+
715+
################################################################################
716+
# ROCm Setup Functions
717+
################################################################################
718+
607719
install_rocm_ubuntu () {
608720
local env_name="$1"
609721
local rocm_version="$2"
@@ -652,15 +764,25 @@ install_rocm_ubuntu () {
652764
(exec_with_retries amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms) || return 1
653765

654766
echo "[INSTALL] Installing HIP-relevant packages ..."
655-
install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
656767
install_system_packages hipify-clang miopen-hip miopen-hip-dev
657768

769+
# There is no need to install these packages for ROCm
770+
# install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
771+
658772
echo "[INSTALL] Cleaning up ..."
659773
print_exec rm -f "${package_name}"
660774

775+
echo "[INFO] Check ROCM GPU info ..."
776+
print_exec rocm-smi
777+
661778
echo "[INSTALL] Successfully installed ROCm ${rocm_version}"
662779
}
663780

781+
782+
################################################################################
783+
# Build Tools Setup Functions
784+
################################################################################
785+
664786
install_cxx_compiler () {
665787
local env_name="$1"
666788
local use_system_package_manager="$2"
@@ -759,82 +881,6 @@ install_build_tools () {
759881
echo "[INSTALL] Successfully installed all the build tools"
760882
}
761883

762-
install_cudnn () {
763-
local env_name="$1"
764-
local install_path="$2"
765-
local cuda_version="$3"
766-
if [ "$cuda_version" == "" ]; then
767-
echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
768-
echo "Example:"
769-
echo " ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7"
770-
return 1
771-
else
772-
echo "################################################################################"
773-
echo "# Install cuDNN"
774-
echo "#"
775-
echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
776-
echo "################################################################################"
777-
echo ""
778-
fi
779-
780-
# Install cuDNN manually
781-
# Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
782-
local cudnn_packages=(
783-
["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
784-
["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
785-
["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
786-
["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
787-
)
788-
789-
# Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
790-
# shellcheck disable=SC2206
791-
local cuda_version_arr=(${cuda_version//./ })
792-
# Fetch the major and minor version to concat
793-
local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}"
794-
795-
# Get the URL
796-
local cudnn_url="${cudnn_packages[cuda_concat_version]}"
797-
if [ "$cudnn_url" == "" ]; then
798-
# Default to cuDNN for 11.7 if no CUDA version fits
799-
echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7"
800-
cudnn_url="${cudnn_packages[117]}"
801-
fi
802-
803-
# Clear the install path
804-
rm -rf "$install_path"
805-
mkdir -p "$install_path"
806-
807-
# Create temporary directory
808-
# shellcheck disable=SC2155
809-
local tmp_dir=$(mktemp -d)
810-
cd "$tmp_dir" || return 1
811-
812-
# Download cuDNN
813-
echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..."
814-
(exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1
815-
816-
# Unpack the tarball
817-
echo "[INSTALL] Unpacking cuDNN ..."
818-
tar -xvf cudnn.tar.xz
819-
820-
# Copy the includes and libs over to the install path
821-
echo "[INSTALL] Moving cuDNN files to ${install_path} ..."
822-
rm -rf "${install_path:?}/include"
823-
rm -rf "${install_path:?}/lib"
824-
mv cudnn-linux-*/include "$install_path"
825-
mv cudnn-linux-*/lib "$install_path"
826-
827-
# Delete the temporary directory
828-
cd - || return 1
829-
rm -rf "$tmp_dir"
830-
831-
# Export the environment variables to the Conda environment
832-
echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
833-
print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib"
834-
835-
echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})"
836-
}
837-
838884

839885
################################################################################
840886
# Combination Functions
@@ -876,7 +922,7 @@ create_conda_pytorch_environment () {
876922

877923

878924
################################################################################
879-
# Build Functions
925+
# FBGEMM_GPU Build Functions
880926
################################################################################
881927

882928
prepare_fbgemm_gpu_build () {
@@ -895,6 +941,11 @@ prepare_fbgemm_gpu_build () {
895941
echo ""
896942
fi
897943

944+
if [[ "${GITHUB_WORKSPACE}" ]]; then
945+
# https://github.com/actions/checkout/issues/841
946+
git config --global --add safe.directory "${GITHUB_WORKSPACE}"
947+
fi
948+
898949
echo "[BUILD] Running git submodules update ..."
899950
git submodule sync
900951
git submodule update --init --recursive

.github/workflows/fbgemm_ci.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ on:
1313
branches:
1414
- main
1515

16+
concurrency:
17+
# Cancel previous runs in the PR if a new commit is pushed
18+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
19+
cancel-in-progress: true
20+
1621
jobs:
1722
build-posix:
1823
runs-on: ${{ matrix.os }}

.github/workflows/fbgemm_gpu_ci.yml

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,17 @@ on:
1313
branches:
1414
- main
1515

16+
concurrency:
17+
# Cancel previous runs in the PR if a new commit is pushed
18+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
19+
cancel-in-progress: true
20+
1621
jobs:
1722
build_and_test_amd:
1823
runs-on: ${{ matrix.os }}
24+
container:
25+
image: ${{ matrix.container-image }}
26+
options: --user root
1927
defaults:
2028
run:
2129
shell: bash
@@ -25,11 +33,18 @@ jobs:
2533
strategy:
2634
fail-fast: false
2735
matrix:
28-
os: [ ubuntu-20.04 ]
36+
os: [ linux.12xlarge ]
37+
container-image: [ "ubuntu:20.04" ]
2938
python-version: [ "3.10" ]
3039
rocm-version: [ "5.3" ]
3140

3241
steps:
42+
- name: Setup Build Container
43+
run: |
44+
apt update -y
45+
apt install -y binutils git sudo wget
46+
git config --global --add safe.directory '*'
47+
3348
- name: Checkout the Repository
3449
uses: actions/checkout@v3
3550
with:
@@ -74,7 +89,7 @@ jobs:
7489
print_exec conda env config vars set -n $BUILD_ENV PYTORCH_ROCM_ARCH=gfx90a
7590
print_exec conda run -n $BUILD_ENV python setup.py build develop
7691
77-
- name: Test FBGEMM_GPU-ROCM Nightly installation
92+
- name: Test FBGEMM_GPU-ROCM Nightly Installation
7893
timeout-minutes: 10
7994
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
8095

.github/workflows/fbgemm_gpu_lint.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ on:
1414
branches:
1515
- main
1616

17+
concurrency:
18+
# Cancel previous runs in the PR if a new commit is pushed
19+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
20+
cancel-in-progress: true
21+
1722
jobs:
1823
run_pylint:
1924
runs-on: ubuntu-latest

.github/workflows/fbgemm_nightly_build.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ on:
3030
#
3131
workflow_dispatch:
3232

33+
concurrency:
34+
# Cancel previous runs in the PR if a new commit is pushed
35+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
36+
cancel-in-progress: true
37+
3338
jobs:
3439
# Build on CPU hosts and upload to GHA
3540
build_artifact:

0 commit comments

Comments
 (0)