13
13
print_exec () {
14
14
echo " + $* "
15
15
echo " "
16
- " $@ "
16
+ if " $@ " ; then
17
+ local retcode=0
18
+ else
19
+ local retcode=$?
20
+ fi
17
21
echo " "
22
+ return $retcode
18
23
}
19
24
20
25
exec_with_retries () {
@@ -205,7 +210,7 @@ run_python_test () {
205
210
echo " ################################################################################"
206
211
fi
207
212
208
- if conda run -n " ${env_name} " python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning " ${python_test_file} " ; then
213
+ if print_exec conda run -n " ${env_name} " python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning " ${python_test_file} " ; then
209
214
echo " [TEST] Python test suite PASSED: ${python_test_file} "
210
215
else
211
216
echo " [TEST] Python test suite FAILED: ${python_test_file} "
@@ -313,7 +318,7 @@ print_ec2_info () {
313
318
314
319
315
320
# ###############################################################################
316
- # Environment Setup and Install Functions
321
+ # Miniconda Setup Functions
317
322
# ###############################################################################
318
323
319
324
setup_miniconda () {
@@ -398,6 +403,11 @@ create_conda_environment () {
398
403
echo " [SETUP] Successfully created Conda environment: ${env_name} "
399
404
}
400
405
406
+
407
+ # ###############################################################################
408
+ # PyTorch Setup Functions
409
+ # ###############################################################################
410
+
401
411
install_pytorch_conda () {
402
412
local env_name=" $1 "
403
413
local pytorch_version=" $2 "
@@ -553,6 +563,28 @@ install_pytorch_pip () {
553
563
echo " [INSTALL] NOTE: The installed version is: ${installed_pytorch_version} "
554
564
}
555
565
566
+
567
+ # ###############################################################################
568
+ # CUDA Setup Functions
569
+ # ###############################################################################
570
+
571
+ install_nvidia_drivers_centos () {
572
+ echo " ################################################################################"
573
+ echo " # Install NVIDIA Drivers"
574
+ echo " #"
575
+ echo " # [TIMESTAMP] $( date --utc +%FT%T.%3NZ) "
576
+ echo " ################################################################################"
577
+ echo " "
578
+
579
+ echo " [SETUP] Adding NVIDIA repos to yum ..."
580
+ print_exec sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
581
+ print_exec sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
582
+ print_exec sudo yum clean expire-cache
583
+
584
+ echo " [SETUP] Installing NVIDIA drivers ..."
585
+ install_system_packages nvidia-driver-latest-dkms
586
+ }
587
+
556
588
install_cuda () {
557
589
local env_name=" $1 "
558
590
local cuda_version=" $2 "
@@ -604,6 +636,86 @@ install_cuda () {
604
636
echo " [INSTALL] Successfully installed CUDA ${cuda_version} "
605
637
}
606
638
639
+ install_cudnn () {
640
+ local env_name=" $1 "
641
+ local install_path=" $2 "
642
+ local cuda_version=" $3 "
643
+ if [ " $cuda_version " == " " ]; then
644
+ echo " Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
645
+ echo " Example:"
646
+ echo " ${FUNCNAME[0]} build_env \$ (pwd)/cudnn_install 11.7"
647
+ return 1
648
+ else
649
+ echo " ################################################################################"
650
+ echo " # Install cuDNN"
651
+ echo " #"
652
+ echo " # [TIMESTAMP] $( date --utc +%FT%T.%3NZ) "
653
+ echo " ################################################################################"
654
+ echo " "
655
+ fi
656
+
657
+ # Install cuDNN manually
658
+ # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
659
+ local cudnn_packages=(
660
+ [" 115" ]=" https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
661
+ [" 116" ]=" https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
662
+ [" 117" ]=" https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
663
+ [" 118" ]=" https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
664
+ )
665
+
666
+ # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
667
+ # shellcheck disable=SC2206
668
+ local cuda_version_arr=(${cuda_version// ./ } )
669
+ # Fetch the major and minor version to concat
670
+ local cuda_concat_version=" ${cuda_version_arr[0]}${cuda_version_arr[1]} "
671
+
672
+ # Get the URL
673
+ local cudnn_url=" ${cudnn_packages[cuda_concat_version]} "
674
+ if [ " $cudnn_url " == " " ]; then
675
+ # Default to cuDNN for 11.7 if no CUDA version fits
676
+ echo " [INSTALL] Defaulting to cuDNN for CUDA 11.7"
677
+ cudnn_url=" ${cudnn_packages[117]} "
678
+ fi
679
+
680
+ # Clear the install path
681
+ rm -rf " $install_path "
682
+ mkdir -p " $install_path "
683
+
684
+ # Create temporary directory
685
+ # shellcheck disable=SC2155
686
+ local tmp_dir=$( mktemp -d)
687
+ cd " $tmp_dir " || return 1
688
+
689
+ # Download cuDNN
690
+ echo " [INSTALL] Downloading cuDNN to ${tmp_dir} ..."
691
+ (exec_with_retries wget -q " $cudnn_url " -O cudnn.tar.xz) || return 1
692
+
693
+ # Unpack the tarball
694
+ echo " [INSTALL] Unpacking cuDNN ..."
695
+ tar -xvf cudnn.tar.xz
696
+
697
+ # Copy the includes and libs over to the install path
698
+ echo " [INSTALL] Moving cuDNN files to ${install_path} ..."
699
+ rm -rf " ${install_path:? } /include"
700
+ rm -rf " ${install_path:? } /lib"
701
+ mv cudnn-linux-* /include " $install_path "
702
+ mv cudnn-linux-* /lib " $install_path "
703
+
704
+ # Delete the temporary directory
705
+ cd - || return 1
706
+ rm -rf " $tmp_dir "
707
+
708
+ # Export the environment variables to the Conda environment
709
+ echo " [INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
710
+ print_exec conda env config vars set -n " ${env_name} " CUDNN_INCLUDE_DIR=" ${install_path} /include" CUDNN_LIBRARY=" ${install_path} /lib"
711
+
712
+ echo " [INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version} )"
713
+ }
714
+
715
+ # ###############################################################################
716
+ # ROCm Setup Functions
717
+ # ###############################################################################
718
+
607
719
install_rocm_ubuntu () {
608
720
local env_name=" $1 "
609
721
local rocm_version=" $2 "
@@ -652,15 +764,25 @@ install_rocm_ubuntu () {
652
764
(exec_with_retries amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms) || return 1
653
765
654
766
echo " [INSTALL] Installing HIP-relevant packages ..."
655
- install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
656
767
install_system_packages hipify-clang miopen-hip miopen-hip-dev
657
768
769
+ # There is no need to install these packages for ROCm
770
+ # install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
771
+
658
772
echo " [INSTALL] Cleaning up ..."
659
773
print_exec rm -f " ${package_name} "
660
774
775
+ echo " [INFO] Check ROCM GPU info ..."
776
+ print_exec rocm-smi
777
+
661
778
echo " [INSTALL] Successfully installed ROCm ${rocm_version} "
662
779
}
663
780
781
+
782
+ # ###############################################################################
783
+ # Build Tools Setup Functions
784
+ # ###############################################################################
785
+
664
786
install_cxx_compiler () {
665
787
local env_name=" $1 "
666
788
local use_system_package_manager=" $2 "
@@ -759,82 +881,6 @@ install_build_tools () {
759
881
echo " [INSTALL] Successfully installed all the build tools"
760
882
}
761
883
762
- install_cudnn () {
763
- local env_name=" $1 "
764
- local install_path=" $2 "
765
- local cuda_version=" $3 "
766
- if [ " $cuda_version " == " " ]; then
767
- echo " Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
768
- echo " Example:"
769
- echo " ${FUNCNAME[0]} build_env \$ (pwd)/cudnn_install 11.7"
770
- return 1
771
- else
772
- echo " ################################################################################"
773
- echo " # Install cuDNN"
774
- echo " #"
775
- echo " # [TIMESTAMP] $( date --utc +%FT%T.%3NZ) "
776
- echo " ################################################################################"
777
- echo " "
778
- fi
779
-
780
- # Install cuDNN manually
781
- # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
782
- local cudnn_packages=(
783
- [" 115" ]=" https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
784
- [" 116" ]=" https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
785
- [" 117" ]=" https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
786
- [" 118" ]=" https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
787
- )
788
-
789
- # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
790
- # shellcheck disable=SC2206
791
- local cuda_version_arr=(${cuda_version// ./ } )
792
- # Fetch the major and minor version to concat
793
- local cuda_concat_version=" ${cuda_version_arr[0]}${cuda_version_arr[1]} "
794
-
795
- # Get the URL
796
- local cudnn_url=" ${cudnn_packages[cuda_concat_version]} "
797
- if [ " $cudnn_url " == " " ]; then
798
- # Default to cuDNN for 11.7 if no CUDA version fits
799
- echo " [INSTALL] Defaulting to cuDNN for CUDA 11.7"
800
- cudnn_url=" ${cudnn_packages[117]} "
801
- fi
802
-
803
- # Clear the install path
804
- rm -rf " $install_path "
805
- mkdir -p " $install_path "
806
-
807
- # Create temporary directory
808
- # shellcheck disable=SC2155
809
- local tmp_dir=$( mktemp -d)
810
- cd " $tmp_dir " || return 1
811
-
812
- # Download cuDNN
813
- echo " [INSTALL] Downloading cuDNN to ${tmp_dir} ..."
814
- (exec_with_retries wget -q " $cudnn_url " -O cudnn.tar.xz) || return 1
815
-
816
- # Unpack the tarball
817
- echo " [INSTALL] Unpacking cuDNN ..."
818
- tar -xvf cudnn.tar.xz
819
-
820
- # Copy the includes and libs over to the install path
821
- echo " [INSTALL] Moving cuDNN files to ${install_path} ..."
822
- rm -rf " ${install_path:? } /include"
823
- rm -rf " ${install_path:? } /lib"
824
- mv cudnn-linux-* /include " $install_path "
825
- mv cudnn-linux-* /lib " $install_path "
826
-
827
- # Delete the temporary directory
828
- cd - || return 1
829
- rm -rf " $tmp_dir "
830
-
831
- # Export the environment variables to the Conda environment
832
- echo " [INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
833
- print_exec conda env config vars set -n " ${env_name} " CUDNN_INCLUDE_DIR=" ${install_path} /include" CUDNN_LIBRARY=" ${install_path} /lib"
834
-
835
- echo " [INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version} )"
836
- }
837
-
838
884
839
885
# ###############################################################################
840
886
# Combination Functions
@@ -876,7 +922,7 @@ create_conda_pytorch_environment () {
876
922
877
923
878
924
# ###############################################################################
879
- # Build Functions
925
+ # FBGEMM_GPU Build Functions
880
926
# ###############################################################################
881
927
882
928
prepare_fbgemm_gpu_build () {
@@ -895,6 +941,11 @@ prepare_fbgemm_gpu_build () {
895
941
echo " "
896
942
fi
897
943
944
+ if [[ " ${GITHUB_WORKSPACE} " ]]; then
945
+ # https://github.com/actions/checkout/issues/841
946
+ git config --global --add safe.directory " ${GITHUB_WORKSPACE} "
947
+ fi
948
+
898
949
echo " [BUILD] Running git submodules update ..."
899
950
git submodule sync
900
951
git submodule update --init --recursive
0 commit comments