Use Nova workflow to host all published wheel files at PyTorch site (#2016)

spcyppt · spcyppt · commit 54556c630fa5 · 2023-09-18T11:32:39.000-07:00
Summary: Pull Request resolved: #2016 To alleviate CUDA version mismatch issues, we aim to publish fbgemm-gpu-nightly with different CUDA versions. This diff uses Nova workflow and will host the published wheels at PyTorch site instead. #1947 Reviewed By: q10 Differential Revision: D49258503 fbshipit-source-id: a06d095b0c03df62d8cea8fb8db1b5018c9a9dd7
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -39,7 +39,7 @@ prepare_fbgemm_gpu_build () {
   git submodule update --init --recursive
 
   echo "[BUILD] Installing other build dependencies ..."
-  (exec_with_retries conda run -n "${env_name}" python -m pip install -r requirements.txt) || return 1
+  (exec_with_retries conda run --no-capture-output -n "${env_name}" python -m pip install -r requirements.txt) || return 1
 
   (test_python_import_package "${env_name}" numpy) || return 1
   (test_python_import_package "${env_name}" skbuild) || return 1
@@ -122,7 +122,7 @@ __configure_fbgemm_gpu_build_cuda () {
   # Build only CUDA 7.0 and 8.0 (i.e. V100 and A100) because of 100 MB binary size limits from PyPI.
   echo "[BUILD] Setting CUDA build args ..."
   # shellcheck disable=SC2155
-  local nvml_lib_path=$(conda run -n "${env_name}" printenv NVML_LIB_PATH)
+  local nvml_lib_path=$(conda run --no-capture-output -n "${env_name}" printenv NVML_LIB_PATH)
   build_args=(
     --nvml_lib_path="${nvml_lib_path}"
     -DTORCH_CUDA_ARCH_LIST="'${arch_list}'"
@@ -193,15 +193,15 @@ __build_fbgemm_gpu_common_pre_steps () {
 
   # Extract the Python tag
   # shellcheck disable=SC2207
-  python_version=($(conda run -n "${env_name}" python --version))
+  python_version=($(conda run --no-capture-output -n "${env_name}" python --version))
   # shellcheck disable=SC2206
   python_version_arr=(${python_version[1]//./ })
   python_tag="py${python_version_arr[0]}${python_version_arr[1]}"
   echo "[BUILD] Extracted Python tag: ${python_tag}"
 
   echo "[BUILD] Running pre-build cleanups ..."
   print_exec rm -rf dist
-  print_exec conda run -n "${env_name}" python setup.py clean
+  print_exec conda run --no-capture-output -n "${env_name}" python setup.py clean
 
   echo "[BUILD] Printing git status ..."
   print_exec git status
@@ -305,10 +305,23 @@ build_fbgemm_gpu_package () {
   # See https://github.com/pypa/manylinux
   local plat_name="manylinux2014_${MACHINE_NAME}"
 
+  echo "[BUILD] Checking arch_list = ${arch_list}"
+  echo "[BUILD] Checking build_args:"
+  echo "${build_args[@]}"
+
+  core=$(lscpu | grep "Core(s)" | awk '{print $NF}') && echo "core = ${core}" || echo "core not found"
+  sockets=$(lscpu | grep "Socket(s)" | awk '{print $NF}') && echo "sockets = ${sockets}" || echo "sockets not found"
+  re='^[0-9]+$'
+  run_multicore=""
+  if [[ $core =~ $re && $sockets =~ $re ]] ; then
+    n_core=$((core * sockets))
+    run_multicore=" -j ${n_core}"
+  fi
+
   # Distribute Python extensions as wheels on Linux
   echo "[BUILD] Building FBGEMM-GPU wheel (VARIANT=${fbgemm_variant}) ..."
-  print_exec conda run -n "${env_name}" \
-    python setup.py bdist_wheel \
+  print_exec conda run --no-capture-output -n "${env_name}" \
+    python setup.py "${run_multicore}" bdist_wheel \
       --package_name="${package_name}" \
       --python-tag="${python_tag}" \
       --plat-name="${plat_name}" \
@@ -357,7 +370,7 @@ build_fbgemm_gpu_install () {
   # Parallelism may need to be limited to prevent the build from being
   # canceled for going over ulimits
   echo "[BUILD] Building + installing FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
-  print_exec conda run -n "${env_name}" \
+  print_exec conda run --no-capture-output -n "${env_name}" \
     python setup.py install "${build_args[@]}"
 
   # Run checks on the built libraries
@@ -401,7 +414,7 @@ build_fbgemm_gpu_develop () {
   # Parallelism may need to be limited to prevent the build from being
   # canceled for going over ulimits
   echo "[BUILD] Building (develop) FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
-  print_exec conda run -n "${env_name}" \
+  print_exec conda run --no-capture-output -n "${env_name}" \
     python setup.py build develop "${build_args[@]}"
 
   # Run checks on the built libraries
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
@@ -28,7 +28,7 @@ run_python_test () {
     echo "################################################################################"
   fi
 
-  if print_exec conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
+  if print_exec conda run --no-capture-output -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
     echo "[TEST] Python test suite PASSED: ${python_test_file}"
     echo ""
   else
diff --git a/.github/scripts/nova_dir.bash b/.github/scripts/nova_dir.bash
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+## Workaround for Nova Workflow to look for setup.py in fbgemm_gpu rather than root repo
+FBGEMM_DIR="/__w/FBGEMM/FBGEMM"
+export FBGEMM_REPO="${FBGEMM_DIR}/${REPOSITORY}"
+working_dir=$(pwd)
+if [[ "$working_dir" == "$FBGEMM_REPO" ]]; then cd fbgemm_gpu || echo "Failed to cd fbgemm_gpu from $(pwd)"; fi
+
+## Build clean/wheel will be done in pre-script. Set flag such that setup.py will skip these steps in Nova workflow
+export BUILD_FROM_NOVA=1
+
+## Overwrite existing ENV VAR in Nova
+if [[ "$CONDA_ENV" != "" ]]; then export CONDA_RUN="conda run --no-capture-output -p ${CONDA_ENV}" && echo "$CONDA_RUN"; fi
+if [[ "$CU_VERSION" == "cu118" ]]; then export TORCH_CUDA_ARCH_LIST='7.0;8.0' && echo "$TORCH_CUDA_ARCH_LIST"; fi
+if [[ "$CU_VERSION" == "cu121" ]]; then export TORCH_CUDA_ARCH_LIST='7.0;8.0;9.0' && echo "$TORCH_CUDA_ARCH_LIST"; fi
diff --git a/.github/scripts/nova_postscript.bash b/.github/scripts/nova_postscript.bash
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+echo "Current working directory: $(pwd)"
+cd "${FBGEMM_REPO}" || echo "Failed to cd to ${FBGEMM_REPO}"
+PRELUDE="${FBGEMM_REPO}/.github/scripts/setup_env.bash"
+BUILD_ENV_NAME=base
+GITHUB_ENV=TRUE
+export GITHUB_ENV
+
+# Install FBGEMM_GPU Nightly
+echo "Current working directory: $(pwd)"
+# shellcheck disable=SC1091
+# shellcheck source=.github/scripts/setup_env.bash
+. "${PRELUDE}";
+
+install_fbgemm_gpu_wheel "${BUILD_ENV_NAME}" fbgemm_gpu/dist/*.whl
+
+# Test with PyTest
+echo "Current working directory: $(pwd)"
+CPU_GPU="${CU_VERSION}"
+if [ "${CU_VERSION}" != 'cpu' ]; then
+    CPU_GPU=""
+fi
+$CONDA_RUN python3 -c "import torch; print('cuda.is_available() ', torch.cuda.is_available()); print ('device_count() ',torch.cuda.device_count());"
+cd "${FBGEMM_REPO}/fbgemm_gpu/test" || { echo "Failed to cd to fbgemm_gpu/test from $(pwd)"; };
+run_fbgemm_gpu_tests "${BUILD_ENV_NAME}" "${CPU_GPU}"
diff --git a/.github/scripts/nova_prescript.bash b/.github/scripts/nova_prescript.bash
@@ -0,0 +1,75 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+echo "Current working directory: $(pwd)"
+cd "${FBGEMM_REPO}" || echo "Failed to cd to ${FBGEMM_REPO}"
+PRELUDE="${FBGEMM_REPO}/.github/scripts/setup_env.bash"
+BUILD_ENV_NAME=base
+echo "--------------------------"
+echo "----- conda env list -----"
+conda env list
+echo "--------------------------"
+echo "PRELUDE = $PRELUDE"
+export PATH="${PATH}:/usr/sbin:/sbin"
+echo "CU_VERSION = ${CU_VERSION}"
+echo "PYTHON_VERSION = ${PYTHON_VERSION}"
+echo "python3 --version = $(python3 --version)"
+echo "ARCH = ${ARCH}"
+echo "---------------------------"
+# shellcheck disable=SC1091
+# shellcheck source=.github/scripts/setup_env.bash
+. "${PRELUDE}";
+
+## Display System Info
+print_system_info
+
+## Display GPU Info
+print_gpu_info
+
+## Install C/C++ Compilers
+install_cxx_compiler "${BUILD_ENV_NAME}"
+
+## Install Build Tools
+install_build_tools "${BUILD_ENV_NAME}"
+
+## Install cuDNN
+CPU_GPU=${CU_VERSION}
+if [ "${CU_VERSION}" != 'cpu' ]; then
+    ## Nova $CU_VERSION is e.g., cu118
+    cuda_version_num=$(echo "$CU_VERSION" | cut -c 3-)
+    install_cudnn "${BUILD_ENV_NAME}" "$(pwd)/build_only/cudnn" "$cuda_version_num"
+    echo "-------- Finding NVML_LIB_PATH -----------"
+    echo "NVML_LIB_PATH = ${NVML_LIB_PATH}"
+    echo "CONDA_ENV = ${CONDA_ENV}, CUDA_HOME = ${CUDA_HOME}"
+    if [[ ${NVML_LIB_PATH} == "" ]]; then NVML_LIB_PATH=$(find "${CUDA_HOME}" -name libnvidia-ml.so) && export NVML_LIB_PATH && echo "looking in ${CUDA_HOME}" || echo "libnvidia-ml.so not found in ${CUDA_HOME}"; fi
+    if [[ ${NVML_LIB_PATH} == "" ]]; then NVML_LIB_PATH=$(find "${CONDA_ENV}" -name libnvidia-ml.so) && export NVML_LIB_PATH && echo "looking in ${CONDA_ENV}" || echo "libnvidia-ml.so not found in ${CONDA_ENV}"; fi
+    echo "NVML_LIB_PATH = ${NVML_LIB_PATH}"
+    echo "------------------------------------------"
+    CPU_GPU="cuda"
+fi
+
+cd "${FBGEMM_REPO}/fbgemm_gpu" || { echo "Failed to cd to fbgemm_gpu from $(pwd)"; }
+prepare_fbgemm_gpu_build "${BUILD_ENV_NAME}"
+
+# reset NOVA flag to run setup.py
+BUILD_FROM_NOVA=0
+export BUILD_FROM_NOVA
+
+## Build FBGEMM_GPU Nightly
+cd "${FBGEMM_REPO}/fbgemm_gpu" || echo "Failed to cd to ${FBGEMM_REPO}/fbgemm_gpu from $(pwd)"
+if [[ ${CHANNEL} == "" ]]; then CHANNEL="nightly"; fi #set nightly by default
+echo "----------------------------------------------"
+echo "build_fbgemm_gpu_package ${BUILD_ENV_NAME} ${CHANNEL} ${CPU_GPU}"
+build_fbgemm_gpu_package "${BUILD_ENV_NAME}" "${CHANNEL}" "${CPU_GPU}"
+echo "----------------------------------------------"
+
+## Temporary workaround - copy dist/ to root repo for smoke test
+echo "Copying dist folder to root repo.."
+(cp -r "${FBGEMM_REPO}/fbgemm_gpu/dist" "${FBGEMM_REPO}") && (echo "dist folder has been copied to ${FBGEMM_REPO}") || echo "Failed to copy dist/ folder to ${FBGEMM_REPO}"
+echo "----------------------------------"
+ls -al "${FBGEMM_REPO}/dist"
+echo "----------------------------------"
diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
@@ -77,10 +77,12 @@ free_disk_space () {
 ################################################################################
 
 print_gpu_info () {
-  echo "################################################################################"
-  echo "[INFO] Printing general display info ..."
-  install_system_packages lshw
-  print_exec sudo lshw -C display
+  if [[ "${BUILD_FROM_NOVA}" != '1' ]]; then
+    echo "################################################################################"
+    echo "[INFO] Printing general display info ..."
+    install_system_packages lshw
+    print_exec sudo lshw -C display
+  fi
 
   echo "################################################################################"
   echo "[INFO] Printing NVIDIA GPU info ..."
@@ -131,11 +133,15 @@ __print_system_info_linux () {
   echo "################################################################################"
   echo "[INFO] Print CPU info ..."
   print_exec nproc
+  print_exec lscpu
   print_exec cat /proc/cpuinfo
 
-  echo "################################################################################"
-  echo "[INFO] Print PCI info ..."
-  print_exec lspci -v
+
+  if [[ "${BUILD_FROM_NOVA}" != '1' ]]; then
+    echo "################################################################################"
+    echo "[INFO] Print PCI info ..."
+    print_exec lspci -v
+  fi
 
   echo "################################################################################"
   echo "[INFO] Print Linux distribution info ..."
diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
@@ -0,0 +1,39 @@
+name: Build Linux Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-cuda: enable
+      with-rocm: disable
+      with-cpu: enable
+  build:
+    needs: generate-matrix
+    name: pytorch/FBGEMM
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      repository: pytorch/FBGEMM
+      ref: ""
+      pre-script: ../.github/scripts/nova_prescript.bash
+      post-script: ../.github/scripts/nova_postscript.bash
+      smoke-test-script: ""
+      env-var-script: .github/scripts/nova_dir.bash
+      package-name: fbgemm_gpu
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      trigger-event: ${{ github.event_name }}
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
@@ -269,6 +269,19 @@ def main(argv: List[str]) -> None:
     if len(unknown) != 0 and (len(unknown) != 1 or unknown[0] != "clean"):
         print("Unknown Arguments: ", unknown)
 
+    # Skip Nova build steps since it will be done in pre-script
+    if "BUILD_FROM_NOVA" in os.environ:
+        build_from_nova = os.getenv("BUILD_FROM_NOVA")
+        print("build_from_nova", build_from_nova)
+        # Package name is the same for all variants in Nova
+        package_name = "fbgemm_gpu"
+        if str(build_from_nova) != "0":
+            # Skip build clean and build wheel steps in Nova workflow since they are done in pre-script
+            print("Build from Nova detected... exiting")
+            sys.exit(0)
+    else:
+        package_name = args.package_name
+
     if not args.cpu_only:
         set_cuda_environment_variables()
 
@@ -282,7 +295,7 @@ def main(argv: List[str]) -> None:
     FbgemmGpuInstaller.generate_version_file(package_version)
 
     setup(
-        name=args.package_name,
+        name=package_name,
         version=package_version,
         author="FBGEMM Team",
         author_email="packages@pytorch.org",