Make Tests More Stable (#1606)

q10 · facebook-github-bot · commit eec6fd241f8a · 2023-02-27T11:23:17.000-08:00
Summary: - Add support for retries in build steps that are known to fail due to the occasional network connection failures - Add support for installing ROCm tooling and testing ROCm builds in the build scripts framework - Update the existing FBGEMM_GPU CI / build_amd_gpu job to use the build scripts framework - Fix the annotations to tests in `jagged_tensor_ops_test.py` to run correctly on CPU-only mode - Impose timeouts of 10 minutes for running the test suites (in practice, they generally complete within 3 minutes) - Add ability to conditionally disable tests depending on whether or not they are running inside the GitHub runner - Disable the `test_jagged_index_select_2d` test on GitHub until we figure out the root cause of it hanging whenever it is run on GitHub (regardless of GPU or CPU variant) Pull Request resolved: #1606 Reviewed By: brad-mengchi, shintaro-iwasaki Differential Revision: D43601032 Pulled By: q10 fbshipit-source-id: 55203151f54bf010859b0e7e1568572e064fa7cf
diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
diff --git a/.github/workflows/fbgemm_docs.yml b/.github/workflows/fbgemm_docs.yml
@@ -1,5 +1,9 @@
-# This workflow builds the fbgemm_gpu docs and deploys them to gh-pages.
-name: Generate documentation
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+name: FBGEMM Documentation
 on:
   push:
     branches:
diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
@@ -14,72 +14,73 @@ on:
       - main
 
 jobs:
-  build_amd_gpu:
-    if: ${{ false }}  # Disable the job for now
+  build_and_test_amd:
     runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
     strategy:
+      fail-fast: false
       matrix:
-        os: [ubuntu-20.04]
-        config: [[pip, 5.3]]
+        os: [ ubuntu-20.04 ]
+        python-version: [ "3.10" ]
+        rocm-version: [ "5.3" ]
 
     steps:
-    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
 
-    - uses: actions/checkout@v3
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
 
-    - name: Install ROCm
-      shell: bash
-      run: |
-        sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 10
-        wget https://repo.radeon.com/amdgpu-install/5.3/ubuntu/focal/amdgpu-install_5.3.50300-1_all.deb
-        export DEBIAN_FRONTEND=noninteractive
-        sudo apt install -y ./amdgpu-install_5.3.50300-1_all.deb
-        amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms
-        sudo rm amdgpu-install_5.3.50300-1_all.deb
-
-    - name: Install dependencies
-      shell: bash
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y git pip python3-dev mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
-        sudo apt-get install -y hipify-clang || true
-        sudo apt-get install -y miopen-hip miopen-hip-dev
-        sudo pip install cmake scikit-build ninja jinja2 numpy hypothesis --no-input
-        sudo apt-get clean
-        # Install PyTorch (nightly) as required by fbgemm_gpu
-        sudo pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/rocm5.3/
-
-    - name: Checkout submodules
-      shell: bash
-      run: |
-        cd fbgemm_gpu
-        git submodule sync
-        git submodule update --init --recursive
+    - name: Free Disk Space
+      run: . $PRELUDE; free_disk_space
 
-    - name: Build fbgemm_gpu
-      shell: bash
+    - name: Setup Miniconda
       run: |
-        sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 10
-        cd fbgemm_gpu
-        # build for MI250 only to save time.
-        sudo PYTORCH_ROCM_ARCH=gfx90a python3 setup.py build develop
+        . $PRELUDE; setup_miniconda $HOME/miniconda
+        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
+        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
 
-    - name: Test fbgemm_gpu installation
-      shell: bash
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install ROCm
+      run: . $PRELUDE; install_rocm_ubuntu $BUILD_ENV ${{ matrix.rocm-version }}
+
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+    - name: Install PyTorch-ROCm Nightly
+      run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}
+
+    - name: Prepare FBGEMM Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Build FBGEMM_GPU-ROCM Nightly
       run: |
+        . $PRELUDE
         cd fbgemm_gpu
-        cd test
-        python3 input_combine_test.py -v
-        python3 quantize_ops_test.py -v
-        python3 sparse_ops_test.py -v
-        python3 -c "import fbgemm_gpu"
-        python3 -c "import fbgemm_gpu.split_embedding_codegen_lookup_invokers"
+
+        # Build for MI250 only to save time.
+        print_exec conda env config vars set -n $BUILD_ENV PYTORCH_ROCM_ARCH=gfx90a
+        print_exec conda run -n $BUILD_ENV python setup.py build develop
+
+    - name: Test FBGEMM_GPU-ROCM Nightly installation
+      timeout-minutes: 10
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
+
 
   test_amd_gpu:
     if: ${{ false }}  # Disable the job for now
     runs-on: rocm
     strategy:
+      fail-fast: false
       matrix:
         os: [ubuntu-latest]
 
@@ -125,6 +126,7 @@ jobs:
         "
         docker run $DOCKER_OPTIONS $DOCKER_IMAGE $JENKINS_REPO_DIR_DOCKER/.jenkins/rocm/build_and_test.sh $JENKINS_REPO_DIR_DOCKER
 
+
   build_and_test_cpu:
     runs-on: ${{ matrix.os }}
     defaults:
@@ -134,6 +136,7 @@ jobs:
       PRELUDE: .github/scripts/setup_env.bash
       BUILD_ENV: build_binary
     strategy:
+      fail-fast: false
       matrix:
         os: [ ubuntu-20.04, ubuntu-latest ]
         python-version: [ "3.8", "3.9", "3.10" ]
@@ -169,4 +172,5 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpuonly
 
     - name: Test with PyTest
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpuonly
+      timeout-minutes: 10
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml
@@ -146,15 +146,14 @@ jobs:
       with:
         name: fbgemm_gpu_nightly_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
 
-    - name: Display Structure of the Downloaded Files
-      run: ls -R
-
     - name: Install FBGEMM_GPU Nightly
-      run: . $PRELUDE; install_fbgemm_gpu_package $BUILD_ENV *.whl
+      run: |
+        . $PRELUDE
+        ls .
+        install_fbgemm_gpu_package $BUILD_ENV *.whl
 
     - name: Test with PyTest
-      # Remove this line when we fixed all the unit tests
-      continue-on-error: true
+      timeout-minutes: 10
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
 
     - name: Push FBGEMM_GPU Nightly Binary to PYPI
diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml
@@ -135,14 +135,15 @@ jobs:
       with:
         name: fbgemm_gpu_nightly_cpu_${{ matrix.python-version }}.whl
 
-    - name: Display Structure of the Downloaded Files
-      run: ls -R
-
     - name: Install FBGEMM_GPU Nightly (CPU version)
-      run: . $PRELUDE; install_fbgemm_gpu_package $BUILD_ENV *.whl
+      run: |
+        . $PRELUDE
+        ls .
+        install_fbgemm_gpu_package $BUILD_ENV *.whl
 
     - name: Test with PyTest
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpuonly
+      timeout-minutes: 10
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
 
     - name: Push FBGEMM_GPU Nightly (CPU version) Binary to PYPI
       if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' }}
diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml
@@ -137,15 +137,14 @@ jobs:
       with:
         name: fbgemm_gpu_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
 
-    - name: Display Structure of the Downloaded Files
-      run: ls -R
-
     - name: Install FBGEMM_GPU
-      run: . $PRELUDE; install_fbgemm_gpu_package $BUILD_ENV *.whl
+      run: |
+        . $PRELUDE
+        ls .
+        install_fbgemm_gpu_package $BUILD_ENV *.whl
 
     - name: Test with PyTest
-      # Remove this line when we fixed all the unit tests
-      continue-on-error: true
+      timeout-minutes: 10
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
 
     - name: Push FBGEMM_GPU Binary to PYPI
diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml
@@ -127,14 +127,15 @@ jobs:
       with:
         name: fbgemm_gpu_cpu_${{ matrix.python-version }}.whl
 
-    - name: Display Structure of the Downloaded Files
-      run: ls -R
-
     - name: Install FBGEMM_GPU (CPU version)
-      run: . $PRELUDE; install_fbgemm_gpu_package $BUILD_ENV *.whl
+      run: |
+        . $PRELUDE
+        ls .
+        install_fbgemm_gpu_package $BUILD_ENV *.whl
 
     - name: Test with PyTest
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpuonly
+      timeout-minutes: 10
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
 
     - name: Push FBGEMM_GPU (CPU version) Binary to PYPI
       if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' }}
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -20,11 +20,15 @@
     from fbgemm_gpu import open_source  # noqa: F401
 
     # pyre-ignore[21]
-    from test_utils import gpu_available, gpu_unavailable
+    from test_utils import gpu_available, gpu_unavailable, running_on_github
 except Exception:
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
-    from fbgemm_gpu.test.test_utils import gpu_available, gpu_unavailable
+    from fbgemm_gpu.test.test_utils import (
+        gpu_available,
+        gpu_unavailable,
+        running_on_github,
+    )
 
 
 def lengths_to_segment_ids(lengths: torch.Tensor) -> torch.Tensor:
@@ -624,7 +628,9 @@ def _test_dense_to_jagged(
         outer_dense_size=st.integers(0, 5),
         inner_dense_size=st.integers(0, 5),
         dtype=st.sampled_from([torch.float, torch.half, torch.bfloat16]),
-        device_type=st.sampled_from(["cpu", "cuda"]),
+        device_type=st.sampled_from(["cpu", "cuda"])
+        if gpu_available
+        else st.just("cpu"),
         precompute_total_L=st.booleans(),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
@@ -646,13 +652,16 @@ def test_dense_to_jagged(
             precompute_total_L,
         )
 
+    @unittest.skipIf(*gpu_unavailable)
     # pyre-ignore [56]
     @given(
         num_jagged_dim=st.just(1),
         outer_dense_size=st.integers(0, 6000),
         inner_dense_size=st.sampled_from([8, 16, 23, 24, 48, 50, 64, 72, 96, 192]),
         dtype=st.just(torch.half),
-        device_type=st.just("cuda"),
+        device_type=st.sampled_from(["cpu", "cuda"])
+        if gpu_available
+        else st.just("cpu"),
         precompute_total_L=st.booleans(),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
@@ -676,13 +685,16 @@ def test_dense_to_jagged_opt(
 
     # (8000+1) * 8 (size of the element of LongTensor/int64_t offsets)
     # = ~62.5KB > 48KB default shared memory on V100/A100.
+    @unittest.skipIf(*gpu_unavailable)
     # pyre-ignore [56]
     @given(
         num_jagged_dim=st.just(1),
         outer_dense_size=st.just(8000),
         inner_dense_size=st.just(16),
         dtype=st.just(torch.half),
-        device_type=st.just("cuda"),
+        device_type=st.sampled_from(["cpu", "cuda"])
+        if gpu_available
+        else st.just("cpu"),
         precompute_total_L=st.booleans(),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=1, deadline=None)
@@ -969,7 +981,9 @@ def mul_func(*args) -> torch.Tensor:
         inner_dense_size=st.integers(0, 4),
         operation=st.sampled_from(["add", "add_jagged_output", "mul"]),
         dtype=st.sampled_from([torch.float, torch.half, torch.double, torch.bfloat16]),
-        device_type=st.sampled_from(["cpu", "cuda"]),
+        device_type=st.sampled_from(["cpu", "cuda"])
+        if gpu_available
+        else st.just("cpu"),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
     def test_jagged_elementwise_binary(
@@ -990,14 +1004,17 @@ def test_jagged_elementwise_binary(
             device_type,
         )
 
+    @unittest.skipIf(*gpu_unavailable)
     # pyre-ignore [56]
     @given(
         num_jagged_dim=st.just(1),
         outer_dense_size=st.integers(0, 8),
         inner_dense_size=st.sampled_from([16, 64, 96, 192]),
         operation=st.sampled_from(["add_jagged_output", "mul"]),
         dtype=st.just(torch.half),
-        device_type=st.just("cuda"),
+        device_type=st.sampled_from(["cpu", "cuda"])
+        if gpu_available
+        else st.just("cpu"),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=4, deadline=None)
     def test_jagged_elementwise_binary_opt(
@@ -1170,7 +1187,9 @@ def add_jagged_output_func(*args) -> torch.Tensor:
         outer_dense_size=st.integers(0, 4),
         inner_dense_size=st.integers(0, 4),
         dtype=st.sampled_from([torch.float, torch.half, torch.double, torch.bfloat16]),
-        device_type=st.sampled_from(["cpu", "cuda"]),
+        device_type=st.sampled_from(["cpu", "cuda"])
+        if gpu_available
+        else st.just("cpu"),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
     def test_jagged_dense_dense_elementwise_add_jagged_output(
@@ -1185,13 +1204,16 @@ def test_jagged_dense_dense_elementwise_add_jagged_output(
             num_jagged_dim, outer_dense_size, inner_dense_size, dtype, device_type
         )
 
+    @unittest.skipIf(*gpu_unavailable)
     # pyre-ignore [56]
     @given(
         num_jagged_dim=st.just(1),
         outer_dense_size=st.integers(0, 8),
         inner_dense_size=st.sampled_from([16, 64, 96, 192]),
         dtype=st.just(torch.half),
-        device_type=st.just("cuda"),
+        device_type=st.sampled_from(["cpu", "cuda"])
+        if gpu_available
+        else st.just("cpu"),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=4, deadline=None)
     def test_jagged_dense_dense_elementwise_add_jagged_output_opt(
@@ -1282,7 +1304,7 @@ def test_jagged_dense_dense_elementwise_add_jagged_output_meta_backend(
         dtype=st.sampled_from([torch.float, torch.half, torch.bfloat16, torch.double]),
         device_type=st.sampled_from(["cpu", "cuda"])
         if gpu_available
-        else st.just("cuda"),
+        else st.just("cpu"),
     )
     def test_batched_dense_vec_jagged_2d_mul(
         self,
@@ -1428,6 +1450,7 @@ def jagged_index_select_2d_ref(
         new_embeddings = torch.index_select(values, 0, all_indices)
         return new_embeddings
 
+    @unittest.skipIf(*running_on_github)
     # pyre-ignore [56]
     @given(
         max_seq_length=st.integers(5, 10),
diff --git a/fbgemm_gpu/test/test_utils.py b/fbgemm_gpu/test/test_utils.py
@@ -172,6 +172,12 @@ def fused_rowwise_nbit_quantize_dequantize_reference(
 # Used for `if` statements inside tests
 gpu_available: bool = not gpu_unavailable[0]
 
+# Used for `@unittest.skipIf` for tests that pass in internal CI, but fail on the GitHub runners
+running_on_github: Tuple[bool, str] = (
+    os.getenv("GITHUB_ENV") is not None,
+    "Test is currently known to fail or hang when run in the GitHub runners",
+)
+
 
 def cpu_and_maybe_gpu() -> st.SearchStrategy[List[torch.device]]:
     gpu_available = torch.cuda.is_available() and torch.cuda.device_count() > 0