Skip to content

Commit eec6fd2

Browse files
q10facebook-github-bot
authored andcommitted
Make Tests More Stable (#1606)
Summary: - Add support for retries in build steps that are known to fail due to the occasional network connection failures - Add support for installing ROCm tooling and testing ROCm builds in the build scripts framework - Update the existing FBGEMM_GPU CI / build_amd_gpu job to use the build scripts framework - Fix the annotations to tests in `jagged_tensor_ops_test.py` to run correctly on CPU-only mode - Impose timeouts of 10 minutes for running the test suites (in practice, they generally complete within 3 minutes) - Add ability to conditionally disable tests depending on whether or not they are running inside the GitHub runner - Disable the `test_jagged_index_select_2d` test on GitHub until we figure out the root cause of it hanging whenever it is run on GitHub (regardless of GPU or CPU variant) Pull Request resolved: #1606 Reviewed By: brad-mengchi, shintaro-iwasaki Differential Revision: D43601032 Pulled By: q10 fbshipit-source-id: 55203151f54bf010859b0e7e1568572e064fa7cf
1 parent 85a6009 commit eec6fd2

File tree

9 files changed

+366
-179
lines changed

9 files changed

+366
-179
lines changed

.github/scripts/setup_env.bash

Lines changed: 245 additions & 95 deletions
Large diffs are not rendered by default.

.github/workflows/fbgemm_docs.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1-
# This workflow builds the fbgemm_gpu docs and deploys them to gh-pages.
2-
name: Generate documentation
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
name: FBGEMM Documentation
37
on:
48
push:
59
branches:

.github/workflows/fbgemm_gpu_ci.yml

Lines changed: 54 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -14,72 +14,73 @@ on:
1414
- main
1515

1616
jobs:
17-
build_amd_gpu:
18-
if: ${{ false }} # Disable the job for now
17+
build_and_test_amd:
1918
runs-on: ${{ matrix.os }}
19+
defaults:
20+
run:
21+
shell: bash
22+
env:
23+
PRELUDE: .github/scripts/setup_env.bash
24+
BUILD_ENV: build_binary
2025
strategy:
26+
fail-fast: false
2127
matrix:
22-
os: [ubuntu-20.04]
23-
config: [[pip, 5.3]]
28+
os: [ ubuntu-20.04 ]
29+
python-version: [ "3.10" ]
30+
rocm-version: [ "5.3" ]
2431

2532
steps:
26-
- name: Free space
27-
run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
33+
- name: Checkout the Repository
34+
uses: actions/checkout@v3
35+
with:
36+
submodules: true
2837

29-
- uses: actions/checkout@v3
38+
- name: Display System Info
39+
run: . $PRELUDE; print_system_info
3040

31-
- name: Install ROCm
32-
shell: bash
33-
run: |
34-
sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 10
35-
wget https://repo.radeon.com/amdgpu-install/5.3/ubuntu/focal/amdgpu-install_5.3.50300-1_all.deb
36-
export DEBIAN_FRONTEND=noninteractive
37-
sudo apt install -y ./amdgpu-install_5.3.50300-1_all.deb
38-
amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms
39-
sudo rm amdgpu-install_5.3.50300-1_all.deb
40-
41-
- name: Install dependencies
42-
shell: bash
43-
run: |
44-
sudo apt-get update
45-
sudo apt-get install -y git pip python3-dev mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
46-
sudo apt-get install -y hipify-clang || true
47-
sudo apt-get install -y miopen-hip miopen-hip-dev
48-
sudo pip install cmake scikit-build ninja jinja2 numpy hypothesis --no-input
49-
sudo apt-get clean
50-
# Install PyTorch (nightly) as required by fbgemm_gpu
51-
sudo pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/rocm5.3/
52-
53-
- name: Checkout submodules
54-
shell: bash
55-
run: |
56-
cd fbgemm_gpu
57-
git submodule sync
58-
git submodule update --init --recursive
41+
- name: Free Disk Space
42+
run: . $PRELUDE; free_disk_space
5943

60-
- name: Build fbgemm_gpu
61-
shell: bash
44+
- name: Setup Miniconda
6245
run: |
63-
sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 10
64-
cd fbgemm_gpu
65-
# build for MI250 only to save time.
66-
sudo PYTORCH_ROCM_ARCH=gfx90a python3 setup.py build develop
46+
. $PRELUDE; setup_miniconda $HOME/miniconda
47+
echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
48+
echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
6749
68-
- name: Test fbgemm_gpu installation
69-
shell: bash
50+
- name: Create Conda Environment
51+
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
52+
53+
- name: Install ROCm
54+
run: . $PRELUDE; install_rocm_ubuntu $BUILD_ENV ${{ matrix.rocm-version }}
55+
56+
- name: Install Build Tools
57+
run: . $PRELUDE; install_build_tools $BUILD_ENV
58+
59+
- name: Install PyTorch-ROCm Nightly
60+
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}
61+
62+
- name: Prepare FBGEMM Build
63+
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
64+
65+
- name: Build FBGEMM_GPU-ROCM Nightly
7066
run: |
67+
. $PRELUDE
7168
cd fbgemm_gpu
72-
cd test
73-
python3 input_combine_test.py -v
74-
python3 quantize_ops_test.py -v
75-
python3 sparse_ops_test.py -v
76-
python3 -c "import fbgemm_gpu"
77-
python3 -c "import fbgemm_gpu.split_embedding_codegen_lookup_invokers"
69+
70+
# Build for MI250 only to save time.
71+
print_exec conda env config vars set -n $BUILD_ENV PYTORCH_ROCM_ARCH=gfx90a
72+
print_exec conda run -n $BUILD_ENV python setup.py build develop
73+
74+
- name: Test FBGEMM_GPU-ROCM Nightly installation
75+
timeout-minutes: 10
76+
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
77+
7878

7979
test_amd_gpu:
8080
if: ${{ false }} # Disable the job for now
8181
runs-on: rocm
8282
strategy:
83+
fail-fast: false
8384
matrix:
8485
os: [ubuntu-latest]
8586

@@ -125,6 +126,7 @@ jobs:
125126
"
126127
docker run $DOCKER_OPTIONS $DOCKER_IMAGE $JENKINS_REPO_DIR_DOCKER/.jenkins/rocm/build_and_test.sh $JENKINS_REPO_DIR_DOCKER
127128
129+
128130
build_and_test_cpu:
129131
runs-on: ${{ matrix.os }}
130132
defaults:
@@ -134,6 +136,7 @@ jobs:
134136
PRELUDE: .github/scripts/setup_env.bash
135137
BUILD_ENV: build_binary
136138
strategy:
139+
fail-fast: false
137140
matrix:
138141
os: [ ubuntu-20.04, ubuntu-latest ]
139142
python-version: [ "3.8", "3.9", "3.10" ]
@@ -169,4 +172,5 @@ jobs:
169172
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpuonly
170173

171174
- name: Test with PyTest
172-
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpuonly
175+
timeout-minutes: 10
176+
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu

.github/workflows/fbgemm_nightly_build.yml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -146,15 +146,14 @@ jobs:
146146
with:
147147
name: fbgemm_gpu_nightly_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
148148

149-
- name: Display Structure of the Downloaded Files
150-
run: ls -R
151-
152149
- name: Install FBGEMM_GPU Nightly
153-
run: . $PRELUDE; install_fbgemm_gpu_package $BUILD_ENV *.whl
150+
run: |
151+
. $PRELUDE
152+
ls .
153+
install_fbgemm_gpu_package $BUILD_ENV *.whl
154154
155155
- name: Test with PyTest
156-
# Remove this line when we fixed all the unit tests
157-
continue-on-error: true
156+
timeout-minutes: 10
158157
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
159158

160159
- name: Push FBGEMM_GPU Nightly Binary to PYPI

.github/workflows/fbgemm_nightly_build_cpu.yml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -135,14 +135,15 @@ jobs:
135135
with:
136136
name: fbgemm_gpu_nightly_cpu_${{ matrix.python-version }}.whl
137137

138-
- name: Display Structure of the Downloaded Files
139-
run: ls -R
140-
141138
- name: Install FBGEMM_GPU Nightly (CPU version)
142-
run: . $PRELUDE; install_fbgemm_gpu_package $BUILD_ENV *.whl
139+
run: |
140+
. $PRELUDE
141+
ls .
142+
install_fbgemm_gpu_package $BUILD_ENV *.whl
143143
144144
- name: Test with PyTest
145-
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpuonly
145+
timeout-minutes: 10
146+
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
146147

147148
- name: Push FBGEMM_GPU Nightly (CPU version) Binary to PYPI
148149
if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' }}

.github/workflows/fbgemm_release_build.yml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -137,15 +137,14 @@ jobs:
137137
with:
138138
name: fbgemm_gpu_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
139139

140-
- name: Display Structure of the Downloaded Files
141-
run: ls -R
142-
143140
- name: Install FBGEMM_GPU
144-
run: . $PRELUDE; install_fbgemm_gpu_package $BUILD_ENV *.whl
141+
run: |
142+
. $PRELUDE
143+
ls .
144+
install_fbgemm_gpu_package $BUILD_ENV *.whl
145145
146146
- name: Test with PyTest
147-
# Remove this line when we fixed all the unit tests
148-
continue-on-error: true
147+
timeout-minutes: 10
149148
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
150149

151150
- name: Push FBGEMM_GPU Binary to PYPI

.github/workflows/fbgemm_release_build_cpu.yml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -127,14 +127,15 @@ jobs:
127127
with:
128128
name: fbgemm_gpu_cpu_${{ matrix.python-version }}.whl
129129

130-
- name: Display Structure of the Downloaded Files
131-
run: ls -R
132-
133130
- name: Install FBGEMM_GPU (CPU version)
134-
run: . $PRELUDE; install_fbgemm_gpu_package $BUILD_ENV *.whl
131+
run: |
132+
. $PRELUDE
133+
ls .
134+
install_fbgemm_gpu_package $BUILD_ENV *.whl
135135
136136
- name: Test with PyTest
137-
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpuonly
137+
timeout-minutes: 10
138+
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
138139

139140
- name: Push FBGEMM_GPU (CPU version) Binary to PYPI
140141
if: ${{ github.event_name != 'pull_request' && github.event_name != 'push' }}

fbgemm_gpu/test/jagged_tensor_ops_test.py

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,15 @@
2020
from fbgemm_gpu import open_source # noqa: F401
2121

2222
# pyre-ignore[21]
23-
from test_utils import gpu_available, gpu_unavailable
23+
from test_utils import gpu_available, gpu_unavailable, running_on_github
2424
except Exception:
2525
torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
2626
torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
27-
from fbgemm_gpu.test.test_utils import gpu_available, gpu_unavailable
27+
from fbgemm_gpu.test.test_utils import (
28+
gpu_available,
29+
gpu_unavailable,
30+
running_on_github,
31+
)
2832

2933

3034
def lengths_to_segment_ids(lengths: torch.Tensor) -> torch.Tensor:
@@ -624,7 +628,9 @@ def _test_dense_to_jagged(
624628
outer_dense_size=st.integers(0, 5),
625629
inner_dense_size=st.integers(0, 5),
626630
dtype=st.sampled_from([torch.float, torch.half, torch.bfloat16]),
627-
device_type=st.sampled_from(["cpu", "cuda"]),
631+
device_type=st.sampled_from(["cpu", "cuda"])
632+
if gpu_available
633+
else st.just("cpu"),
628634
precompute_total_L=st.booleans(),
629635
)
630636
@settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
@@ -646,13 +652,16 @@ def test_dense_to_jagged(
646652
precompute_total_L,
647653
)
648654

655+
@unittest.skipIf(*gpu_unavailable)
649656
# pyre-ignore [56]
650657
@given(
651658
num_jagged_dim=st.just(1),
652659
outer_dense_size=st.integers(0, 6000),
653660
inner_dense_size=st.sampled_from([8, 16, 23, 24, 48, 50, 64, 72, 96, 192]),
654661
dtype=st.just(torch.half),
655-
device_type=st.just("cuda"),
662+
device_type=st.sampled_from(["cpu", "cuda"])
663+
if gpu_available
664+
else st.just("cpu"),
656665
precompute_total_L=st.booleans(),
657666
)
658667
@settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
@@ -676,13 +685,16 @@ def test_dense_to_jagged_opt(
676685

677686
# (8000+1) * 8 (size of the element of LongTensor/int64_t offsets)
678687
# = ~62.5KB > 48KB default shared memory on V100/A100.
688+
@unittest.skipIf(*gpu_unavailable)
679689
# pyre-ignore [56]
680690
@given(
681691
num_jagged_dim=st.just(1),
682692
outer_dense_size=st.just(8000),
683693
inner_dense_size=st.just(16),
684694
dtype=st.just(torch.half),
685-
device_type=st.just("cuda"),
695+
device_type=st.sampled_from(["cpu", "cuda"])
696+
if gpu_available
697+
else st.just("cpu"),
686698
precompute_total_L=st.booleans(),
687699
)
688700
@settings(verbosity=Verbosity.verbose, max_examples=1, deadline=None)
@@ -969,7 +981,9 @@ def mul_func(*args) -> torch.Tensor:
969981
inner_dense_size=st.integers(0, 4),
970982
operation=st.sampled_from(["add", "add_jagged_output", "mul"]),
971983
dtype=st.sampled_from([torch.float, torch.half, torch.double, torch.bfloat16]),
972-
device_type=st.sampled_from(["cpu", "cuda"]),
984+
device_type=st.sampled_from(["cpu", "cuda"])
985+
if gpu_available
986+
else st.just("cpu"),
973987
)
974988
@settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
975989
def test_jagged_elementwise_binary(
@@ -990,14 +1004,17 @@ def test_jagged_elementwise_binary(
9901004
device_type,
9911005
)
9921006

1007+
@unittest.skipIf(*gpu_unavailable)
9931008
# pyre-ignore [56]
9941009
@given(
9951010
num_jagged_dim=st.just(1),
9961011
outer_dense_size=st.integers(0, 8),
9971012
inner_dense_size=st.sampled_from([16, 64, 96, 192]),
9981013
operation=st.sampled_from(["add_jagged_output", "mul"]),
9991014
dtype=st.just(torch.half),
1000-
device_type=st.just("cuda"),
1015+
device_type=st.sampled_from(["cpu", "cuda"])
1016+
if gpu_available
1017+
else st.just("cpu"),
10011018
)
10021019
@settings(verbosity=Verbosity.verbose, max_examples=4, deadline=None)
10031020
def test_jagged_elementwise_binary_opt(
@@ -1170,7 +1187,9 @@ def add_jagged_output_func(*args) -> torch.Tensor:
11701187
outer_dense_size=st.integers(0, 4),
11711188
inner_dense_size=st.integers(0, 4),
11721189
dtype=st.sampled_from([torch.float, torch.half, torch.double, torch.bfloat16]),
1173-
device_type=st.sampled_from(["cpu", "cuda"]),
1190+
device_type=st.sampled_from(["cpu", "cuda"])
1191+
if gpu_available
1192+
else st.just("cpu"),
11741193
)
11751194
@settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
11761195
def test_jagged_dense_dense_elementwise_add_jagged_output(
@@ -1185,13 +1204,16 @@ def test_jagged_dense_dense_elementwise_add_jagged_output(
11851204
num_jagged_dim, outer_dense_size, inner_dense_size, dtype, device_type
11861205
)
11871206

1207+
@unittest.skipIf(*gpu_unavailable)
11881208
# pyre-ignore [56]
11891209
@given(
11901210
num_jagged_dim=st.just(1),
11911211
outer_dense_size=st.integers(0, 8),
11921212
inner_dense_size=st.sampled_from([16, 64, 96, 192]),
11931213
dtype=st.just(torch.half),
1194-
device_type=st.just("cuda"),
1214+
device_type=st.sampled_from(["cpu", "cuda"])
1215+
if gpu_available
1216+
else st.just("cpu"),
11951217
)
11961218
@settings(verbosity=Verbosity.verbose, max_examples=4, deadline=None)
11971219
def test_jagged_dense_dense_elementwise_add_jagged_output_opt(
@@ -1282,7 +1304,7 @@ def test_jagged_dense_dense_elementwise_add_jagged_output_meta_backend(
12821304
dtype=st.sampled_from([torch.float, torch.half, torch.bfloat16, torch.double]),
12831305
device_type=st.sampled_from(["cpu", "cuda"])
12841306
if gpu_available
1285-
else st.just("cuda"),
1307+
else st.just("cpu"),
12861308
)
12871309
def test_batched_dense_vec_jagged_2d_mul(
12881310
self,
@@ -1428,6 +1450,7 @@ def jagged_index_select_2d_ref(
14281450
new_embeddings = torch.index_select(values, 0, all_indices)
14291451
return new_embeddings
14301452

1453+
@unittest.skipIf(*running_on_github)
14311454
# pyre-ignore [56]
14321455
@given(
14331456
max_seq_length=st.integers(5, 10),

fbgemm_gpu/test/test_utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,12 @@ def fused_rowwise_nbit_quantize_dequantize_reference(
172172
# Used for `if` statements inside tests
173173
gpu_available: bool = not gpu_unavailable[0]
174174

175+
# Used for `@unittest.skipIf` for tests that pass in internal CI, but fail on the GitHub runners
176+
running_on_github: Tuple[bool, str] = (
177+
os.getenv("GITHUB_ENV") is not None,
178+
"Test is currently known to fail or hang when run in the GitHub runners",
179+
)
180+
175181

176182
def cpu_and_maybe_gpu() -> st.SearchStrategy[List[torch.device]]:
177183
gpu_available = torch.cuda.is_available() and torch.cuda.device_count() > 0

0 commit comments

Comments
 (0)