ROCm fixes for CI (#4345)

q10 · facebook-github-bot · commit 940a285424cc · 2025-06-17T22:15:49.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1430 - ROCm fixes for CI Pull Request resolved: #4345 Reviewed By: spcyppt Differential Revision: D76792282 Pulled By: q10 fbshipit-source-id: 41de3351c6496dfe5ae90c0395cce94a2896040a
diff --git a/.github/scripts/fbgemm_gpu_benchmarks.bash b/.github/scripts/fbgemm_gpu_benchmarks.bash
@@ -8,11 +8,41 @@
 
 # shellcheck disable=SC1091,SC2128
 . "$( dirname -- "$BASH_SOURCE"; )/utils_base.bash"
+. "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_test.bash"
 
 ################################################################################
 # FBGEMM_GPU Test Helper Functions
 ################################################################################
 
+setup_fbgemm_gpu_bench () {
+  env_name="$1"
+
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  # shellcheck disable=SC2086
+  fbgemm_build_variant=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(fbgemm_gpu.__variant__)")
+  echo "[BENCH] Determined FBGEMM_GPU variant from installation: ${fbgemm_build_variant}"
+
+  if [ "$fbgemm_build_variant" == "rocm" ]; then
+    echo "[BENCH] Configuring for ROCm-based benchmarking ..."
+    __configure_fbgemm_gpu_test_rocm
+  fi
+
+  if [[ $MACHINE_NAME == 'aarch64' ]]; then
+    # NOTE: Setting KMP_DUPLICATE_LIB_OK silences the error about multiple
+    # OpenMP being linked when FBGEMM_GPU is compiled under Clang on aarch64
+    # machines:
+    #   https://stackoverflow.com/questions/53014306/error-15-initializing-libiomp5-dylib-but-found-libiomp5-dylib-already-initial
+    echo "[TEST] Platform is aarch64; will set KMP_DUPLICATE_LIB_OK ..."
+    # shellcheck disable=SC2086
+    print_exec conda env config vars set ${env_prefix} KMP_DUPLICATE_LIB_OK=1
+  fi
+
+  # shellcheck disable=SC2086
+  print_exec conda env config vars set ${env_prefix} TORCH_SHOW_CPP_STACKTRACES=1
+}
+
 run_tbe_microbench () {
   local env_name="$1"
 
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
@@ -111,11 +111,20 @@ __configure_fbgemm_gpu_test_rocm () {
 
   # shellcheck disable=SC2086
   print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
+  # Disabled by default; enable for debugging
   # shellcheck disable=SC2086
   print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1
   # shellcheck disable=SC2086
   print_exec conda env config vars set ${env_prefix} FBGEMM_TBE_ROCM_INFERENCE_PACKED_BAGS=1
 
+  # AMD GPUs need to be explicitly made visible to PyTorch for use
+  # shellcheck disable=SC2155,SC2126
+  local num_gpus=$(rocm-smi --showproductname | grep GUID | wc -l)
+  # shellcheck disable=SC2155
+  local gpu_indices=$(seq 0 $((num_gpus - 1)) | paste -sd, -)
+  # shellcheck disable=SC2086
+  print_exec conda env config vars set ${env_prefix} HIP_VISIBLE_DEVICES="${gpu_indices}"
+
   # Starting from MI250 AMD GPUs support per process XNACK mode change
   # shellcheck disable=SC2155
   local rocm_version=$(awk -F'[.-]' '{print $1 * 10000 + $2 * 100 + $3}' /opt/rocm/.info/version-dev)
diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash
@@ -25,10 +25,13 @@ __verify_pytorch_gpu_integration () {
   local torch_version_cuda=$(conda run ${env_prefix} python -c "import torch; print(torch.version.cuda)")
   # shellcheck disable=SC2086,SC2155
   local torch_version_hip=$(conda run ${env_prefix} python -c "import torch; print(torch.version.hip)")
+  # shellcheck disable=SC2086,SC2155
+  local torch_device_compatibility=$(conda run ${env_prefix} python -c "import torch; print(torch.cuda.get_device_capability())")
 
   echo ""
   echo "################################################################################"
   echo "[CHECK] torch.cuda.is_available(): ${torch_cuda_available}"
+  echo "[CHECK] torch.cuda.get_device_capability(): ${torch_device_compatibility}"
   echo "[CHECK] torch.version.cuda: ${torch_version_cuda}"
   echo "[CHECK] torch.version.hip: ${torch_version_hip}"
   echo "################################################################################"
diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
@@ -170,18 +170,18 @@ print_gpu_info () {
       return 1
     fi
   else
-    local smi_programs=( rocminfo rocm-smi )
-
-    for smi_program in "${smi_programs[@]}"; do
-      # shellcheck disable=SC2086
-      if which $smi_program; then
-        # If the program is installed on a machine without GPUs, invoking it will return error
-        # shellcheck disable=SC2086
-        (print_exec $smi_program) || true
-      else
-        echo "[CHECK] $smi_program not found"
-      fi
-    done
+    if which rocm-smi; then
+      # If the program is installed on a machine without GPUs, invoking it will return error
+      (print_exec rocm-smi --showproductname) || true
+    else
+      echo "[CHECK] rocm-smi not found"
+    fi
+
+    if which rocminfo; then
+      (print_exec rocminfo) || true
+    else
+      echo "[CHECK] rocminfo not found"
+    fi
   fi
 }
 
diff --git a/.github/workflows/fbgemm_gpu_benchmark_rocm.yml b/.github/workflows/fbgemm_gpu_benchmark_rocm.yml
@@ -189,6 +189,9 @@ jobs:
     - name: Install FBGEMM_GPU Wheel
       run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
 
+    - name: Set Up for FBGEMM_GPU Benchmark
+      run: . $PRELUDE; setup_fbgemm_gpu_bench $BUILD_ENV
+
     - name: Run FBGEMM_GPU Benchmark
       timeout-minutes: 40
       run: . $PRELUDE; run_tbe_microbench $BUILD_ENV
diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu/development/TestInstructions.rst b/fbgemm_gpu/docs/src/fbgemm_gpu/development/TestInstructions.rst
@@ -47,7 +47,8 @@ environment:
 
   # !! Run inside the Conda environment !!
 
-  # Enable for running in CPU-only mode (when on a GPU-capable machine)
+  # Specify the specific CUDA devices to run the tests on
+  # Alternatively, set to -1 for running in CPU-only mode (when on a GPU-capable machine)
   export CUDA_VISIBLE_DEVICES=-1
 
   # Enable for debugging failed kernel executions
@@ -73,7 +74,14 @@ For ROCm machines, testing against a ROCm GPU needs to be enabled with
   cd test
 
   export FBGEMM_TEST_WITH_ROCM=1
-  # Enable for debugging failed kernel executions
+
+  # Specify the specific HIP devices to run the tests on
+  #
+  # NOTE: This is necessary if PyTorch is unable to see the devices that
+  # `rocm-smi --showproductname` can see
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+
+  # Enable for debugging kernel executions
   export HIP_LAUNCH_BLOCKING=1
 
   python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning split_table_batched_embeddings_test.py
diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
@@ -44,13 +44,19 @@ def disable_bufferops(_unused: bool):
 
 
 @functools.lru_cache
-def supports_float8_fnuz() -> bool:
+def supports_float8_fnuz(throw_on_hip_incompatibility: bool = True) -> bool:
     if torch.version.hip:
         device_capability = torch.cuda.get_device_capability()
 
         if device_capability < (9, 4):
             gpu_arch = torch.cuda.get_device_properties("cuda").gcnArchName
-            raise RuntimeError(f"Unsupported GPU arch: {gpu_arch} for FP8")
+            msg = f"Unsupported GPU arch: {gpu_arch} for FP8"
+            if throw_on_hip_incompatibility:
+                raise RuntimeError(msg)
+            else:
+                logging.error(msg)
+                return False
+
         elif device_capability == (9, 4):
             return True
 
diff --git a/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py b/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py
@@ -41,7 +41,9 @@
 
 running_on_github: bool = os.getenv("GITHUB_ENV") is not None
 
-if torch.version.hip and supports_float8_fnuz():
+if torch.cuda.is_available() and supports_float8_fnuz(
+    throw_on_hip_incompatibility=(not running_on_github)
+):
     # Supported FP8 format is different on NV and AMD.
     fp8_e4m3: torch.dtype = torch.float8_e4m3fnuz
     fp8_e5m2: torch.dtype = torch.float8_e5m2fnuz
@@ -635,6 +637,7 @@ def test_quantize_int4_bf16_matmul(
         zq_ref = (x @ w.T).to(torch.bfloat16)
         torch.testing.assert_close(zq, zq_ref, atol=1.0e-1, rtol=8.0e-2)
 
+    @unittest.skipIf(running_on_github, "Test is currently unreliable on GitHub OSS CI")
     @unittest.skipIf(
         not torch.version.cuda and torch.version.hip < "6.2",
         "Skip on AMD with < RoCM 6.2",
diff --git a/fbgemm_gpu/test/quantize/comm_codec_test.py b/fbgemm_gpu/test/quantize/comm_codec_test.py
@@ -18,6 +18,10 @@
 
 
 class QuantizedCommCodecTest(unittest.TestCase):
+    @unittest.skipIf(
+        not torch.cuda.is_available(),
+        "Skip when no GPU is available",
+    )
     @settings(deadline=8000)
     # pyre-ignore
     @given(