pytorch
diff --git a/‎.github/workflows/regression_test_rocm.yml
Lines changed: 49 additions & 0 deletions b/‎.github/workflows/regression_test_rocm.yml
Lines changed: 49 additions & 0 deletions
diff --git a/‎test/dtypes/test_affine_quantized.py
Lines changed: 4 additions & 0 deletions b/‎test/dtypes/test_affine_quantized.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎test/dtypes/test_affine_quantized_tensor_parallel.py
Lines changed: 4 additions & 0 deletions b/‎test/dtypes/test_affine_quantized_tensor_parallel.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎test/dtypes/test_floatx.py
Lines changed: 2 additions & 1 deletion b/‎test/dtypes/test_floatx.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎test/dtypes/test_nf4.py
Lines changed: 3 additions & 0 deletions b/‎test/dtypes/test_nf4.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎test/dtypes/test_uint4.py
Lines changed: 3 additions & 1 deletion b/‎test/dtypes/test_uint4.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎test/float8/test_base.py
Lines changed: 2 additions & 0 deletions b/‎test/float8/test_base.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎test/float8/test_float8_utils.py
Lines changed: 2 additions & 1 deletion b/‎test/float8/test_float8_utils.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎test/float8/test_fsdp2/test_fsdp2.py
Lines changed: 3 additions & 0 deletions b/‎test/float8/test_fsdp2/test_fsdp2.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎test/hqq/test_hqq_affine.py
Lines changed: 2 additions & 0 deletions b/‎test/hqq/test_hqq_affine.py
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,49 @@
+name: Run Regression Tests on ROCm
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - ciflow/rocm/*
+
+concurrency:
+  group: regression_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+jobs:
+  test-nightly:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: ROCM Nightly
+            runs-on: linux.rocm.gpu.torchao
+            torch-spec: '--pre torch==2.7.0.dev20250122 --index-url https://download.pytorch.org/whl/nightly/rocm6.3'
+            gpu-arch-type: "rocm"
+            gpu-arch-version: "6.3"
+
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      timeout: 120
+      no-sudo: ${{ matrix.gpu-arch-type == 'rocm' }}
+      runner: ${{ matrix.runs-on }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      submodules: recursive
+      script: |
+        conda create -n venv python=3.9 -y
+        conda activate venv
+        python -m pip install --upgrade pip
+        pip install ${{ matrix.torch-spec }}
+        pip install -r dev-requirements.txt
+        pip install .
+        export CONDA=$(dirname $(dirname $(which conda)))
+        export LD_LIBRARY_PATH=$CONDA/lib/:$LD_LIBRARY_PATH
+        pytest test --verbose -s
@@ -25,6 +25,7 @@
     TORCH_VERSION_AT_LEAST_2_6,
     is_fbcode,
     is_sm_at_least_89,
+    skip_if_rocm,
 )
 
 is_cusparselt_available = (
@@ -104,6 +105,7 @@ def test_tensor_core_layout_transpose(self):
         "apply_quant",
         get_quantization_functions(is_cusparselt_available, True, "cuda", True),
     )
+    @skip_if_rocm("ROCm enablement in progress")
     def test_weights_only(self, apply_quant):
         linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
         if isinstance(apply_quant, AOBaseConfig):
@@ -196,6 +198,7 @@ def apply_uint6_weight_only_quant(linear):
         "apply_quant", get_quantization_functions(is_cusparselt_available, True)
     )
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @skip_if_rocm("ROCm enablement in progress")
     def test_print_quantized_module(self, apply_quant):
         linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
         if isinstance(apply_quant, AOBaseConfig):
@@ -213,6 +216,7 @@ class TestAffineQuantizedBasic(TestCase):
 
     @common_utils.parametrize("device", COMMON_DEVICES)
     @common_utils.parametrize("dtype", COMMON_DTYPES)
+    @skip_if_rocm("ROCm enablement in progress")
     def test_flatten_unflatten(self, device, dtype):
         if device == "cuda" and dtype == torch.bfloat16 and is_fbcode():
             raise unittest.SkipTest("TODO: Failing for cuda + bfloat16 in fbcode")
 
@@ -1,5 +1,6 @@
 import unittest
 
+import pytest
 import torch
 from torch.distributed._tensor import DeviceMesh, DTensor, Replicate, Shard
 from torch.testing._internal import common_utils
@@ -27,6 +28,9 @@
 except ModuleNotFoundError:
     has_gemlite = False
 
+if torch.version.hip is not None:
+    pytest.skip("Skipping the test in ROCm", allow_module_level=True)
+
 
 class TestAffineQuantizedTensorParallel(DTensorTestBase):
     """Basic test case for tensor subclasses"""
 
@@ -27,7 +27,7 @@
     fpx_weight_only,
     quantize_,
 )
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, is_fbcode
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, is_fbcode, skip_if_rocm
 
 _DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
 _Floatx_DTYPES = [(3, 2), (2, 2)]
@@ -109,6 +109,7 @@ def test_to_copy_device(self, ebits, mbits):
     @parametrize("bias", [False, True])
     @parametrize("dtype", [torch.half, torch.bfloat16])
     @unittest.skipIf(is_fbcode(), reason="broken in fbcode")
+    @skip_if_rocm("ROCm enablement in progress")
     def test_fpx_weight_only(self, ebits, mbits, bias, dtype):
         N, OC, IC = 4, 256, 64
         device = "cuda"
 
@@ -33,6 +33,7 @@
     nf4_weight_only,
     to_nf4,
 )
+from torchao.utils import skip_if_rocm
 
 bnb_available = False
 
@@ -111,6 +112,7 @@ def test_backward_dtype_match(self, dtype: torch.dtype):
 
     @unittest.skipIf(not bnb_available, "Need bnb availble")
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @skip_if_rocm("ROCm enablement in progress")
     @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
     def test_reconstruction_qlora_vs_bnb(self, dtype: torch.dtype):
         # From https://github.com/drisspg/transformer_nuggets/blob/f05afad68ad9086d342268f46a7f344617a02314/test/test_qlora.py#L65C1-L81C47
@@ -133,6 +135,7 @@ def test_reconstruction_qlora_vs_bnb(self, dtype: torch.dtype):
 
     @unittest.skipIf(not bnb_available, "Need bnb availble")
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @skip_if_rocm("ROCm enablement in progress")
     @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
     def test_nf4_bnb_linear(self, dtype: torch.dtype):
         """
 
@@ -28,7 +28,7 @@
 from torchao.quantization.quant_api import (
     _replace_with_custom_fn_if_matches_filter,
 )
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, skip_if_rocm
 
 
 def _apply_weight_only_uint4_quant(model):
@@ -92,6 +92,7 @@ def test_basic_tensor_ops(self):
         # only test locally
         # print("x:", x[0])
 
+    @skip_if_rocm("ROCm enablement in progress")
     def test_gpu_quant(self):
         for x_shape in [[2, 4], [5, 5, 5, 4], [1, 4, 4]]:
             x = torch.randn(*x_shape)
@@ -104,6 +105,7 @@ def test_gpu_quant(self):
             # make sure it runs
             opt(x)
 
+    @skip_if_rocm("ROCm enablement in progress")
     def test_pt2e_quant(self):
         from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
             QuantizationConfig,
 
@@ -18,6 +18,7 @@
     TORCH_VERSION_AT_LEAST_2_5,
     is_sm_at_least_89,
     is_sm_at_least_90,
+    skip_if_rocm,
 )
 
 if not TORCH_VERSION_AT_LEAST_2_5:
@@ -426,6 +427,7 @@ def test_linear_from_config_params(
     @pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)])
     @pytest.mark.parametrize("linear_bias", [True, False])
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    @skip_if_rocm("ROCm enablement in progress")
     def test_linear_from_recipe(
         self,
         recipe_name,
 
@@ -4,7 +4,7 @@
 import torch
 
 from torchao.float8.float8_utils import _round_scale_down_to_power_of_2
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, skip_if_rocm
 
 if not TORCH_VERSION_AT_LEAST_2_5:
     pytest.skip("Unsupported PyTorch version", allow_module_level=True)
@@ -30,6 +30,7 @@
         # ("largest subnormal number", [2**-126 * (1 - 2**-23), 1.1754943508222875e-38]),
     ],
 )
+@skip_if_rocm("ROCm enablement in progress")
 def test_round_scale_down_to_power_of_2_valid_inputs(
     test_case: dict,
 ):
 
@@ -43,6 +43,9 @@
 if not is_sm_at_least_89():
     pytest.skip("Unsupported CUDA device capability version", allow_module_level=True)
 
+if torch.version.hip is not None:
+    pytest.skip("ROCm enablement in progress", allow_module_level=True)
+
 
 class TestFloat8Common:
     def broadcast_module(self, module: nn.Module) -> None:
 
@@ -11,6 +11,7 @@
 )
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_3,
+    skip_if_rocm,
 )
 
 cuda_available = torch.cuda.is_available()
@@ -109,6 +110,7 @@ def test_hqq_plain_5bit(self):
             ref_dot_product_error=0.000704,
         )
 
+    @skip_if_rocm("ROCm enablement in progress")
     def test_hqq_plain_4bit(self):
         self._test_hqq(
             dtype=torch.uint4,
Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@`
`11`	`11`	`)`
`12`	`12`	`from torchao.utils import (`
`13`	`13`	`TORCH_VERSION_AT_LEAST_2_3,`
	`14`	`+ skip_if_rocm,`
`14`	`15`	`)`
`15`	`16`
`16`	`17`	`cuda_available = torch.cuda.is_available()`
`@@ -109,6 +110,7 @@ def test_hqq_plain_5bit(self):`
`109`	`110`	`ref_dot_product_error=0.000704,`
`110`	`111`	`)`
`111`	`112`
	`113`	`+ @skip_if_rocm("ROCm enablement in progress")`
`112`	`114`	`def test_hqq_plain_4bit(self):`
`113`	`115`	`self._test_hqq(`
`114`	`116`	`dtype=torch.uint4,`