Additional regression tests for cpu-only (#84)

cpuhrsch · web-flow · commit 56fb1b74009d · 2024-03-25T19:46:07.000-07:00
diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml
@@ -53,6 +53,58 @@ jobs:
         pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
         
 
+    - name: Install package
+      run: |
+        pip install .
+
+    - name: Run tests
+      run: |
+        pytest test
+
+  test-cpu:
+    runs-on: 32-core-ubuntu
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.9
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install -r dev-requirements.txt
+        pip install torch --index-url https://download.pytorch.org/whl/cpu
+        
+
+    - name: Install package
+      run: |
+        pip install .
+
+    - name: Run tests
+      run: |
+        pytest test
+
+  test-nightly-cpu:
+    runs-on: 32-core-ubuntu
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.9
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install -r dev-requirements.txt
+        pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+        
+
     - name: Install package
       run: |
         pip install .
diff --git a/test/dtypes/test_nf4.py b/test/dtypes/test_nf4.py
@@ -184,6 +184,7 @@ def test_to_bfloat16(self):
         assert type(inpt_tensor_nf4.to(torch.bfloat16)) == torch.Tensor
         assert inpt_tensor_nf4.to(torch.bfloat16).dtype == torch.bfloat16
 
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_smoketest_linear(self):
         a = torch.randn(32, 32, dtype=torch.bfloat16, device='cuda')
         a_nf4 = torchao.dtypes.to_nf4(a, 16, 2)
@@ -192,6 +193,7 @@ def test_smoketest_linear(self):
         out2 = torch.nn.functional.linear(inp, a_nf4)
 
     @unittest.skipIf(torch.__version__.split('+')[0] == '2.2.1', "Broken on stable.")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_smoketest_linear_compile(self):
         a = torch.randn(32, 32, dtype=torch.bfloat16, device='cuda')
         a_nf4 = torchao.dtypes.to_nf4(a, 16, 2)
diff --git a/test/kernel/test_autotuner.py b/test/kernel/test_autotuner.py
@@ -27,41 +27,39 @@ def tearDown(self):
 
     @parameterized.expand(
         [
-            ("cuda", torch.bfloat16),
             ("cuda", torch.bfloat16),
             # TODO: ("cpu", torch.bfloat16),
             ("cuda", torch.float16),
-            ("cuda", torch.float16),
             # TODO: ("cpu", torch.float16),
         ]
     )
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_int_mm(self, device, dtype):
-        from torchao.kernel import intmm_triton
+        from torchao.kernel import intmm
 
         dtype = torch.bfloat16
         m, k, n = (128, 64, 16)
         x = torch.randn(m, k, dtype=dtype, device=device)
         w = torch.randn(n, k, dtype=dtype, device=device).t()
         x_int = x.to(dtype=torch.int8)
         w_int = w.to(dtype=torch.int8)
-        out32_1 = intmm_triton.safe_int_mm(x_int, w_int)
+        out32_1 = intmm.safe_int_mm(x_int, w_int)
         assert out32_1.dtype == torch.int32
-        out32_2 = intmm_triton.int_matmul(x_int, w_int)
+        out32_2 = intmm.int_matmul(x_int, w_int)
         assert out32_2.dtype == out32_1.dtype
         torch.testing.assert_allclose(out32_1, out32_2)
 
     @parameterized.expand(
         [
-            ("cuda", torch.bfloat16),
             ("cuda", torch.bfloat16),
             # TODO: ("cpu", torch.bfloat16),
             ("cuda", torch.float16),
-            ("cuda", torch.float16),
             # TODO: ("cpu", torch.float16),
         ]
     )
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_int_scaled_mm(self, device, dtype):
-        from torchao.kernel import intmm_triton
+        from torchao.kernel import intmm
 
         dtype = torch.bfloat16
         m, k, n = (128, 64, 16)
@@ -70,9 +68,9 @@ def test_int_scaled_mm(self, device, dtype):
         w = torch.randn(n, k, dtype=dtype, device=device).t()
         x_int = x.to(dtype=torch.int8)
         w_int = w.to(dtype=torch.int8)
-        out32_1 = intmm_triton.safe_int_mm(x_int, w_int) * scales
+        out32_1 = intmm.safe_int_mm(x_int, w_int) * scales
         assert out32_1.dtype == torch.bfloat16
-        out32_2 = intmm_triton.int_scaled_matmul(x_int, w_int, scales)
+        out32_2 = intmm.int_scaled_matmul(x_int, w_int, scales)
         assert out32_2.dtype == out32_1.dtype
         torch.testing.assert_allclose(out32_1, out32_2)
 
diff --git a/torchao/kernel/intmm.py b/torchao/kernel/intmm.py
@@ -0,0 +1,74 @@
+import itertools
+import os
+import torch
+
+from torch._dynamo import is_compiling as dynamo_is_compiling
+from torch._higher_order_ops.out_dtype import out_dtype
+
+try:
+    from torchao.kernel import intmm_triton
+except ImportError:
+    intmm_triton = None
+
+AUTOTUNER_ENABLE = bool(int(os.getenv("TORCHAO_AUTOTUNER_ENABLE", 0)))
+
+def safe_int_mm(input: torch.Tensor, mat2: torch.Tensor) -> torch.Tensor:
+    # torch.compile path
+    if dynamo_is_compiling() or "FakeTensor" in input.__repr__():
+        return out_dtype(torch.ops.aten.mm.default, torch.int32, input, mat2)
+
+    # error checking for cublas path
+    assert (
+        mat2.device == input.device
+    ), f"need both tensors to be on the same device but got {mat2.device} and {input.device}"
+    device_cpu = "cpu" in [mat2.device.type, input.device.type]
+    # with input.shape = [i,j] and mat2.shape = [j,k]
+    i_is_strictly_greater_than_16 = input.shape[0] > 16
+    j_is_nonzero_multiple_of_8 = (input.shape[1] % 8 == 0) and (input.shape[1] > 0)
+    k_is_nonzero_multiple_of_8 = (mat2.shape[1] % 8 == 0) and (mat2.shape[1] > 0)
+    bad_dimensions_for_cublas = not (
+        i_is_strictly_greater_than_16
+        and j_is_nonzero_multiple_of_8
+        and k_is_nonzero_multiple_of_8
+    )
+
+    if device_cpu or bad_dimensions_for_cublas:
+        # fallback path
+        return torch.matmul(input.cpu().to(torch.int32), mat2.cpu().to(torch.int32)).to(
+            input.device.type
+        )
+
+    # cublas paths
+    if not mat2.is_contiguous():  # silently gives incorrect result without this
+        mat2 = mat2.contiguous()
+    if (not input.is_contiguous()) and (
+        input.shape[0] % 8 != 0
+    ):  # gives cryptic error without this
+        input = (
+            input.contiguous()
+        )  # (it seems the transpose makes cublas check the above j constraint on i)
+    return out_dtype(torch.ops.aten.mm.default, torch.int32, input, mat2)
+
+
+def int_matmul(a, b):
+    if intmm_triton is not None and AUTOTUNER_ENABLE:
+        return torch.ops.torchao.int_matmul(a, b)
+    return safe_int_mm(a, b)
+
+
+def int_scaled_matmul(a, b, scales1):
+    assert a.is_contiguous(), "Matrix A must be contiguous"
+    assert b.transpose(0, 1).is_contiguous(), "Matrix B must be transpose contiguous"
+    M, K = a.shape
+    K, N = b.shape
+    assert M == scales1.size(0)
+    assert 1 == scales1.size(1)
+    assert scales1.is_contiguous()
+    assert scales1.dtype == torch.bfloat16
+    scales1 = scales1.expand((M, N))
+    assert scales1.dim() == 2
+    if intmm_triton is not None and AUTOTUNER_ENABLE:
+        return torch.ops.torchao.int_scaled_matmul(a, b, scales1)
+
+    c = safe_int_mm(a, b)
+    return c * scales1
diff --git a/torchao/kernel/intmm_triton.py b/torchao/kernel/intmm_triton.py
@@ -5,13 +5,9 @@
 
 import triton
 import triton.language as tl
-from torch._dynamo import is_compiling as dynamo_is_compiling
-from torch._higher_order_ops.out_dtype import out_dtype
 
 from torchao.kernel.autotuner import get_best_config_fn
 
-AUTOTUNER_ENABLE = bool(int(os.getenv("TORCHAO_AUTOTUNER_ENABLE", 0)))
-
 int8_powers_of_two = [32, 64, 128, 256]
 int8_mm_kernel_configs = sum(
     [
@@ -338,50 +334,6 @@ def int_matmul_cuda(a, b):
     return int_matmul_kernel(a, b, c, best_config)
 
 
-def safe_int_mm(input: torch.Tensor, mat2: torch.Tensor) -> torch.Tensor:
-    # torch.compile path
-    if dynamo_is_compiling() or "FakeTensor" in input.__repr__():
-        return out_dtype(torch.ops.aten.mm.default, torch.int32, input, mat2)
-
-    # error checking for cublas path
-    assert (
-        mat2.device == input.device
-    ), f"need both tensors to be on the same device but got {mat2.device} and {input.device}"
-    device_cpu = "cpu" in [mat2.device.type, input.device.type]
-    # with input.shape = [i,j] and mat2.shape = [j,k]
-    i_is_strictly_greater_than_16 = input.shape[0] > 16
-    j_is_nonzero_multiple_of_8 = (input.shape[1] % 8 == 0) and (input.shape[1] > 0)
-    k_is_nonzero_multiple_of_8 = (mat2.shape[1] % 8 == 0) and (mat2.shape[1] > 0)
-    bad_dimensions_for_cublas = not (
-        i_is_strictly_greater_than_16
-        and j_is_nonzero_multiple_of_8
-        and k_is_nonzero_multiple_of_8
-    )
-
-    if device_cpu or bad_dimensions_for_cublas:
-        # fallback path
-        return torch.matmul(input.cpu().to(torch.int32), mat2.cpu().to(torch.int32)).to(
-            input.device.type
-        )
-
-    # cublas paths
-    if not mat2.is_contiguous():  # silently gives incorrect result without this
-        mat2 = mat2.contiguous()
-    if (not input.is_contiguous()) and (
-        input.shape[0] % 8 != 0
-    ):  # gives cryptic error without this
-        input = (
-            input.contiguous()
-        )  # (it seems the transpose makes cublas check the above j constraint on i)
-    return out_dtype(torch.ops.aten.mm.default, torch.int32, input, mat2)
-
-
-def int_matmul(a, b):
-    if AUTOTUNER_ENABLE:
-        return torch.ops.torchao.int_matmul(a, b)
-    return safe_int_mm(a, b)
-
-
 @torch.library.impl(lib, "int_scaled_matmul", "Meta")
 def int_scaled_matmul_meta(a, b, scales1):
     M, K = a.shape
@@ -404,21 +356,3 @@ def int_scaled_matmul_cuda(a, b, scales1):
         int_scaled_matmul_kernel, [a, b, scales1, c], int8_mm_kernel_configs
     )
     return int_scaled_matmul_kernel(a, b, scales1, c, best_config)
-
-
-def int_scaled_matmul(a, b, scales1):
-    assert a.is_contiguous(), "Matrix A must be contiguous"
-    assert b.transpose(0, 1).is_contiguous(), "Matrix B must be transpose contiguous"
-    M, K = a.shape
-    K, N = b.shape
-    assert M == scales1.size(0)
-    assert 1 == scales1.size(1)
-    assert scales1.is_contiguous()
-    assert scales1.dtype == torch.bfloat16
-    scales1 = scales1.expand((M, N))
-    assert scales1.dim() == 2
-    if AUTOTUNER_ENABLE:
-        return torch.ops.torchao.int_scaled_matmul(a, b, scales1)
-
-    c = safe_int_mm(a, b)
-    return c * scales1
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -10,7 +10,7 @@
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib
 from torch.library import impl
 
-from torchao.kernel.intmm_triton import int_scaled_matmul
+from torchao.kernel.intmm import int_scaled_matmul
 from .utils import TORCH_VERSION_AFTER_2_4