Expand CI coverage to 2.2.2, 2.3rc and nightly (#96)

cpuhrsch · web-flow · commit 5420089c79e9 · 2024-03-29T19:55:47.000-07:00
diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml
@@ -9,7 +9,7 @@ on:
       - main
 
 jobs:
-  test:
+  test-cuda-2-2-2:
     runs-on: 4-core-ubuntu-gpu-t4
     steps:
     - uses: actions/checkout@v2
@@ -22,10 +22,9 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install torch
+        pip install torch==2.2.2
         pip install -r requirements.txt
         pip install -r dev-requirements.txt
-        
 
     - name: Install package
       run: |
@@ -35,7 +34,24 @@ jobs:
       run: |
         pytest test --verbose -s -x
 
-  test-nightly:
+  test-cuda-2-3-rc:
+    runs-on: 4-core-ubuntu-gpu-t4
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.9
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
+        pip install -r requirements.txt
+        pip install -r dev-requirements.txt
+
+  test-cuda-nightly:
     runs-on: 4-core-ubuntu-gpu-t4
     steps:
     - uses: actions/checkout@v2
@@ -103,7 +119,6 @@ jobs:
         pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
         pip install -r requirements.txt
         pip install -r dev-requirements.txt
-        
 
     - name: Install package
       run: |
diff --git a/torchao/kernel/intmm.py b/torchao/kernel/intmm.py
@@ -2,52 +2,65 @@
 import os
 import torch
 
-from torch._dynamo import is_compiling as dynamo_is_compiling
-from torch._higher_order_ops.out_dtype import out_dtype
+from torchao.quantization.utils import TORCH_VERSION_AFTER_2_2
 
 try:
-    from torchao.kernel import intmm_triton
+    # Only works for torch2.2 or newer.
+    if TORCH_VERSION_AFTER_2_2:
+        from torchao.kernel import intmm_triton
+    else:
+        intmm_triton = None
 except ImportError:
+    # On cpu-only builds might not be available.
     intmm_triton = None
 
 AUTOTUNER_ENABLE = bool(int(os.getenv("TORCHAO_AUTOTUNER_ENABLE", 0)))
 
-def safe_int_mm(input: torch.Tensor, mat2: torch.Tensor) -> torch.Tensor:
-    # torch.compile path
-    if dynamo_is_compiling() or "FakeTensor" in input.__repr__():
-        return out_dtype(torch.ops.aten.mm.default, torch.int32, input, mat2)
-
-    # error checking for cublas path
-    assert (
-        mat2.device == input.device
-    ), f"need both tensors to be on the same device but got {mat2.device} and {input.device}"
-    device_cpu = "cpu" in [mat2.device.type, input.device.type]
-    # with input.shape = [i,j] and mat2.shape = [j,k]
-    i_is_strictly_greater_than_16 = input.shape[0] > 16
-    j_is_nonzero_multiple_of_8 = (input.shape[1] % 8 == 0) and (input.shape[1] > 0)
-    k_is_nonzero_multiple_of_8 = (mat2.shape[1] % 8 == 0) and (mat2.shape[1] > 0)
-    bad_dimensions_for_cublas = not (
-        i_is_strictly_greater_than_16
-        and j_is_nonzero_multiple_of_8
-        and k_is_nonzero_multiple_of_8
-    )
-
-    if device_cpu or bad_dimensions_for_cublas:
-        # fallback path
-        return torch.matmul(input.cpu().to(torch.int32), mat2.cpu().to(torch.int32)).to(
-            input.device.type
+# torch._int_mm doesn't exist before 2.2
+if TORCH_VERSION_AFTER_2_2:
+    from torch._dynamo import is_compiling as dynamo_is_compiling
+    from torch._higher_order_ops.out_dtype import out_dtype
+    def safe_int_mm(input: torch.Tensor, mat2: torch.Tensor) -> torch.Tensor:
+        # torch.compile path
+        if dynamo_is_compiling() or "FakeTensor" in input.__repr__():
+            return out_dtype(torch.ops.aten.mm.default, torch.int32, input, mat2)
+    
+        # error checking for cublas path
+        assert (
+            mat2.device == input.device
+        ), f"need both tensors to be on the same device but got {mat2.device} and {input.device}"
+        device_cpu = "cpu" in [mat2.device.type, input.device.type]
+        # with input.shape = [i,j] and mat2.shape = [j,k]
+        i_is_strictly_greater_than_16 = input.shape[0] > 16
+        j_is_nonzero_multiple_of_8 = (input.shape[1] % 8 == 0) and (input.shape[1] > 0)
+        k_is_nonzero_multiple_of_8 = (mat2.shape[1] % 8 == 0) and (mat2.shape[1] > 0)
+        bad_dimensions_for_cublas = not (
+            i_is_strictly_greater_than_16
+            and j_is_nonzero_multiple_of_8
+            and k_is_nonzero_multiple_of_8
         )
-
-    # cublas paths
-    if not mat2.is_contiguous():  # silently gives incorrect result without this
-        mat2 = mat2.contiguous()
-    if (not input.is_contiguous()) and (
-        input.shape[0] % 8 != 0
-    ):  # gives cryptic error without this
-        input = (
-            input.contiguous()
-        )  # (it seems the transpose makes cublas check the above j constraint on i)
-    return out_dtype(torch.ops.aten.mm.default, torch.int32, input, mat2)
+    
+        if device_cpu or bad_dimensions_for_cublas:
+            # fallback path
+            return torch.matmul(input.cpu().to(torch.int32), mat2.cpu().to(torch.int32)).to(
+                input.device.type
+            )
+    
+        # cublas paths
+        if not mat2.is_contiguous():  # silently gives incorrect result without this
+            mat2 = mat2.contiguous()
+        if (not input.is_contiguous()) and (
+            input.shape[0] % 8 != 0
+        ):  # gives cryptic error without this
+            input = (
+                input.contiguous()
+            )  # (it seems the transpose makes cublas check the above j constraint on i)
+        return out_dtype(torch.ops.aten.mm.default, torch.int32, input, mat2)
+else:
+    def safe_int_mm(input: torch.Tensor, mat2: torch.Tensor) -> torch.Tensor:
+        # We can improve on this by writing Triton code that works for older versions of Triton
+        # that ship with 2.1 or 2.0.
+        return torch.matmul(input.to(torch.float32), mat2.to(torch.float32)).to(torch.int32)
 
 
 def int_matmul(a, b):
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -11,6 +11,8 @@
 from torch.library import impl
 
 from torchao.kernel.intmm import int_scaled_matmul
+from .utils import TORCH_VERSION_AFTER_2_4
+from torchao.kernel.intmm import safe_int_mm
 from .utils import TORCH_VERSION_AFTER_2_3
 
 
@@ -40,64 +42,8 @@
     # TODO: need to clean up above functions
 ] + (_AFTER_TORCH_2_3_ONLY if TORCH_VERSION_AFTER_2_3 else [])
 
-
-def safe_int_mm(input: torch.Tensor, mat2: torch.Tensor) -> torch.Tensor:
-    r"""
-    This function wraps torch._int_mm and avoids several undesirable behaviors of the function for certain inputs while still
-    returning correct results and being torch.compiled in a performant way.
-
-    Assumes both tensors have dimension of 2.
-
-    Note: no error checking for torch.compiled path, if input.shape = [i, j] and j<=16 then the triton kernel
-    will error.
-
-    Args:
-        input (Tensor, int8): the first tensor to be multiplied
-        mat2 (Tensor, int8): the second tensor to be multiplied
-
-    Return:
-        out (Tensor, int32): the result of the matmul with device matching that of the inputs
-    """
-    # torch.compile path
-    if dynamo_is_compiling() or "FakeTensor" in input.__repr__():
-        return out_dtype(torch.ops.aten.mm.default, torch.int32, input, mat2)
-
-    # error checking for cublas path
-    assert (
-        mat2.device == input.device
-    ), f"need both tensors to be on the same device but got {mat2.device} and {input.device}"
-    device_cpu = "cpu" in [mat2.device.type, input.device.type]
-    # with input.shape = [i,j] and mat2.shape = [j,k]
-    i_is_strictly_greater_than_16 = input.shape[0] > 16
-    j_is_nonzero_multiple_of_8 = (input.shape[1] % 8 == 0) and (input.shape[1] > 0)
-    k_is_nonzero_multiple_of_8 = (mat2.shape[1] % 8 == 0) and (mat2.shape[1] > 0)
-    bad_dimensions_for_cublas = not (
-        i_is_strictly_greater_than_16
-        and j_is_nonzero_multiple_of_8
-        and k_is_nonzero_multiple_of_8
-    )
-
-    if device_cpu or bad_dimensions_for_cublas:
-        # fallback path
-        return torch.matmul(input.cpu().to(torch.int32), mat2.cpu().to(torch.int32)).to(
-            input.device.type
-        )
-
-    # cublas paths
-    if not mat2.is_contiguous():  # silently gives incorrect result without this
-        mat2 = mat2.contiguous()
-    if (not input.is_contiguous()) and (
-        input.shape[0] % 8 != 0
-    ):  # gives cryptic error without this
-        input = (
-            input.contiguous()
-        )  # (it seems the transpose makes cublas check the above j constraint on i)
-    return out_dtype(torch.ops.aten.mm.default, torch.int32, input, mat2)
-
-
 # copy-pasta of https://www.internalfb.com/intern/anp/view/?id=3350736
 
-
 def dynamically_quantize_per_tensor(
     x,
     quant_min,
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -95,8 +95,17 @@ def get_model_size_in_bytes(model):
         s += b.nelement() * b.element_size()
     return s
 
+if version.parse(torch.__version__) >= version.parse("2.4.0.dev"):
+    TORCH_VERSION_AFTER_2_4 = True
+else:
+    TORCH_VERSION_AFTER_2_4 = False
 
 if version.parse(torch.__version__) >= version.parse("2.3.0.dev"):
     TORCH_VERSION_AFTER_2_3 = True
 else:
     TORCH_VERSION_AFTER_2_3 = False
+
+if version.parse(torch.__version__) >= version.parse("2.2.0.dev"):
+    TORCH_VERSION_AFTER_2_2 = True
+else:
+    TORCH_VERSION_AFTER_2_2 = False