Merge pull request #43 from pytorch-labs/nf4to1

cpuhrsch · web-flow · commit 0babda5cc50e · 2024-03-12T12:54:53.000-07:00
to_nf4 and support for _to_copy
diff --git a/test/dtypes/test_uint4.py b/test/dtypes/test_uint4.py
@@ -18,7 +18,7 @@
     compute_error,
 )
 from torchao.quantization.quant_api import (
-    replace_with_custom_fn_if_matches_filter,
+    _replace_with_custom_fn_if_matches_filter,
 )
 from torch.ao.quantization.observer import ObserverBase
 from torch import nn
@@ -36,7 +36,7 @@ def fn(mod):
         mod.weight = torch.nn.Parameter(PerChannelSymmetricWeightUInt4Tensor.from_float(mod.weight), requires_grad=False)
         return mod
 
-    replace_with_custom_fn_if_matches_filter(
+    _replace_with_custom_fn_if_matches_filter(
         model,
         lambda mod: fn(mod),
         lambda mod, fqn: isinstance(mod, torch.nn.Linear),
diff --git a/test/modules/test_nf4_linear.py b/test/modules/test_nf4_linear.py
@@ -4,7 +4,7 @@
 import torch
 from torch import nn
 from torch.testing._internal.common_utils import TestCase
-from torchao.dtypes.nf4tensor import linear_nf4, NF4Tensor
+from torchao.dtypes.nf4tensor import linear_nf4, NF4Tensor, to_nf4
 import torch.nn.functional as F
 import io
 from collections import OrderedDict
@@ -48,7 +48,7 @@ class TestNF4Linear(TestCase):
     class TestMod(nn.Module):
         def __init__(self, tensor, block_size, scaler_block_size):
             super().__init__()
-            self.param = torch.nn.Parameter(NF4Tensor.from_tensor(tensor, block_size, scaler_block_size))
+            self.param = torch.nn.Parameter(to_nf4(tensor, block_size, scaler_block_size))
 
     def save_state_dict_to_buffer(self, state_dict: OrderedDict):
         buffer = io.BytesIO()
@@ -57,9 +57,7 @@ def save_state_dict_to_buffer(self, state_dict: OrderedDict):
         return buffer
 
     def test_register_nf4_as_param(self):
-        nf4_tensor = NF4Tensor.from_tensor(
-            inpt_tensor=torch.randn(512, 512, dtype=torch.bfloat16)
-        )
+        nf4_tensor = to_nf4(torch.randn(512, 512, dtype=torch.bfloat16))
 
         # Would raise if nn.Parameter registration fails, such as no detach()
         # impl when calling __torch_dispatch__
@@ -69,18 +67,14 @@ def test_register_nf4_as_param(self):
     def test_output_bf16(self):
         # Test to ensure W4 A16 produces A16
         inp = torch.randn(2, 512, dtype=torch.bfloat16, requires_grad=True)
-        nf4_tensor = NF4Tensor.from_tensor(
-            inpt_tensor=torch.randn(512, 512, dtype=torch.bfloat16)
-        )
+        nf4_tensor = to_nf4(torch.randn(512, 512, dtype=torch.bfloat16))
         out = linear_nf4(input=inp, weight=nf4_tensor)
         assert out.dtype == torch.bfloat16
 
     def test_backward_bf16(self):
         # Test to ensure backward pass gives activation a bf16 gradient and no gradient
         # to the linear's weight, as it is frozen.
-        nf4_tensor = NF4Tensor.from_tensor(
-            inpt_tensor=torch.randn(512, 512, dtype=torch.bfloat16)
-        )
+        nf4_tensor = to_nf4(torch.randn(512, 512, dtype=torch.bfloat16))
         inp = torch.randn(2, 512, dtype=torch.bfloat16, requires_grad=True)
         linear_nf4(inp, nf4_tensor).sum().backward()
         assert inp.grad is not None and inp.grad.dtype == torch.bfloat16
@@ -94,7 +88,7 @@ def test_reconstruction_qlora_vs_bnb(self):
         device = "cuda"
         embed_dim = 512
         input_weight = _build_input_weight(embed_dim, device)
-        nf4_weight = NF4Tensor.from_tensor(input_weight)
+        nf4_weight = to_nf4(input_weight)
         bnb_linear = _build_bnb_linear(input_weight, device)
         bnb_reconstruction = bnb_linear(
             torch.eye(embed_dim, embed_dim, dtype=torch.bfloat16, device=device)
@@ -118,7 +112,7 @@ def test_nf4_bnb_linear(self):
         dim = 512
         device = "cuda"
         input_weight = _build_input_weight(dim, device)
-        nf4_weight = NF4Tensor.from_tensor(input_weight)
+        nf4_weight = to_nf4(input_weight)
         bnb_linear = _build_bnb_linear(input_weight, device)
 
         inp = torch.randn(2, 512, dtype=torch.bfloat16, device="cuda")
@@ -170,5 +164,18 @@ def test_load_from_nf4_diff_meta(self):
         assert other_mod.param.block_size == 64
         assert other_mod.param.scaler_block_size == 1
 
+    def test_to_copy(self):
+        inpt_tensor = torch.rand(128, device='cpu')
+        inpt_tensor_nf4 = to_nf4(inpt_tensor, 32, 2)
+        inpt_tensor_bfloat16 = inpt_tensor_nf4.to(torch.bfloat16)
+        torch.testing.assert_allclose(inpt_tensor, inpt_tensor_bfloat16, atol=0.13, rtol=0.13)
+
+        if torch.cuda.is_available():
+            inpt_tensor = torch.rand(128, device='cuda')
+            inpt_tensor_nf4 = to_nf4(inpt_tensor, 32, 2)
+            inpt_tensor_bfloat16 = inpt_tensor_nf4.to(torch.bfloat16)
+            torch.testing.assert_allclose(inpt_tensor, inpt_tensor_bfloat16, atol=0.13, rtol=0.13)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/__init__.py b/torchao/__init__.py
@@ -0,0 +1,5 @@
+from . import dtypes
+
+__all__ = [
+        "dtypes"
+]
diff --git a/torchao/dtypes/__init__.py b/torchao/dtypes/__init__.py
@@ -1,5 +1,8 @@
+from .nf4tensor import NF4Tensor, to_nf4
 from .uint4 import UInt4Tensor
 
 __all__ = [
+    "NF4Tensor",
+    "to_nf4",
     "UInt4Tensor"
 ]
diff --git a/torchao/dtypes/nf4tensor.py b/torchao/dtypes/nf4tensor.py
@@ -36,6 +36,10 @@ def decorator(func):
 def noop_detach(func, *args, **kwargs):
     return args[0][0]
 
+@implements([torch.ops.aten._to_copy.default])
+def _to_copy(func, *args, **kwargs):
+    return args[0][0].get_original_weight().to(args[1]['dtype'])
+
 
 @implements(
     [
@@ -164,8 +168,8 @@ def __init__(
     def from_tensor(
         cls,
         inpt_tensor: torch.Tensor,
-        block_size: int = 64,
-        scaler_block_size: int = 256,
+        block_size: int,
+        scaler_block_size: int,
     ):
         assert inpt_tensor.dtype == torch.bfloat16
         assert (
@@ -452,6 +456,10 @@ def __tensor_unflatten__(inner_tensors: Dict, metadata, outer_size, outer_stride
             inner_tensors["nf4"],
         )
 
+
+    def __str__(self):
+        return self.to(torch.float32).__str__()
+
     @classmethod
     def __torch_dispatch__(cls, func, types, args, kwargs=None):
         """TODO we are not supporting torch dispatch at the moment
@@ -501,3 +509,9 @@ def linear_nf4(input: torch.Tensor, weight: NF4Tensor) -> torch.Tensor:
         weight: NF4Tensor weight
     """
     return LinearNF4.apply(input, weight)
+
+def to_nf4(tensor,
+           block_size: int = 64,
+           scaler_block_size: int = 256):
+    tensor1 = tensor.to(torch.bfloat16)
+    return NF4Tensor.from_tensor(tensor1, block_size, scaler_block_size)

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,8 @@`
	`1`	`+from .nf4tensor import NF4Tensor, to_nf4`
`1`	`2`	`from .uint4 import UInt4Tensor`
`2`	`3`
`3`	`4`	`__all__ = [`
	`5`	`+ "NF4Tensor",`
	`6`	`+ "to_nf4",`
`4`	`7`	`"UInt4Tensor"`
`5`	`8`	`]`