Merge remote-tracking branch 'origin/main' into nf4to1

cpuhrsch · cpuhrsch · commit 145b1dfd3250 · 2024-03-12T02:11:42.000Z
diff --git a/test/modules/test_nf4_linear.py b/test/modules/test_nf4_linear.py
@@ -6,7 +6,8 @@
 from torch.testing._internal.common_utils import TestCase
 from torchao.dtypes.nf4tensor import linear_nf4, NF4Tensor
 import torch.nn.functional as F
-
+import io
+from collections import OrderedDict
 
 bnb_available = False
 
@@ -44,6 +45,16 @@ def _build_bnb_linear(input_weight, device):
 
 
 class TestNF4Linear(TestCase):
+    class TestMod(nn.Module):
+        def __init__(self, tensor, block_size, scaler_block_size):
+            super().__init__()
+            self.param = torch.nn.Parameter(NF4Tensor.from_tensor(tensor, block_size, scaler_block_size))
+
+    def save_state_dict_to_buffer(self, state_dict: OrderedDict):
+        buffer = io.BytesIO()
+        torch.save(state_dict, buffer)
+        buffer.seek(0)
+        return buffer
 
     def test_register_nf4_as_param(self):
         nf4_tensor = NF4Tensor.from_tensor(
@@ -121,6 +132,43 @@ def test_nf4_bnb_linear(self):
         assert err_native < 0.5 * dim
         assert err_bnb < 0.5 * dim
 
+    @unittest.skipIf(not torch.cuda.is_available(), "Need cuda for test")
+    def test_load_from_bfloat16(self):
+        """Tests loading to and from different module state dicts"""
+        inpt_tensor = torch.rand(64, device='cuda', dtype=torch.bfloat16)
+        base_mod = self.TestMod(inpt_tensor, 32, 2)
+
+        bf16_dummy_dict = {"param": inpt_tensor}
+        base_mod.load_state_dict(bf16_dummy_dict)
+
+        assert base_mod.param.block_size == 32
+        assert base_mod.param.scaler_block_size == 2
+
+    @unittest.skipIf(not torch.cuda.is_available(), "Need cuda for test")
+    def test_load_from_nf4_same_meta(self):
+        """Tests loading to and from different module state dicts"""
+        inpt_tensor = torch.rand(64, device='cuda', dtype=torch.bfloat16)
+        base_mod = self.TestMod(inpt_tensor, 32, 2)
+        state_dict = base_mod.state_dict()
+        saved_state_dict = self.save_state_dict_to_buffer(state_dict)
+
+        other_mod = self.TestMod(inpt_tensor, 32, 2)
+        other_mod.load_state_dict(torch.load(saved_state_dict))
+        assert other_mod.param.block_size == 32
+        assert other_mod.param.scaler_block_size == 2
+
+    @unittest.skipIf(not torch.cuda.is_available(), "Need cuda for test")
+    def test_load_from_nf4_diff_meta(self):
+        """Tests loading to and from different module state dicts"""
+        inpt_tensor = torch.rand(128, device='cuda', dtype=torch.bfloat16)
+        base_mod = self.TestMod(inpt_tensor, 32, 2)
+        state_dict = base_mod.state_dict()
+        saved_state_dict = self.save_state_dict_to_buffer(state_dict)
+
+        other_mod = self.TestMod(inpt_tensor, 64, 1)
+        other_mod.load_state_dict(torch.load(saved_state_dict))
+        assert other_mod.param.block_size == 64
+        assert other_mod.param.scaler_block_size == 1
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/dtypes/nf4tensor.py b/torchao/dtypes/nf4tensor.py
@@ -13,6 +13,14 @@
 NF4_OPS_TABLE: Dict[Any, Any] = {}
 
 
+def same_metadata(a: "NF4Tensor", b: "NF4Tensor"):
+    both_nf4 = isinstance(a, NF4Tensor) and isinstance(b, NF4Tensor)
+    return (
+        both_nf4 and
+        a.block_size == b.block_size
+        and a.scaler_block_size == b.scaler_block_size
+        and a.n_blocks == b.n_blocks
+    )
 
 def implements(aten_ops):
     """Use this decorator to implement a function for an aten op in __torch_dispatch__"""
@@ -33,6 +41,34 @@ def _to_copy(func, *args, **kwargs):
     return args[0][0].get_original_weight().to(args[1]['dtype'])
 
 
+@implements(
+    [
+        aten.copy_.default,
+    ]
+)
+def copy_(func, *args, **kwargs):
+    original: NF4Tensor = args[0][0]
+    copy_in: torch.Tensor = args[0][1]
+
+    # Base Case
+    if same_metadata(original, copy_in):
+        original_tensors = original.__tensor_flatten__()[0]
+        for tensor_name in original_tensors:
+            getattr(original, tensor_name).copy_(getattr(copy_in, tensor_name))
+        return
+
+    # Convert Non NF4Tensor into NF4 for copy in
+    if not isinstance(copy_in, NF4Tensor):
+        copy_in_nf4 = NF4Tensor.from_tensor(copy_in, original.block_size, original.scaler_block_size)
+        return original.copy_(copy_in_nf4)
+
+    # Other Tensor is not a NF4Tensor
+    full_precision = copy_in.get_original_weight()
+    same_meta_nf4 = NF4Tensor.from_tensor(
+        full_precision, original.block_size, original.scaler_block_size
+    )
+    return original.copy_(same_meta_nf4)
+
 @dataclass
 class SubclassTensorArgs:
     original_shape: torch.Size