construct on execution device, cache on offload device

kylesayrs · kylesayrs · commit 5a887f45236a · 2025-06-11T17:49:15.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/transform/factory/hadamard.py b/src/compressed_tensors/transform/factory/hadamard.py
@@ -22,7 +22,7 @@
     apply_transform_weight,
     get_matrix_size,
 )
-from compressed_tensors.utils import get_offloaded_device
+from compressed_tensors.utils import get_execution_device, get_offloaded_device
 from compressed_tensors.utils.helpers import ParameterizedDefaultDict
 from torch import Tensor, device, dtype
 from torch.nn import Linear, Module, Parameter
@@ -41,6 +41,7 @@ class HadamardFactory(TransformFactory):
     def __init__(self, name: str, scheme: TransformScheme, seed: Optional[int] = None):
         super().__init__(name, scheme, seed)
         self.weights = ParameterizedDefaultDict(self._create_weight)
+        self._exec_device = torch.device("cpu")
 
     def create_transform(self, module: Module, args: TransformArgs):
         """
@@ -54,12 +55,20 @@ def create_transform(self, module: Module, args: TransformArgs):
         size = get_matrix_size(module, args.location)
         dtype = module.weight.dtype
         device = get_offloaded_device(module)
+        exec_device = get_execution_device(module)
 
-        weight = self.weights[size, dtype, device]
+        weight = self.weights.get(size, dtype, device, construct_device=exec_device)
         return HadamardTransform(weight, args)
 
-    def _create_weight(self, size: int, dtype: dtype, device: device) -> Parameter:
-        data = deterministic_hadamard_matrix(size, dtype=dtype)
+    def _create_weight(
+        self,
+        size: int,
+        dtype: dtype,
+        device: device,
+        construct_device: device,
+    ) -> Parameter:
+        # construct on execution device, cache on offload device
+        data = deterministic_hadamard_matrix(size, dtype, construct_device)
         data = data.to(device=device)
         return Parameter(data, requires_grad=self.scheme.requires_grad)
 
diff --git a/src/compressed_tensors/transform/factory/random_hadamard.py b/src/compressed_tensors/transform/factory/random_hadamard.py
@@ -28,7 +28,14 @@ class RandomHadamardFactory(HadamardFactory):
     :param seed: random seed used to transform weight randomization
     """
 
-    def _create_weight(self, size: int, dtype: dtype, device: device) -> Parameter:
-        data = random_hadamard_matrix(size, dtype=dtype, gen=self.generator)
+    def _create_weight(
+        self,
+        size: int,
+        dtype: dtype,
+        device: device,
+        construct_device: device,
+    ) -> Parameter:
+        # construct on execution device, cache on offload device
+        data = random_hadamard_matrix(size, dtype, construct_device, self.generator)
         data = data.to(device=device)
         return Parameter(data, requires_grad=self.scheme.requires_grad)
diff --git a/src/compressed_tensors/transform/utils/hadamard.py b/src/compressed_tensors/transform/utils/hadamard.py
@@ -71,16 +71,17 @@ def random_hadamard_matrix(
     See https://cornell-relaxml.github.io/quip-sharp/ ,
     Section "Randomized Hadamard Transformation"
 
+    Improves upon deterministic_hadamard_matrix
+    in that this supports non powers of 2 and random seeds
+
     Adapated from https://github.com/facebookresearch/SpinQuant/blob/main/utils/hadamard_utils.py  # noqa: E501
 
     :param size: The dimension of the hamadard matrix
     :param gen: Optional generator random values
     :return: randomly generated hadamard matrix
     """
-    # Benefits: support other shapes / non powers of 2, support randomization
-    Q = torch.randint(
-        low=0, high=2, size=(size,), generator=gen, dtype=dtype, device=device
-    )
+    Q = torch.randint(low=0, high=2, size=(size,), generator=gen, dtype=dtype)  # cpu
+    Q = Q.to(device=device)
     Q = Q * 2 - 1
     Q = torch.diag(Q)
     return _matmul_hadU(Q) / math.sqrt(size)
diff --git a/src/compressed_tensors/utils/helpers.py b/src/compressed_tensors/utils/helpers.py
@@ -373,11 +373,16 @@ class ParameterizedDefaultDict(dict):
 
     def __init__(self, default_factory: Callable[[Any], Any]):
         self.default_factory = default_factory
+        self._kwargs = {}
 
-    def __missing__(self, key):
+    def __missing__(self, key: Any) -> Any:
         if isinstance(key, tuple):
-            value = self.default_factory(*key)
+            value = self.default_factory(*key, **self._kwargs)
         else:
-            value = self.default_factory(key)
+            value = self.default_factory(key, **self._kwargs)
         self[key] = value
         return value
+
+    def get(self, *args, **kwargs) -> Any:
+        with patch_attr(self, "_kwargs", kwargs):
+            return self[args]
diff --git a/tests/test_transform/utils/test_hadamard.py b/tests/test_transform/utils/test_hadamard.py
@@ -44,8 +44,8 @@
 @pytest.mark.parametrize("size", _sizes_to_test)
 def test_random_hadamard_matrix_compliant(size):
     # (H / sqrt(n))(H.T / sqrt(n)) == I
-    had_matrix = random_hadamard_matrix(size, device="cuda")
-    product = had_matrix @ had_matrix.T
+    matrix = random_hadamard_matrix(size, device="cuda")
+    product = matrix @ matrix.T
     eye = torch.eye(size, dtype=product.dtype, device="cuda")
     assert torch.allclose(product, eye, atol=_atol)