add device option

kylesayrs · kylesayrs · commit fbaf47aa785b · 2025-06-11T17:12:50.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/transform/factory/hadamard.py b/src/compressed_tensors/transform/factory/hadamard.py
@@ -60,7 +60,7 @@ def create_transform(self, module: Module, args: TransformArgs):
 
     def _create_weight(self, size: int, dtype: dtype, device: device) -> Parameter:
         data = deterministic_hadamard_matrix(size, dtype=dtype)
-        data = data.to(dtype=dtype, device=device)
+        data = data.to(device=device)
         return Parameter(data, requires_grad=self.scheme.requires_grad)
 
 
diff --git a/src/compressed_tensors/transform/factory/random_hadamard.py b/src/compressed_tensors/transform/factory/random_hadamard.py
@@ -30,5 +30,5 @@ class RandomHadamardFactory(HadamardFactory):
 
     def _create_weight(self, size: int, dtype: dtype, device: device) -> Parameter:
         data = random_hadamard_matrix(size, dtype=dtype, gen=self.generator)
-        data = data.to(dtype=dtype, device=device)
+        data = data.to(device=device)
         return Parameter(data, requires_grad=self.scheme.requires_grad)
diff --git a/src/compressed_tensors/transform/utils/hadamard.py b/src/compressed_tensors/transform/utils/hadamard.py
@@ -31,7 +31,9 @@
 
 
 def deterministic_hadamard_matrix(
-    size: int, dtype: torch.dtype = torch.bfloat16
+    size: int,
+    dtype: torch.dtype = torch.bfloat16,
+    device: torch.device = torch.device("cpu"),
 ) -> torch.Tensor:
     """
     Construct an n-by-n Hadamard matrix, using Sylvester's construction.
@@ -49,7 +51,7 @@ def deterministic_hadamard_matrix(
     if size != 2**log2:
         raise ValueError("Cannot construct deterministic hadamard of size != 2^n")
 
-    H = torch.tensor([[1]], dtype=dtype)
+    H = torch.tensor([[1]], dtype=dtype, device=device)
 
     # Sylvester's construction
     for _ in range(0, log2):
@@ -61,6 +63,7 @@ def deterministic_hadamard_matrix(
 def random_hadamard_matrix(
     size: int,
     dtype: torch.dtype = torch.bfloat16,
+    device: torch.device = torch.device("cpu"),
     gen: Optional[torch.Generator] = None,
 ) -> torch.Tensor:
     """
@@ -75,7 +78,9 @@ def random_hadamard_matrix(
     :return: randomly generated hadamard matrix
     """
     # Benefits: support other shapes / non powers of 2, support randomization
-    Q = torch.randint(low=0, high=2, size=(size,), generator=gen, dtype=dtype)
+    Q = torch.randint(
+        low=0, high=2, size=(size,), generator=gen, dtype=dtype, device=device
+    )
     Q = Q * 2 - 1
     Q = torch.diag(Q)
     return _matmul_hadU(Q) / math.sqrt(size)
@@ -86,16 +91,18 @@ def is_pow2(n: int) -> bool:
 
 
 def _get_known_divisor(
-    n: int, dtype: torch.dtype, file_path: str = REPO_PATH
+    n: int,
+    dtype: torch.dtype,
+    device: torch.device = torch.device("cpu"),
+    file_path: str = REPO_PATH,
 ) -> Optional[torch.Tensor]:
     """
     Fetch a known hadamard matrix from the given file path. The returned matrix will
     be of of size `k` such that `n / k` is a power of two. Return None if no such
     matrix exists.
 
     Note: This function reopens the safetensors file every time it is called.
-    This is inefficient, but inconsequential because hadamards are typically
-    cached by size through the factory that produced them. This is also simpler
+    This is technically inefficient, but a very small runtime cost and simpler
     than forcing callers to manage the file open context
 
     :param n: size of known hadamard matrix
@@ -105,17 +112,18 @@ def _get_known_divisor(
         divisors = sorted([int(key) for key in file.keys()], reverse=True)
         for divisor in divisors:
             if n % divisor == 0 and is_pow2(n // divisor):
-                return file.get_tensor(str(divisor)).to(dtype=dtype)
+                return file.get_tensor(str(divisor)).to(dtype=dtype, device=device)
 
     return None
 
 
 def _matmul_hadU(X: torch.Tensor) -> torch.Tensor:
-    size = X.shape[-1]
+    size = X.size(0)
     dtype = X.dtype
+    device = X.device
 
     # Check if we have the determined hadamard matrix
-    hadK = _get_known_divisor(size, dtype)
+    hadK = _get_known_divisor(size, dtype, device=device)
     if hadK is None:
         raise ValueError(f"Cannot construct random hadamard matrix of size {size}")
     K = hadK.size(0)
@@ -130,6 +138,7 @@ def _matmul_hadU(X: torch.Tensor) -> torch.Tensor:
         output[:, :, 1, :] = input[:, :, 0, :] - input[:, :, 1, :]
         output = output.view(input.shape[0], input.shape[1], -1)
         (input, output) = (output, input)
+    assert input.shape[1] == K
     del output
 
     # Do not explicitly repeat - OOM
diff --git a/tests/test_transform/utils/test_hadamard.py b/tests/test_transform/utils/test_hadamard.py
@@ -32,19 +32,22 @@
     3584,  # qwen_2_5_vl
     3840,  # qwen_2_5_vl vision qkv
     4096,  # llama3
+    7168,  # deepseek_v3
     14336,  # llama3 intermediate
+    18432,  # deepseek_v3 intermediate
     18944,  # qwen_2_5_vl intermediate
 ]
+_atol = 1e-1  # bfloat16 is low precision for large matrices
 
 
 @requires_gpu
 @pytest.mark.parametrize("size", _sizes_to_test)
 def test_random_hadamard_matrix_compliant(size):
     # (H / sqrt(n))(H.T / sqrt(n)) == I
-    with torch.device("cuda"):
-        had_matrix = random_hadamard_matrix(size)
-        product = torch.round(had_matrix @ had_matrix.T)
-        assert torch.allclose(product, torch.eye(size, dtype=product.dtype), atol=1e-5)
+    had_matrix = random_hadamard_matrix(size, device="cuda")
+    product = had_matrix @ had_matrix.T
+    eye = torch.eye(size, dtype=product.dtype, device="cuda")
+    assert torch.allclose(product, eye, atol=_atol)
 
 
 def test_random_hadamard_generator():
@@ -75,13 +78,13 @@ def test_random_hadamard_generator():
 @requires_gpu
 @pytest.mark.parametrize("size", _sizes_to_test)
 def test_deterministic_hadamard_compliant(size):
-    with torch.device("cuda"):
-        if not is_pow2(size):
-            with pytest.raises(ValueError):
-                had_matrix = deterministic_hadamard_matrix(size)
-            return
+    if not is_pow2(size):
+        with pytest.raises(ValueError):
+            matrix = deterministic_hadamard_matrix(size, device="cuda")
+        return
 
-        # (H / sqrt(n))(H.T / sqrt(n)) == I
-        had_matrix = deterministic_hadamard_matrix(size)
-        product = had_matrix @ had_matrix.T
-        assert torch.allclose(product, torch.eye(size, dtype=product.dtype), atol=1e-5)
+    # (H / sqrt(n))(H.T / sqrt(n)) == I
+    matrix = deterministic_hadamard_matrix(size, device="cuda")
+    product = matrix @ matrix.T
+    eye = torch.eye(size, dtype=product.dtype, device="cuda")
+    assert torch.allclose(product, eye, atol=_atol)