implement head dim

kylesayrs · kylesayrs · commit 6ee452e78436 · 2025-07-09T11:21:43.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/transform/factory/hadamard.py b/src/compressed_tensors/transform/factory/hadamard.py
@@ -52,16 +52,15 @@ def create_transform(self, module: Module, args: TransformArgs):
         :param args: defines how the transform will be applied to the module
         """
         assert isinstance(module, Linear)
-        num_heads = self.scheme.num_heads
-        size = get_matrix_size(module, args.location, num_heads)
+        size = get_matrix_size(module, args.location, self.scheme.head_dim)
         dtype = module.weight.dtype
         device = get_offloaded_device(module)
         exec_device = get_execution_device(module)
 
         factory_kwargs = {"construct_device": exec_device}
         weight = self.weights.get(size, dtype, device, factory_kwargs=factory_kwargs)
         perm = self.perms[weight] if self.scheme.randomize else None
-        return HadamardTransform(weight, perm, args, num_heads)
+        return HadamardTransform(weight, perm, args)
 
     def _create_weight(
         self,
@@ -86,13 +85,11 @@ def __init__(
         weight: Parameter,
         perm: Optional[Parameter],
         args: TransformArgs,
-        num_heads: Optional[int],
     ):
         super().__init__()
         self.weight = weight
         self.perm = perm
         self.args = args
-        self.num_heads = num_heads
 
     def forward(self, value: Tensor) -> Tensor:
         weight = self.weight
@@ -103,4 +100,4 @@ def forward(self, value: Tensor) -> Tensor:
         if self.args.inverse:
             weight = weight.T
 
-        return apply_transform_weight(weight, value, self.args.location, self.num_heads)
+        return apply_transform_weight(weight, value, self.args.location)
diff --git a/src/compressed_tensors/transform/factory/matrix_multiply.py b/src/compressed_tensors/transform/factory/matrix_multiply.py
@@ -51,16 +51,15 @@ def create_transform(self, module: Module, args: TransformArgs):
         :param args: defines how the transform will be applied to the module
         """
         assert isinstance(module, Linear)
-        num_heads = self.scheme.num_heads
-        size = get_matrix_size(module, args.location, num_heads)
+        size = get_matrix_size(module, args.location, self.scheme.head_dim)
         dtype = module.weight.dtype
         device = get_offloaded_device(module)
 
         weight = self.weights[size, dtype, device]
         if args.inverse:
             weight = self.inverses[weight]
 
-        return RandomMatrixTransform(weight, args, num_heads)
+        return RandomMatrixTransform(weight, args)
 
     def _create_weight(self, size: int, dtype: dtype, device: device) -> Parameter:
         # TODO: verify that weight is invertible (has non-zero determinant)
@@ -75,22 +74,17 @@ def _create_inverse(self, weight: Parameter) -> Parameter:
 
 
 class RandomMatrixTransform(TransformBase):
-    def __init__(self, weight: Tensor, args: TransformArgs, num_heads: Optional[int]):
+    def __init__(self, weight: Tensor, args: TransformArgs):
         super().__init__()
         self.weight = weight  # is an inverse if args.inverse
         self.args = args
-        self.num_heads = num_heads
 
     def forward(self, value: Tensor) -> Parameter:
-        return apply_transform_weight(
-            self.weight, value, self.args.location, self.num_heads
-        )
+        return apply_transform_weight(self.weight, value, self.args.location)
 
     def right_inverse(self, value: Tensor) -> Tensor:
         inverse = high_precision_invert(self.weight)
-        return apply_transform_weight(
-            inverse, value, self.args.location, self.num_heads
-        )
+        return apply_transform_weight(inverse, value, self.args.location)
 
 
 def high_precision_invert(weight: Tensor) -> Tensor:
diff --git a/src/compressed_tensors/transform/transform_scheme.py b/src/compressed_tensors/transform/transform_scheme.py
@@ -40,4 +40,4 @@ class TransformScheme(BaseModel):
     apply: List[TransformArgs] = Field(default_factory=list)
     randomize: bool = Field(default=False)
     requires_grad: bool = Field(default=False)
-    num_heads: Optional[int] = Field(default=None)
+    head_dim: Optional[int] = Field(default=None)
diff --git a/src/compressed_tensors/transform/utils/matrix.py b/src/compressed_tensors/transform/utils/matrix.py
@@ -24,38 +24,36 @@
 def get_matrix_size(
     module: torch.nn.Module,
     location: TransformLocation,
-    num_heads: Optional[int] = None,
+    head_dim: Optional[int] = None,
 ) -> int:
     """
     Determine the size of a matrix given its location on the module
 
     :param module: module that matrix will be applied to
     :param location: location on module
+    :TODO head_dim:
     :return: size of matrix
     """
     assert isinstance(module, torch.nn.Linear)
 
-    if location in ("input", TransformLocation.WEIGHT_INPUT):
+    if location in (TransformLocation.INPUT, TransformLocation.WEIGHT_INPUT):
         size = module.in_features
     else:
         size = module.out_features
 
-    if num_heads is not None:
-        assert size % num_heads == 0
-        size = size // num_heads
+    if head_dim is not None:
+        assert size % head_dim == 0
+        return head_dim
 
-    return size
+    else:
+        return size
 
 
 def apply_transform_weight(
     weight: torch.Tensor,
     value: torch.Tensor,
     location: TransformLocation,
-    num_heads: Optional[int] = None,
 ) -> torch.Tensor:
-    if num_heads is not None:
-        weight = weight.repeat((num_heads, num_heads))
-
     return apply_transform_weight_linear(weight, value, location)
 
 
@@ -99,17 +97,31 @@ def apply_transform_weight_linear(
     :param location: determines how weight should be applied
     :return: value after transform weight has been applied
     """
+    value_shape = value.shape
+    weight_size = weight.shape[0]
+    assert weight.shape[0] == weight.shape[1]
+
     if location == TransformLocation.INPUT:
-        return value @ weight
+        num_heads = value_shape[1] // weight_size
+        value = value.reshape(value_shape[0], num_heads, weight_size)
+        ret = value @ weight
 
     elif location == TransformLocation.WEIGHT_INPUT:
-        return value @ weight.T
+        num_heads = value_shape[1] // weight_size
+        value = value.reshape(value_shape[0], num_heads, weight_size)
+        ret = value @ weight.T
 
     elif location == TransformLocation.WEIGHT_OUTPUT:
-        return weight.T @ value
+        num_heads = value_shape[0] // weight_size
+        value = value.reshape(num_heads, weight_size, value_shape[1])
+        ret = weight.T @ value
 
     elif location == TransformLocation.OUTPUT:
-        return value @ weight
+        num_heads = value_shape[1] // weight_size
+        value = value.reshape(value_shape[0], num_heads, weight_size)
+        ret = value @ weight
 
     else:
         raise NotImplementedError(f"{location} has not been implemented yet")
+
+    return ret.reshape(value_shape)
diff --git a/tests/test_transform/conftest.py b/tests/test_transform/conftest.py
@@ -33,6 +33,62 @@ def forward(self, x):
         return x
 
 
+class MockAttention(torch.nn.Module):
+    def __init__(
+        self, hidden_size: int, num_attention_heads: int, num_key_value_heads: int
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+
+        self.num_key_value_groups = num_attention_heads // num_key_value_heads
+        self.head_dim = hidden_size // num_attention_heads
+        self.scaling = self.head_dim**-0.5
+
+        self.q_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
+        self.k_proj = torch.nn.Linear(
+            hidden_size, num_key_value_heads * self.head_dim, bias=False
+        )
+        self.v_proj = torch.nn.Linear(
+            hidden_size, num_key_value_heads * self.head_dim, bias=False
+        )
+        self.o_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, hidden_size = hidden_states.shape
+        hidden_shape = (batch_size, seq_len, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        key_states = self.repeat_kv(key_states, self.num_key_value_groups)
+        value_states = self.repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = (
+            torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
+        )
+
+        attn_weights = torch.nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape((batch_size, seq_len, -1)).contiguous()
+
+        return self.o_proj(attn_output)
+
+    def repeat_kv(self, hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+        batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+        if n_rep == 1:
+            return hidden_states
+        hidden_states = hidden_states[:, :, None, :, :].expand(
+            batch, num_key_value_heads, n_rep, slen, head_dim
+        )
+        return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
 @pytest.fixture(scope="function")
 def model_apply():
     model = TransformableModel(2, 4, 8, 16, 32, 64)
diff --git a/tests/test_transform/factory/test_correctness.py b/tests/test_transform/factory/test_correctness.py
@@ -22,6 +22,7 @@
     apply_transform_config,
 )
 from compressed_tensors.utils import offloaded_dispatch
+from tests.test_transform.conftest import MockAttention
 from tests.testing_utils import requires_accelerate, requires_gpu
 
 
@@ -87,3 +88,77 @@ def test_correctness_model(type, randomized, model_apply, offload=False):
 @pytest.mark.parametrize("randomized", (True, False))
 def test_correctness_model_offload(type, randomized, model_apply):
     test_correctness_model(type, randomized, model_apply, offload=True)
+
+
+@pytest.mark.parametrize("type", ("hadamard", "random-hadamard"))
+@pytest.mark.parametrize("randomized", (True, False))
+@pytest.mark.parametrize("head_dim", (16, 32))
+def test_correctness_heads(type, randomized, head_dim, offload=False):
+    hidden_size = 64
+
+    model = torch.nn.ModuleDict(
+        {
+            "v_proj": torch.nn.Linear(hidden_size, hidden_size, bias=False),
+            "o_proj": torch.nn.Linear(hidden_size, hidden_size, bias=False),
+        }
+    )
+
+    input = torch.rand(17, 5, hidden_size)
+    true_output = model.o_proj(model.v_proj(input))
+
+    config = TransformConfig(
+        config_groups={
+            "": TransformScheme(
+                type=type,
+                randomized=randomized,
+                head_dim=head_dim,
+                apply=[
+                    TransformArgs(targets="v_proj", location="weight_output"),
+                    TransformArgs(
+                        targets="o_proj", location="weight_input", inverse=True
+                    ),
+                ],
+            )
+        }
+    )
+    apply_transform_config(model, config)
+
+    output = model.o_proj(model.v_proj(input))
+    assert torch.allclose(true_output, output, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("type", ("hadamard", "random-hadamard"))
+@pytest.mark.parametrize("randomized", (True, False))
+@pytest.mark.parametrize("head_dim", (8,))  # (8, 16))
+def test_correctness_attention_heads(type, randomized, head_dim, offload=False):
+    hidden_size = 4096
+    num_attention_heads = 32
+
+    attention = MockAttention(
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        num_key_value_heads=head_dim,
+    )
+
+    input = torch.rand(17, 5, hidden_size)
+    true_output = attention(input)
+
+    config = TransformConfig(
+        config_groups={
+            "": TransformScheme(
+                type=type,
+                randomized=randomized,
+                head_dim=head_dim,
+                apply=[
+                    TransformArgs(targets="v_proj", location="weight_output"),
+                    TransformArgs(
+                        targets="o_proj", location="weight_input", inverse=True
+                    ),
+                ],
+            )
+        }
+    )
+    apply_transform_config(attention, config)
+
+    output = attention(input)
+    assert torch.allclose(true_output, output, atol=1e-5, rtol=0.0)