neuralmagic
diff --git a/‎src/compressed_tensors/transform/factory/base.py
Lines changed: 1 addition & 3 deletions b/‎src/compressed_tensors/transform/factory/base.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/compressed_tensors/transform/factory/hadamard.py
Lines changed: 15 additions & 8 deletions b/‎src/compressed_tensors/transform/factory/hadamard.py
Lines changed: 15 additions & 8 deletions
diff --git a/‎src/compressed_tensors/transform/factory/matrix_multiply.py
Lines changed: 18 additions & 8 deletions b/‎src/compressed_tensors/transform/factory/matrix_multiply.py
Lines changed: 18 additions & 8 deletions
diff --git a/‎src/compressed_tensors/transform/transform_scheme.py
Lines changed: 2 additions & 1 deletion b/‎src/compressed_tensors/transform/transform_scheme.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/compressed_tensors/transform/utils/matrix.py
Lines changed: 179 additions & 0 deletions b/‎src/compressed_tensors/transform/utils/matrix.py
Lines changed: 179 additions & 0 deletions
@@ -117,10 +117,8 @@ def input_hook(_, args):
             TransformLocation.WEIGHT_INPUT,
             TransformLocation.WEIGHT_OUTPUT,
         ):
-            assert isinstance(module, torch.nn.Linear)
-            assert module.bias is None
-
             # fuse transform into weight
+            assert hasattr(module, "weight")
             with torch.no_grad(), align_module_device(module):
                 update_offload_parameter(module, "weight", transform(module.weight))
 
 
@@ -19,9 +19,9 @@
 from compressed_tensors.transform import TransformArgs, TransformScheme
 from compressed_tensors.transform.factory.base import TransformBase, TransformFactory
 from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
-from compressed_tensors.transform.utils.utils import (
+from compressed_tensors.transform.utils.matrix import (
     apply_transform_weight,
-    get_matrix_size,
+    get_transform_size,
 )
 from compressed_tensors.utils import get_execution_device, get_offloaded_device
 from compressed_tensors.utils.helpers import ParameterizedDefaultDict
@@ -52,16 +52,16 @@ def create_transform(self, module: Module, args: TransformArgs):
         :param module: parent module that transform will be applied to
         :param args: defines how the transform will be applied to the module
         """
-        assert isinstance(module, Linear)
-        size = get_matrix_size(module, args.location)
+        assert hasattr(module, "weight")
+        size = get_transform_size(module, args.location, self.scheme.head_dim)
         dtype = module.weight.dtype
         device = get_offloaded_device(module)
         exec_device = get_execution_device(module)
 
         factory_kwargs = {"construct_device": exec_device}
         weight = self.weights.get(size, dtype, device, factory_kwargs=factory_kwargs)
         perm = self.perms[weight] if self.scheme.randomize else None
-        return HadamardTransform(weight, perm, args)
+        return HadamardTransform(weight, perm, args, type(module))
 
     def _create_weight(
         self,
@@ -82,12 +82,17 @@ def _create_permutation(self, weight: Parameter) -> Parameter:
 
 class HadamardTransform(TransformBase):
     def __init__(
-        self, weight: Parameter, perm: Union[Parameter, None], args: TransformArgs
+        self,
+        weight: Parameter,
+        perm: Optional[Parameter],
+        args: TransformArgs,
+        module_type: type[torch.nn.Module],
     ):
         super().__init__()
         self.weight = weight
         self.perm = perm
         self.args = args
+        self.module_type = module_type
         self._scale = math.sqrt(weight.size(0))
 
     def forward(self, value: Tensor) -> Tensor:
@@ -98,5 +103,7 @@ def forward(self, value: Tensor) -> Tensor:
 
         if self.args.inverse:
             weight = weight.T
-
-        return apply_transform_weight(weight, value, self.args.location) / self._scale
+ 
+        return apply_transform_weight(
+            weight, value, self.args.location, self.module_type
+        ) / self._scale
@@ -17,9 +17,9 @@
 import torch
 from compressed_tensors.transform import TransformArgs, TransformScheme
 from compressed_tensors.transform.factory.base import TransformBase, TransformFactory
-from compressed_tensors.transform.utils.utils import (
+from compressed_tensors.transform.utils.matrix import (
     apply_transform_weight,
-    get_matrix_size,
+    get_transform_size,
 )
 from compressed_tensors.utils import get_offloaded_device
 from compressed_tensors.utils.helpers import ParameterizedDefaultDict
@@ -50,16 +50,16 @@ def create_transform(self, module: Module, args: TransformArgs):
         :param module: parent module that transform will be applied to
         :param args: defines how the transform will be applied to the module
         """
-        assert isinstance(module, Linear)
-        size = get_matrix_size(module, args.location)
+        assert hasattr(module, "weight")
+        size = get_transform_size(module, args.location, self.scheme.head_dim)
         dtype = module.weight.dtype
         device = get_offloaded_device(module)
 
         weight = self.weights[size, dtype, device]
         if args.inverse:
             weight = self.inverses[weight]
 
-        return RandomMatrixTransform(weight, args)
+        return RandomMatrixTransform(weight, args, type(module))
 
     def _create_weight(self, size: int, dtype: dtype, device: device) -> Parameter:
         # TODO: verify that weight is invertible (has non-zero determinant)
@@ -74,17 +74,27 @@ def _create_inverse(self, weight: Parameter) -> Parameter:
 
 
 class RandomMatrixTransform(TransformBase):
-    def __init__(self, weight: Tensor, args: TransformArgs):
+    def __init__(
+        self,
+        weight: Tensor,
+        args: TransformArgs,
+        module_type: type[torch.nn.Module],
+    ):
         super().__init__()
         self.weight = weight  # is an inverse if args.inverse
         self.args = args
+        self.module_type = module_type
 
     def forward(self, value: Tensor) -> Parameter:
-        return apply_transform_weight(self.weight, value, self.args.location)
+        return apply_transform_weight(
+            self.weight, value, self.args.location, self.module_type
+        )
 
     def right_inverse(self, value: Tensor) -> Tensor:
         inverse = high_precision_invert(self.weight)
-        return apply_transform_weight(inverse, value, self.args.location)
+        return apply_transform_weight(
+            inverse, value, self.args.location, self.module_type
+        )
 
 
 def high_precision_invert(weight: Tensor) -> Tensor:
 
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
+from typing import List, Optional
 
 from compressed_tensors.transform import TransformArgs
 from pydantic import BaseModel, Field
@@ -40,3 +40,4 @@ class TransformScheme(BaseModel):
     apply: List[TransformArgs] = Field(default_factory=list)
     randomize: bool = Field(default=False)
     requires_grad: bool = Field(default=False)
+    head_dim: Optional[int] = Field(default=None)
@@ -0,0 +1,179 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional, Tuple
+
+import torch
+from compressed_tensors.transform import TransformLocation
+
+
+__all__ = ["get_transform_size", "apply_transform_weight"]
+
+
+def get_transform_size(
+    module: torch.nn.Module,
+    location: TransformLocation,
+    head_dim: Optional[int] = None,
+) -> int:
+    """
+    Determine the size of a transform matrix given its location on the module
+
+    :param module: module that matrix will be applied to
+    :param location: location on module
+    :param head_dim: size of head when transform is applied to mha
+    :return: size of matrix
+    """
+    if isinstance(module, torch.nn.Linear):
+        if location in (TransformLocation.INPUT, TransformLocation.WEIGHT_INPUT):
+            size = module.in_features
+        else:
+            size = module.out_features
+    elif isinstance(module, torch.nn.Embedding):
+        if location in (TransformLocation.INPUT, TransformLocation.WEIGHT_INPUT):
+            size = module.num_embeddings
+        else:
+            size = module.embedding_dim
+    else:
+        raise NotImplementedError(f"Transforms on {type(module)} are not supported")
+
+    if head_dim is not None:
+        if size % head_dim != 0:
+            raise ValueError(
+                f"{head_dim} must divide {size} for {type(module)} at {location}"
+            )
+
+        size = head_dim
+
+    return size
+
+
+def apply_transform_weight(
+    transform_weight: torch.Tensor,
+    value: torch.Tensor,
+    location: TransformLocation,
+    module_type: type[torch.nn.Module],
+) -> torch.Tensor:
+    """
+    Using the transform location, apply the transform_weight to the
+    given value wrt linear weights. For more info on input and output transforms,
+    see `TransformLocation`
+
+    The following explains how weights should be applied to values according to location
+
+    let  x          be input activation
+         W          be weight,
+         yh, xh, Wh be transformed output, input, weight
+
+    note that
+         y  = (x W.T)        // torch.nn.Linear
+
+    Choose values for yh, xh, and Wh which incorporate matrix transforms
+
+    let  V, Vi      be transform matrices on input side
+         U, Ui      be transform matrices on output side
+
+    pick xh = (x V)
+         Wh = (U.T W Vi.T)
+         yh = (y U)
+
+    The following shows that `yh = (xh) (Wh).T` for the chosen values of yh, xh, and Wh
+
+    (xh) (Wh).T = (x V) (U.T W Vi.T).T
+                = (x V) (Vi W.T U)        // transpose matrix product identity
+                = (x W.T) U
+                = y U
+                = yh
+
+    :param transform_weight: transform weight to apply
+    :param value: value to apply transform_weight to
+    :param location: determines how weight should be applied
+    :param model_type: result of type(module), passed in to determine application of
+        weight transform
+    :return: value after transform_weight has been applied
+    """
+
+    assert transform_weight.shape[0] == transform_weight.shape[1]
+
+    if module_type == torch.nn.Linear:
+        if location == TransformLocation.INPUT:
+            return _multihead_matmul(value, transform_weight)
+
+        elif location == TransformLocation.WEIGHT_INPUT:
+            # equivalent to (transform_weight @ value.T).T
+            return _multihead_matmul(value, transform_weight.T)
+
+        elif location == TransformLocation.WEIGHT_OUTPUT:
+            # equivalent to (value.T @ transform_weight).T
+            return _multihead_matmul(transform_weight.T, value)
+
+        elif location == TransformLocation.OUTPUT:
+            return _multihead_matmul(value, transform_weight)
+
+    # similar derivation to torch.nn.Linear, but `y = (x W)`
+    elif module_type == torch.nn.Embedding:
+        if location == TransformLocation.INPUT:
+            return _multihead_matmul(value, transform_weight)
+
+        elif location == TransformLocation.WEIGHT_INPUT:
+            return _multihead_matmul(
+                transform_weight,
+                value,
+            )
+
+        elif location == TransformLocation.WEIGHT_OUTPUT:
+            return _multihead_matmul(value, transform_weight)
+
+        elif location == TransformLocation.OUTPUT:
+            return _multihead_matmul(value, transform_weight)
+
+    raise NotImplementedError(
+        f"Applying transforms to {module_type} {location} is not supported"
+    )
+
+
+def _multihead_matmul(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+    """
+    Performs A @ B for last two dims of two matrices A and B that possibly
+    have different shapes, as is the case in multi-headed dimension. If
+    shapes are different, this is equivalent to converting the last two dims
+    of the smaller matrix into a block-diagonal matrix with the same shape as
+    the last two dims of the larger matrix.
+
+    E.g. if A is half the size of B, this function will perform
+    [[A  ]  @ B
+     [  A]]
+
+    If B is a third of the size of A, this function will perform
+    A @ [[B    ]
+         [  B  ]
+         [    B]]
+
+    This function will error out if the shapes are not evenly divisble
+
+    :param A: left-hand tensor
+    :param B: right-hand tensor
+    :return: result
+    """
+    if A.shape[-1] > B.shape[-2]:
+        head_dim = B.shape[-2]
+        num_heads = A.shape[-1] // head_dim
+        A = A.unflatten(-1, (num_heads, head_dim))
+        return (A @ B).flatten(-2, -1)
+    elif A.shape[-1] < B.shape[-2]:
+        head_dim = A.shape[-1]
+        num_heads = B.shape[-2] // head_dim
+        B = B.unflatten(-2, (num_heads, head_dim))
+        return (A @ B).flatten(-3, -2)
+    else:
+        return A @ B