[Transform] Norm fusing utilities (#1637)

kylesayrs · web-flow · commit 2b00d044547b · 2025-07-15T15:27:46.000-04:00
## Purpose ##
* Provide utilities for fusing norms and embeddings for
SpinQuantModifier

## Changes ##
* Implement `center_embeddings` and `fuse_norm_linears`
* `center_embeddings` doesn't seem to have an effect (and theoretically
shouldn't have an effect, and makes the implementation less resilient),
but is implemented by the SpinQuant paper. We can implement the utility
here and decide to not use it later

 ## Testing ##
* Add `test_center_embeddings` and `test_fuse_norm_linears`

---------

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/modeling/fuse.py b/src/llmcompressor/modeling/fuse.py
@@ -0,0 +1,60 @@
+from typing import Iterable
+
+import torch
+from compressed_tensors import (
+    align_module_device,
+    get_execution_device,
+    update_offload_parameter,
+)
+
+__all__ = ["center_embeddings", "fuse_norm_linears"]
+
+
+PRECISION = torch.float64
+
+
+def center_embeddings(embedding: torch.nn.Module):
+    """
+    Shift each embedding to have a mean of zero
+
+    :param embedding: embedding module containing embeddings to center
+    """
+    if not hasattr(embedding, "weight"):
+        raise ValueError(f"Cannot fuse norm of type {type(embedding)}")
+
+    with align_module_device(embedding):
+        weight_dtype = embedding.weight.dtype
+        weight = embedding.weight.to(PRECISION)
+        new_weight = weight - weight.mean(dim=-1, keepdim=True)
+        new_weight = new_weight.to(weight_dtype)
+
+    update_offload_parameter(embedding, "weight", new_weight)
+
+
+def fuse_norm_linears(norm: torch.nn.Module, linears: Iterable[torch.nn.Linear]):
+    """
+    Fuse the scaling operation of norm layer into subsequent linear layers.
+    This useful for ensuring transform invariance between norm and linear layers.
+
+    Note that unitary transforms (rotation) commute with normalization, but not scaling
+
+    :param norm: norm layer whose weight will be fused into subsequent linears
+    :param linears: linear layers which directly follow the norm layer
+    """
+    if not hasattr(norm, "weight"):
+        raise ValueError(f"Cannot fuse norm of type {type(norm)}")
+
+    for linear in linears:
+        # NOTE: spinquant does this op in float64
+        exec_device = get_execution_device(norm)
+        with align_module_device(norm, exec_device), align_module_device(
+            linear, exec_device
+        ):
+            weight_dtype = linear.weight.dtype
+            new_weight = linear.weight.to(PRECISION) * norm.weight.to(PRECISION)
+            new_weight = new_weight.to(weight_dtype)
+
+        update_offload_parameter(linear, "weight", new_weight)
+
+    new_norm_weight = torch.ones_like(norm.weight, device="cpu")
+    update_offload_parameter(norm, "weight", new_norm_weight)
diff --git a/tests/llmcompressor/modeling/test_fuse.py b/tests/llmcompressor/modeling/test_fuse.py
@@ -0,0 +1,32 @@
+import pytest
+import torch
+
+from llmcompressor.modeling.fuse import center_embeddings, fuse_norm_linears
+
+
+@pytest.mark.unit
+def test_center_embeddings():
+    embedding = torch.nn.Embedding(10, 10)
+    center_embeddings(embedding)
+
+    assert torch.allclose(
+        embedding.weight.mean(dim=1), torch.zeros(embedding.num_embeddings), atol=1e-5
+    )
+
+
+@pytest.mark.unit
+def test_fuse_norm_linears():
+    norm = torch.nn.LayerNorm((5,))
+    norm.weight.data = torch.rand(norm.weight.shape)
+    linears = [
+        torch.nn.Linear(5, 5),
+        torch.nn.Linear(5, 5),
+    ]
+
+    input = torch.rand((1, 5), requires_grad=False)
+    true_output = torch.stack([linear(norm(input)) for linear in linears])
+
+    fuse_norm_linears(norm, linears)
+    output = torch.stack([linear(norm(input)) for linear in linears])
+
+    assert torch.allclose(true_output, output)