[sparse] marlin fixes (#2305)

jcaip · web-flow · commit 801af03165d2 · 2025-06-04T16:10:54.000-04:00
* [sparse] marlin fixes

Summary:

This PR updates sparse-marlin to not use CPU tensors and updates it to
be compatible with Int4WeightOnl.

Test Plan:
```
pytest test/sparsity/test_marlin.py
```

Reviewers:

Subscribers:

Tasks:

Tags:

* ruff check
diff --git a/torchao/dtypes/uintx/marlin_sparse_layout.py b/torchao/dtypes/uintx/marlin_sparse_layout.py
@@ -130,7 +130,7 @@ def __new__(
         cls,
         int_data: torch.Tensor,
         scale: torch.Tensor,
-        zero_point: torch.Tensor,
+        zero: torch.Tensor,
         meta: torch.Tensor,
         _layout: Layout,
         original_shape: torch.Size,
@@ -151,16 +151,17 @@ def __init__(
         self,
         int_data: torch.Tensor,
         scale: torch.Tensor,
-        zero_point: torch.Tensor,
+        zero: torch.Tensor,
         meta: torch.Tensor,
         _layout: Layout,
         original_shape: torch.Size,
         group_size: int,
         num_bits: int,
     ):
         self.int_data = int_data
+        self.scale_and_zero = None
         self.scale = scale
-        self.zero_point = zero_point
+        self.zero = zero
         self.meta = meta
         self._layout = _layout
         self.original_shape = original_shape
@@ -181,7 +182,7 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
         )
 
     def __tensor_flatten__(self):
-        return ["int_data", "scale", "zero_point", "meta"], [
+        return ["int_data", "scale", "zero", "meta"], [
             self._layout,
             self.original_shape,
             self.group_size,
@@ -194,13 +195,13 @@ def __tensor_unflatten__(
     ):
         int_data = tensor_data_dict["int_data"]
         scale = tensor_data_dict["scale"]
-        zero_point = tensor_data_dict["zero_point"]
+        zero = tensor_data_dict["zero"]
         meta = tensor_data_dict["meta"]
         _layout, original_shape, group_size, num_bits = tensor_attributes
         return cls(
             int_data,
             scale,
-            zero_point,
+            zero,
             meta,
             _layout,
             original_shape,
@@ -223,14 +224,14 @@ def get_plain(self):
         )
         int_data_expanded_t = int_data_expanded.t()
         scales_expanded_t = scales_expanded.t()
-        return int_data_expanded_t, scales_expanded_t, self.zero_point
+        return int_data_expanded_t, scales_expanded_t, self.zero
 
     @classmethod
     def from_plain(
         cls,
         int_data: torch.Tensor,
         scale: torch.Tensor,
-        zero_point: torch.Tensor,
+        zero: torch.Tensor,
         _layout: Layout,
     ):
         from torchao.sparsity.marlin import (
@@ -291,7 +292,7 @@ def from_plain(
         return cls(
             marlin_24_q_w_comp,
             marlin_24_s,
-            zero_point,
+            zero,
             meta,
             _layout,
             q_w_24.shape,
@@ -305,6 +306,6 @@ def get_layout(self) -> Layout:
     def _apply_fn_to_data(self, fn):
         self.int_data = fn(self.int_data)
         self.scale = fn(self.scale)
-        self.zero_point = fn(self.zero_point)
+        self.zero = fn(self.zero)
         self.meta = fn(self.meta)
         return self
diff --git a/torchao/sparsity/marlin/__init__.py b/torchao/sparsity/marlin/__init__.py
@@ -226,11 +226,10 @@ def _to_marlin_weights(
 
     # Pack
     pack_factor = utils.get_pack_factor(num_bits)
-    orig_device = q_w.device
 
     # Original implementation uses numpy + uint32 but we need to use int64 because torch.uint32
     # does not support rshift_cpu.
-    q_w = q_w.cpu().to(torch.int64)
+    q_w = q_w.to(torch.int64)
     q_packed = torch.zeros(
         (q_w.shape[0], q_w.shape[1] // pack_factor),
         dtype=torch.int64,
@@ -239,7 +238,7 @@ def _to_marlin_weights(
     for i in range(pack_factor):
         q_packed |= q_w[:, i::pack_factor] << (num_bits * i)
 
-    q_packed = q_packed.to(orig_device, dtype=torch.int32)
+    q_packed = q_packed.to(dtype=torch.int32)
     return q_packed
 
 
@@ -259,12 +258,11 @@ def _from_marlin_weights(
     perm_24, _, _ = utils.get_reverse_perms_24(num_bits)
 
     pack_factor = utils.get_pack_factor(num_bits)
-    orig_device = q_packed.device
 
     # Unpack from marlin format.
     # Original implementation uses numpy + uint32 but we need to use int64 because torch.uint32
     # does not support rshift_cpu.
-    q_packed = q_packed.cpu().to(torch.int64)
+    q_packed = q_packed.to(torch.int64)
     q_w_unpacked = torch.zeros(
         (q_packed.shape[0], q_packed.shape[1] * pack_factor),
         dtype=torch.int64,
@@ -275,7 +273,7 @@ def _from_marlin_weights(
             (1 << num_bits) - 1
         )
 
-    q_w_unpacked = q_w_unpacked.to(orig_device, dtype=torch.int32)
+    q_w_unpacked = q_w_unpacked.to(dtype=torch.int32)
 
     q_w_comp = utils.reverse_marlin_permute_weights(
         q_w_unpacked, size_k, size_n, perm_24