Optimize sparse 2:4 compression performance (3.69x speedup)

rahul-tuli · rahul-tuli · commit 5172c0cfee96 · 2025-06-15T22:58:49.000-04:00
- Implement GPU-accelerated bit packing in pack_bitmasks()
- Remove unnecessary CPU transfers in sparse compression pipeline
- Optimize topk operation with sorted=False parameter

Achieves 3.69x speedup (22.57s → 6.12s) for 8B parameter models by keeping operations on GPU and eliminating device transfers.
diff --git a/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py b/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py
@@ -90,13 +90,15 @@ def from_dense(
         :return: instantiated compressed tensor
         """
         shape = list(tensor.shape)
+        # Keep tensor on its original device for faster processing
         compressed, bitmask = sparse24_bitmask_compress(
-            tensor.cpu(), sparsity_structure=sparsity_structure
+            tensor, sparsity_structure=sparsity_structure
         )
+        # Move to CPU only at the end if needed for storage
         return Sparse24BitMaskTensor(
             shape=shape,
-            compressed=compressed,
-            bitmask=bitmask,
+            compressed=compressed.cpu() if compressed.is_cuda else compressed,
+            bitmask=bitmask.cpu() if bitmask.is_cuda else bitmask,
         )
 
     @staticmethod
@@ -233,10 +235,12 @@ def get_24_bytemasks(tensor):
 
     reshaped_tensor = tensor.view(-1, 4)
     abs_tensor = reshaped_tensor.abs()
-    topk_indices = abs_tensor.topk(2, dim=1).indices
+    # Use largest=True, sorted=False for better performance
+    topk_indices = abs_tensor.topk(2, dim=1, largest=True, sorted=False).indices
     mask = torch.zeros_like(reshaped_tensor, dtype=torch.bool)
     mask.scatter_(1, topk_indices, True)
     mask = mask.view(original_shape)
-    tensor = tensor.view(original_dtype)
+    if tensor.dtype == torch.int8:
+        tensor = tensor.view(original_dtype)
 
     return mask
diff --git a/src/compressed_tensors/utils/helpers.py b/src/compressed_tensors/utils/helpers.py
@@ -301,10 +301,46 @@ def pack_bitmasks(bytemasks: torch.Tensor) -> torch.Tensor:
     :param bytemasks: mask tensor where each byte corresponds to a weight
     :return: mask tensor where each bit corresounds to a weight
     """
-    packed_bits_numpy = numpy.packbits(bytemasks.numpy(), axis=-1, bitorder="little")
-    packed_bits_torch = torch.from_numpy(packed_bits_numpy)
-
-    return packed_bits_torch
+    # Try PyTorch-based implementation first to avoid CPU transfer
+    try:
+        device = bytemasks.device
+        dtype = bytemasks.dtype
+        
+        # Ensure input is boolean or can be treated as boolean
+        if dtype != torch.bool:
+            bytemasks = bytemasks.bool()
+        
+        rows, cols = bytemasks.shape
+        packed_cols = (cols + 7) // 8  # ceil(cols/8)
+        
+        # Convert boolean mask to uint8
+        bytemasks_uint8 = bytemasks.to(torch.uint8)
+        
+        # Pad to multiple of 8 if needed
+        if cols % 8 != 0:
+            padding = 8 - (cols % 8)
+            bytemasks_uint8 = torch.nn.functional.pad(bytemasks_uint8, (0, padding))
+        
+        # Reshape to group by 8 bits
+        reshaped = bytemasks_uint8.view(rows, packed_cols, 8)
+        
+        # Pack bits (little endian) - use bitwise operations
+        packed = torch.zeros(rows, packed_cols, dtype=torch.uint8, device=device)
+        for i in range(8):
+            packed |= reshaped[:, :, i] << i
+        
+        return packed
+        
+    except Exception:
+        # Fallback to NumPy implementation for compatibility
+        # Move to CPU if needed
+        if bytemasks.is_cuda:
+            bytemasks = bytemasks.cpu()
+        
+        packed_bits_numpy = numpy.packbits(bytemasks.numpy(), axis=-1, bitorder="little")
+        packed_bits_torch = torch.from_numpy(packed_bits_numpy)
+        
+        return packed_bits_torch
 
 
 def unpack_bitmasks(