Support DeepSeekV3-style block FP8 quantization (#372)

mgoin · shanjiaz · web-flow · commit 09b7ed4a07f4 · 2025-07-21T12:44:13.000-04:00
* Support DeepSeekV3-style block FP8 quantization

Signed-off-by: mgoin &lt;michael@neuralmagic.com&gt;

* Remove math

Signed-off-by: mgoin &lt;michael@neuralmagic.com&gt;

* Enforce divisible shapes

Signed-off-by: mgoin &lt;michael@neuralmagic.com&gt;

* Format

Signed-off-by: mgoin &lt;michael@neuralmagic.com&gt;

* Remove validation

Signed-off-by: mgoin &lt;michael@neuralmagic.com&gt;

* Fix string

Signed-off-by: mgoin &lt;michael@neuralmagic.com&gt;

* fix failed tests

Signed-off-by: shanjiaz &lt;zsjwpianpian@gmail.com&gt;

* address review comments

Signed-off-by: shanjiaz &lt;zsjwpianpian@gmail.com&gt;

* new line

Signed-off-by: shanjiaz &lt;zsjwpianpian@gmail.com&gt;

---------

Signed-off-by: mgoin &lt;michael@neuralmagic.com&gt;
Signed-off-by: shanjiaz &lt;zsjwpianpian@gmail.com&gt;
Co-authored-by: shanjiaz &lt;zsjwpianpian@gmail.com&gt;
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -111,11 +111,18 @@ def dequantize(
         elif scale.ndim == 2:
             if scale.shape[1] == 1:
                 args = QuantizationArgs(strategy=QuantizationStrategy.CHANNEL)
-            else:
+            # Scale height matches input or is 1 -> group quantization across columns
+            # 
+            # Example 1: scale.shape[0] == 1
+            # x_q: (4, 8), scale: (1, 4) -> 2 columns per group
+            #
+            # Example 2: scale.shape[0] == x_q.shape[0] 
+            # x_q: (4, 8), scale: (4, 4) -> 2 elements per group (per row)
+            elif (scale.shape[0] == 1) or (scale.shape[0] == x_q.shape[0]):
                 group_size = int(x_q.shape[1] / scale.shape[1])
-                args = QuantizationArgs(
-                    strategy=QuantizationStrategy.GROUP, group_size=group_size
-                )
+                args = QuantizationArgs(strategy=QuantizationStrategy.GROUP, group_size=group_size)
+            else:
+                args = QuantizationArgs(strategy=QuantizationStrategy.BLOCK, block_structure=scale.shape)
         else:
             raise ValueError(
                 f"Could not infer a quantization strategy from scale with {scale.ndim} "
@@ -189,7 +196,63 @@ def _process_quantization(
     q_min, q_max = calculate_range(args, x.device)
     group_size = args.group_size
 
-    if args.strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
+    # blockwise FP8: quantize per 2D block, supports block_structure for static block quant
+    if args.strategy == QuantizationStrategy.BLOCK:
+        original_shape = x.shape
+        rows, cols = x.shape[-2], x.shape[-1]
+        block_height, block_width = args.block_structure
+
+        # Ensure exact division (tensor dimensions must be divisible by block size)
+        if rows % block_height != 0:
+            raise ValueError(
+                f"Tensor height {rows} is not divisible by block_height {block_height}. "
+                f"Block quantization requires exact division."
+            )
+        if cols % block_width != 0:
+            raise ValueError(
+                f"Tensor width {cols} is not divisible by block_width {block_width}. "
+                f"Block quantization requires exact division."
+            )
+
+        # reshape into blocks and transpose to make each block contiguous
+        num_rows_blocks = rows // block_height
+        num_cols_blocks = cols // block_width
+        x_blocks = x.reshape(
+            num_rows_blocks,
+            block_height,
+            num_cols_blocks,
+            block_width,
+        ).transpose(1, 2)
+
+        # expand scale/zero_point for blocks
+        sb = scale.unsqueeze(-1).unsqueeze(-1)
+        zb = zero_point.unsqueeze(-1).unsqueeze(-1) if zero_point is not None else None
+        if do_quantize:
+            # quantize blocks
+            x_blocks = _quantize(
+                x=x_blocks,
+                scale=sb,
+                zero_point=zb,
+                q_min=q_min,
+                q_max=q_max,
+                args=args,
+                dtype=dtype,
+                global_scale=global_scale,
+            )
+        if do_dequantize:
+            # dequantize blocks
+            x_blocks = _dequantize(
+                x_q=x_blocks,
+                scale=sb,
+                zero_point=zb,
+                global_scale=global_scale,
+            )
+        # restore original shape
+        output = x_blocks.transpose(1, 2).reshape(original_shape)
+    elif args.strategy in (
+        QuantizationStrategy.GROUP,
+        QuantizationStrategy.TENSOR_GROUP,
+    ):
         n_dims = x.shape
         if len(n_dims) > 2:
             x = x.squeeze(0)
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -14,7 +14,7 @@
 
 import warnings
 from enum import Enum
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 from compressed_tensors.utils import Aliasable
@@ -153,8 +153,8 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
     :param symmetric: whether or not quantization scale is symmetric about zero-point
     :param strategy: string id determining the scope of scale/zero-point to apply
     :param group_size: group length to use for the group strategy
-    :param block_structure: 2d block structure to use for the block strategy, must be
-    of the format "2x4", "8x16", etc.
+    :param block_structure: 2d block structure to use for the block strategy; must be
+        a list of two ints [rows, cols] like [128, 128].
     :param dynamic: set True to perform dynamic quantization - values will not be
         calibrated during calibration phase, instead during inference new quantization
         ranges will be observed with every sample. Defaults to False for static
@@ -169,7 +169,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
     symmetric: bool = True
     group_size: Optional[int] = None
     strategy: Optional[QuantizationStrategy] = None
-    block_structure: Optional[str] = None
+    block_structure: Optional[List[int]] = None
     dynamic: Union[DynamicType, bool] = False
     actorder: Union[ActivationOrdering, bool, None] = None
     observer: Optional[str] = Field(
@@ -207,6 +207,28 @@ def validate_group(cls, value) -> Union[int, None]:
 
         return value
 
+    @field_validator("block_structure", mode="before")
+    def validate_block_structure(cls, value) -> Optional[List[int]]:
+        if value is None:
+            return value
+        # For backward compatibility, allow string format "2x4", "8x16", etc.
+        if isinstance(value, str):
+            try:
+                return [int(x) for x in value.split("x")]
+            except Exception:
+                raise ValueError(
+                    f"Invalid block_structure '{value}'. Must be a list of two ints [rows, cols]."
+                )
+        if isinstance(value, (list, tuple)):
+            if len(value) != 2 or not all(isinstance(v, int) for v in value):
+                raise ValueError(
+                    f"Invalid block_structure '{value}'. Must be a list of two ints [rows, cols]."
+                )
+            return list(value)
+        raise ValueError(
+            f"Invalid block_structure '{value}'. Must be a list of two ints [rows, cols]."
+        )
+
     @field_validator("strategy", mode="before")
     def validate_strategy(cls, value) -> Union[QuantizationStrategy, None]:
         if isinstance(value, str):
@@ -277,14 +299,15 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
 
         # infer observer w.r.t. dynamic
         if dynamic:
-            if strategy not in (
+            supported_strategies = (
                 QuantizationStrategy.TOKEN,
                 QuantizationStrategy.TENSOR,
                 QuantizationStrategy.TENSOR_GROUP,
-            ):
+                QuantizationStrategy.GROUP,
+            )
+            if strategy not in supported_strategies:
                 raise ValueError(
-                    f"One of {(QuantizationStrategy.TOKEN, QuantizationStrategy.TENSOR, QuantizationStrategy.TENSOR_GROUP)} "
-                    "must be used for dynamic quantization",
+                    f"One of {supported_strategies} must be used for dynamic quantization"
                 )
 
             if (
diff --git a/src/compressed_tensors/quantization/quant_scheme.py b/src/compressed_tensors/quantization/quant_scheme.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 from copy import deepcopy
 from typing import Any, Dict, List, Optional
 
@@ -52,6 +53,7 @@ class QuantizationScheme(BaseModel):
     def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme":
         inputs = model.input_activations
         outputs = model.output_activations
+        weights = model.weights
 
         if inputs is not None:
             if inputs.actorder is not None:
@@ -61,6 +63,21 @@ def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme":
             if outputs.actorder is not None:
                 raise ValueError("Cannot apply actorder to output activations")
 
+        if (
+            inputs and weights
+            and weights.strategy == QuantizationStrategy.GROUP 
+            and inputs.strategy == QuantizationStrategy.GROUP
+            and weights.group_size != inputs.group_size
+        ):
+            warnings.warn(
+                "Using GROUP strategy for both weights and input_activations "
+                f"with different group sizes ({weights.group_size} vs {inputs.group_size}) "
+                "may complicate fused kernel implementations. Consider using "
+                "TENSOR_GROUP strategy for both or matching group sizes.",
+                UserWarning,
+                stacklevel=2
+            )
+
         return model
 
 
@@ -243,6 +260,29 @@ def is_preset_scheme(name: str) -> bool:
     ),
 )
 
+# Block‐wise FP8 (deepseekv3-style quantization):
+# static 128x128 per‐block weights and
+# dynamic per‐token‐group activations
+FP8_BLOCK = dict(
+    weights=QuantizationArgs(
+        num_bits=8,
+        type=QuantizationType.FLOAT,
+        strategy=QuantizationStrategy.BLOCK,
+        symmetric=True,
+        dynamic=False,
+        block_structure=[128, 128],
+    ),
+    input_activations=QuantizationArgs(
+        num_bits=8,
+        type=QuantizationType.FLOAT,
+        strategy=QuantizationStrategy.GROUP,
+        symmetric=True,
+        dynamic=True,
+        observer=None,
+        group_size=128,
+    ),
+)
+
 PRESET_SCHEMES = {
     # Unquantized (no-op)
     "UNQUANTIZED": UNQUANTIZED,
@@ -257,6 +297,7 @@ def is_preset_scheme(name: str) -> bool:
     # Float weight and activation schemes
     "FP8": FP8,
     "FP8_DYNAMIC": FP8_DYNAMIC,
+    "FP8_BLOCK": FP8_BLOCK,
     "NVFP4A16": NVFP4A16,
     "NVFP4": NVFP4,
 }
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -171,7 +171,10 @@ def compute_dynamic_scales_and_zp(
         reduce_dims = tuple(idx for idx in range(value.ndim) if idx not in dim)
     elif args.strategy == QuantizationStrategy.TENSOR:
         reduce_dims = None
-    elif args.strategy == QuantizationStrategy.TENSOR_GROUP:
+    elif args.strategy in (
+        QuantizationStrategy.TENSOR_GROUP,
+        QuantizationStrategy.GROUP,
+    ):
         if len(value.shape) > 2:
             value = value.squeeze(0)
 
@@ -187,9 +190,15 @@ def compute_dynamic_scales_and_zp(
             ),
         )
     else:
+        supported_strategies = (
+            QuantizationStrategy.TOKEN,
+            QuantizationStrategy.TENSOR,
+            QuantizationStrategy.TENSOR_GROUP,
+            QuantizationStrategy.GROUP,
+        )
         raise ValueError(
             "Dynamic quantization is only supported for ",
-            f"{QuantizationStrategy.TOKEN, QuantizationStrategy.TENSOR, QuantizationStrategy.TENSOR_GROUP}",
+            f"{supported_strategies}",
         )
 
     if not reduce_dims:
diff --git a/tests/test_examples/test_bitmask_compression_ipynb.py b/tests/test_examples/test_bitmask_compression_ipynb.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import nbformat
 import pytest
+
+
+nbformat = pytest.importorskip("nbformat")
 from nbconvert.preprocessors import ExecutePreprocessor
 
 
diff --git a/tests/test_quantization/lifecycle/test_forward.py b/tests/test_quantization/lifecycle/test_forward.py
@@ -13,9 +13,12 @@
 # limitations under the License.
 
 
+import math
+
 import pytest
 import torch
 from compressed_tensors.quantization.lifecycle.forward import (
+    _process_quantization,
     dequantize,
     forward_quantize,
     quantize,
@@ -29,6 +32,7 @@
     QuantizationStrategy,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
+from compressed_tensors.quantization.utils.helpers import calculate_range
 from torch.nn import Linear
 
 
@@ -203,3 +207,49 @@ def test_dequantize(num_bits, type, strategy, group_size, scale, zero_point, g_i
         dtype=None,
         g_idx=g_idx,
     )
+
+
+def test_process_quantization_block_static():
+    """
+    Static block quantization (QuantizationStrategy.BLOCK) should split a 2D tensor
+    into blocks, quantize each block, and reassemble without changing shape.
+    """
+    rows, cols = 8, 8
+    bh, bw = 2, 4
+    x = torch.randn(rows, cols)
+    args = QuantizationArgs(
+        num_bits=8,
+        type="float",
+        strategy=QuantizationStrategy.BLOCK,
+        symmetric=True,
+        dynamic=False,
+        block_structure=[bh, bw],
+    )
+    num_rb = math.ceil(rows / bh)
+    num_cb = math.ceil(cols / bw)
+    scale = torch.rand(num_rb, num_cb) + 0.1
+    zp = torch.zeros_like(scale)
+    q_min, q_max = calculate_range(args, x.device)
+    out = _process_quantization(
+        x=x,
+        scale=scale,
+        zero_point=zp,
+        args=args,
+        do_quantize=True,
+        do_dequantize=False,
+        dtype=None,
+        global_scale=None,
+    )
+    assert out.shape == x.shape
+    # full fake-quantize roundtrip
+    out2 = _process_quantization(
+        x=x,
+        scale=scale,
+        zero_point=zp,
+        args=args,
+        do_quantize=True,
+        do_dequantize=True,
+        dtype=None,
+        global_scale=None,
+    )
+    assert out2.shape == x.shape
diff --git a/tests/test_quantization/test_quant_args.py b/tests/test_quantization/test_quant_args.py
@@ -59,7 +59,8 @@ def test_block():
 
     block = QuantizationArgs(**kwargs)
     assert block.strategy == QuantizationStrategy.BLOCK
-    assert block.block_structure == kwargs["block_structure"]
+    assert block.block_structure == [2, 4]
+    assert block.block_structure != kwargs["block_structure"]  # "2x4" != [2, 4]
 
 
 def test_infer_strategy():
diff --git a/tests/test_quantization/test_utils/test_helpers.py b/tests/test_quantization/test_utils/test_helpers.py