vllm-project
diff --git a/‎docs/features/quantization/quark.md
Lines changed: 25 additions & 0 deletions b/‎docs/features/quantization/quark.md
Lines changed: 25 additions & 0 deletions
diff --git a/‎tests/kernels/moe/test_moe.py
Lines changed: 1 addition & 0 deletions b/‎tests/kernels/moe/test_moe.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/kernels/moe/test_mxfp4_moe.py
Lines changed: 57 additions & 0 deletions b/‎tests/kernels/moe/test_mxfp4_moe.py
Lines changed: 57 additions & 0 deletions
diff --git a/‎tests/quantization/reference_mxfp4.py
Lines changed: 287 additions & 0 deletions b/‎tests/quantization/reference_mxfp4.py
Lines changed: 287 additions & 0 deletions
@@ -229,3 +229,28 @@ python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \
                           --model_export hf_format \
                           --tasks gsm8k
 ```
+
+## Using MXFP4 models
+
+vLLM supports loading MXFP4 models quantized offline through AMD Quark, compliant with [Open Compute Project (OCP) specification](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf).
+
+The scheme currently only supports dynamic quantization for activations.
+
+Example usage, after installing the latest AMD Quark release:
+
+```bash
+vllm serve fxmarty/qwen_1.5-moe-a2.7b-mxfp4 --tensor-parallel-size 1
+```
+
+A simulation of the matrix multiplication execution in MXFP4 can be run on devices that do not support MXFP4 operations natively (e.g. AMD Instinct MI325, MI300 and MI250), dequantizing weights from MXFP4 to half precision on the fly, using a fused kernel. This is useful e.g. to evaluate MXFP4 models using vLLM, or alternatively to benefit from the ~4x memory savings (compared to float16 and bfloat16).
+
+To generate offline models quantized using MXFP4 data type, the easiest approach is to use AMD Quark's [quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html), as an example:
+
+```bash
+python quantize_quark.py --model_dir Qwen/Qwen1.5-MoE-A2.7B-Chat \
+    --quant_scheme w_mxfp4_a_mxfp4_sym \
+    --output_dir qwen_1.5-moe-a2.7b-mxfp4 \
+    --skip_evaluation \
+    --model_export hf_format \
+    --group_size 32
+```
@@ -174,6 +174,7 @@ def test_fused_moe(
                                               use_int8_w8a8=False,
                                               use_int8_w8a16=False,
                                               use_int4_w4a16=False,
+                                              use_mxfp4_w4a4=False,
                                               per_act_token_quant=False,
                                               block_shape=None)
 
 
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+import importlib.metadata
+from dataclasses import dataclass
+
+import pytest
+import torch
+from packaging import version
+
+QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
+    "quark") is not None and version.parse(
+        importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
+
+
+@dataclass
+class ModelCase:
+    model_id: str
+    tp: int
+
+
+@pytest.mark.parametrize('model_case', [
+    ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
+    ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
+    ModelCase("fxmarty/Llama-4-Scout-17B-16E-Instruct-2-layers-mxfp4", tp=1)
+])
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
+                    reason="amd-quark>=0.9 is not available")
+def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
+    if torch.cuda.device_count() < model_case.tp:
+        pytest.skip(f"This test requires >={model_case.tp} gpus, got only "
+                    f"{torch.cuda.device_count()}")
+
+    with vllm_runner(model_case.model_id,
+                     tensor_parallel_size=model_case.tp,
+                     load_format="dummy") as llm:
+
+        # TODO: llm.apply_model(check_model) currently relies on V0 internals.
+        # Re-enable this later.
+        # def check_model(model):
+        #     layer = model.model.layers[0]
+
+        #     qkv_proj = layer.self_attn.qkv_proj
+
+        #     assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
+        #     assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
+
+        #     assert isinstance(layer.mlp.experts.quant_method,
+        #                       QuarkW4A4MXFp4MoEMethod)
+
+        # if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
+        #     llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Today I am in the French Alps and",
+                                     max_tokens=20)
+        assert output
@@ -0,0 +1,287 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+BFLOAT16_EXP_BIAS = 127
+BFLOAT16_MANTISSA_BITS = 7
+BFLOAT16_EXP_BITS = 8
+
+FLOAT16_EXP_BIAS = 15
+FLOAT16_MANTISSA_BITS = 10
+FLOAT16_EXP_BITS = 5
+
+FLOAT8_E8M0_MAX_EXP = 127
+FLOAT4_EXP_BIAS = 1
+FLOAT4_MANTISSA_BITS = 1
+
+FLOAT16_VAL_TO_ADD = (1 << (FLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1))
+FLOAT16_SIGN_EXPONENT_MASK = ((
+    (1 << (FLOAT16_EXP_BITS + 1)) - 1) << FLOAT16_MANTISSA_BITS)
+
+BFLOAT16_VAL_TO_ADD = (1 <<
+                       (BFLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1))
+BFLOAT16_SIGN_EXPONENT_MASK = ((
+    (1 << (BFLOAT16_EXP_BITS + 1)) - 1) << BFLOAT16_MANTISSA_BITS)
+
+
+def e8m0_to_half(scale, half_dtype: torch.dtype):
+    assert scale.dtype == torch.uint8
+
+    scale_exp = scale.to(torch.int16) - 127
+
+    # This can be implemented with bitwise operations in a proper kernel.
+    scale_half = 2.0**(scale_exp.to(torch.float))
+
+    return scale_half.to(half_dtype)
+
+
+def upcast_fp4_to_fp16_or_bf16(val, float_dtype: torch.dtype,
+                               half_exp_bias: int, half_mantissa_bits: int):
+    assert val.dtype == torch.uint8
+
+    unpacked = torch.zeros(*val.shape[:-1],
+                           val.shape[-1] * 2,
+                           dtype=torch.uint8,
+                           device=val.device)
+    unpacked[..., 1::2] = (val >> 4) & 0x0F  # Extract high 4 bits.
+    unpacked[..., ::2] = val & 0x0F  # Extract low 4 bits.
+
+    # Takes one float4 values represented as b0000xxxx,
+    # and converts it to the corresponding float16 value.
+
+    sign = unpacked >> 3
+
+    exp = (unpacked >> 1) & 3
+    new_mantissa = unpacked & 1
+
+    # if exp == 0 and new_mantissa == 0:
+    #     new_exp = 0
+    # else:
+    #     new_exp = exp - FLOAT4_EXP_BIAS + FLOAT16_EXP_BIAS
+
+    # int8_t works with float16, but may overflow with bfloat16.
+    new_exp = exp - FLOAT4_EXP_BIAS + half_exp_bias
+
+    # Cast b0000 to 0. in fp16/bf16.
+    new_exp = new_exp * torch.logical_or(exp > 0, new_mantissa > 0)
+
+    # Cast b0001 to 0.5 in fp16/bf16.
+    new_mantissa = torch.logical_and(new_mantissa, exp > 0)
+
+    new_mantissa = new_mantissa.to(torch.int32)
+    new_exp = new_exp.to(torch.int32)
+    sign = sign.to(torch.int32)
+
+    qdq_val = (sign << 15) + (new_exp << half_mantissa_bits) + (
+        new_mantissa << (half_mantissa_bits - 1))
+
+    assert qdq_val.max() <= 65535
+    assert qdq_val.min() >= 0
+    qdq_val = qdq_val.to(torch.uint16)
+
+    result = qdq_val.view(float_dtype)
+
+    return result
+
+
+def dq_mxfp4_torch(x: torch.Tensor, scale: torch.Tensor,
+                   float_dtype: torch.dtype) -> torch.Tensor:
+    assert x.dtype == torch.uint8
+    assert scale.dtype == torch.uint8
+
+    if float_dtype == torch.float16:
+        half_exp_bias = FLOAT16_EXP_BIAS
+        half_mantissa_bits = FLOAT16_MANTISSA_BITS
+    elif float_dtype == torch.bfloat16:
+        half_exp_bias = BFLOAT16_EXP_BIAS
+        half_mantissa_bits = BFLOAT16_MANTISSA_BITS
+
+    scale_half = e8m0_to_half(scale, half_dtype=float_dtype)
+
+    x_half = upcast_fp4_to_fp16_or_bf16(x,
+                                        float_dtype=float_dtype,
+                                        half_exp_bias=half_exp_bias,
+                                        half_mantissa_bits=half_mantissa_bits)
+
+    x_half = x_half.reshape(*x_half.shape[:-1], -1, 32)
+    x_half = x_half * scale_half[..., None]
+    x_half = x_half.reshape(*x_half.shape[:-2], -1)
+
+    return x_half
+
+
+def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
+                         half_exp_bias: int):
+    # Casts an fp16/bf16 input to the restricted values of float4_e2m1,
+    # that is to say [0., 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0, -0.0,
+    # -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0].
+
+    float_type = val.dtype
+
+    # "rshift_cuda" not implemented for 'UInt16'
+    val_view = val.view(torch.int16)  #.to(torch.int32)
+
+    exp = val_view >> half_mantissa_bits
+    exp = exp & ((1 << half_exp_bits) - 1)
+
+    exp = exp.view(torch.uint16).to(torch.int32)
+
+    sign = (val_view >> (half_mantissa_bits + half_exp_bits)) & 1
+
+    mantissa_last = (val_view >> (half_mantissa_bits - 1)) & 1
+
+    exp_unbias = exp - half_exp_bias
+    new_exp = exp_unbias + FLOAT4_EXP_BIAS
+
+    exp_shift = (new_exp <= 0) * (1 - new_exp)
+
+    # Typically 9.
+    # Take the min to prevent overflow on `uint16_t half`. This is the case for
+    # very small values, correctly mapped to `round_close`.
+    tail_bits = half_mantissa_bits - FLOAT4_MANTISSA_BITS + exp_shift
+    tail_bits[tail_bits >= 16] = 16
+
+    mantissa_plus_one = val_view & ((1 << (half_mantissa_bits + 1)) - 1)
+
+    half = 1 << (tail_bits - 1)
+
+    tail = mantissa_plus_one & ((1 << tail_bits) - 1)
+
+    round_close = (tail < half)  # round towards 0
+    round_away = (tail > half)  # round away from 0
+    tie = tail == half
+
+    new_mantissa_close = torch.zeros(val.shape,
+                                     device=val.device,
+                                     dtype=torch.bool)
+    new_exp_close = torch.zeros(val.shape,
+                                device=val.device,
+                                dtype=torch.uint16)
+
+    new_mantissa_away = torch.zeros(val.shape,
+                                    device=val.device,
+                                    dtype=torch.bool)
+    new_exp_away = torch.zeros(val.shape,
+                               device=val.device,
+                               dtype=torch.uint16)
+
+    new_exp_tie = torch.zeros(val.shape, device=val.device, dtype=torch.uint16)
+
+    # 1. round down
+    # if new_exp == 0: # case [0.5, 0.749999]
+    #     new_mantissa = 0
+    # elif new_exp < 0:  # case [0, 0.24999]
+    #     new_mantissa = 0
+    # else:
+    #     new_mantissa = mantissa_last
+
+    new_mantissa_close = (new_exp > 0) * mantissa_last
+    new_exp_close = exp
+
+    # # 2. round up
+    # if new_exp <= 0:  # case [0.250001, 0.499999] and [0.75001, 0.99999]
+    #     new_mantissa = 0
+    #     new_exp += 1
+    # elif mantissa_last == 0:
+    #     new_mantissa = 1
+    # else:
+    #     new_mantissa = 0
+    #     new_exp += 1
+
+    new_mantissa_away = torch.logical_and(new_exp > 0, mantissa_last == 0)
+    new_exp_away = exp + torch.logical_or(new_exp <= 0, mantissa_last == 1)
+
+    # # 3. tie
+    # 0.25 -> 0. (handled by `exp > (half_exp_bias - 2)`)
+    # 0.75 -> 1.
+    # 1.25 -> 1.
+    # 1.75 -> 2.
+    # 2.5 -> 2.
+    # 3.5 -> 4.
+    # 5. -> 4.
+    new_exp_tie = (exp > (half_exp_bias - 2)) * (exp + (mantissa_last == 1))
+
+    # Gather round up, round down and tie.
+    new_exp = round_away * new_exp_away \
+        + round_close * new_exp_close \
+        + tie * new_exp_tie
+
+    new_mantissa = round_away * new_mantissa_away \
+        + round_close * new_mantissa_close
+
+    # if new_exp > 3:
+    #     new_mantissa = 1
+    new_mantissa = new_mantissa + (new_exp >
+                                   (2 + half_exp_bias)) * (new_mantissa == 0)
+
+    # Clamp the exponent to acceptable values.
+    new_exp = (new_exp >= (half_exp_bias - 2)) * torch.clamp(
+        new_exp, half_exp_bias - 2, half_exp_bias + 2)
+
+    sign = sign.to(torch.int32)
+    new_mantissa = new_mantissa.to(torch.int32)
+
+    qdq_val = (sign << 15) + (new_exp << half_mantissa_bits) + (
+        new_mantissa << (half_mantissa_bits - 1))
+
+    assert qdq_val.max() <= 65535
+    assert qdq_val.min() >= 0
+    assert qdq_val.dtype == torch.int32
+    qdq_val = qdq_val.to(torch.uint16)
+
+    result = qdq_val.view(float_type)
+    return result
+
+
+def qdq_mxfp4_torch(x: torch.Tensor,
+                    scale_calculation_mode: str = "even") -> torch.Tensor:
+    half_dtype = x.dtype
+
+    if half_dtype == torch.float16:
+        half_mantissa_bits = FLOAT16_MANTISSA_BITS
+        half_exp_bits = FLOAT16_EXP_BITS
+        half_exp_bias = FLOAT16_EXP_BIAS
+        val_to_add = FLOAT16_VAL_TO_ADD
+        sign_exponent_mask = FLOAT16_SIGN_EXPONENT_MASK
+    elif half_dtype == torch.bfloat16:
+        half_mantissa_bits = BFLOAT16_MANTISSA_BITS
+        half_exp_bits = BFLOAT16_EXP_BITS
+        half_exp_bias = BFLOAT16_EXP_BIAS
+        val_to_add = BFLOAT16_VAL_TO_ADD
+        sign_exponent_mask = BFLOAT16_SIGN_EXPONENT_MASK
+    else:
+        raise ValueError("not implemented")
+
+    x = x.reshape(*x.shape[:-1], -1, 32)
+
+    block_max = torch.max(torch.abs(x), dim=-1).values
+
+    block_max = block_max.view(torch.uint16).to(torch.int32)
+
+    block_max_uint = torch.bitwise_and(block_max + val_to_add,
+                                       sign_exponent_mask)
+
+    assert block_max_uint.max() <= 65535
+    assert block_max_uint.min() >= 0
+    assert block_max_uint.dtype == torch.int32
+    block_max_uint = block_max_uint.to(torch.uint16)
+
+    block_max = block_max_uint.view(half_dtype)
+
+    scale_exp = FLOAT8_E8M0_MAX_EXP + torch.floor(torch.log2(block_max)).to(
+        torch.int32) - 2
+
+    scale_exp = torch.clamp(scale_exp, 0, 2 * FLOAT8_E8M0_MAX_EXP)
+
+    scale = 2.0**(scale_exp - FLOAT8_E8M0_MAX_EXP)
+    scale = scale.to(half_dtype)
+
+    x = x / scale[..., None]
+
+    x_fp4 = fp16_to_fp4_simulate(x,
+                                 half_exp_bits=half_exp_bits,
+                                 half_mantissa_bits=half_mantissa_bits,
+                                 half_exp_bias=half_exp_bias)
+
+    x_fp4 = x_fp4 * scale[..., None]
+    return x_fp4.reshape(*x_fp4.shape[:-2], -1)