HabanaAI · czhu15 · Jun 20, 2025 · Jun 19, 2025
@@ -117,6 +117,11 @@ Running vLLM with FP8 precision can be achieved using [Intel(R) Neural Compresso
 
 - #### Run vLLM with FP8 using INC
 To run vLLM with FP8 precision using INC, pass `-d fp8` and specify the path to your bfloat16 or float16 model with `-w <model_path>`. The model will be quantized to FP8 using calibration data obtained from the [FP8 Calibration Procedure](https://github.com/HabanaAI/vllm-hpu-extension/blob/v1.21.0/calibration/README.md).
+> For the Qwen3 MoE models, a custom INC should be installed:
+``` bash
+pip install git+https://github.com/intel/neural-compressor.git@qwen-fp8
+```
+
 #### 1. Copy open_orca_gpt4_tokenized_llama.calibration_1000.pkl to vllm-hpu-extension/calibration folder
 ```bash
 gzip -dk Gaudi-fp8-calibration/open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz
@@ -131,6 +136,14 @@ MODEL=/models/Qwen2.5-72B-Instruct
 HPU_SIZE=2
 ./calibrate_model.sh -m $MODEL -d open_orca_gpt4_tokenized_llama.calibration_1000.pkl  -o quantization -t $HPU_SIZE
 ```
+For Qwen3-235B-A22B, the calibration process needs 8 HPUs to load the original bfloat16 weights. Then the fp8 inference could run on 4 HPUs. Thus the measurements should be unified as follow:
+``` bash
+bash calibrate_model.sh -m /models/Qwen3-235B-A22B -d open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz -o Qwen3-235B-A22B -t 8 -b 256 -r 4 -u
+```
+Where the 
+ - `-t 8` means run calibration with 8 HPUs.
+ - `-r 4` means to unify the measurements to 4 HPUs.
+ - `-u` means the model have MoE weights.
 
 #### 3. Make the Quantization folder
 Create a quantization folder at the same level as start_gaudi_vllm_server.sh.

@@ -173,9 +173,10 @@ case "$dtype" in
         echo Running with dtype="$dtype"
         export QUANT_CONFIG=quantization/${model_name}/maxabs_quant_g2.json
         export PT_HPU_WEIGHT_SHARING=0
+        export VLLM_DISABLE_MARK_SCALES_AS_CONST=true
         QUANT_FLAGS=(--quantization inc --kv-cache-dtype fp8_inc)
         if [ "${model_name}" == "Qwen3-235B-A22B" ] || [ "${model_name}" == "Qwen3-30B-A3B" ]; then
-	    QUANT_FLAGS=(--quantization inc --weights-load-device cpu)
+            QUANT_FLAGS=(--quantization inc --weights-load-device cpu)
         fi
         dtype="bfloat16"
         ;;

@@ -1,29 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+from glob import glob
+
 import torch
 from safetensors import safe_open
 from safetensors.torch import save_file
-from glob import glob
-import os
 
 # input_path = "/models/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic"
 # output_path = "/models/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic-G2"
 input_path = "/models/Llama-4-Maverick-17B-128E-Instruct-FP8"
 output_path = "/models/Llama-4-Maverick-17B-128E-Instruct-FP8-G2"
 
-weight_factor = (
-    torch.finfo(torch.float8_e4m3fnuz).max / torch.finfo(torch.float8_e4m3fn).max
-)
+weight_factor = (torch.finfo(torch.float8_e4m3fnuz).max /
+                 torch.finfo(torch.float8_e4m3fn).max)
 scale_factor = 1.0 / weight_factor
 scale_inv_factor = weight_factor
 
 for safetensors_path in glob(f"{input_path}/*.safetensors"):
     tensors = {}
     print(f"processing {safetensors_path}")
-    with safe_open(safetensors_path, framework="pt", device="cpu") as tensor_file:
-        for k in tensor_file.keys():
+    with safe_open(safetensors_path, framework="pt",
+                   device="cpu") as tensor_file:
+        for k in tensor_file.keys():  # noqa:SIM118
             tensor = tensor_file.get_tensor(k)
             # print(f'{k}:{tensor.dtype}')
             if tensor.dtype == torch.float8_e4m3fn:
-                tensor = (tensor.float() * weight_factor).to(torch.float8_e4m3fn)
+                tensor = (tensor.float() * weight_factor).to(
+                    torch.float8_e4m3fn)
             elif k.endswith("_scale"):
                 tensor = tensor.float() * scale_factor
             else: