diff --git a/scripts/README.md b/scripts/README.md
index 4b081f7dcbc..65d24fff7e7 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -117,6 +117,11 @@ Running vLLM with FP8 precision can be achieved using [Intel(R) Neural Compresso
 
 - #### Run vLLM with FP8 using INC
 To run vLLM with FP8 precision using INC, pass `-d fp8` and specify the path to your bfloat16 or float16 model with `-w <model_path>`. The model will be quantized to FP8 using calibration data obtained from the [FP8 Calibration Procedure](https://github.com/HabanaAI/vllm-hpu-extension/blob/v1.21.0/calibration/README.md).
+> For the Qwen3 MoE models, a custom INC should be installed:
+``` bash
+pip install git+https://github.com/intel/neural-compressor.git@qwen-fp8
+```
+
 #### 1. Copy open_orca_gpt4_tokenized_llama.calibration_1000.pkl to vllm-hpu-extension/calibration folder
 ```bash
 gzip -dk Gaudi-fp8-calibration/open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz
@@ -131,6 +136,14 @@ MODEL=/models/Qwen2.5-72B-Instruct
 HPU_SIZE=2
 ./calibrate_model.sh -m $MODEL -d open_orca_gpt4_tokenized_llama.calibration_1000.pkl  -o quantization -t $HPU_SIZE
 ```
+For Qwen3-235B-A22B, the calibration process needs 8 HPUs to load the original bfloat16 weights. Then the fp8 inference could run on 4 HPUs. Thus the measurements should be unified as follow:
+``` bash
+bash calibrate_model.sh -m /models/Qwen3-235B-A22B -d open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz -o Qwen3-235B-A22B -t 8 -b 256 -r 4 -u
+```
+Where the 
+ - `-t 8` means run calibration with 8 HPUs.
+ - `-r 4` means to unify the measurements to 4 HPUs.
+ - `-u` means the model have MoE weights.
 
 #### 3. Make the Quantization folder
 Create a quantization folder at the same level as start_gaudi_vllm_server.sh.
diff --git a/scripts/benchmark_throughput.sh b/scripts/benchmark_throughput.sh
index fbedfc67ab0..a3a985bbe57 100644
--- a/scripts/benchmark_throughput.sh
+++ b/scripts/benchmark_throughput.sh
@@ -173,9 +173,10 @@ case "$dtype" in
         echo Running with dtype="$dtype"
         export QUANT_CONFIG=quantization/${model_name}/maxabs_quant_g2.json
         export PT_HPU_WEIGHT_SHARING=0
+        export VLLM_DISABLE_MARK_SCALES_AS_CONST=true
         QUANT_FLAGS=(--quantization inc --kv-cache-dtype fp8_inc)
         if [ "${model_name}" == "Qwen3-235B-A22B" ] || [ "${model_name}" == "Qwen3-30B-A3B" ]; then
-	    QUANT_FLAGS=(--quantization inc --weights-load-device cpu)
+            QUANT_FLAGS=(--quantization inc --weights-load-device cpu)
         fi
         dtype="bfloat16"
         ;;
diff --git a/scripts/llama4/convert_for_g2.py b/scripts/quantization/convert_fp8_weights_for_gaudi2.py
similarity index 71%
rename from scripts/llama4/convert_for_g2.py
rename to scripts/quantization/convert_fp8_weights_for_gaudi2.py
index f973a59ba63..e94af81e5e4 100644
--- a/scripts/llama4/convert_for_g2.py
+++ b/scripts/quantization/convert_fp8_weights_for_gaudi2.py
@@ -1,29 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+from glob import glob
+
 import torch
 from safetensors import safe_open
 from safetensors.torch import save_file
-from glob import glob
-import os
 
 # input_path = "/models/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic"
 # output_path = "/models/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic-G2"
 input_path = "/models/Llama-4-Maverick-17B-128E-Instruct-FP8"
 output_path = "/models/Llama-4-Maverick-17B-128E-Instruct-FP8-G2"
 
-weight_factor = (
-    torch.finfo(torch.float8_e4m3fnuz).max / torch.finfo(torch.float8_e4m3fn).max
-)
+weight_factor = (torch.finfo(torch.float8_e4m3fnuz).max /
+                 torch.finfo(torch.float8_e4m3fn).max)
 scale_factor = 1.0 / weight_factor
 scale_inv_factor = weight_factor
 
 for safetensors_path in glob(f"{input_path}/*.safetensors"):
     tensors = {}
     print(f"processing {safetensors_path}")
-    with safe_open(safetensors_path, framework="pt", device="cpu") as tensor_file:
-        for k in tensor_file.keys():
+    with safe_open(safetensors_path, framework="pt",
+                   device="cpu") as tensor_file:
+        for k in tensor_file.keys():  # noqa:SIM118
             tensor = tensor_file.get_tensor(k)
             # print(f'{k}:{tensor.dtype}')
             if tensor.dtype == torch.float8_e4m3fn:
-                tensor = (tensor.float() * weight_factor).to(torch.float8_e4m3fn)
+                tensor = (tensor.float() * weight_factor).to(
+                    torch.float8_e4m3fn)
             elif k.endswith("_scale"):
                 tensor = tensor.float() * scale_factor
             else:
diff --git a/scripts/qwen3/01-benchmark-online-30B-fp8.sh b/scripts/qwen3/01-benchmark-online-30B-fp8.sh
deleted file mode 100644
index a6b66d147d7..00000000000
--- a/scripts/qwen3/01-benchmark-online-30B-fp8.sh
+++ /dev/null
@@ -1,194 +0,0 @@
-#!/bin/bash
-#########################################################
-# vLLM Benchmark Script for Qwen3
-# 
-# This script runs a vLLM server with specific configurations
-# and benchmarks it using the sonnet dataset.
-#########################################################
-
-#===========================================================
-# CONFIGURATION PARAMETERS
-#===========================================================
-export QUANT_CONFIG="inc_quant_g3_30B_A3B.json"
-
-if [ $# -gt 0 ] && [ "$1" == "--model_path" ]; then
-    model=$2
-else
-    model="/mnt/weka/llm/Qwen3/Qwen3-30B-A3B/"
-fi
-
-if [ $# -eq 4 ] && [ "$3" == "--tp_size" ]; then
-    tp_size=$4
-else
-    tp_size=1
-fi
-
-model_name=$(basename ${model})
-
-# Model Configuration
-tokenizer=$model
-
-# Hardware Configuration
-moe_n_slice=1         # MoE groups
-gpu_utils=0.95        # GPU memory utilization
-
-# Request Configuration
-max_model_len=9216    # Max model len
-request_rate="inf"    # Request rate (inf = unlimited)
-multi_step=1          # Number of scheduler steps
-
-
-#===========================================================
-# START the LOOP
-#===========================================================
-
-tp_parallel=${tp_size}         # Tensor parallelism size
-req_in_out_list=(256_1024_1024 96_5120_1024 32_10240_1024)
-
-for req_in_out in "${req_in_out_list[@]}"; do
-    # Token Length Configuration
-    bs=$(echo "$req_in_out" | awk -F'_' '{ print $1 }')
-    in_len=$(echo "$req_in_out" | awk -F'_' '{ print $2 }')
-    out_len=$(echo "$req_in_out" | awk -F'_' '{ print $3 }')
-
-    num_prompts=$((bs * 5)) 
-    # Expert parallelism size
-    ep_size=${tp_parallel}
-
-    #===========================================================
-    # DERIVED PARAMETERS
-    #===========================================================
-
-    # Calculate and align total length
-    # Calculate aligned lengths for buckets
-    in_len_aligned=$(((in_len + 127) / 128 * 128))
-    prompt_seq_max=$((in_len * 1125 / 1000))
-    prompt_seq_max=$(((prompt_seq_max + 127) / 128 * 128))
-
-    total_len=$((prompt_seq_max + out_len))
-    if [ $((total_len % 128)) -ne 0 ]; then
-        echo 'Rounding up total length to multiple of 128'
-        total_len=$(((total_len / 128 + 1) * 128))
-    fi
-
-    total_len_aligned=$(((total_len + 127) / 128 * 128))
-
-    decode_total_len=$((total_len + 128))
-    decode_total_len_aligned=$(((decode_total_len + 127) / 128 * 128))
-
-    # Calculate bucket sizes
-    VLLM_DECODE_BLOCK_BUCKET_MIN=$((in_len_aligned * bs / 128))
-    VLLM_DECODE_BLOCK_BUCKET_MIN=$(((VLLM_DECODE_BLOCK_BUCKET_MIN + 127) / 128 * 128))
-    VLLM_DECODE_BLOCK_BUCKET_MAX=$((decode_total_len_aligned * bs / 128))
-    VLLM_DECODE_BLOCK_BUCKET_MAX=$(((VLLM_DECODE_BLOCK_BUCKET_MAX + 127) / 128 * 128))
-
-    #===========================================================
-    # LOG CONFIGURATION
-    #===========================================================
-
-    # Create a descriptive log name based on parameters
-    log_name="${model_name}-gaudi3-tp${tp_parallel}-ep${ep_size}-moe${moe_n_slice}-ms${multi_step}_np${num_prompts}_rr${request_rate}_bs${bs}_i${in_len}_o${out_len}_len${total_len}"
-
-    # Create log directory
-    mkdir -p benchmark_logs
-
-    #===========================================================
-    # START vLLM SERVER
-    #===========================================================
-
-    echo "Starting vLLM server with the following configuration:"
-    echo "- Model: ${model_name}"
-    echo "- Tensor Parallel Size: ${tp_parallel}"
-    echo "- Expert Parallel Size: ${ep_size}"
-    echo "- Batch Size: ${bs}"
-    echo "- Input Length: ${in_len}"
-    echo "- Output Length: ${out_len}"
-    echo "- Total Length: ${total_len}"
-
-    VLLM_DMOE_FORCE_LOOP=1 \
-    VLLM_DYNAMIC_MOE_MIN_TOKENS=256 \
-    PT_HPU_LAZY_MODE=1 \
-    VLLM_PROMPT_BS_BUCKET_MIN=1 \
-    VLLM_PROMPT_BS_BUCKET_MAX=8 \
-    VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len_aligned} \
-    VLLM_PROMPT_SEQ_BUCKET_MAX=${prompt_seq_max} \
-    VLLM_DECODE_BS_BUCKET_MIN=${bs} \
-    VLLM_DECODE_BS_BUCKET_MAX=${bs} \
-    VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \
-    VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \
-    VLLM_DECODE_BLOCK_BUCKET_STEP=128 \
-    VLLM_DELAYED_SAMPLING=true \
-    HABANA_VISIBLE_DEVICES="ALL" \
-    VLLM_EP_SIZE=${ep_size} \
-    PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
-    PT_HPU_WEIGHT_SHARING=0 \
-    python3 -m vllm.entrypoints.openai.api_server \
-        --port 18080 \
-        --model ${model} \
-        --load-format safetensors \
-        --config-format hf \
-        --tensor-parallel-size ${tp_parallel} \
-        --max-num-seqs ${bs} \
-        --disable-log-requests \
-        --dtype bfloat16 \
-        --use-v2-block-manager \
-        --use-padding-aware-scheduling \
-        --num_scheduler_steps ${multi_step} \
-        --max-model-len $((total_len_aligned)) \
-        --max-num-batched-tokens $((total_len_aligned * 4)) \
-        --distributed_executor_backend ray \
-        --gpu_memory_utilization ${gpu_utils} \
-        --quantization inc \
-        # --enable-expert-parallel \
-        2>&1 | tee benchmark_logs/${log_name}_serving.log &
-    pid=$(($!-1))
-        #  --trust-remote-code false    --enforce-eager \
-
-    # Wait for server to start
-    n=0
-    ready=false
-    until [[ "$n" -ge 1000 ]] || [[ $ready == true ]]; do
-        n=$((n+1))
-        if grep -q "Started server process" benchmark_logs/${log_name}_serving.log; then
-            break
-        fi
-        sleep 5s
-    done
-    sleep 10s
-    echo "Server started with PID: ${pid}"
-
-    #===========================================================
-    # RUN BENCHMARK
-    #===========================================================
-
-    echo "Starting benchmark with Sonnet dataset"
-    max_concurrency_client=${bs}
-    start_time=$(date +%s)
-
-    python3 ../benchmarks/benchmark_serving.py \
-        --backend vllm \
-        --model ${model} \
-        --tokenizer ${tokenizer} \
-        --dataset-name sonnet \
-        --dataset-path ../../benchmarks/sonnet.txt \
-        --request-rate ${request_rate} \
-        --percentile-metrics ttft,tpot,itl,e2el \
-        --ignore-eos \
-        --num-prompts ${num_prompts} \
-        --port 18080 \
-        --sonnet-input-len ${in_len} \
-        --sonnet-output-len ${out_len} \
-        --sonnet-prefix-len 100 \
-        --max-concurrency ${max_concurrency_client} \
-        --save-result 2>&1 | tee benchmark_logs/${log_name}_benchmark.log
-
-    end_time=$(date +%s)
-    echo "Benchmark completed in $((end_time - start_time)) seconds"
-
-    # Clean up
-    echo "Stopping vLLM server"
-    kill ${pid}
-    echo "Script execution completed"
-    sleep 10
-done
-
diff --git a/scripts/qwen3/Quant_QWen3-FP8.md b/scripts/qwen3/Quant_QWen3-FP8.md
deleted file mode 100644
index ac69fe2bc42..00000000000
--- a/scripts/qwen3/Quant_QWen3-FP8.md
+++ /dev/null
@@ -1,79 +0,0 @@
-
-## 0. Prerequisites
-
-- Driver: 1.20.1 (how to update Gaudi driver: https://docs.habana.ai/en/latest/Installation_Guide/Driver_Installation.html)
-- Firmware: 1.20.1 (how to update Gaudi firmware: https://docs.habana.ai/en/latest/Installation_Guide/Firmware_Upgrade.html#system-unboxing-main)
-- Docker: vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-
-## 1. Installation
-
-- VLLM
-```bash
-git clone recursive https://github.com/vllm-fork.git -b aice/v1.20.1
-
-cd vllm-fork
-pip install -e .
-```
-- INC
-```bash
-pip install git+https://github.com/intel/neural-compressor.git@qwen-fp8
-
-
-- VLLM-HPU-EXT
-```bash
-git clone https://github.com/vllm-hpu-extension-fork.git -b aice/v1.20.1
-cd vllm-hpu-extension
-pip install -e . 
-```
-
-## 2. FP8
-
-
-- Calibration
-
-```bash
-cd vllm-fork/scripts/qwen3
-pip install datasets
-export OFFICIAL_MODEL=/path/to/qwen/model
-bash ./run_qwen.sh calib ${OFFICIAL_MODEL}
-```
-
-```
-- Online Serving 
-
-```bash
-cd vllm-fork/scripts-fp8
-bash 01-benchmark-online-30B-fp8.sh --model_path </path/to/qwen/model> --tp_size <number of cards>
-ex.
-Qwen3-30B-A3B:
-bash 01-benchmark-online-30B-fp8.sh --model_path /workspace/HF_models/Qwen3-30B-A3B  --tp_size 8
-
-Qwen3-235B-A22B:
-unzip nc_workspace_measure_kvache_g2_235B_A22B_4cards_EP.zip
-bash single_2k_qwen3-235b.sh
-notes: change the model_path in single_2k_qwen3-235b.sh to your local path.
-```
-
-Please refer to https://github.com/HabanaAI/vllm-fork/tree/dev/qwen3/scripts for other benchmarks.
-
-
-## 3. Others
-
-set EP size:
-```
-export VLLM_EP_SIZE=<number of cards>
-ex. VLLM_EP_SIZE=8
-```
-if num_tokens exceed the VLLM_DYNAMIC_MOE_MIN_TOKENS,dynamic MoE is used since its performance is
-better than static MoE in this case.
-```
-export VLLM_DYNAMIC_MOE_MIN_TOKENS=<int value>
-```
-default value is 256
-
-if the number of expert on a single card is smaller than VLLM_DYNAMIC_MOE_MIN_EXPERTS_SINGLEHPU, dynamic MoE
-is used since its performance is better than static MoE in this case.
-```
-export VLLM_DYNAMIC_MOE_MIN_EXPERTS_SINGLEHPU=<int value>
-```
-default value 32
diff --git a/scripts/qwen3/inc_measure_g2_235B_A22B.json b/scripts/qwen3/inc_measure_g2_235B_A22B.json
deleted file mode 100644
index 5f91d52821f..00000000000
--- a/scripts/qwen3/inc_measure_g2_235B_A22B.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "MEASURE",
-    "observer": "maxabs",
-    "whitelist": {
-        "types": [],
-        "names": []
-    },
-    "blocklist": {
-        "types": [],
-        "names": ["lm_head"]
-    },
-    "quantize_weight": false,
-    "dump_stats_path": "./nc_workspace_measure_kvache_g2_235B_A22B/inc_measure_output"
-
-}
diff --git a/scripts/qwen3/inc_measure_g3_30B_A3B.json b/scripts/qwen3/inc_measure_g3_30B_A3B.json
deleted file mode 100644
index e631825bf54..00000000000
--- a/scripts/qwen3/inc_measure_g3_30B_A3B.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "MEASURE",
-    "observer": "maxabs",
-    "whitelist": {
-        "types": [],
-        "names": []
-    },
-    "blocklist": {
-        "types": [],
-        "names": ["lm_head"]
-    },
-    "quantize_weight": false,
-    "dump_stats_path": "./scripts/nc_workspace_measure_kvache_g3_30B_A3B/inc_measure_output"
-
-}
diff --git a/scripts/qwen3/inc_measure_g3_32B.json b/scripts/qwen3/inc_measure_g3_32B.json
deleted file mode 100644
index 11200f3bedd..00000000000
--- a/scripts/qwen3/inc_measure_g3_32B.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "MEASURE",
-    "observer": "maxabs",
-    "whitelist": {
-        "types": [],
-        "names": []
-    },
-    "blocklist": {
-        "types": [],
-        "names": ["lm_head"]
-    },
-    "quantize_weight": false,
-    "dump_stats_path": "./scripts/nc_workspace_measure_kvache_g3_32B/inc_measure_output"
-}
diff --git a/scripts/qwen3/inc_measure_v2.json b/scripts/qwen3/inc_measure_v2.json
deleted file mode 100644
index 833629e131c..00000000000
--- a/scripts/qwen3/inc_measure_v2.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "MEASURE",
-    "observer": "maxabs",
-    "whitelist": {
-        "types": [],
-        "names": []
-    },
-    "blocklist": {
-        "types": [],
-        "names": ["lm_head"]
-    },
-    "quantize_weight": false,
-    "dump_stats_path": "./scripts/nc_workspace_measure_kvache_v2/inc_measure_output"
-}
diff --git a/scripts/qwen3/inc_quant_g2_235B_A22B_4card.json b/scripts/qwen3/inc_quant_g2_235B_A22B_4card.json
deleted file mode 100644
index 957f36e49d8..00000000000
--- a/scripts/qwen3/inc_quant_g2_235B_A22B_4card.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-    "mode": "QUANTIZE",
-    "observer": "maxabs",
-    "scale_method": "maxabs_hw",
-    "scale_format": "scalar",
-    "allowlist": {
-        "types": [],
-        "names": []
-    },
-    "blocklist": {
-        "types": [],
-        "names": [
-            "lm_head",
-            "mlp\\.gate\\b",
-            "k_cache",
-            "v_cache",
-            "matmul_av",
-            "matmul_qk",
-            "batch2block_matmul",
-            "block2batch_matmul",
-            "fused_scaled_dot_product_attention",
-            "softmax"
-        ]
-    },
-    "dump_stats_path": "nc_workspace_measure_kvache_g2_235B_A22B_4cards_EP/inc_measure_output"
-}
diff --git a/scripts/qwen3/inc_quant_g3_30B_A3B.json b/scripts/qwen3/inc_quant_g3_30B_A3B.json
deleted file mode 100644
index 86cdeb33ed3..00000000000
--- a/scripts/qwen3/inc_quant_g3_30B_A3B.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-    "mode": "QUANTIZE",
-    "observer": "maxabs",
-    "scale_method": "maxabs_hw",
-    "scale_format": "scalar",
-    "allowlist": {
-        "types": [],
-        "names": []
-    },
-    "blocklist": {
-        "types": [],
-        "names": [
-            "lm_head",
-            "mlp\\.gate\\b",
-            "k_cache",
-            "v_cache",
-            "matmul_av",
-            "matmul_qk",
-            "batch2block_matmul",
-            "block2batch_matmul",
-            "fused_scaled_dot_product_attention",
-            "softmax"
-        ]
-    },
-    "dump_stats_path": "scripts/nc_workspace_measure_kvache_g3_30B_A3B/inc_measure_output"
-}
diff --git a/scripts/qwen3/inc_quant_g3_32B.json b/scripts/qwen3/inc_quant_g3_32B.json
deleted file mode 100644
index 7b3eba3e648..00000000000
--- a/scripts/qwen3/inc_quant_g3_32B.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-    "mode": "QUANTIZE",
-    "observer": "maxabs",
-    "scale_method": "maxabs_hw",
-    "scale_format": "scalar",
-    "allowlist": {
-        "types": [],
-        "names": []
-    },
-    "blocklist": {
-        "types": [],
-        "names": [
-            "lm_head",
-            "mlp\\.gate\\b",
-            "k_cache",
-            "v_cache",
-            "matmul_av",
-            "matmul_qk",
-            "batch2block_matmul",
-            "block2batch_matmul",
-            "fused_scaled_dot_product_attention",
-            "softmax"
-        ]
-    },
-    "dump_stats_path": "nc_workspace_measure_kvache_g3_32B/inc_measure_output"
-}
diff --git a/scripts/qwen3/inc_quant_v2.json b/scripts/qwen3/inc_quant_v2.json
deleted file mode 100644
index d6356d2b9e5..00000000000
--- a/scripts/qwen3/inc_quant_v2.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
-    "mode": "QUANTIZE",
-    "observer": "maxabs",
-    "scale_method": "maxabs_hw",
-    "scale_format": "scalar",
-    "allowlist": {
-        "types": [],
-        "names": []
-    },
-    "blocklist": {
-        "types": [],
-        "names": [
-            "lm_head",
-            "mlp\\.gate\\b"
-        ]
-    },
-    "dump_stats_path": "./scripts/nc_workspace_measure_kvache_v2/inc_measure_output"
-}
diff --git a/scripts/qwen3/nc_workspace_measure_kvache_g2_235B_A22B_4cards.zip b/scripts/qwen3/nc_workspace_measure_kvache_g2_235B_A22B_4cards.zip
deleted file mode 100644
index bcff0b84794..00000000000
Binary files a/scripts/qwen3/nc_workspace_measure_kvache_g2_235B_A22B_4cards.zip and /dev/null differ
diff --git a/scripts/qwen3/nc_workspace_measure_kvache_g2_235B_A22B_4cards_EP.zip b/scripts/qwen3/nc_workspace_measure_kvache_g2_235B_A22B_4cards_EP.zip
deleted file mode 100644
index 711bd9ef3e5..00000000000
Binary files a/scripts/qwen3/nc_workspace_measure_kvache_g2_235B_A22B_4cards_EP.zip and /dev/null differ
diff --git a/scripts/qwen3/run_example_tp_qwen.py b/scripts/qwen3/run_example_tp_qwen.py
deleted file mode 100644
index 6dbb3622659..00000000000
--- a/scripts/qwen3/run_example_tp_qwen.py
+++ /dev/null
@@ -1,382 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-import argparse
-import os
-import random
-import time
-from typing import List, Tuple
-
-import datasets
-from transformers import AutoTokenizer, PreTrainedTokenizerBase
-
-from vllm import LLM, SamplingParams
-
-# get file location
-file_path = os.path.abspath(__file__)
-dataset_path = os.path.join(os.path.dirname(file_path), "../benchmarks")
-
-model_path = "/mnt/disk5/Qwen3-30B-A3B-250425"
-
-# Parse the command-line arguments.
-parser = argparse.ArgumentParser()
-parser.add_argument("--model",
-                    type=str,
-                    default=model_path,
-                    help="The model path.")
-parser.add_argument("--tokenizer",
-                    type=str,
-                    default=model_path,
-                    help="The model path.")
-parser.add_argument("--tp_size",
-                    type=int,
-                    default=8,
-                    help="The number of threads.")
-parser.add_argument("--ep_size",
-                    type=int,
-                    default=8,
-                    help="The number of threads.")
-parser.add_argument("--dataset", type=str, default=None, help="The dataset.")
-parser.add_argument("--isl",
-                    type=int,
-                    default=1024,
-                    help="input sequence length.")
-parser.add_argument("--osl",
-                    type=int,
-                    default=1024,
-                    help="output sequence length.")
-parser.add_argument("--nprompts",
-                    type=int,
-                    default=4,
-                    help="The number of prompts.")
-parser.add_argument("--max_num_seqs",
-                    type=int,
-                    default=None,
-                    help="The max number of sequences.")
-parser.add_argument("--max_model_len",
-                    type=int,
-                    default=16384,
-                    help="The max model length.")
-parser.add_argument("--random",
-                    action="store_true",
-                    help="Randomly sample prompts.")
-parser.add_argument("--fp8_kv_cache",
-                    action="store_true",
-                    help="Use fp8 for kv cache.")
-parser.add_argument("--inc", action="store_true", help="Use inc.")
-parser.add_argument("--dummy", action="store_true", help="Use dummy weights.")
-parser.add_argument("--enforce_eager",
-                    action="store_true",
-                    help="Enforce eager")
-args = parser.parse_args()
-
-os.environ["VLLM_SKIP_WARMUP"] = "true"
-os.environ["HABANA_VISIBLE_DEVICES"] = "ALL"
-os.environ["PT_HPU_ENABLE_LAZY_COLLECTIVES"] = "true"
-os.environ["VLLM_MOE_N_SLICE"] = "1"  #if args.ep_size > 1 else "4"
-os.environ["VLLM_EP_SIZE"] = f"{args.ep_size}"
-os.environ["PT_HPU_WEIGHT_SHARING"] = "0"
-
-
-def sample_sonnet_requests(
-    dataset_path: str,
-    num_requests: int,
-    input_len: int,
-    prefix_len: int,
-    tokenizer: PreTrainedTokenizerBase,
-) -> List[Tuple[str, str, int, int, None]]:
-    assert (
-        input_len > prefix_len
-    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
-
-    # Load the dataset.
-    with open(dataset_path, encoding='utf-8') as f:
-        poem_lines = f.readlines()
-
-    # Tokenize the poem lines.
-    poem_token_ids = tokenizer(poem_lines).input_ids
-    average_poem_len = sum(
-        len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids)
-
-    # Base prefix for all requests.
-    base_prompt = "Pick as many lines as you can from these poem lines:\n"
-    base_message = [{
-        "role": "user",
-        "content": base_prompt,
-    }]
-    base_prompt_formatted = tokenizer.apply_chat_template(
-        base_message, add_generation_prompt=True, tokenize=False)
-    base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
-
-    assert (
-        input_len > base_prompt_offset
-    ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
-    num_input_lines = round(
-        (input_len - base_prompt_offset) / average_poem_len)
-
-    # First approximately `prefix_len` number of tokens in the
-    # prompt are fixed poem lines.
-    assert (
-        prefix_len > base_prompt_offset
-    ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
-
-    num_prefix_lines = round(
-        (prefix_len - base_prompt_offset) / average_poem_len)
-    prefix_lines = poem_lines[:num_prefix_lines]
-
-    # Sample the rest of lines per request.
-    sampled_requests: List = []
-    for _ in range(num_requests):
-        num_lines_needed = num_input_lines - num_prefix_lines
-        sampled_lines = "".join(prefix_lines +
-                                random.choices(poem_lines, k=num_lines_needed))
-
-        prompt = f"{base_prompt}{sampled_lines}"
-        message = [
-            {
-                "role": "user",
-                "content": prompt,
-            },
-        ]
-        prompt_formatted = tokenizer.apply_chat_template(
-            message, add_generation_prompt=True, tokenize=False)
-        sampled_requests.append(prompt_formatted)
-
-    return sampled_requests, None
-
-
-def sample_gsm8k_requests(num_requests: int,
-                          tokenizer: PreTrainedTokenizerBase,
-                          do_random: bool = False) -> List[Tuple[str, str]]:
-    # Load the dataset from huggingface.
-    dataset = datasets.load_dataset("openai/gsm8k", "main")
-    prompts = dataset["train"]["question"]
-    expected_responses = dataset["train"]["answer"]
-    few_shots = 5
-    base_prompt = [
-        f"Question: {prompts[i]}\nAnswer: {expected_responses[i]}\n"
-        for i in range(few_shots)
-    ]
-    base_prompt = "\n".join(base_prompt)
-    base_prompt = f"{base_prompt}\n"
-
-    # Sample the requests.
-    sampled_requests: List = []
-    sampled_response: List = []
-    for j in range(num_requests):
-        i = random.choice(range(len(
-            prompts[few_shots:]))) if do_random else j + few_shots
-        prompt = f"{base_prompt}Question: {prompts[i]}\nAnswer: "
-        # message = [
-        #     {
-        #         "role": "user",
-        #         "content": prompt,
-        #     },
-        # ]
-        # prompt = tokenizer.apply_chat_template(
-        #     message, add_generation_prompt=True, tokenize=False)
-        expected_response = expected_responses[i]
-        sampled_requests.append(prompt)
-        sampled_response.append(expected_response)
-
-    return sampled_requests, sampled_response
-
-
-def dump_logprob(logprobs, file_name):
-    import json
-    converted_logprobs = []
-    for token_dict in logprobs:
-        converted_dict = {}
-        for token_id, lp in token_dict.items():
-            converted_dict[token_id] = {
-                'logprob': lp.logprob,
-                'rank': lp.rank,
-                'decoded_token': lp.decoded_token
-            }
-        converted_logprobs.append(converted_dict)
-
-    # Write to JSON file
-    with open(file_name, 'w') as f:
-        json.dump(converted_logprobs, f, indent=2)
-    print(f"save logprobs to {file_name}.")
-
-
-if __name__ == "__main__":
-
-    # Sample prompts.
-
-    if args.dataset == "sonnet":
-        # Sample sonnet requests.
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
-        prompts, gt = sample_sonnet_requests(
-            dataset_path=f"{dataset_path}/sonnet.txt",
-            num_requests=args.nprompts,
-            input_len=args.isl,
-            prefix_len=200,
-            tokenizer=tokenizer,
-        )
-    elif args.dataset == "gsm8k":
-        # Sample GSM8K requests.
-        args.osl = 128
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
-        prompts, gt = sample_gsm8k_requests(
-            num_requests=args.nprompts,
-            tokenizer=tokenizer,
-            do_random=args.random,
-        )
-    elif args.dataset == "pile":
-
-        def reset_seed(seed=42):
-            import random
-
-            import numpy as np
-            import torch
-
-            torch.manual_seed(seed)
-            np.random.seed(seed)
-            random.seed(seed)
-
-        def get_prompt_token_ids(model_path, prompts, max_length=1024):
-            from transformers import AutoTokenizer
-
-            tokenizer = AutoTokenizer.from_pretrained(model_path)
-            prompt_token_ids = []
-            for prompt in prompts:
-                tokens = tokenizer(
-                    prompt,
-                    return_tensors="pt",
-                    truncation=True,
-                    max_length=max_length,
-                )
-                if len(tokens.input_ids[0]) < max_length:
-                    continue
-                prompt_token_ids.append(
-                    [x.item() for x in tokens.input_ids[0]])
-            return prompt_token_ids
-
-        def get_pile_prompts(model_name, num_samples=512):
-            import transformers
-            from datasets import load_dataset
-            from tqdm import tqdm
-
-            least_tokens = 1024
-            seed = 42
-
-            reset_seed(seed)
-
-            dataset = load_dataset("NeelNanda/pile-10k", split="train")
-            dataset = dataset.shuffle(seed=seed)
-
-            tokenizer = transformers.AutoTokenizer.from_pretrained(
-                model_name, trust_remote_code=True)
-            num_sample = 0
-            samples_lst = []
-            for data in tqdm(dataset):
-                prompt = data["text"]
-                tokens = tokenizer(prompt, return_tensors="pt")
-                if len(tokens.input_ids[0]) < least_tokens:
-                    continue
-                num_sample += 1
-                samples_lst.append(prompt)
-                if num_sample >= num_samples:
-                    break
-            return samples_lst
-
-        least_tokens = args.isl
-        num_samples = args.nprompts
-        prompts = get_pile_prompts(args.model, num_samples)
-        prompt_token_ids = get_prompt_token_ids(args.model, prompts,
-                                                least_tokens)
-        print(f"Got {len(prompts)} prompts, length of first prompt: \
-            {len(prompt_token_ids[0])}.")
-        gt = None
-    else:
-        prompts = [
-            "Hello, my name is",
-            "0.999 compares to 0.9 is ",
-            "The capital of France is",
-            "The future of AI is",
-        ]
-        if args.nprompts > 4:
-            prompts += random.choices(prompts, k=args.nprompts - 4)
-        elif args.nprompts < 4:
-            prompts = prompts[:args.nprompts]
-        gt = None
-    # Create a sampling params object.
-    sampling_params = SamplingParams(
-        temperature=0,
-        max_tokens=args.osl,
-        ignore_eos=True,
-        # logprobs=10,
-    )
-    model = args.model
-    param = {}
-    if args.inc:
-        param["quantization"] = "inc"
-    if args.fp8_kv_cache:
-        param["kv_cache_dtype"] = "fp8_inc"
-    if args.max_num_seqs is not None:
-        param["max_num_seqs"] = args.max_num_seqs
-    if args.enforce_eager:
-        param["enforce_eager"] = True
-    if args.ep_size > 1:
-        param["enable_expert_parallel"] = True
-        # os.environ["VLLM_EP_SIZE"] = str(args.ep_size)
-    if args.dummy:
-        param["load_format"] = "dummy"
-    if args.tp_size == 1:
-        llm = LLM(model=model,
-                  tokenizer=args.tokenizer,
-                  trust_remote_code=True,
-                  dtype="bfloat16",
-                  max_model_len=args.max_model_len,
-                  gpu_memory_utilization=0.8,
-                  **param)
-    else:
-        llm = LLM(model=model,
-                  tokenizer=args.tokenizer,
-                  tensor_parallel_size=args.tp_size,
-                  distributed_executor_backend='mp',
-                  trust_remote_code=True,
-                  max_model_len=args.max_model_len,
-                  dtype="bfloat16",
-                  gpu_memory_utilization=0.8,
-                  **param)
-
-    # Generate texts from the prompts.
-    # The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    start = time.perf_counter()
-    if args.dataset == "pile":
-        outputs = llm.generate(prompts=None,
-                               sampling_params=sampling_params,
-                               prompt_token_ids=prompt_token_ids)
-    else:
-        outputs = llm.generate(prompts, sampling_params)
-    end = time.perf_counter()
-    # Print the outputs.
-    print(f"e2e took {end - start} seconds")
-    for output_i in range(len(outputs)):
-        output = outputs[output_i]
-        gt_i = None if gt is None else gt[output_i]
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        gen_token_id = output.outputs[0].token_ids
-        prompt_logprobs = output.outputs[0].logprobs
-        num_hidden_layers = int(os.environ.get("VLLM_NUM_LAYERS", "61"))
-        OFFICIAL_FP8_MODEL = os.environ.get("OFFICIAL_FP8_MODEL", "0")
-        # replace "/" with "_"
-        _model_path = OFFICIAL_FP8_MODEL.replace("/", "_")
-        time_str = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
-        file_name = f"prompt_logprobs.layer \
-        {num_hidden_layers}_{_model_path}_{time_str}.json"
-
-        # dump_logprob(prompt_logprobs, file_name)
-        print("====================================")
-        print(f"Prompt: {prompt!r}")
-        print(f"Generated text: {generated_text!r}")
-        print(f"Generated token: {gen_token_id!r}")
-        print(f"Ground truth: {gt_i!r}")
-        # print(f"Prompt logprobs: {prompt_logprobs!r}")
-        print("====================================")
-    if os.getenv("QUANT_CONFIG", None) is not None:
-        llm.llm_engine.model_executor.shutdown()
-    del llm
diff --git a/scripts/qwen3/run_qwen.sh b/scripts/qwen3/run_qwen.sh
deleted file mode 100644
index eb2bb47f3fe..00000000000
--- a/scripts/qwen3/run_qwen.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-pkill -9 python
-
-export PT_HPU_LAZY_MODE=1 
-export GRAPH_VISUALIZATION=1 
-export VLLM_LOGGING_LEVEL=DEBUG
-export VLLM_DISABLE_MARK_SCALES_AS_CONST=1
-
-
-# export ENABLE_EXPERIMENTAL_FLAGS=1 
-# export PRINT_FILE_AND_LINE=1  
-# export LOG_LEVEL_PASS_MANAGER=1  
-# export LOG_LEVEL_ALL=1 HABANA_LOGS=.habana_logs-515  
-
-export OFFICIAL_MODEL="/mnt/disk5/Qwen3-30B-A3B-250425"
-
-#############################
-# Qwen
-#############################
-# FIXME: (Yi) Enable the static MoE path 
-export VLLM_DYNAMIC_MOE_MIN_TOKENS=0
-
-#!/bin/bash
-
-set -e
-
-MODE=$1  # First argument
-MODEL=$2 # Second argument (model name)
-TOKENIZER=$3 # Third argument (tokenizer name)
-
-if [ -z "$TOKENIZER" ]; then
-  TOKENIZER=$MODEL
-fi
-
-if [ -z "$MODE" ] || [ -z "$MODEL" ] || [ -z "$TOKENIZER" ]; then
-  echo "Usage: $0 {bf16|calib|quant|eval} <model> <tokenizer>"
-  exit 1
-fi
-
-COMMON_ARGS="--model $MODEL --tokenizer $TOKENIZER --osl 32 --max_model_len 2048 --max_num_seqs 1 --tp_size 8 --ep_size 8"
-
-model_name=$(basename ${MODEL})
-if [ ${model_name} == "Qwen3-30B-A3B-250425" ]; then
-    quant_file_path="inc_measure_g3_30B_A3B.json"
-elif [ ${model_name} == "Qwen3-30B-A3B" ]; then
-    quant_file_path="inc_measure_g3_30B_A3B.json"
-elif [ ${model_name} == "Qwen3-32B-250426" ]; then
-    quant_file_path="inc_measure_g3_32B.json"
-elif [ ${model_name} == "Qwen3-235B-A22B" ]; then
-    COMMON_ARGS="--model $MODEL --tokenizer $TOKENIZER --osl 32 --max_model_len 8192 --max_num_seqs 1 --tp_size 8 --ep_size 8"
-    quant_file_path="inc_measure_g2_235B_A22B.json"
-else
-    echo "Unknown model name: ${model_name}"
-    exit 1
-fi
-
-
-if [ "$MODE" == "bf16" ]; then
-  python run_example_tp_qwen.py $COMMON_ARGS
-
-elif [ "$MODE" == "calib" ]; then
-  QUANT_CONFIG=${quant_file_path} \
-  python run_example_tp_qwen.py $COMMON_ARGS --inc --dataset pile --nprompts 512
-
-else
-  echo "Unknown mode: $MODE"
-  echo "Valid modes are: bf16, calib"
-  exit 1
-fi
-
diff --git a/scripts/qwen3/single_2k_qwen3-235b.sh b/scripts/qwen3/single_2k_qwen3-235b.sh
deleted file mode 100644
index e76e8203e18..00000000000
--- a/scripts/qwen3/single_2k_qwen3-235b.sh
+++ /dev/null
@@ -1,109 +0,0 @@
-#! /bin/bash
-
-# set -x
-
-BASH_DIR=$(dirname "${BASH_SOURCE[0]}")
-source "$BASH_DIR"/utils.sh
-
-ray stop --force
-
-export QUANT_CONFIG=inc_quant_g2_235B_A22B_4card.json
-
-# DO NOT change unless you fully undersand its purpose
-export HABANA_VISIBLE_DEVICES="ALL"
-export PT_HPU_ENABLE_LAZY_COLLECTIVES="true"
-export VLLM_RAY_DISABLE_LOG_TO_DRIVER="1"
-export RAY_IGNORE_UNHANDLED_ERRORS="1"
-export PT_HPU_WEIGHT_SHARING=0
-export HABANA_VISIBLE_MODULES="0,1,2,3"
-export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=1
-
-#export VLLM_MOE_N_SLICE=8
-export VLLM_EP_SIZE=4
-export VLLM_DELAYED_SAMPLING="true"
-
-block_size=128
-# DO NOT change ends...
-
-# memory footprint tunning params
-export VLLM_GPU_MEMORY_UTILIZATION=0.9
-export VLLM_GRAPH_RESERVED_MEM=0.4
-export VLLM_GRAPH_PROMPT_RATIO=0
-export VLLM_DISABLE_MARK_SCALES_AS_CONST=true
-# params
-model_path=/root/.cache/huggingface/hub/models--Qwen--Qwen3-235B-A22B/snapshots/Qwen3-235B-A22B/
-max_model_len=8192
-max_num_batched_tokens=25536
-max_num_seqs=256
-input_min=1
-input_max=2048
-output_max=2048
-
-unset VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP VLLM_PROMPT_BS_BUCKET_MAX
-unset VLLM_PROMPT_SEQ_BUCKET_MIN VLLM_PROMPT_SEQ_BUCKET_STEP VLLM_PROMPT_SEQ_BUCKET_MAX
-unset VLLM_DECODE_BS_BUCKET_MIN VLLM_DECODE_BS_BUCKET_STEP VLLM_DECODE_BS_BUCKET_MAX
-unset VLLM_DECODE_BLOCK_BUCKET_MIN VLLM_DECODE_BLOCK_BUCKET_STEP VLLM_DECODE_BLOCK_BUCKET_MAX
-
-
-export PT_HPU_RECIPE_CACHE_CONFIG=/data/8k_cache,false,8192
-
-#set_bucketing
-
-
-
-# !!!!!!!!!!!!!!!!!!!! set bucketing !!!!!!!!!!!!!
-prompt_bs_min=1
-prompt_bs_step=$(( $max_num_seqs > 32 ? 32 : $max_num_seqs ))
-prompt_bs_max=$(( $max_num_seqs > 64 ? 64 : $max_num_seqs ))
-export VLLM_PROMPT_BS_BUCKET_MIN=${VLLM_PROMPT_BS_BUCKET_MIN:-$prompt_bs_min}
-export VLLM_PROMPT_BS_BUCKET_STEP=${VLLM_PROMPT_BS_BUCKET_STEP:-$prompt_bs_step}
-export VLLM_PROMPT_BS_BUCKET_MAX=${VLLM_PROMPT_BS_BUCKET_MAX:-$prompt_bs_max}
-
-prompt_seq_step=128
-prompt_seq_min=2048
-prompt_seq_max=2048
-export VLLM_PROMPT_SEQ_BUCKET_MIN=${VLLM_PROMPT_SEQ_BUCKET_MIN:-$prompt_seq_min}
-export VLLM_PROMPT_SEQ_BUCKET_STEP=${VLLM_PROMPT_SEQ_BUCKET_STEP:-$prompt_seq_step}
-export VLLM_PROMPT_SEQ_BUCKET_MAX=${VLLM_PROMPT_SEQ_BUCKET_MAX:-$prompt_seq_max}
-
-decode_bs_min=1
-decode_bs_step=$(( $max_num_seqs > 32 ? 32 : $max_num_seqs ))
-decode_bs_max=$max_num_seqs
-export VLLM_DECODE_BS_BUCKET_MIN=${VLLM_DECODE_BS_BUCKET_MIN:-$decode_bs_min}
-export VLLM_DECODE_BS_BUCKET_STEP=${VLLM_DECODE_BS_BUCKET_STEP:-$decode_bs_step}
-export VLLM_DECODE_BS_BUCKET_MAX=${VLLM_DECODE_BS_BUCKET_MAX:-$decode_bs_max}
-
-decode_block_min=128
-decode_block_step=128
-block_size=128
-decode_block_max=$(( ((max_num_seqs * max_model_len / block_size) > 128) ? (max_num_seqs * max_model_len / block_size) : 128 ))
-export VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN:-$decode_block_min}
-export VLLM_DECODE_BLOCK_BUCKET_STEP=${VLLM_DECODE_BLOCK_BUCKET_STEP:-$decode_block_step}
-export VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX:-$decode_block_max}
-
-set_env
-set_numactl
-
-echo " environments are reseted "
-
-env | grep VLLM
-
-
-python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8688 \
-    --block-size 128 \
-    --model $model_path \
-    --device hpu \
-    --dtype bfloat16 \
-    --tensor-parallel-size 4 \
-    --trust-remote-code  \
-    --max-model-len $max_model_len \
-    --max-num-seqs $max_num_seqs \
-    --max-num-batched-tokens $max_num_batched_tokens \
-    --disable-log-requests \
-    --use-padding-aware-scheduling \
-    --use-v2-block-manager \
-    --distributed_executor_backend ray \
-    --gpu_memory_utilization 0.9 \
-    --quantization=inc \
-    --weights-load-device cpu \
-    --enable-expert-parallel
diff --git a/scripts/qwen3/utils.sh b/scripts/qwen3/utils.sh
deleted file mode 100644
index a910cb8cc17..00000000000
--- a/scripts/qwen3/utils.sh
+++ /dev/null
@@ -1,142 +0,0 @@
-#! /bin/bash
-
-# set -x
-
-# set up commen environment variables for vllm
-set_env(){
-    # pytorch bridge
-    export PT_HPU_WEIGHT_SHARING=${PT_HPU_WEIGHT_SHARING:-"0"}
-    export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE:-"1"}
-
-    # memory usage tuning
-    export VLLM_GPU_MEMORY_UTILIZATION=${VLLM_GPU_MEMORY_UTILIZATION:-"0.9"}
-    export VLLM_GRAPH_RESERVED_MEM=${VLLM_GRAPH_RESERVED_MEM:-"0.2"}
-    export VLLM_GRAPH_PROMPT_RATIO=${VLLM_GRAPH_PROMPT_RATIO:-"0.8"}
-    export VLLM_MAX_SEQ_LEN_TO_CAPTURE=${VLLM_MAX_SEQ_LEN_TO_CAPTURE:-"8192"}
-
-    # performance tuning
-    export VLLM_DELAYED_SAMPLING=${VLLM_DELAYED_SAMPLING:-"true"}
-    export VLLM_ZERO_PADDING=${VLLM_ZERO_PADDING:-"true"}
-
-    # MoE sepcific
-    export VLLM_EP_SIZE=${VLLM_EP_SIZE:-"${num_hpu}"}
-    export VLLM_DYNAMIC_MOE_MIN_TOKENS=${VLLM_DYNAMIC_MOE_MIN_TOKENS:-"256"}
-    export VLLM_DYNAMIC_MOE_MIN_EXPERTS_SINGLEHPU=${VLLM_DYNAMIC_MOE_MIN_EXPERTS_SINGLEHPU:-"32"}
-
-    # profiler
-    export VLLM_PROFILER_ENABLED=${VLLM_PROFILER_ENABLED:-"false"}
-    export VLLM_ENGINE_PROFILER_ENABLED=${VLLM_ENGINE_PROFILER_ENABLED:-"false"}
-    export VLLM_ENGINE_PROFILER_WARMUP_STEPS=${VLLM_ENGINE_PROFILER_WARMUP_STEPS:-"0"}
-    export VLLM_ENGINE_PROFILER_STEPS=${VLLM_ENGINE_PROFILER_STEPS:-"1"}
-    export VLLM_ENGINE_PROFILER_REPEAT=${VLLM_ENGINE_PROFILER_REPEAT:-"1"}
-
-    # network
-    default_host_ip=$( hostname -I | awk '{print $1}' )
-    default_ifname=$( ip -br addr show to ${default_host_ip} | awk '{print $1}' )
-    export VLLM_HOST_IP=${VLLM_HOST_IP:-"${default_host_ip}"}
-    export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-"${default_ifname}"}
-    export HCCL_SOCKET_IFNAME=${HCCL_SOCKET_IFNAME:-"${default_ifname}"}
-
-    # misc
-    export VLLM_WORKER_MULTIPROC_METHOD=${VLLM_WORKER_MULTIPROC_METHOD:-"spawn"}
-    export TOKENIZERS_PARALLELISM=${TOKENIZERS_PARALLELISM:-"true"}
-    export RAY_IGNORE_UNHANDLED_ERRORS=${RAY_IGNORE_UNHANDLED_ERRORS:-"1"}
-    export VLLM_RAY_DISABLE_LOG_TO_DRIVER=${VLLM_RAY_DISABLE_LOG_TO_DRIVER:-"1"}
-}
-
-# set up numactl for the specified module IDs
-set_numactl(){
-    if [ "$module_ids" != "None" ]; then
-        # Check if module_ids is a comma-separated list of integers
-        if [[ $module_ids =~ ^[0-9]+(,[0-9]+)*$ ]]; then
-            IFS="," read -r -a MODULES <<< "$module_ids"
-        else
-            echo "The specified module IDs should be a comma-separated list of integers instead of $module_ids."
-            return
-        fi
-    else
-        echo no modules specified, skip numactl
-        return
-    fi
-
-    HL_TOPO="hl-smi topo -c -N"
-    NODE_MEM=($( echo -e "$($HL_TOPO | grep "^[$(IFS="|" ; echo "${MODULES[*]}")]" | awk '{print $4}' | uniq)" ))
-    NODE_CPUS=($( echo -e "$($HL_TOPO | grep "^[$(IFS="|" ; echo "${MODULES[*]}")]" | awk '{print $2}' | uniq | sed 's/,//g')" ))
-
-    if [ "${#NODE_MEM[@]}" -gt 1 ] || [ "${#NODE_CPUS[@]}" -gt 1 ];then
-        echo "The specified modules are not on the same NUMA node, skip numactl"
-        return
-    fi
-    NUM_HPU_PER_NODE=$($HL_TOPO | grep -c "${NODE_CPUS[0]}")
-
-    CPUS_LOW=$(echo "${NODE_CPUS[0]}" | cut -d '-' -f 1)
-    CPUS_UP=$(echo "${NODE_CPUS[0]}" | cut -d '-' -f 2)
-    NUM_CPU_PER_HPU=$(echo "($CPUS_UP-$CPUS_LOW+1)/$NUM_HPU_PER_NODE" | bc)
-
-    CORES=()
-    for MODULE in "${MODULES[@]}"; do
-        MODULE_IDX=$(echo "$MODULE % $NUM_HPU_PER_NODE" | bc)
-        CORE_LOW=$(echo "$CPUS_LOW + ($NUM_CPU_PER_HPU * $MODULE_IDX)" | bc)
-        CORE_UP=$(echo "$CORE_LOW + $NUM_CPU_PER_HPU - 1" | bc)
-        CORES+=("$CORE_LOW-$CORE_UP")
-    done
-    CORES_STR=$(IFS="," ; echo "${CORES[*]}")
-
-    NUMA_CTL="numactl -C $CORES_STR -m ${NODE_MEM[0]}"
-    MODULES_STR=$(IFS=',' ; echo "${MODULES[@]}")
-    echo "using '$NUMA_CTL' for module #.$MODULES_STR"
-}
-
-# set up bucketing based on input/output range and max_num_batched_tokens
-set_bucketing(){
-    max_num_batched_tokens=${max_num_batched_tokens:-8192}
-    max_num_seqs=${max_num_seqs:-128}
-    input_min=${input_min:-1024}
-    input_max=${input_max:-1024}
-    output_max=${output_max:-2048}
-    block_size=${block_size:-128}
-
-    prompt_bs_step=1
-    prompt_bs_min=1
-    prompt_bs_max=$(( $max_num_batched_tokens / $input_min ))
-    # prompt_bs_max = min(prompt_bs_max, max_num_seqs)
-    prompt_bs_max=$(( $prompt_bs_max > $max_num_seqs ? $max_num_seqs : $prompt_bs_max ))
-    # prompt_bs_max = CEILING.MATH(prompt_bs_max, prompt_bs_step)
-    prompt_bs_max=$(( ($prompt_bs_max + $prompt_bs_step - 1) / $prompt_bs_step * $prompt_bs_step ))    
-    export VLLM_PROMPT_BS_BUCKET_MIN=${VLLM_PROMPT_BS_BUCKET_MIN:-$prompt_bs_min}
-    export VLLM_PROMPT_BS_BUCKET_STEP=${VLLM_PROMPT_BS_BUCKET_STEP:-$prompt_bs_step}
-    export VLLM_PROMPT_BS_BUCKET_MAX=${VLLM_PROMPT_BS_BUCKET_MAX:-$prompt_bs_max}
-
-    prompt_seq_step=$block_size
-    # prompt_seq_min = CEILING.MATH(input_min, prompt_seq_step)
-    prompt_seq_min=$(( ($input_min + $prompt_seq_step -1) / $prompt_seq_step * $prompt_seq_step ))
-    # prompt_seq_max = CEILING.MATH(input_max, prompt_seq_step) + prompt_seq_step
-    prompt_seq_max=$(( (($input_max + $prompt_seq_step -1) / $prompt_seq_step + 1) * $prompt_seq_step ))
-    export VLLM_PROMPT_SEQ_BUCKET_MIN=${VLLM_PROMPT_SEQ_BUCKET_MIN:-$prompt_seq_min}
-    export VLLM_PROMPT_SEQ_BUCKET_STEP=${VLLM_PROMPT_SEQ_BUCKET_STEP:-$prompt_seq_step}
-    export VLLM_PROMPT_SEQ_BUCKET_MAX=${VLLM_PROMPT_SEQ_BUCKET_MAX:-$prompt_seq_max}
-
-    # decode_bs_step = ROUNDUP(max_num_seqs / 16, 0)
-    decode_bs_step=$(( ($max_num_seqs + 15) / 16 ))
-    # decode_bs_step = min(decode_bs_step, 16)
-    decode_bs_step=$(( $decode_bs_step > 16 ? 16 : $decode_bs_step ))
-    decode_bs_min=1
-    # decode_bs_max = CEILING.MATH(max_num_seqs, decode_bs_step)
-    decode_bs_max=$(( ($max_num_seqs + $decode_bs_step -1) / $decode_bs_step * $decode_bs_step ))
-    export VLLM_DECODE_BS_BUCKET_MIN=${VLLM_DECODE_BS_BUCKET_MIN:-$decode_bs_min}
-    export VLLM_DECODE_BS_BUCKET_STEP=${VLLM_DECODE_BS_BUCKET_STEP:-$decode_bs_step}
-    export VLLM_DECODE_BS_BUCKET_MAX=${VLLM_DECODE_BS_BUCKET_MAX:-$decode_bs_max}
-
-    decode_block_step=$block_size
-    # decode_block_min = ROUNDUP(input_min / block_size, 0)
-    decode_block_min=$(( ($input_min + $block_size - 1) / $block_size ))
-    # decode_block_min = CEILING.MATH(decode_block_min, decode_block_step)
-    decode_block_min=$(( ($decode_block_min + $decode_block_step -1) / $decode_block_step * $decode_block_step ))
-    # decode_block_max = (ROUNDUP((input_max + output_max) / block_size, 0) + 1) * decode_bs_max
-    decode_block_max=$(( (($input_max + $output_max + $block_size -1) / $block_size + 1) * $decode_bs_max))
-    # decode_block_max = (CEILING.MATH(decode_block_max, decode_block_step)
-    decode_block_max=$(( ($decode_block_max + $decode_block_step -1) / $decode_block_step * $decode_block_step ))
-    export VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN:-$decode_block_min}
-    export VLLM_DECODE_BLOCK_BUCKET_STEP=${VLLM_DECODE_BLOCK_BUCKET_STEP:-$decode_block_step}
-    export VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX:-$decode_block_max}
-}
diff --git a/scripts/start_gaudi_vllm_server.sh b/scripts/start_gaudi_vllm_server.sh
index bdbb74025f6..8566954a497 100644
--- a/scripts/start_gaudi_vllm_server.sh
+++ b/scripts/start_gaudi_vllm_server.sh
@@ -151,10 +151,11 @@ case "$dtype" in
         echo Running with dtype="$dtype"
         export QUANT_CONFIG=quantization/${model_name}/maxabs_quant_g2.json
         export PT_HPU_WEIGHT_SHARING=0
+        export VLLM_DISABLE_MARK_SCALES_AS_CONST=true
         QUANT_FLAGS=(--quantization inc --kv-cache-dtype fp8_inc)
-	if [ "${model_name}" == "Qwen3-235B-A22B" ] || [ "${model_name}" == "Qwen3-30B-A3B" ]; then
-	    QUANT_FLAGS=(--quantization inc --weights-load-device cpu)
-	fi
+        if [ "${model_name}" == "Qwen3-235B-A22B" ] || [ "${model_name}" == "Qwen3-30B-A3B" ]; then
+            QUANT_FLAGS=(--quantization inc --weights-load-device cpu)
+        fi
         dtype="bfloat16"
         ;;
     "awq")