diff --git a/scripts/README.md b/scripts/README.md index 4b081f7dcbc..65d24fff7e7 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -117,6 +117,11 @@ Running vLLM with FP8 precision can be achieved using [Intel(R) Neural Compresso - #### Run vLLM with FP8 using INC To run vLLM with FP8 precision using INC, pass `-d fp8` and specify the path to your bfloat16 or float16 model with `-w `. The model will be quantized to FP8 using calibration data obtained from the [FP8 Calibration Procedure](https://github.com/HabanaAI/vllm-hpu-extension/blob/v1.21.0/calibration/README.md). +> For the Qwen3 MoE models, a custom INC should be installed: +``` bash +pip install git+https://github.com/intel/neural-compressor.git@qwen-fp8 +``` + #### 1. Copy open_orca_gpt4_tokenized_llama.calibration_1000.pkl to vllm-hpu-extension/calibration folder ```bash gzip -dk Gaudi-fp8-calibration/open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz @@ -131,6 +136,14 @@ MODEL=/models/Qwen2.5-72B-Instruct HPU_SIZE=2 ./calibrate_model.sh -m $MODEL -d open_orca_gpt4_tokenized_llama.calibration_1000.pkl -o quantization -t $HPU_SIZE ``` +For Qwen3-235B-A22B, the calibration process needs 8 HPUs to load the original bfloat16 weights. Then the fp8 inference could run on 4 HPUs. Thus the measurements should be unified as follow: +``` bash +bash calibrate_model.sh -m /models/Qwen3-235B-A22B -d open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz -o Qwen3-235B-A22B -t 8 -b 256 -r 4 -u +``` +Where the + - `-t 8` means run calibration with 8 HPUs. + - `-r 4` means to unify the measurements to 4 HPUs. + - `-u` means the model have MoE weights. #### 3. Make the Quantization folder Create a quantization folder at the same level as start_gaudi_vllm_server.sh. diff --git a/scripts/benchmark_throughput.sh b/scripts/benchmark_throughput.sh index fbedfc67ab0..a3a985bbe57 100644 --- a/scripts/benchmark_throughput.sh +++ b/scripts/benchmark_throughput.sh @@ -173,9 +173,10 @@ case "$dtype" in echo Running with dtype="$dtype" export QUANT_CONFIG=quantization/${model_name}/maxabs_quant_g2.json export PT_HPU_WEIGHT_SHARING=0 + export VLLM_DISABLE_MARK_SCALES_AS_CONST=true QUANT_FLAGS=(--quantization inc --kv-cache-dtype fp8_inc) if [ "${model_name}" == "Qwen3-235B-A22B" ] || [ "${model_name}" == "Qwen3-30B-A3B" ]; then - QUANT_FLAGS=(--quantization inc --weights-load-device cpu) + QUANT_FLAGS=(--quantization inc --weights-load-device cpu) fi dtype="bfloat16" ;; diff --git a/scripts/llama4/convert_for_g2.py b/scripts/quantization/convert_fp8_weights_for_gaudi2.py similarity index 71% rename from scripts/llama4/convert_for_g2.py rename to scripts/quantization/convert_fp8_weights_for_gaudi2.py index f973a59ba63..e94af81e5e4 100644 --- a/scripts/llama4/convert_for_g2.py +++ b/scripts/quantization/convert_fp8_weights_for_gaudi2.py @@ -1,29 +1,31 @@ +# SPDX-License-Identifier: Apache-2.0 +from glob import glob + import torch from safetensors import safe_open from safetensors.torch import save_file -from glob import glob -import os # input_path = "/models/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" # output_path = "/models/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic-G2" input_path = "/models/Llama-4-Maverick-17B-128E-Instruct-FP8" output_path = "/models/Llama-4-Maverick-17B-128E-Instruct-FP8-G2" -weight_factor = ( - torch.finfo(torch.float8_e4m3fnuz).max / torch.finfo(torch.float8_e4m3fn).max -) +weight_factor = (torch.finfo(torch.float8_e4m3fnuz).max / + torch.finfo(torch.float8_e4m3fn).max) scale_factor = 1.0 / weight_factor scale_inv_factor = weight_factor for safetensors_path in glob(f"{input_path}/*.safetensors"): tensors = {} print(f"processing {safetensors_path}") - with safe_open(safetensors_path, framework="pt", device="cpu") as tensor_file: - for k in tensor_file.keys(): + with safe_open(safetensors_path, framework="pt", + device="cpu") as tensor_file: + for k in tensor_file.keys(): # noqa:SIM118 tensor = tensor_file.get_tensor(k) # print(f'{k}:{tensor.dtype}') if tensor.dtype == torch.float8_e4m3fn: - tensor = (tensor.float() * weight_factor).to(torch.float8_e4m3fn) + tensor = (tensor.float() * weight_factor).to( + torch.float8_e4m3fn) elif k.endswith("_scale"): tensor = tensor.float() * scale_factor else: diff --git a/scripts/qwen3/01-benchmark-online-30B-fp8.sh b/scripts/qwen3/01-benchmark-online-30B-fp8.sh deleted file mode 100644 index a6b66d147d7..00000000000 --- a/scripts/qwen3/01-benchmark-online-30B-fp8.sh +++ /dev/null @@ -1,194 +0,0 @@ -#!/bin/bash -######################################################### -# vLLM Benchmark Script for Qwen3 -# -# This script runs a vLLM server with specific configurations -# and benchmarks it using the sonnet dataset. -######################################################### - -#=========================================================== -# CONFIGURATION PARAMETERS -#=========================================================== -export QUANT_CONFIG="inc_quant_g3_30B_A3B.json" - -if [ $# -gt 0 ] && [ "$1" == "--model_path" ]; then - model=$2 -else - model="/mnt/weka/llm/Qwen3/Qwen3-30B-A3B/" -fi - -if [ $# -eq 4 ] && [ "$3" == "--tp_size" ]; then - tp_size=$4 -else - tp_size=1 -fi - -model_name=$(basename ${model}) - -# Model Configuration -tokenizer=$model - -# Hardware Configuration -moe_n_slice=1 # MoE groups -gpu_utils=0.95 # GPU memory utilization - -# Request Configuration -max_model_len=9216 # Max model len -request_rate="inf" # Request rate (inf = unlimited) -multi_step=1 # Number of scheduler steps - - -#=========================================================== -# START the LOOP -#=========================================================== - -tp_parallel=${tp_size} # Tensor parallelism size -req_in_out_list=(256_1024_1024 96_5120_1024 32_10240_1024) - -for req_in_out in "${req_in_out_list[@]}"; do - # Token Length Configuration - bs=$(echo "$req_in_out" | awk -F'_' '{ print $1 }') - in_len=$(echo "$req_in_out" | awk -F'_' '{ print $2 }') - out_len=$(echo "$req_in_out" | awk -F'_' '{ print $3 }') - - num_prompts=$((bs * 5)) - # Expert parallelism size - ep_size=${tp_parallel} - - #=========================================================== - # DERIVED PARAMETERS - #=========================================================== - - # Calculate and align total length - # Calculate aligned lengths for buckets - in_len_aligned=$(((in_len + 127) / 128 * 128)) - prompt_seq_max=$((in_len * 1125 / 1000)) - prompt_seq_max=$(((prompt_seq_max + 127) / 128 * 128)) - - total_len=$((prompt_seq_max + out_len)) - if [ $((total_len % 128)) -ne 0 ]; then - echo 'Rounding up total length to multiple of 128' - total_len=$(((total_len / 128 + 1) * 128)) - fi - - total_len_aligned=$(((total_len + 127) / 128 * 128)) - - decode_total_len=$((total_len + 128)) - decode_total_len_aligned=$(((decode_total_len + 127) / 128 * 128)) - - # Calculate bucket sizes - VLLM_DECODE_BLOCK_BUCKET_MIN=$((in_len_aligned * bs / 128)) - VLLM_DECODE_BLOCK_BUCKET_MIN=$(((VLLM_DECODE_BLOCK_BUCKET_MIN + 127) / 128 * 128)) - VLLM_DECODE_BLOCK_BUCKET_MAX=$((decode_total_len_aligned * bs / 128)) - VLLM_DECODE_BLOCK_BUCKET_MAX=$(((VLLM_DECODE_BLOCK_BUCKET_MAX + 127) / 128 * 128)) - - #=========================================================== - # LOG CONFIGURATION - #=========================================================== - - # Create a descriptive log name based on parameters - log_name="${model_name}-gaudi3-tp${tp_parallel}-ep${ep_size}-moe${moe_n_slice}-ms${multi_step}_np${num_prompts}_rr${request_rate}_bs${bs}_i${in_len}_o${out_len}_len${total_len}" - - # Create log directory - mkdir -p benchmark_logs - - #=========================================================== - # START vLLM SERVER - #=========================================================== - - echo "Starting vLLM server with the following configuration:" - echo "- Model: ${model_name}" - echo "- Tensor Parallel Size: ${tp_parallel}" - echo "- Expert Parallel Size: ${ep_size}" - echo "- Batch Size: ${bs}" - echo "- Input Length: ${in_len}" - echo "- Output Length: ${out_len}" - echo "- Total Length: ${total_len}" - - VLLM_DMOE_FORCE_LOOP=1 \ - VLLM_DYNAMIC_MOE_MIN_TOKENS=256 \ - PT_HPU_LAZY_MODE=1 \ - VLLM_PROMPT_BS_BUCKET_MIN=1 \ - VLLM_PROMPT_BS_BUCKET_MAX=8 \ - VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len_aligned} \ - VLLM_PROMPT_SEQ_BUCKET_MAX=${prompt_seq_max} \ - VLLM_DECODE_BS_BUCKET_MIN=${bs} \ - VLLM_DECODE_BS_BUCKET_MAX=${bs} \ - VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \ - VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \ - VLLM_DECODE_BLOCK_BUCKET_STEP=128 \ - VLLM_DELAYED_SAMPLING=true \ - HABANA_VISIBLE_DEVICES="ALL" \ - VLLM_EP_SIZE=${ep_size} \ - PT_HPU_ENABLE_LAZY_COLLECTIVES=true \ - PT_HPU_WEIGHT_SHARING=0 \ - python3 -m vllm.entrypoints.openai.api_server \ - --port 18080 \ - --model ${model} \ - --load-format safetensors \ - --config-format hf \ - --tensor-parallel-size ${tp_parallel} \ - --max-num-seqs ${bs} \ - --disable-log-requests \ - --dtype bfloat16 \ - --use-v2-block-manager \ - --use-padding-aware-scheduling \ - --num_scheduler_steps ${multi_step} \ - --max-model-len $((total_len_aligned)) \ - --max-num-batched-tokens $((total_len_aligned * 4)) \ - --distributed_executor_backend ray \ - --gpu_memory_utilization ${gpu_utils} \ - --quantization inc \ - # --enable-expert-parallel \ - 2>&1 | tee benchmark_logs/${log_name}_serving.log & - pid=$(($!-1)) - # --trust-remote-code false --enforce-eager \ - - # Wait for server to start - n=0 - ready=false - until [[ "$n" -ge 1000 ]] || [[ $ready == true ]]; do - n=$((n+1)) - if grep -q "Started server process" benchmark_logs/${log_name}_serving.log; then - break - fi - sleep 5s - done - sleep 10s - echo "Server started with PID: ${pid}" - - #=========================================================== - # RUN BENCHMARK - #=========================================================== - - echo "Starting benchmark with Sonnet dataset" - max_concurrency_client=${bs} - start_time=$(date +%s) - - python3 ../benchmarks/benchmark_serving.py \ - --backend vllm \ - --model ${model} \ - --tokenizer ${tokenizer} \ - --dataset-name sonnet \ - --dataset-path ../../benchmarks/sonnet.txt \ - --request-rate ${request_rate} \ - --percentile-metrics ttft,tpot,itl,e2el \ - --ignore-eos \ - --num-prompts ${num_prompts} \ - --port 18080 \ - --sonnet-input-len ${in_len} \ - --sonnet-output-len ${out_len} \ - --sonnet-prefix-len 100 \ - --max-concurrency ${max_concurrency_client} \ - --save-result 2>&1 | tee benchmark_logs/${log_name}_benchmark.log - - end_time=$(date +%s) - echo "Benchmark completed in $((end_time - start_time)) seconds" - - # Clean up - echo "Stopping vLLM server" - kill ${pid} - echo "Script execution completed" - sleep 10 -done - diff --git a/scripts/qwen3/Quant_QWen3-FP8.md b/scripts/qwen3/Quant_QWen3-FP8.md deleted file mode 100644 index ac69fe2bc42..00000000000 --- a/scripts/qwen3/Quant_QWen3-FP8.md +++ /dev/null @@ -1,79 +0,0 @@ - -## 0. Prerequisites - -- Driver: 1.20.1 (how to update Gaudi driver: https://docs.habana.ai/en/latest/Installation_Guide/Driver_Installation.html) -- Firmware: 1.20.1 (how to update Gaudi firmware: https://docs.habana.ai/en/latest/Installation_Guide/Firmware_Upgrade.html#system-unboxing-main) -- Docker: vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - -## 1. Installation - -- VLLM -```bash -git clone recursive https://github.com/vllm-fork.git -b aice/v1.20.1 - -cd vllm-fork -pip install -e . -``` -- INC -```bash -pip install git+https://github.com/intel/neural-compressor.git@qwen-fp8 - - -- VLLM-HPU-EXT -```bash -git clone https://github.com/vllm-hpu-extension-fork.git -b aice/v1.20.1 -cd vllm-hpu-extension -pip install -e . -``` - -## 2. FP8 - - -- Calibration - -```bash -cd vllm-fork/scripts/qwen3 -pip install datasets -export OFFICIAL_MODEL=/path/to/qwen/model -bash ./run_qwen.sh calib ${OFFICIAL_MODEL} -``` - -``` -- Online Serving - -```bash -cd vllm-fork/scripts-fp8 -bash 01-benchmark-online-30B-fp8.sh --model_path --tp_size -ex. -Qwen3-30B-A3B: -bash 01-benchmark-online-30B-fp8.sh --model_path /workspace/HF_models/Qwen3-30B-A3B --tp_size 8 - -Qwen3-235B-A22B: -unzip nc_workspace_measure_kvache_g2_235B_A22B_4cards_EP.zip -bash single_2k_qwen3-235b.sh -notes: change the model_path in single_2k_qwen3-235b.sh to your local path. -``` - -Please refer to https://github.com/HabanaAI/vllm-fork/tree/dev/qwen3/scripts for other benchmarks. - - -## 3. Others - -set EP size: -``` -export VLLM_EP_SIZE= -ex. VLLM_EP_SIZE=8 -``` -if num_tokens exceed the VLLM_DYNAMIC_MOE_MIN_TOKENS,dynamic MoE is used since its performance is -better than static MoE in this case. -``` -export VLLM_DYNAMIC_MOE_MIN_TOKENS= -``` -default value is 256 - -if the number of expert on a single card is smaller than VLLM_DYNAMIC_MOE_MIN_EXPERTS_SINGLEHPU, dynamic MoE -is used since its performance is better than static MoE in this case. -``` -export VLLM_DYNAMIC_MOE_MIN_EXPERTS_SINGLEHPU= -``` -default value 32 diff --git a/scripts/qwen3/inc_measure_g2_235B_A22B.json b/scripts/qwen3/inc_measure_g2_235B_A22B.json deleted file mode 100644 index 5f91d52821f..00000000000 --- a/scripts/qwen3/inc_measure_g2_235B_A22B.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "method": "HOOKS", - "mode": "MEASURE", - "observer": "maxabs", - "whitelist": { - "types": [], - "names": [] - }, - "blocklist": { - "types": [], - "names": ["lm_head"] - }, - "quantize_weight": false, - "dump_stats_path": "./nc_workspace_measure_kvache_g2_235B_A22B/inc_measure_output" - -} diff --git a/scripts/qwen3/inc_measure_g3_30B_A3B.json b/scripts/qwen3/inc_measure_g3_30B_A3B.json deleted file mode 100644 index e631825bf54..00000000000 --- a/scripts/qwen3/inc_measure_g3_30B_A3B.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "method": "HOOKS", - "mode": "MEASURE", - "observer": "maxabs", - "whitelist": { - "types": [], - "names": [] - }, - "blocklist": { - "types": [], - "names": ["lm_head"] - }, - "quantize_weight": false, - "dump_stats_path": "./scripts/nc_workspace_measure_kvache_g3_30B_A3B/inc_measure_output" - -} diff --git a/scripts/qwen3/inc_measure_g3_32B.json b/scripts/qwen3/inc_measure_g3_32B.json deleted file mode 100644 index 11200f3bedd..00000000000 --- a/scripts/qwen3/inc_measure_g3_32B.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "method": "HOOKS", - "mode": "MEASURE", - "observer": "maxabs", - "whitelist": { - "types": [], - "names": [] - }, - "blocklist": { - "types": [], - "names": ["lm_head"] - }, - "quantize_weight": false, - "dump_stats_path": "./scripts/nc_workspace_measure_kvache_g3_32B/inc_measure_output" -} diff --git a/scripts/qwen3/inc_measure_v2.json b/scripts/qwen3/inc_measure_v2.json deleted file mode 100644 index 833629e131c..00000000000 --- a/scripts/qwen3/inc_measure_v2.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "method": "HOOKS", - "mode": "MEASURE", - "observer": "maxabs", - "whitelist": { - "types": [], - "names": [] - }, - "blocklist": { - "types": [], - "names": ["lm_head"] - }, - "quantize_weight": false, - "dump_stats_path": "./scripts/nc_workspace_measure_kvache_v2/inc_measure_output" -} diff --git a/scripts/qwen3/inc_quant_g2_235B_A22B_4card.json b/scripts/qwen3/inc_quant_g2_235B_A22B_4card.json deleted file mode 100644 index 957f36e49d8..00000000000 --- a/scripts/qwen3/inc_quant_g2_235B_A22B_4card.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "mode": "QUANTIZE", - "observer": "maxabs", - "scale_method": "maxabs_hw", - "scale_format": "scalar", - "allowlist": { - "types": [], - "names": [] - }, - "blocklist": { - "types": [], - "names": [ - "lm_head", - "mlp\\.gate\\b", - "k_cache", - "v_cache", - "matmul_av", - "matmul_qk", - "batch2block_matmul", - "block2batch_matmul", - "fused_scaled_dot_product_attention", - "softmax" - ] - }, - "dump_stats_path": "nc_workspace_measure_kvache_g2_235B_A22B_4cards_EP/inc_measure_output" -} diff --git a/scripts/qwen3/inc_quant_g3_30B_A3B.json b/scripts/qwen3/inc_quant_g3_30B_A3B.json deleted file mode 100644 index 86cdeb33ed3..00000000000 --- a/scripts/qwen3/inc_quant_g3_30B_A3B.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "mode": "QUANTIZE", - "observer": "maxabs", - "scale_method": "maxabs_hw", - "scale_format": "scalar", - "allowlist": { - "types": [], - "names": [] - }, - "blocklist": { - "types": [], - "names": [ - "lm_head", - "mlp\\.gate\\b", - "k_cache", - "v_cache", - "matmul_av", - "matmul_qk", - "batch2block_matmul", - "block2batch_matmul", - "fused_scaled_dot_product_attention", - "softmax" - ] - }, - "dump_stats_path": "scripts/nc_workspace_measure_kvache_g3_30B_A3B/inc_measure_output" -} diff --git a/scripts/qwen3/inc_quant_g3_32B.json b/scripts/qwen3/inc_quant_g3_32B.json deleted file mode 100644 index 7b3eba3e648..00000000000 --- a/scripts/qwen3/inc_quant_g3_32B.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "mode": "QUANTIZE", - "observer": "maxabs", - "scale_method": "maxabs_hw", - "scale_format": "scalar", - "allowlist": { - "types": [], - "names": [] - }, - "blocklist": { - "types": [], - "names": [ - "lm_head", - "mlp\\.gate\\b", - "k_cache", - "v_cache", - "matmul_av", - "matmul_qk", - "batch2block_matmul", - "block2batch_matmul", - "fused_scaled_dot_product_attention", - "softmax" - ] - }, - "dump_stats_path": "nc_workspace_measure_kvache_g3_32B/inc_measure_output" -} diff --git a/scripts/qwen3/inc_quant_v2.json b/scripts/qwen3/inc_quant_v2.json deleted file mode 100644 index d6356d2b9e5..00000000000 --- a/scripts/qwen3/inc_quant_v2.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "mode": "QUANTIZE", - "observer": "maxabs", - "scale_method": "maxabs_hw", - "scale_format": "scalar", - "allowlist": { - "types": [], - "names": [] - }, - "blocklist": { - "types": [], - "names": [ - "lm_head", - "mlp\\.gate\\b" - ] - }, - "dump_stats_path": "./scripts/nc_workspace_measure_kvache_v2/inc_measure_output" -} diff --git a/scripts/qwen3/nc_workspace_measure_kvache_g2_235B_A22B_4cards.zip b/scripts/qwen3/nc_workspace_measure_kvache_g2_235B_A22B_4cards.zip deleted file mode 100644 index bcff0b84794..00000000000 Binary files a/scripts/qwen3/nc_workspace_measure_kvache_g2_235B_A22B_4cards.zip and /dev/null differ diff --git a/scripts/qwen3/nc_workspace_measure_kvache_g2_235B_A22B_4cards_EP.zip b/scripts/qwen3/nc_workspace_measure_kvache_g2_235B_A22B_4cards_EP.zip deleted file mode 100644 index 711bd9ef3e5..00000000000 Binary files a/scripts/qwen3/nc_workspace_measure_kvache_g2_235B_A22B_4cards_EP.zip and /dev/null differ diff --git a/scripts/qwen3/run_example_tp_qwen.py b/scripts/qwen3/run_example_tp_qwen.py deleted file mode 100644 index 6dbb3622659..00000000000 --- a/scripts/qwen3/run_example_tp_qwen.py +++ /dev/null @@ -1,382 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -import argparse -import os -import random -import time -from typing import List, Tuple - -import datasets -from transformers import AutoTokenizer, PreTrainedTokenizerBase - -from vllm import LLM, SamplingParams - -# get file location -file_path = os.path.abspath(__file__) -dataset_path = os.path.join(os.path.dirname(file_path), "../benchmarks") - -model_path = "/mnt/disk5/Qwen3-30B-A3B-250425" - -# Parse the command-line arguments. -parser = argparse.ArgumentParser() -parser.add_argument("--model", - type=str, - default=model_path, - help="The model path.") -parser.add_argument("--tokenizer", - type=str, - default=model_path, - help="The model path.") -parser.add_argument("--tp_size", - type=int, - default=8, - help="The number of threads.") -parser.add_argument("--ep_size", - type=int, - default=8, - help="The number of threads.") -parser.add_argument("--dataset", type=str, default=None, help="The dataset.") -parser.add_argument("--isl", - type=int, - default=1024, - help="input sequence length.") -parser.add_argument("--osl", - type=int, - default=1024, - help="output sequence length.") -parser.add_argument("--nprompts", - type=int, - default=4, - help="The number of prompts.") -parser.add_argument("--max_num_seqs", - type=int, - default=None, - help="The max number of sequences.") -parser.add_argument("--max_model_len", - type=int, - default=16384, - help="The max model length.") -parser.add_argument("--random", - action="store_true", - help="Randomly sample prompts.") -parser.add_argument("--fp8_kv_cache", - action="store_true", - help="Use fp8 for kv cache.") -parser.add_argument("--inc", action="store_true", help="Use inc.") -parser.add_argument("--dummy", action="store_true", help="Use dummy weights.") -parser.add_argument("--enforce_eager", - action="store_true", - help="Enforce eager") -args = parser.parse_args() - -os.environ["VLLM_SKIP_WARMUP"] = "true" -os.environ["HABANA_VISIBLE_DEVICES"] = "ALL" -os.environ["PT_HPU_ENABLE_LAZY_COLLECTIVES"] = "true" -os.environ["VLLM_MOE_N_SLICE"] = "1" #if args.ep_size > 1 else "4" -os.environ["VLLM_EP_SIZE"] = f"{args.ep_size}" -os.environ["PT_HPU_WEIGHT_SHARING"] = "0" - - -def sample_sonnet_requests( - dataset_path: str, - num_requests: int, - input_len: int, - prefix_len: int, - tokenizer: PreTrainedTokenizerBase, -) -> List[Tuple[str, str, int, int, None]]: - assert ( - input_len > prefix_len - ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'." - - # Load the dataset. - with open(dataset_path, encoding='utf-8') as f: - poem_lines = f.readlines() - - # Tokenize the poem lines. - poem_token_ids = tokenizer(poem_lines).input_ids - average_poem_len = sum( - len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids) - - # Base prefix for all requests. - base_prompt = "Pick as many lines as you can from these poem lines:\n" - base_message = [{ - "role": "user", - "content": base_prompt, - }] - base_prompt_formatted = tokenizer.apply_chat_template( - base_message, add_generation_prompt=True, tokenize=False) - base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids) - - assert ( - input_len > base_prompt_offset - ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}." - num_input_lines = round( - (input_len - base_prompt_offset) / average_poem_len) - - # First approximately `prefix_len` number of tokens in the - # prompt are fixed poem lines. - assert ( - prefix_len > base_prompt_offset - ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}." - - num_prefix_lines = round( - (prefix_len - base_prompt_offset) / average_poem_len) - prefix_lines = poem_lines[:num_prefix_lines] - - # Sample the rest of lines per request. - sampled_requests: List = [] - for _ in range(num_requests): - num_lines_needed = num_input_lines - num_prefix_lines - sampled_lines = "".join(prefix_lines + - random.choices(poem_lines, k=num_lines_needed)) - - prompt = f"{base_prompt}{sampled_lines}" - message = [ - { - "role": "user", - "content": prompt, - }, - ] - prompt_formatted = tokenizer.apply_chat_template( - message, add_generation_prompt=True, tokenize=False) - sampled_requests.append(prompt_formatted) - - return sampled_requests, None - - -def sample_gsm8k_requests(num_requests: int, - tokenizer: PreTrainedTokenizerBase, - do_random: bool = False) -> List[Tuple[str, str]]: - # Load the dataset from huggingface. - dataset = datasets.load_dataset("openai/gsm8k", "main") - prompts = dataset["train"]["question"] - expected_responses = dataset["train"]["answer"] - few_shots = 5 - base_prompt = [ - f"Question: {prompts[i]}\nAnswer: {expected_responses[i]}\n" - for i in range(few_shots) - ] - base_prompt = "\n".join(base_prompt) - base_prompt = f"{base_prompt}\n" - - # Sample the requests. - sampled_requests: List = [] - sampled_response: List = [] - for j in range(num_requests): - i = random.choice(range(len( - prompts[few_shots:]))) if do_random else j + few_shots - prompt = f"{base_prompt}Question: {prompts[i]}\nAnswer: " - # message = [ - # { - # "role": "user", - # "content": prompt, - # }, - # ] - # prompt = tokenizer.apply_chat_template( - # message, add_generation_prompt=True, tokenize=False) - expected_response = expected_responses[i] - sampled_requests.append(prompt) - sampled_response.append(expected_response) - - return sampled_requests, sampled_response - - -def dump_logprob(logprobs, file_name): - import json - converted_logprobs = [] - for token_dict in logprobs: - converted_dict = {} - for token_id, lp in token_dict.items(): - converted_dict[token_id] = { - 'logprob': lp.logprob, - 'rank': lp.rank, - 'decoded_token': lp.decoded_token - } - converted_logprobs.append(converted_dict) - - # Write to JSON file - with open(file_name, 'w') as f: - json.dump(converted_logprobs, f, indent=2) - print(f"save logprobs to {file_name}.") - - -if __name__ == "__main__": - - # Sample prompts. - - if args.dataset == "sonnet": - # Sample sonnet requests. - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) - prompts, gt = sample_sonnet_requests( - dataset_path=f"{dataset_path}/sonnet.txt", - num_requests=args.nprompts, - input_len=args.isl, - prefix_len=200, - tokenizer=tokenizer, - ) - elif args.dataset == "gsm8k": - # Sample GSM8K requests. - args.osl = 128 - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) - prompts, gt = sample_gsm8k_requests( - num_requests=args.nprompts, - tokenizer=tokenizer, - do_random=args.random, - ) - elif args.dataset == "pile": - - def reset_seed(seed=42): - import random - - import numpy as np - import torch - - torch.manual_seed(seed) - np.random.seed(seed) - random.seed(seed) - - def get_prompt_token_ids(model_path, prompts, max_length=1024): - from transformers import AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained(model_path) - prompt_token_ids = [] - for prompt in prompts: - tokens = tokenizer( - prompt, - return_tensors="pt", - truncation=True, - max_length=max_length, - ) - if len(tokens.input_ids[0]) < max_length: - continue - prompt_token_ids.append( - [x.item() for x in tokens.input_ids[0]]) - return prompt_token_ids - - def get_pile_prompts(model_name, num_samples=512): - import transformers - from datasets import load_dataset - from tqdm import tqdm - - least_tokens = 1024 - seed = 42 - - reset_seed(seed) - - dataset = load_dataset("NeelNanda/pile-10k", split="train") - dataset = dataset.shuffle(seed=seed) - - tokenizer = transformers.AutoTokenizer.from_pretrained( - model_name, trust_remote_code=True) - num_sample = 0 - samples_lst = [] - for data in tqdm(dataset): - prompt = data["text"] - tokens = tokenizer(prompt, return_tensors="pt") - if len(tokens.input_ids[0]) < least_tokens: - continue - num_sample += 1 - samples_lst.append(prompt) - if num_sample >= num_samples: - break - return samples_lst - - least_tokens = args.isl - num_samples = args.nprompts - prompts = get_pile_prompts(args.model, num_samples) - prompt_token_ids = get_prompt_token_ids(args.model, prompts, - least_tokens) - print(f"Got {len(prompts)} prompts, length of first prompt: \ - {len(prompt_token_ids[0])}.") - gt = None - else: - prompts = [ - "Hello, my name is", - "0.999 compares to 0.9 is ", - "The capital of France is", - "The future of AI is", - ] - if args.nprompts > 4: - prompts += random.choices(prompts, k=args.nprompts - 4) - elif args.nprompts < 4: - prompts = prompts[:args.nprompts] - gt = None - # Create a sampling params object. - sampling_params = SamplingParams( - temperature=0, - max_tokens=args.osl, - ignore_eos=True, - # logprobs=10, - ) - model = args.model - param = {} - if args.inc: - param["quantization"] = "inc" - if args.fp8_kv_cache: - param["kv_cache_dtype"] = "fp8_inc" - if args.max_num_seqs is not None: - param["max_num_seqs"] = args.max_num_seqs - if args.enforce_eager: - param["enforce_eager"] = True - if args.ep_size > 1: - param["enable_expert_parallel"] = True - # os.environ["VLLM_EP_SIZE"] = str(args.ep_size) - if args.dummy: - param["load_format"] = "dummy" - if args.tp_size == 1: - llm = LLM(model=model, - tokenizer=args.tokenizer, - trust_remote_code=True, - dtype="bfloat16", - max_model_len=args.max_model_len, - gpu_memory_utilization=0.8, - **param) - else: - llm = LLM(model=model, - tokenizer=args.tokenizer, - tensor_parallel_size=args.tp_size, - distributed_executor_backend='mp', - trust_remote_code=True, - max_model_len=args.max_model_len, - dtype="bfloat16", - gpu_memory_utilization=0.8, - **param) - - # Generate texts from the prompts. - # The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - start = time.perf_counter() - if args.dataset == "pile": - outputs = llm.generate(prompts=None, - sampling_params=sampling_params, - prompt_token_ids=prompt_token_ids) - else: - outputs = llm.generate(prompts, sampling_params) - end = time.perf_counter() - # Print the outputs. - print(f"e2e took {end - start} seconds") - for output_i in range(len(outputs)): - output = outputs[output_i] - gt_i = None if gt is None else gt[output_i] - prompt = output.prompt - generated_text = output.outputs[0].text - gen_token_id = output.outputs[0].token_ids - prompt_logprobs = output.outputs[0].logprobs - num_hidden_layers = int(os.environ.get("VLLM_NUM_LAYERS", "61")) - OFFICIAL_FP8_MODEL = os.environ.get("OFFICIAL_FP8_MODEL", "0") - # replace "/" with "_" - _model_path = OFFICIAL_FP8_MODEL.replace("/", "_") - time_str = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()) - file_name = f"prompt_logprobs.layer \ - {num_hidden_layers}_{_model_path}_{time_str}.json" - - # dump_logprob(prompt_logprobs, file_name) - print("====================================") - print(f"Prompt: {prompt!r}") - print(f"Generated text: {generated_text!r}") - print(f"Generated token: {gen_token_id!r}") - print(f"Ground truth: {gt_i!r}") - # print(f"Prompt logprobs: {prompt_logprobs!r}") - print("====================================") - if os.getenv("QUANT_CONFIG", None) is not None: - llm.llm_engine.model_executor.shutdown() - del llm diff --git a/scripts/qwen3/run_qwen.sh b/scripts/qwen3/run_qwen.sh deleted file mode 100644 index eb2bb47f3fe..00000000000 --- a/scripts/qwen3/run_qwen.sh +++ /dev/null @@ -1,69 +0,0 @@ -pkill -9 python - -export PT_HPU_LAZY_MODE=1 -export GRAPH_VISUALIZATION=1 -export VLLM_LOGGING_LEVEL=DEBUG -export VLLM_DISABLE_MARK_SCALES_AS_CONST=1 - - -# export ENABLE_EXPERIMENTAL_FLAGS=1 -# export PRINT_FILE_AND_LINE=1 -# export LOG_LEVEL_PASS_MANAGER=1 -# export LOG_LEVEL_ALL=1 HABANA_LOGS=.habana_logs-515 - -export OFFICIAL_MODEL="/mnt/disk5/Qwen3-30B-A3B-250425" - -############################# -# Qwen -############################# -# FIXME: (Yi) Enable the static MoE path -export VLLM_DYNAMIC_MOE_MIN_TOKENS=0 - -#!/bin/bash - -set -e - -MODE=$1 # First argument -MODEL=$2 # Second argument (model name) -TOKENIZER=$3 # Third argument (tokenizer name) - -if [ -z "$TOKENIZER" ]; then - TOKENIZER=$MODEL -fi - -if [ -z "$MODE" ] || [ -z "$MODEL" ] || [ -z "$TOKENIZER" ]; then - echo "Usage: $0 {bf16|calib|quant|eval} " - exit 1 -fi - -COMMON_ARGS="--model $MODEL --tokenizer $TOKENIZER --osl 32 --max_model_len 2048 --max_num_seqs 1 --tp_size 8 --ep_size 8" - -model_name=$(basename ${MODEL}) -if [ ${model_name} == "Qwen3-30B-A3B-250425" ]; then - quant_file_path="inc_measure_g3_30B_A3B.json" -elif [ ${model_name} == "Qwen3-30B-A3B" ]; then - quant_file_path="inc_measure_g3_30B_A3B.json" -elif [ ${model_name} == "Qwen3-32B-250426" ]; then - quant_file_path="inc_measure_g3_32B.json" -elif [ ${model_name} == "Qwen3-235B-A22B" ]; then - COMMON_ARGS="--model $MODEL --tokenizer $TOKENIZER --osl 32 --max_model_len 8192 --max_num_seqs 1 --tp_size 8 --ep_size 8" - quant_file_path="inc_measure_g2_235B_A22B.json" -else - echo "Unknown model name: ${model_name}" - exit 1 -fi - - -if [ "$MODE" == "bf16" ]; then - python run_example_tp_qwen.py $COMMON_ARGS - -elif [ "$MODE" == "calib" ]; then - QUANT_CONFIG=${quant_file_path} \ - python run_example_tp_qwen.py $COMMON_ARGS --inc --dataset pile --nprompts 512 - -else - echo "Unknown mode: $MODE" - echo "Valid modes are: bf16, calib" - exit 1 -fi - diff --git a/scripts/qwen3/single_2k_qwen3-235b.sh b/scripts/qwen3/single_2k_qwen3-235b.sh deleted file mode 100644 index e76e8203e18..00000000000 --- a/scripts/qwen3/single_2k_qwen3-235b.sh +++ /dev/null @@ -1,109 +0,0 @@ -#! /bin/bash - -# set -x - -BASH_DIR=$(dirname "${BASH_SOURCE[0]}") -source "$BASH_DIR"/utils.sh - -ray stop --force - -export QUANT_CONFIG=inc_quant_g2_235B_A22B_4card.json - -# DO NOT change unless you fully undersand its purpose -export HABANA_VISIBLE_DEVICES="ALL" -export PT_HPU_ENABLE_LAZY_COLLECTIVES="true" -export VLLM_RAY_DISABLE_LOG_TO_DRIVER="1" -export RAY_IGNORE_UNHANDLED_ERRORS="1" -export PT_HPU_WEIGHT_SHARING=0 -export HABANA_VISIBLE_MODULES="0,1,2,3" -export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=1 - -#export VLLM_MOE_N_SLICE=8 -export VLLM_EP_SIZE=4 -export VLLM_DELAYED_SAMPLING="true" - -block_size=128 -# DO NOT change ends... - -# memory footprint tunning params -export VLLM_GPU_MEMORY_UTILIZATION=0.9 -export VLLM_GRAPH_RESERVED_MEM=0.4 -export VLLM_GRAPH_PROMPT_RATIO=0 -export VLLM_DISABLE_MARK_SCALES_AS_CONST=true -# params -model_path=/root/.cache/huggingface/hub/models--Qwen--Qwen3-235B-A22B/snapshots/Qwen3-235B-A22B/ -max_model_len=8192 -max_num_batched_tokens=25536 -max_num_seqs=256 -input_min=1 -input_max=2048 -output_max=2048 - -unset VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP VLLM_PROMPT_BS_BUCKET_MAX -unset VLLM_PROMPT_SEQ_BUCKET_MIN VLLM_PROMPT_SEQ_BUCKET_STEP VLLM_PROMPT_SEQ_BUCKET_MAX -unset VLLM_DECODE_BS_BUCKET_MIN VLLM_DECODE_BS_BUCKET_STEP VLLM_DECODE_BS_BUCKET_MAX -unset VLLM_DECODE_BLOCK_BUCKET_MIN VLLM_DECODE_BLOCK_BUCKET_STEP VLLM_DECODE_BLOCK_BUCKET_MAX - - -export PT_HPU_RECIPE_CACHE_CONFIG=/data/8k_cache,false,8192 - -#set_bucketing - - - -# !!!!!!!!!!!!!!!!!!!! set bucketing !!!!!!!!!!!!! -prompt_bs_min=1 -prompt_bs_step=$(( $max_num_seqs > 32 ? 32 : $max_num_seqs )) -prompt_bs_max=$(( $max_num_seqs > 64 ? 64 : $max_num_seqs )) -export VLLM_PROMPT_BS_BUCKET_MIN=${VLLM_PROMPT_BS_BUCKET_MIN:-$prompt_bs_min} -export VLLM_PROMPT_BS_BUCKET_STEP=${VLLM_PROMPT_BS_BUCKET_STEP:-$prompt_bs_step} -export VLLM_PROMPT_BS_BUCKET_MAX=${VLLM_PROMPT_BS_BUCKET_MAX:-$prompt_bs_max} - -prompt_seq_step=128 -prompt_seq_min=2048 -prompt_seq_max=2048 -export VLLM_PROMPT_SEQ_BUCKET_MIN=${VLLM_PROMPT_SEQ_BUCKET_MIN:-$prompt_seq_min} -export VLLM_PROMPT_SEQ_BUCKET_STEP=${VLLM_PROMPT_SEQ_BUCKET_STEP:-$prompt_seq_step} -export VLLM_PROMPT_SEQ_BUCKET_MAX=${VLLM_PROMPT_SEQ_BUCKET_MAX:-$prompt_seq_max} - -decode_bs_min=1 -decode_bs_step=$(( $max_num_seqs > 32 ? 32 : $max_num_seqs )) -decode_bs_max=$max_num_seqs -export VLLM_DECODE_BS_BUCKET_MIN=${VLLM_DECODE_BS_BUCKET_MIN:-$decode_bs_min} -export VLLM_DECODE_BS_BUCKET_STEP=${VLLM_DECODE_BS_BUCKET_STEP:-$decode_bs_step} -export VLLM_DECODE_BS_BUCKET_MAX=${VLLM_DECODE_BS_BUCKET_MAX:-$decode_bs_max} - -decode_block_min=128 -decode_block_step=128 -block_size=128 -decode_block_max=$(( ((max_num_seqs * max_model_len / block_size) > 128) ? (max_num_seqs * max_model_len / block_size) : 128 )) -export VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN:-$decode_block_min} -export VLLM_DECODE_BLOCK_BUCKET_STEP=${VLLM_DECODE_BLOCK_BUCKET_STEP:-$decode_block_step} -export VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX:-$decode_block_max} - -set_env -set_numactl - -echo " environments are reseted " - -env | grep VLLM - - -python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8688 \ - --block-size 128 \ - --model $model_path \ - --device hpu \ - --dtype bfloat16 \ - --tensor-parallel-size 4 \ - --trust-remote-code \ - --max-model-len $max_model_len \ - --max-num-seqs $max_num_seqs \ - --max-num-batched-tokens $max_num_batched_tokens \ - --disable-log-requests \ - --use-padding-aware-scheduling \ - --use-v2-block-manager \ - --distributed_executor_backend ray \ - --gpu_memory_utilization 0.9 \ - --quantization=inc \ - --weights-load-device cpu \ - --enable-expert-parallel diff --git a/scripts/qwen3/utils.sh b/scripts/qwen3/utils.sh deleted file mode 100644 index a910cb8cc17..00000000000 --- a/scripts/qwen3/utils.sh +++ /dev/null @@ -1,142 +0,0 @@ -#! /bin/bash - -# set -x - -# set up commen environment variables for vllm -set_env(){ - # pytorch bridge - export PT_HPU_WEIGHT_SHARING=${PT_HPU_WEIGHT_SHARING:-"0"} - export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE:-"1"} - - # memory usage tuning - export VLLM_GPU_MEMORY_UTILIZATION=${VLLM_GPU_MEMORY_UTILIZATION:-"0.9"} - export VLLM_GRAPH_RESERVED_MEM=${VLLM_GRAPH_RESERVED_MEM:-"0.2"} - export VLLM_GRAPH_PROMPT_RATIO=${VLLM_GRAPH_PROMPT_RATIO:-"0.8"} - export VLLM_MAX_SEQ_LEN_TO_CAPTURE=${VLLM_MAX_SEQ_LEN_TO_CAPTURE:-"8192"} - - # performance tuning - export VLLM_DELAYED_SAMPLING=${VLLM_DELAYED_SAMPLING:-"true"} - export VLLM_ZERO_PADDING=${VLLM_ZERO_PADDING:-"true"} - - # MoE sepcific - export VLLM_EP_SIZE=${VLLM_EP_SIZE:-"${num_hpu}"} - export VLLM_DYNAMIC_MOE_MIN_TOKENS=${VLLM_DYNAMIC_MOE_MIN_TOKENS:-"256"} - export VLLM_DYNAMIC_MOE_MIN_EXPERTS_SINGLEHPU=${VLLM_DYNAMIC_MOE_MIN_EXPERTS_SINGLEHPU:-"32"} - - # profiler - export VLLM_PROFILER_ENABLED=${VLLM_PROFILER_ENABLED:-"false"} - export VLLM_ENGINE_PROFILER_ENABLED=${VLLM_ENGINE_PROFILER_ENABLED:-"false"} - export VLLM_ENGINE_PROFILER_WARMUP_STEPS=${VLLM_ENGINE_PROFILER_WARMUP_STEPS:-"0"} - export VLLM_ENGINE_PROFILER_STEPS=${VLLM_ENGINE_PROFILER_STEPS:-"1"} - export VLLM_ENGINE_PROFILER_REPEAT=${VLLM_ENGINE_PROFILER_REPEAT:-"1"} - - # network - default_host_ip=$( hostname -I | awk '{print $1}' ) - default_ifname=$( ip -br addr show to ${default_host_ip} | awk '{print $1}' ) - export VLLM_HOST_IP=${VLLM_HOST_IP:-"${default_host_ip}"} - export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-"${default_ifname}"} - export HCCL_SOCKET_IFNAME=${HCCL_SOCKET_IFNAME:-"${default_ifname}"} - - # misc - export VLLM_WORKER_MULTIPROC_METHOD=${VLLM_WORKER_MULTIPROC_METHOD:-"spawn"} - export TOKENIZERS_PARALLELISM=${TOKENIZERS_PARALLELISM:-"true"} - export RAY_IGNORE_UNHANDLED_ERRORS=${RAY_IGNORE_UNHANDLED_ERRORS:-"1"} - export VLLM_RAY_DISABLE_LOG_TO_DRIVER=${VLLM_RAY_DISABLE_LOG_TO_DRIVER:-"1"} -} - -# set up numactl for the specified module IDs -set_numactl(){ - if [ "$module_ids" != "None" ]; then - # Check if module_ids is a comma-separated list of integers - if [[ $module_ids =~ ^[0-9]+(,[0-9]+)*$ ]]; then - IFS="," read -r -a MODULES <<< "$module_ids" - else - echo "The specified module IDs should be a comma-separated list of integers instead of $module_ids." - return - fi - else - echo no modules specified, skip numactl - return - fi - - HL_TOPO="hl-smi topo -c -N" - NODE_MEM=($( echo -e "$($HL_TOPO | grep "^[$(IFS="|" ; echo "${MODULES[*]}")]" | awk '{print $4}' | uniq)" )) - NODE_CPUS=($( echo -e "$($HL_TOPO | grep "^[$(IFS="|" ; echo "${MODULES[*]}")]" | awk '{print $2}' | uniq | sed 's/,//g')" )) - - if [ "${#NODE_MEM[@]}" -gt 1 ] || [ "${#NODE_CPUS[@]}" -gt 1 ];then - echo "The specified modules are not on the same NUMA node, skip numactl" - return - fi - NUM_HPU_PER_NODE=$($HL_TOPO | grep -c "${NODE_CPUS[0]}") - - CPUS_LOW=$(echo "${NODE_CPUS[0]}" | cut -d '-' -f 1) - CPUS_UP=$(echo "${NODE_CPUS[0]}" | cut -d '-' -f 2) - NUM_CPU_PER_HPU=$(echo "($CPUS_UP-$CPUS_LOW+1)/$NUM_HPU_PER_NODE" | bc) - - CORES=() - for MODULE in "${MODULES[@]}"; do - MODULE_IDX=$(echo "$MODULE % $NUM_HPU_PER_NODE" | bc) - CORE_LOW=$(echo "$CPUS_LOW + ($NUM_CPU_PER_HPU * $MODULE_IDX)" | bc) - CORE_UP=$(echo "$CORE_LOW + $NUM_CPU_PER_HPU - 1" | bc) - CORES+=("$CORE_LOW-$CORE_UP") - done - CORES_STR=$(IFS="," ; echo "${CORES[*]}") - - NUMA_CTL="numactl -C $CORES_STR -m ${NODE_MEM[0]}" - MODULES_STR=$(IFS=',' ; echo "${MODULES[@]}") - echo "using '$NUMA_CTL' for module #.$MODULES_STR" -} - -# set up bucketing based on input/output range and max_num_batched_tokens -set_bucketing(){ - max_num_batched_tokens=${max_num_batched_tokens:-8192} - max_num_seqs=${max_num_seqs:-128} - input_min=${input_min:-1024} - input_max=${input_max:-1024} - output_max=${output_max:-2048} - block_size=${block_size:-128} - - prompt_bs_step=1 - prompt_bs_min=1 - prompt_bs_max=$(( $max_num_batched_tokens / $input_min )) - # prompt_bs_max = min(prompt_bs_max, max_num_seqs) - prompt_bs_max=$(( $prompt_bs_max > $max_num_seqs ? $max_num_seqs : $prompt_bs_max )) - # prompt_bs_max = CEILING.MATH(prompt_bs_max, prompt_bs_step) - prompt_bs_max=$(( ($prompt_bs_max + $prompt_bs_step - 1) / $prompt_bs_step * $prompt_bs_step )) - export VLLM_PROMPT_BS_BUCKET_MIN=${VLLM_PROMPT_BS_BUCKET_MIN:-$prompt_bs_min} - export VLLM_PROMPT_BS_BUCKET_STEP=${VLLM_PROMPT_BS_BUCKET_STEP:-$prompt_bs_step} - export VLLM_PROMPT_BS_BUCKET_MAX=${VLLM_PROMPT_BS_BUCKET_MAX:-$prompt_bs_max} - - prompt_seq_step=$block_size - # prompt_seq_min = CEILING.MATH(input_min, prompt_seq_step) - prompt_seq_min=$(( ($input_min + $prompt_seq_step -1) / $prompt_seq_step * $prompt_seq_step )) - # prompt_seq_max = CEILING.MATH(input_max, prompt_seq_step) + prompt_seq_step - prompt_seq_max=$(( (($input_max + $prompt_seq_step -1) / $prompt_seq_step + 1) * $prompt_seq_step )) - export VLLM_PROMPT_SEQ_BUCKET_MIN=${VLLM_PROMPT_SEQ_BUCKET_MIN:-$prompt_seq_min} - export VLLM_PROMPT_SEQ_BUCKET_STEP=${VLLM_PROMPT_SEQ_BUCKET_STEP:-$prompt_seq_step} - export VLLM_PROMPT_SEQ_BUCKET_MAX=${VLLM_PROMPT_SEQ_BUCKET_MAX:-$prompt_seq_max} - - # decode_bs_step = ROUNDUP(max_num_seqs / 16, 0) - decode_bs_step=$(( ($max_num_seqs + 15) / 16 )) - # decode_bs_step = min(decode_bs_step, 16) - decode_bs_step=$(( $decode_bs_step > 16 ? 16 : $decode_bs_step )) - decode_bs_min=1 - # decode_bs_max = CEILING.MATH(max_num_seqs, decode_bs_step) - decode_bs_max=$(( ($max_num_seqs + $decode_bs_step -1) / $decode_bs_step * $decode_bs_step )) - export VLLM_DECODE_BS_BUCKET_MIN=${VLLM_DECODE_BS_BUCKET_MIN:-$decode_bs_min} - export VLLM_DECODE_BS_BUCKET_STEP=${VLLM_DECODE_BS_BUCKET_STEP:-$decode_bs_step} - export VLLM_DECODE_BS_BUCKET_MAX=${VLLM_DECODE_BS_BUCKET_MAX:-$decode_bs_max} - - decode_block_step=$block_size - # decode_block_min = ROUNDUP(input_min / block_size, 0) - decode_block_min=$(( ($input_min + $block_size - 1) / $block_size )) - # decode_block_min = CEILING.MATH(decode_block_min, decode_block_step) - decode_block_min=$(( ($decode_block_min + $decode_block_step -1) / $decode_block_step * $decode_block_step )) - # decode_block_max = (ROUNDUP((input_max + output_max) / block_size, 0) + 1) * decode_bs_max - decode_block_max=$(( (($input_max + $output_max + $block_size -1) / $block_size + 1) * $decode_bs_max)) - # decode_block_max = (CEILING.MATH(decode_block_max, decode_block_step) - decode_block_max=$(( ($decode_block_max + $decode_block_step -1) / $decode_block_step * $decode_block_step )) - export VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN:-$decode_block_min} - export VLLM_DECODE_BLOCK_BUCKET_STEP=${VLLM_DECODE_BLOCK_BUCKET_STEP:-$decode_block_step} - export VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX:-$decode_block_max} -} diff --git a/scripts/start_gaudi_vllm_server.sh b/scripts/start_gaudi_vllm_server.sh index bdbb74025f6..8566954a497 100644 --- a/scripts/start_gaudi_vllm_server.sh +++ b/scripts/start_gaudi_vllm_server.sh @@ -151,10 +151,11 @@ case "$dtype" in echo Running with dtype="$dtype" export QUANT_CONFIG=quantization/${model_name}/maxabs_quant_g2.json export PT_HPU_WEIGHT_SHARING=0 + export VLLM_DISABLE_MARK_SCALES_AS_CONST=true QUANT_FLAGS=(--quantization inc --kv-cache-dtype fp8_inc) - if [ "${model_name}" == "Qwen3-235B-A22B" ] || [ "${model_name}" == "Qwen3-30B-A3B" ]; then - QUANT_FLAGS=(--quantization inc --weights-load-device cpu) - fi + if [ "${model_name}" == "Qwen3-235B-A22B" ] || [ "${model_name}" == "Qwen3-30B-A3B" ]; then + QUANT_FLAGS=(--quantization inc --weights-load-device cpu) + fi dtype="bfloat16" ;; "awq")