Skip to content

docker vllm: add unit tests #1406

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: habana_main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .cd/requirements_tests.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pytest
pandas
pyyaml
18 changes: 18 additions & 0 deletions .cd/tests/test_entrypoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
from entrypoint import Entrypoint


def test_generate_server_script(tmp_path):
entry = Entrypoint()
template = tmp_path / "template.sh"
output = tmp_path / "output.sh"
# Prepare a simple template
template.write_text("#@VARS\nrun")
entry.generate_server_script(str(template), str(output), {
"FOO": "bar",
"X": 1
})
content = output.read_text()
assert "export FOO=bar" in content
assert "export X=1" in content
assert "run" in content
53 changes: 53 additions & 0 deletions .cd/tests/test_vllm_autocalc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# SPDX-License-Identifier: Apache-2.0
import pytest
from vllm_autocalc import VarsGenerator


@pytest.fixture
def minimal_config(tmp_path):
# Prepare minimal config files for VarsGenerator
defaults = tmp_path / "defaults.yaml"
varlist_conf = tmp_path / "varlist_conf.yaml"
model_def_settings = tmp_path / "settings_vllm.csv"

defaults.write_text("defaults:\n"
" DEVICE_NAME: TEST_DEVICE\n"
" HPU_MEM: {GAUDI2: 96}\n"
" DTYPE: bfloat16\n")
varlist_conf.write_text("output_vars:\n"
" - DEVICE_NAME\n"
" - DTYPE\n"
"user_variable:\n"
" - DEVICE_NAME\n")
model_def_settings.write_text("MODEL,PARAM1\nTEST_MODEL,123\n")

return {
"defaults_path": str(defaults),
"varlist_conf_path": str(varlist_conf),
"model_def_settings_path": str(model_def_settings)
}


def test_build_context(monkeypatch, minimal_config):
monkeypatch.setenv("MODEL", "TEST_MODEL")
vg = VarsGenerator(**minimal_config)
assert vg.context["DEVICE_NAME"] == "TEST_DEVICE"
assert vg.context["DTYPE"] == "bfloat16"
assert vg.context["MODEL"] == "TEST_MODEL"


def test_overwrite_params(monkeypatch, minimal_config):
monkeypatch.setenv("MODEL", "TEST_MODEL")
monkeypatch.setenv("DEVICE_NAME", "OVERRIDE_DEVICE")
vg = VarsGenerator(**minimal_config)
vg.overwrite_params()
assert vg.context["DEVICE_NAME"] == "OVERRIDE_DEVICE"


def test_return_dict(monkeypatch, minimal_config):
monkeypatch.setenv("MODEL", "TEST_MODEL")
vg = VarsGenerator(**minimal_config)
result = vg.return_dict()
assert "DEVICE_NAME" in result
assert "DTYPE" in result
assert "MODEL" not in result # Not in output_vars
228 changes: 228 additions & 0 deletions .cd/tests/test_vllm_autocalc_rules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
# SPDX-License-Identifier: Apache-2.0
import math

import vllm_autocalc_rules as rules


def test_calc_TENSOR_PARALLEL_SIZE():
ctx = {'TENSOR_PARALLEL_SIZE': 4}
assert rules.calc_TENSOR_PARALLEL_SIZE(ctx) == 4


def test_calc_MAX_MODEL_LEN():
ctx = {'MAX_MODEL_LEN': 1024}
assert rules.calc_MAX_MODEL_LEN(ctx) == 1024


def test_calc_PT_HPU_ENABLE_LAZY_COLLECTIVES():
ctx = {'TENSOR_PARALLEL_SIZE': 2}
assert rules.calc_PT_HPU_ENABLE_LAZY_COLLECTIVES(ctx) is True


def test_calc_MODEL_MEM_FROM_CONFIG():
ctx = {'MODEL_MEM_FROM_CONFIG': "123.5"}
assert rules.calc_MODEL_MEM_FROM_CONFIG(ctx) == 123.5


def test_calc_DEVICE_HPU_MEM():
ctx = {'HPU_MEM': {'GAUDI2': 96}, 'DEVICE_NAME': 'GAUDI2'}
assert rules.calc_DEVICE_HPU_MEM(ctx) == 96


def test_calc_TOTAL_GPU_MEM():
ctx = {'DEVICE_HPU_MEM': 96, 'TENSOR_PARALLEL_SIZE': 4}
assert rules.calc_TOTAL_GPU_MEM(ctx) == 384


def test_calc_MODEL_MEM_IN_GB():
ctx = {
'MODEL_MEM_FROM_CONFIG': 2 * 1024**3,
'QUANT_DTYPE': 1,
'MODEL_DTYPE': 1
}
assert rules.calc_MODEL_MEM_IN_GB(ctx) == 2.0


def test_calc_USABLE_MEM():
ctx = {
'TOTAL_GPU_MEM': 384,
'TENSOR_PARALLEL_SIZE': 4,
'UNAVAILABLE_MEM_ABS': 10,
'MODEL_MEM_IN_GB': 8,
'PROFILER_MEM_OVERHEAD': 2
}
expected = ((384 / 4) - 10 - (8 / 4) - 2)
assert rules.calc_USABLE_MEM(ctx) == expected


def test_calc_GPU_MEMORY_UTIL_TEMP():
ctx = {'GPU_FREE_MEM_TARGET': 10, 'USABLE_MEM': 100}
assert rules.calc_GPU_MEMORY_UTIL_TEMP(ctx) == 0.9


def test_calc_GPU_MEM_UTILIZATION():
ctx = {'GPU_MEMORY_UTIL_TEMP': 0.987}
assert rules.calc_GPU_MEM_UTILIZATION(ctx) == math.floor(0.987 * 100) / 100


def test_calc_KV_CACHE_PER_SEQ():
ctx = {
'MAX_MODEL_LEN': 128,
'NUM_HIDDEN_LAYERS': 2,
'HIDDEN_SIZE': 4,
'NUM_KEY_VALUE_HEADS': 2,
'CACHE_DTYPE_BYTES': 2,
'NUM_ATTENTION_HEADS': 2
}
expected = ((2 * 128 * 2 * 4 * 2 * 2) / 2) / (1024 * 1024 * 1024)
assert rules.calc_KV_CACHE_PER_SEQ(ctx) == expected


def test_calc_EST_MAX_NUM_SEQS():
ctx = {
'TENSOR_PARALLEL_SIZE': 4,
'USABLE_MEM': 100,
'GPU_MEM_UTILIZATION': 0.9,
'KV_CACHE_PER_SEQ': 0.5
}
expected = (4 * 100 * 0.9) / 0.5
assert rules.calc_EST_MAX_NUM_SEQS(ctx) == expected


def test_calc_EST_HPU_BLOCKS():
ctx = {'MAX_MODEL_LEN': 128, 'EST_MAX_NUM_SEQS': 32, 'BLOCK_SIZE': 16}
expected = (128 * 32) / 16
assert rules.calc_EST_HPU_BLOCKS(ctx) == expected


def test_calc_DECODE_BS_RAMP_GRAPHS():
ctx = {'VLLM_DECODE_BS_BUCKET_STEP': 16, 'VLLM_DECODE_BS_BUCKET_MIN': 2}
expected = 1 + int(math.log(16 / 2, 2))
assert rules.calc_DECODE_BS_RAMP_GRAPHS(ctx) == expected


def test_calc_DECODE_BS_STEP_GRAPHS():
ctx = {'EST_MAX_NUM_SEQS': 64, 'VLLM_DECODE_BS_BUCKET_STEP': 8}
expected = max(0, int(1 + (64 - 8) / 8))
assert rules.calc_DECODE_BS_STEP_GRAPHS(ctx) == expected


def test_calc_DECODE_BLOCK_RAMP_GRAPHS():
ctx = {
'VLLM_DECODE_BLOCK_BUCKET_STEP': 16,
'VLLM_DECODE_BLOCK_BUCKET_MIN': 2
}
expected = 1 + int(math.log(16 / 2, 2))
assert rules.calc_DECODE_BLOCK_RAMP_GRAPHS(ctx) == expected


def test_calc_DECODE_BLOCK_STEP_GRAPHS():
ctx = {'EST_HPU_BLOCKS': 64, 'VLLM_DECODE_BLOCK_BUCKET_STEP': 8}
expected = max(0, int(1 + (64 - 8) / 8))
assert rules.calc_DECODE_BLOCK_STEP_GRAPHS(ctx) == expected


def test_calc_NUM_DECODE_GRAPHS():
ctx = {
'DECODE_BS_RAMP_GRAPHS': 2,
'DECODE_BS_STEP_GRAPHS': 3,
'DECODE_BLOCK_RAMP_GRAPHS': 4,
'DECODE_BLOCK_STEP_GRAPHS': 5
}
expected = (2 + 3) * (4 + 5)
assert rules.calc_NUM_DECODE_GRAPHS(ctx) == expected


def test_calc_PROMPT_BS_RAMP_GRAPHS():
ctx = {
'MAX_NUM_PREFILL_SEQS': 16,
'VLLM_PROMPT_BS_BUCKET_STEP': 8,
'VLLM_PROMPT_BS_BUCKET_MIN': 2
}
expected = 1 + int(math.log(min(16, 8) / 2, 2))
assert rules.calc_PROMPT_BS_RAMP_GRAPHS(ctx) == expected


def test_calc_PROMPT_BS_STEP_GRAPHS():
ctx = {'MAX_NUM_PREFILL_SEQS': 32, 'VLLM_PROMPT_BS_BUCKET_STEP': 8}
expected = max(0, int(1 + (32 - 8) / 8))
assert rules.calc_PROMPT_BS_STEP_GRAPHS(ctx) == expected


def test_calc_PROMPT_SEQ_RAMP_GRAPHS():
ctx = {'VLLM_PROMPT_SEQ_BUCKET_STEP': 16, 'VLLM_PROMPT_SEQ_BUCKET_MIN': 2}
expected = 1 + int(math.log(16 / 2, 2))
assert rules.calc_PROMPT_SEQ_RAMP_GRAPHS(ctx) == expected


def test_calc_PROMPT_SEQ_STEP_GRAPHS():
ctx = {'MAX_MODEL_LEN': 64, 'VLLM_PROMPT_SEQ_BUCKET_STEP': 8}
expected = int(1 + (64 - 8) / 8)
assert rules.calc_PROMPT_SEQ_STEP_GRAPHS(ctx) == expected


def test_calc_EST_NUM_PROMPT_GRAPHS():
ctx = {
'PROMPT_BS_RAMP_GRAPHS': 2,
'PROMPT_BS_STEP_GRAPHS': 3,
'PROMPT_SEQ_RAMP_GRAPHS': 4,
'PROMPT_SEQ_STEP_GRAPHS': 5
}
expected = ((2 + 3) * (4 + 5)) / 2
assert rules.calc_EST_NUM_PROMPT_GRAPHS(ctx) == expected


def test_calc_EST_GRAPH_PROMPT_RATIO():
ctx = {'EST_NUM_PROMPT_GRAPHS': 10, 'NUM_DECODE_GRAPHS': 30}
expected = math.ceil(10 / (10 + 30) * 100) / 100
assert rules.calc_EST_GRAPH_PROMPT_RATIO(ctx) == expected


def test_calc_VLLM_GRAPH_PROMPT_RATIO():
ctx = {'EST_GRAPH_PROMPT_RATIO': 0.5}
expected = math.ceil(min(max(0.5, 0.1), 0.9) * 10) / 10
assert rules.calc_VLLM_GRAPH_PROMPT_RATIO(ctx) == expected


def test_calc_DECODE_GRAPH_TARGET_GB():
ctx = {'NUM_DECODE_GRAPHS': 10, 'APPROX_MEM_PER_GRAPH_MB': 512}
expected = math.ceil(10 * 512 / 1024 * 10) / 10
assert rules.calc_DECODE_GRAPH_TARGET_GB(ctx) == expected


def test_calc_EST_GRAPH_RESERVE_MEM():
ctx = {
'DECODE_GRAPH_TARGET_GB': 5,
'USABLE_MEM': 10,
'GPU_MEM_UTILIZATION': 0.8,
'VLLM_GRAPH_PROMPT_RATIO': 0.2
}
expected = math.ceil(5 / (10 * 0.8 * (1 - 0.2)) * 100) / 100
assert rules.calc_EST_GRAPH_RESERVE_MEM(ctx) == expected


def test_calc_VLLM_GRAPH_RESERVED_MEM():
ctx = {'EST_GRAPH_RESERVE_MEM': 0.3}
expected = min(max(0.3, 0.01), 0.5)
assert rules.calc_VLLM_GRAPH_RESERVED_MEM(ctx) == expected


def test_calc_KV_CACHE_MEM():
ctx = {
'USABLE_MEM': 10,
'GPU_MEM_UTILIZATION': 0.8,
'VLLM_GRAPH_RESERVED_MEM': 0.2
}
expected = 10 * 0.8 * (1 - 0.2)
assert rules.calc_KV_CACHE_MEM(ctx) == expected


def test_calc_VLLM_DECODE_BLOCK_BUCKET_MAX():
ctx = {'MAX_NUM_SEQS': 16, 'MAX_MODEL_LEN': 128}
expected = max(128, math.ceil((16 * 128) / 128))
assert rules.calc_VLLM_DECODE_BLOCK_BUCKET_MAX(ctx) == expected


def test_calc_VLLM_PROMPT_SEQ_BUCKET_MAX():
ctx = {'MAX_MODEL_LEN': 4096}
assert rules.calc_VLLM_PROMPT_SEQ_BUCKET_MAX(ctx) == 4096
70 changes: 70 additions & 0 deletions .cd/tests/test_vllm_autocalc_rules_max_num_seqs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# SPDX-License-Identifier: Apache-2.0
import math

import pytest
import vllm_autocalc_rules as rules


def test_calc_MAX_NUM_SEQS_user_provided():
ctx = {'MAX_NUM_SEQS': 5}
assert rules.calc_MAX_NUM_SEQS(ctx) == 5

ctx = {'MAX_NUM_SEQS': 0}
assert rules.calc_MAX_NUM_SEQS(ctx) == 1


def test_calc_MAX_NUM_SEQS_fp8():
ctx = {
'MAX_NUM_SEQS': None,
'TENSOR_PARALLEL_SIZE': 2,
'KV_CACHE_MEM': 64,
'KV_CACHE_PER_SEQ': 2,
'DTYPE': 'fp8',
'VLLM_DECODE_BS_BUCKET_STEP': 8,
'MODEL': 'test'
}
val = (2 * 64 / 2)
expected = max(1, math.floor(val / 8)) * 8
assert rules.calc_MAX_NUM_SEQS(ctx) == expected


def test_calc_MAX_NUM_SEQS_non_fp8():
ctx = {
'MAX_NUM_SEQS': None,
'TENSOR_PARALLEL_SIZE': 2,
'KV_CACHE_MEM': 64,
'KV_CACHE_PER_SEQ': 2,
'DTYPE': 'bfloat16',
'VLLM_DECODE_BS_BUCKET_STEP': 8,
'MODEL': 'test'
}
val = (2 * 64 / 2)
expected = math.ceil(val / 8) * 8
assert rules.calc_MAX_NUM_SEQS(ctx) == expected


def test_calc_MAX_NUM_SEQS_vision_instruct_limit():
ctx = {
'MAX_NUM_SEQS': None,
'TENSOR_PARALLEL_SIZE': 2,
'KV_CACHE_MEM': 2048,
'KV_CACHE_PER_SEQ': 2,
'DTYPE': 'bfloat16',
'VLLM_DECODE_BS_BUCKET_STEP': 8,
'MODEL': 'meta-llama/Llama-3.2-11B-Vision-Instruct'
}
assert rules.calc_MAX_NUM_SEQS(ctx) == 128


def test_calc_MAX_NUM_SEQS_not_enough_memory():
ctx = {
'MAX_NUM_SEQS': None,
'TENSOR_PARALLEL_SIZE': 2,
'KV_CACHE_MEM': 0,
'KV_CACHE_PER_SEQ': 2,
'DTYPE': 'bfloat16',
'VLLM_DECODE_BS_BUCKET_STEP': 8,
'MODEL': 'test'
}
with pytest.raises(ValueError):
rules.calc_MAX_NUM_SEQS(ctx)
Loading