forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 109
docker vllm: add unit tests #1406
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
tthaddey
wants to merge
5
commits into
habana_main
Choose a base branch
from
tthaddey/tests_for_vllm_docker
base: habana_main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 2 commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
c715d0c
docker vllm: add unit tests
tthaddey 659d9dc
Potential fix for code scanning alert no. 54: Workflow does not conta…
tthaddey 6073360
Merge branch 'habana_main' into tthaddey/tests_for_vllm_docker
tthaddey 26fc053
Merge branch 'habana_main' into tthaddey/tests_for_vllm_docker
tthaddey dc4e579
Merge branch 'habana_main' into tthaddey/tests_for_vllm_docker
tthaddey File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
pytest | ||
pandas | ||
pyyaml |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
from entrypoint import Entrypoint | ||
|
||
|
||
def test_generate_server_script(tmp_path): | ||
entry = Entrypoint() | ||
template = tmp_path / "template.sh" | ||
output = tmp_path / "output.sh" | ||
# Prepare a simple template | ||
template.write_text("#@VARS\nrun") | ||
entry.generate_server_script(str(template), str(output), { | ||
"FOO": "bar", | ||
"X": 1 | ||
}) | ||
content = output.read_text() | ||
assert "export FOO=bar" in content | ||
assert "export X=1" in content | ||
assert "run" in content |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
import pytest | ||
from vllm_autocalc import VarsGenerator | ||
|
||
|
||
@pytest.fixture | ||
def minimal_config(tmp_path): | ||
# Prepare minimal config files for VarsGenerator | ||
defaults = tmp_path / "defaults.yaml" | ||
varlist_conf = tmp_path / "varlist_conf.yaml" | ||
model_def_settings = tmp_path / "settings_vllm.csv" | ||
|
||
defaults.write_text("defaults:\n" | ||
" DEVICE_NAME: TEST_DEVICE\n" | ||
" HPU_MEM: {GAUDI2: 96}\n" | ||
" DTYPE: bfloat16\n") | ||
varlist_conf.write_text("output_vars:\n" | ||
" - DEVICE_NAME\n" | ||
" - DTYPE\n" | ||
"user_variable:\n" | ||
" - DEVICE_NAME\n") | ||
model_def_settings.write_text("MODEL,PARAM1\nTEST_MODEL,123\n") | ||
|
||
return { | ||
"defaults_path": str(defaults), | ||
"varlist_conf_path": str(varlist_conf), | ||
"model_def_settings_path": str(model_def_settings) | ||
} | ||
|
||
|
||
def test_build_context(monkeypatch, minimal_config): | ||
monkeypatch.setenv("MODEL", "TEST_MODEL") | ||
vg = VarsGenerator(**minimal_config) | ||
assert vg.context["DEVICE_NAME"] == "TEST_DEVICE" | ||
assert vg.context["DTYPE"] == "bfloat16" | ||
assert vg.context["MODEL"] == "TEST_MODEL" | ||
|
||
|
||
def test_overwrite_params(monkeypatch, minimal_config): | ||
monkeypatch.setenv("MODEL", "TEST_MODEL") | ||
monkeypatch.setenv("DEVICE_NAME", "OVERRIDE_DEVICE") | ||
vg = VarsGenerator(**minimal_config) | ||
vg.overwrite_params() | ||
assert vg.context["DEVICE_NAME"] == "OVERRIDE_DEVICE" | ||
|
||
|
||
def test_return_dict(monkeypatch, minimal_config): | ||
monkeypatch.setenv("MODEL", "TEST_MODEL") | ||
vg = VarsGenerator(**minimal_config) | ||
result = vg.return_dict() | ||
assert "DEVICE_NAME" in result | ||
assert "DTYPE" in result | ||
assert "MODEL" not in result # Not in output_vars |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,228 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
import math | ||
|
||
import vllm_autocalc_rules as rules | ||
|
||
|
||
def test_calc_TENSOR_PARALLEL_SIZE(): | ||
ctx = {'TENSOR_PARALLEL_SIZE': 4} | ||
assert rules.calc_TENSOR_PARALLEL_SIZE(ctx) == 4 | ||
|
||
|
||
def test_calc_MAX_MODEL_LEN(): | ||
ctx = {'MAX_MODEL_LEN': 1024} | ||
assert rules.calc_MAX_MODEL_LEN(ctx) == 1024 | ||
|
||
|
||
def test_calc_PT_HPU_ENABLE_LAZY_COLLECTIVES(): | ||
ctx = {'TENSOR_PARALLEL_SIZE': 2} | ||
assert rules.calc_PT_HPU_ENABLE_LAZY_COLLECTIVES(ctx) is True | ||
|
||
|
||
def test_calc_MODEL_MEM_FROM_CONFIG(): | ||
ctx = {'MODEL_MEM_FROM_CONFIG': "123.5"} | ||
assert rules.calc_MODEL_MEM_FROM_CONFIG(ctx) == 123.5 | ||
|
||
|
||
def test_calc_DEVICE_HPU_MEM(): | ||
ctx = {'HPU_MEM': {'GAUDI2': 96}, 'DEVICE_NAME': 'GAUDI2'} | ||
assert rules.calc_DEVICE_HPU_MEM(ctx) == 96 | ||
|
||
|
||
def test_calc_TOTAL_GPU_MEM(): | ||
ctx = {'DEVICE_HPU_MEM': 96, 'TENSOR_PARALLEL_SIZE': 4} | ||
assert rules.calc_TOTAL_GPU_MEM(ctx) == 384 | ||
|
||
|
||
def test_calc_MODEL_MEM_IN_GB(): | ||
ctx = { | ||
'MODEL_MEM_FROM_CONFIG': 2 * 1024**3, | ||
'QUANT_DTYPE': 1, | ||
'MODEL_DTYPE': 1 | ||
} | ||
assert rules.calc_MODEL_MEM_IN_GB(ctx) == 2.0 | ||
|
||
|
||
def test_calc_USABLE_MEM(): | ||
ctx = { | ||
'TOTAL_GPU_MEM': 384, | ||
'TENSOR_PARALLEL_SIZE': 4, | ||
'UNAVAILABLE_MEM_ABS': 10, | ||
'MODEL_MEM_IN_GB': 8, | ||
'PROFILER_MEM_OVERHEAD': 2 | ||
} | ||
expected = ((384 / 4) - 10 - (8 / 4) - 2) | ||
assert rules.calc_USABLE_MEM(ctx) == expected | ||
|
||
|
||
def test_calc_GPU_MEMORY_UTIL_TEMP(): | ||
ctx = {'GPU_FREE_MEM_TARGET': 10, 'USABLE_MEM': 100} | ||
assert rules.calc_GPU_MEMORY_UTIL_TEMP(ctx) == 0.9 | ||
|
||
|
||
def test_calc_GPU_MEM_UTILIZATION(): | ||
ctx = {'GPU_MEMORY_UTIL_TEMP': 0.987} | ||
assert rules.calc_GPU_MEM_UTILIZATION(ctx) == math.floor(0.987 * 100) / 100 | ||
|
||
|
||
def test_calc_KV_CACHE_PER_SEQ(): | ||
ctx = { | ||
'MAX_MODEL_LEN': 128, | ||
'NUM_HIDDEN_LAYERS': 2, | ||
'HIDDEN_SIZE': 4, | ||
'NUM_KEY_VALUE_HEADS': 2, | ||
'CACHE_DTYPE_BYTES': 2, | ||
'NUM_ATTENTION_HEADS': 2 | ||
} | ||
expected = ((2 * 128 * 2 * 4 * 2 * 2) / 2) / (1024 * 1024 * 1024) | ||
assert rules.calc_KV_CACHE_PER_SEQ(ctx) == expected | ||
|
||
|
||
def test_calc_EST_MAX_NUM_SEQS(): | ||
ctx = { | ||
'TENSOR_PARALLEL_SIZE': 4, | ||
'USABLE_MEM': 100, | ||
'GPU_MEM_UTILIZATION': 0.9, | ||
'KV_CACHE_PER_SEQ': 0.5 | ||
} | ||
expected = (4 * 100 * 0.9) / 0.5 | ||
assert rules.calc_EST_MAX_NUM_SEQS(ctx) == expected | ||
|
||
|
||
def test_calc_EST_HPU_BLOCKS(): | ||
ctx = {'MAX_MODEL_LEN': 128, 'EST_MAX_NUM_SEQS': 32, 'BLOCK_SIZE': 16} | ||
expected = (128 * 32) / 16 | ||
assert rules.calc_EST_HPU_BLOCKS(ctx) == expected | ||
|
||
|
||
def test_calc_DECODE_BS_RAMP_GRAPHS(): | ||
ctx = {'VLLM_DECODE_BS_BUCKET_STEP': 16, 'VLLM_DECODE_BS_BUCKET_MIN': 2} | ||
expected = 1 + int(math.log(16 / 2, 2)) | ||
assert rules.calc_DECODE_BS_RAMP_GRAPHS(ctx) == expected | ||
|
||
|
||
def test_calc_DECODE_BS_STEP_GRAPHS(): | ||
ctx = {'EST_MAX_NUM_SEQS': 64, 'VLLM_DECODE_BS_BUCKET_STEP': 8} | ||
expected = max(0, int(1 + (64 - 8) / 8)) | ||
assert rules.calc_DECODE_BS_STEP_GRAPHS(ctx) == expected | ||
|
||
|
||
def test_calc_DECODE_BLOCK_RAMP_GRAPHS(): | ||
ctx = { | ||
'VLLM_DECODE_BLOCK_BUCKET_STEP': 16, | ||
'VLLM_DECODE_BLOCK_BUCKET_MIN': 2 | ||
} | ||
expected = 1 + int(math.log(16 / 2, 2)) | ||
assert rules.calc_DECODE_BLOCK_RAMP_GRAPHS(ctx) == expected | ||
|
||
|
||
def test_calc_DECODE_BLOCK_STEP_GRAPHS(): | ||
ctx = {'EST_HPU_BLOCKS': 64, 'VLLM_DECODE_BLOCK_BUCKET_STEP': 8} | ||
expected = max(0, int(1 + (64 - 8) / 8)) | ||
assert rules.calc_DECODE_BLOCK_STEP_GRAPHS(ctx) == expected | ||
|
||
|
||
def test_calc_NUM_DECODE_GRAPHS(): | ||
ctx = { | ||
'DECODE_BS_RAMP_GRAPHS': 2, | ||
'DECODE_BS_STEP_GRAPHS': 3, | ||
'DECODE_BLOCK_RAMP_GRAPHS': 4, | ||
'DECODE_BLOCK_STEP_GRAPHS': 5 | ||
} | ||
expected = (2 + 3) * (4 + 5) | ||
assert rules.calc_NUM_DECODE_GRAPHS(ctx) == expected | ||
|
||
|
||
def test_calc_PROMPT_BS_RAMP_GRAPHS(): | ||
ctx = { | ||
'MAX_NUM_PREFILL_SEQS': 16, | ||
'VLLM_PROMPT_BS_BUCKET_STEP': 8, | ||
'VLLM_PROMPT_BS_BUCKET_MIN': 2 | ||
} | ||
expected = 1 + int(math.log(min(16, 8) / 2, 2)) | ||
assert rules.calc_PROMPT_BS_RAMP_GRAPHS(ctx) == expected | ||
|
||
|
||
def test_calc_PROMPT_BS_STEP_GRAPHS(): | ||
ctx = {'MAX_NUM_PREFILL_SEQS': 32, 'VLLM_PROMPT_BS_BUCKET_STEP': 8} | ||
expected = max(0, int(1 + (32 - 8) / 8)) | ||
assert rules.calc_PROMPT_BS_STEP_GRAPHS(ctx) == expected | ||
|
||
|
||
def test_calc_PROMPT_SEQ_RAMP_GRAPHS(): | ||
ctx = {'VLLM_PROMPT_SEQ_BUCKET_STEP': 16, 'VLLM_PROMPT_SEQ_BUCKET_MIN': 2} | ||
expected = 1 + int(math.log(16 / 2, 2)) | ||
assert rules.calc_PROMPT_SEQ_RAMP_GRAPHS(ctx) == expected | ||
|
||
|
||
def test_calc_PROMPT_SEQ_STEP_GRAPHS(): | ||
ctx = {'MAX_MODEL_LEN': 64, 'VLLM_PROMPT_SEQ_BUCKET_STEP': 8} | ||
expected = int(1 + (64 - 8) / 8) | ||
assert rules.calc_PROMPT_SEQ_STEP_GRAPHS(ctx) == expected | ||
|
||
|
||
def test_calc_EST_NUM_PROMPT_GRAPHS(): | ||
ctx = { | ||
'PROMPT_BS_RAMP_GRAPHS': 2, | ||
'PROMPT_BS_STEP_GRAPHS': 3, | ||
'PROMPT_SEQ_RAMP_GRAPHS': 4, | ||
'PROMPT_SEQ_STEP_GRAPHS': 5 | ||
} | ||
expected = ((2 + 3) * (4 + 5)) / 2 | ||
assert rules.calc_EST_NUM_PROMPT_GRAPHS(ctx) == expected | ||
|
||
|
||
def test_calc_EST_GRAPH_PROMPT_RATIO(): | ||
ctx = {'EST_NUM_PROMPT_GRAPHS': 10, 'NUM_DECODE_GRAPHS': 30} | ||
expected = math.ceil(10 / (10 + 30) * 100) / 100 | ||
assert rules.calc_EST_GRAPH_PROMPT_RATIO(ctx) == expected | ||
|
||
|
||
def test_calc_VLLM_GRAPH_PROMPT_RATIO(): | ||
ctx = {'EST_GRAPH_PROMPT_RATIO': 0.5} | ||
expected = math.ceil(min(max(0.5, 0.1), 0.9) * 10) / 10 | ||
assert rules.calc_VLLM_GRAPH_PROMPT_RATIO(ctx) == expected | ||
|
||
|
||
def test_calc_DECODE_GRAPH_TARGET_GB(): | ||
ctx = {'NUM_DECODE_GRAPHS': 10, 'APPROX_MEM_PER_GRAPH_MB': 512} | ||
expected = math.ceil(10 * 512 / 1024 * 10) / 10 | ||
assert rules.calc_DECODE_GRAPH_TARGET_GB(ctx) == expected | ||
|
||
|
||
def test_calc_EST_GRAPH_RESERVE_MEM(): | ||
ctx = { | ||
'DECODE_GRAPH_TARGET_GB': 5, | ||
'USABLE_MEM': 10, | ||
'GPU_MEM_UTILIZATION': 0.8, | ||
'VLLM_GRAPH_PROMPT_RATIO': 0.2 | ||
} | ||
expected = math.ceil(5 / (10 * 0.8 * (1 - 0.2)) * 100) / 100 | ||
assert rules.calc_EST_GRAPH_RESERVE_MEM(ctx) == expected | ||
|
||
|
||
def test_calc_VLLM_GRAPH_RESERVED_MEM(): | ||
ctx = {'EST_GRAPH_RESERVE_MEM': 0.3} | ||
expected = min(max(0.3, 0.01), 0.5) | ||
assert rules.calc_VLLM_GRAPH_RESERVED_MEM(ctx) == expected | ||
|
||
|
||
def test_calc_KV_CACHE_MEM(): | ||
ctx = { | ||
'USABLE_MEM': 10, | ||
'GPU_MEM_UTILIZATION': 0.8, | ||
'VLLM_GRAPH_RESERVED_MEM': 0.2 | ||
} | ||
expected = 10 * 0.8 * (1 - 0.2) | ||
assert rules.calc_KV_CACHE_MEM(ctx) == expected | ||
|
||
|
||
def test_calc_VLLM_DECODE_BLOCK_BUCKET_MAX(): | ||
ctx = {'MAX_NUM_SEQS': 16, 'MAX_MODEL_LEN': 128} | ||
expected = max(128, math.ceil((16 * 128) / 128)) | ||
assert rules.calc_VLLM_DECODE_BLOCK_BUCKET_MAX(ctx) == expected | ||
|
||
|
||
def test_calc_VLLM_PROMPT_SEQ_BUCKET_MAX(): | ||
ctx = {'MAX_MODEL_LEN': 4096} | ||
assert rules.calc_VLLM_PROMPT_SEQ_BUCKET_MAX(ctx) == 4096 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
import math | ||
|
||
import pytest | ||
import vllm_autocalc_rules as rules | ||
|
||
|
||
def test_calc_MAX_NUM_SEQS_user_provided(): | ||
ctx = {'MAX_NUM_SEQS': 5} | ||
assert rules.calc_MAX_NUM_SEQS(ctx) == 5 | ||
|
||
ctx = {'MAX_NUM_SEQS': 0} | ||
assert rules.calc_MAX_NUM_SEQS(ctx) == 1 | ||
|
||
|
||
def test_calc_MAX_NUM_SEQS_fp8(): | ||
ctx = { | ||
'MAX_NUM_SEQS': None, | ||
'TENSOR_PARALLEL_SIZE': 2, | ||
'KV_CACHE_MEM': 64, | ||
'KV_CACHE_PER_SEQ': 2, | ||
'DTYPE': 'fp8', | ||
'VLLM_DECODE_BS_BUCKET_STEP': 8, | ||
'MODEL': 'test' | ||
} | ||
val = (2 * 64 / 2) | ||
expected = max(1, math.floor(val / 8)) * 8 | ||
assert rules.calc_MAX_NUM_SEQS(ctx) == expected | ||
|
||
|
||
def test_calc_MAX_NUM_SEQS_non_fp8(): | ||
ctx = { | ||
'MAX_NUM_SEQS': None, | ||
'TENSOR_PARALLEL_SIZE': 2, | ||
'KV_CACHE_MEM': 64, | ||
'KV_CACHE_PER_SEQ': 2, | ||
'DTYPE': 'bfloat16', | ||
'VLLM_DECODE_BS_BUCKET_STEP': 8, | ||
'MODEL': 'test' | ||
} | ||
val = (2 * 64 / 2) | ||
expected = math.ceil(val / 8) * 8 | ||
assert rules.calc_MAX_NUM_SEQS(ctx) == expected | ||
|
||
|
||
def test_calc_MAX_NUM_SEQS_vision_instruct_limit(): | ||
ctx = { | ||
'MAX_NUM_SEQS': None, | ||
'TENSOR_PARALLEL_SIZE': 2, | ||
'KV_CACHE_MEM': 2048, | ||
'KV_CACHE_PER_SEQ': 2, | ||
'DTYPE': 'bfloat16', | ||
'VLLM_DECODE_BS_BUCKET_STEP': 8, | ||
'MODEL': 'meta-llama/Llama-3.2-11B-Vision-Instruct' | ||
} | ||
assert rules.calc_MAX_NUM_SEQS(ctx) == 128 | ||
|
||
|
||
def test_calc_MAX_NUM_SEQS_not_enough_memory(): | ||
ctx = { | ||
'MAX_NUM_SEQS': None, | ||
'TENSOR_PARALLEL_SIZE': 2, | ||
'KV_CACHE_MEM': 0, | ||
'KV_CACHE_PER_SEQ': 2, | ||
'DTYPE': 'bfloat16', | ||
'VLLM_DECODE_BS_BUCKET_STEP': 8, | ||
'MODEL': 'test' | ||
} | ||
with pytest.raises(ValueError): | ||
rules.calc_MAX_NUM_SEQS(ctx) |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.