Skip to content

Enabed test cases for disaggregate_prefill_v1 with lmcache on hpu, gaudi-nic, cpuoverloading. #1370

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 14 commits into
base: habana_main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions docker/Dockerfile.hpu.lmcache
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
FROM vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest

COPY ./ /workspace/vllm

WORKDIR /workspace/vllm

RUN pip install --upgrade pip && \
pip install -v -r requirements-hpu.txt

ENV no_proxy=localhost,127.0.0.1
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
ENV VLLM_SKIP_WARMUP=true
ENV VLLM_USE_V1=1

RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install

# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils

WORKDIR /workspace/

RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

ENTRYPOINT ["bash"]
100 changes: 100 additions & 0 deletions examples/lmcache/cpu_offload_lmcache_v0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# SPDX-License-Identifier: Apache-2.0

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please provide description of the PR including why we need those examples on CPU

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's for our test only for now

"""
This file demonstrates the example usage of cpu offloading
with LMCache.

Note that `lmcache` is needed to run this example.
Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
Learn more about LMCache environment setup, please refer to:
https://docs.lmcache.ai/getting_started/installation.html
"""
import contextlib
import os
import time

from lmcache.experimental.cache_engine import LMCacheEngineBuilder
from lmcache.integration.vllm.utils import ENGINE_NAME

from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig


def setup_environment_variables():
# LMCache-related environment variables
# Use experimental features in LMCache
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
# LMCache is set to use 256 tokens per chunk
os.environ["LMCACHE_CHUNK_SIZE"] = "256"
# Enable local CPU backend in LMCache
os.environ["LMCACHE_LOCAL_CPU"] = "True"
# Set local CPU memory limit to 5.0 GB
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"


@contextlib.contextmanager
def build_llm_with_lmcache():
ktc = KVTransferConfig.from_cli(
'{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
kv_transfer_config=ktc,
max_model_len=8000,
enable_chunked_prefill=True,
gpu_memory_utilization=0.8)

try:
yield llm
finally:
# Clean up lmcache backend
LMCacheEngineBuilder.destroy(ENGINE_NAME)


def print_output(
llm: LLM,
prompt: list[str],
sampling_params: SamplingParams,
req_str: str,
):
start = time.time()
outputs = llm.generate(prompt, sampling_params)
print("-" * 50)
for output in outputs:
generated_text = output.outputs[0].text
print(f"Generated text: {generated_text!r}")
print(f"Generation took {time.time() - start:.2f} seconds, "
f"{req_str} request done.")
print("-" * 50)


def main():
setup_environment_variables()

with build_llm_with_lmcache() as llm:

# This example script runs two requests with a shared prefix.
# Define the shared prompt and specific prompts
shared_prompt = "Hello, how are you?" * 1000
first_prompt = [
shared_prompt + "Hello, my name is",
]
second_prompt = [
shared_prompt + "Tell me a very long story",
]

sampling_params = SamplingParams(temperature=0,
top_p=0.95,
max_tokens=10)

# Print the first output
print_output(llm, first_prompt, sampling_params, "first")

time.sleep(1)

# print the second output
print_output(llm, second_prompt, sampling_params, "second")


if __name__ == "__main__":
main()
61 changes: 61 additions & 0 deletions examples/lmcache/cpu_offload_lmcache_v1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# SPDX-License-Identifier: Apache-2.0
"""
This file demonstrates the example usage of cpu offloading
with LMCache in vLLM v1.

Note that lmcache needs to be installed to run this example.
Learn more about LMCache in https://github.com/LMCache/LMCache.
"""
import os

from lmcache.experimental.cache_engine import LMCacheEngineBuilder
from lmcache.integration.vllm.utils import ENGINE_NAME

from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig

# LMCache-related environment variables
# Use experimental features in LMCache
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
# LMCache is set to use 256 tokens per chunk
os.environ["LMCACHE_CHUNK_SIZE"] = "256"
# Enable local CPU backend in LMCache
os.environ["LMCACHE_LOCAL_CPU"] = "True"
# Set local CPU memory limit to 5.0 GB
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
os.environ["LMCACHE_REMOTE_URL"] = f"lm://localhost:65432"
# Set the serializer/deserializer between vllm and LMCache server
# `naive` indicates using raw bytes of the tensor without any compression
os.environ["LMCACHE_REMOTE_SERDE"] = "naive"

# This example script runs two requests with a shared prefix.
shared_prompt = "Hello, how are you?" * 1000
first_prompt = [
shared_prompt + "Hello, my name is",
]
second_prompt = [
shared_prompt + "Tell me a very long story",
]

sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)

ktc = KVTransferConfig.from_cli(
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
# Note that LMCache is not compatible with chunked prefill for now.
llm = LLM(model="/root/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct/",
kv_transfer_config=ktc,
max_model_len=8000,
gpu_memory_utilization=0.8)

# Should be able to see logs like the following:
# `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
# This indicates that the KV cache has been stored in LMCache.
outputs = llm.generate(first_prompt, sampling_params)
for output in outputs:
generated_text = output.outputs[0].text
print(f"Generated text: {generated_text!r}")

# Clean up lmcache backend
LMCacheEngineBuilder.destroy(ENGINE_NAME)
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
local_cpu: False
max_local_cpu_size: 5.0
#local_disk:
max_local_disk_size: 0
remote_serde: naive
remote_url: "lm://localhost:8100"
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
local_cpu: False
max_local_cpu_size: 5.0
#local_disk:
max_local_disk_size: 0
remote_serde: naive
remote_url: "redis://localhost:6379"
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ max_local_disk_size: 0
remote_serde: NULL

enable_nixl: True
nixl_role: "receiver"
nixl_role: "RECEIVER"
nixl_peer_host: "localhost"
nixl_peer_port: 55555
nixl_buffer_size: 1073741824 # 1GB
nixl_buffer_device: "cuda"
nixl_buffer_device: "hpu"
nixl_enable_gc: True
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ max_local_disk_size: 0
remote_serde: NULL

enable_nixl: True
nixl_role: "sender"
nixl_role: "SENDER"
nixl_peer_host: "localhost"
nixl_peer_port: 55555
nixl_buffer_size: 1073741824 # 1GB
nixl_buffer_device: "cuda"
nixl_buffer_device: "hpu"
nixl_enable_gc: True
138 changes: 138 additions & 0 deletions examples/lmcache/disagg_prefill_lmcache_v1/disagg_example_gaudi.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/bin/bash

echo "Warning: LMCache disaggregated prefill support for vLLM v1 is experimental and subject to change."


PIDS=()

# Switch to the directory of the current script
cd "$(dirname "${BASH_SOURCE[0]}")"

check_hf_token() {
if [ -z "$HF_TOKEN" ]; then
echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
exit 1
fi
if [[ "$HF_TOKEN" != hf_* ]]; then
echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
exit 1
fi
echo "HF_TOKEN is set and valid."
}

check_num_gpus() {
# can you check if the number of GPUs are >=2 via nvidia-smi?
num_gpus=$(hl-smi --query-gpu=name --format=csv,noheader | wc -l)
if [ "$num_gpus" -lt 2 ]; then
echo "You need at least 2 GPUs to run disaggregated prefill."
exit 1
else
echo "Found $num_gpus GPUs."
fi
}

ensure_python_library_installed() {
echo "Checking if $1 is installed..."
python -c "import $1" > /dev/null 2>&1
if [ $? -ne 0 ]; then
if [ "$1" == "nixl" ]; then
echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation."
else
echo "$1 is not installed. Please install it via pip install $1."
fi
exit 1
else
echo "$1 is installed."
fi
}

cleanup() {
echo "Stopping everything…"
trap - INT TERM # prevent re-entrancy
kill -- -$$ # negative PID == “this whole process-group”
wait # reap children so we don't leave zombies
exit 0
}

wait_for_server() {
local port=$1
local timeout_seconds=1200
local start_time=$(date +%s)

echo "Waiting for server on port $port..."

while true; do
if curl -s "localhost:${port}/v1/completions" > /dev/null; then
return 0
fi

local now=$(date +%s)
if (( now - start_time >= timeout_seconds )); then
echo "Timeout waiting for server"
return 1
fi

sleep 1
done
}


main() {
#check_hf_token
check_num_gpus
ensure_python_library_installed lmcache
#ensure_python_library_installed nixl
ensure_python_library_installed pandas
ensure_python_library_installed datasets
ensure_python_library_installed vllm

trap cleanup INT
trap cleanup USR1
trap cleanup TERM

echo "Launching prefiller, decoder and proxy..."
echo "Please check prefiller.log, decoder.log and proxy.log for logs."

python -m lmcache.v1.server localhost 2000 2>&1 &

bash disagg_vllm_launcher_gaudi.sh prefiller \
> >(tee prefiller.log) 2>&1 &
prefiller_pid=$!
PIDS+=($prefiller_pid)

bash disagg_vllm_launcher_gaudi.sh decoder \
> >(tee decoder.log) 2>&1 &
decoder_pid=$!
PIDS+=($decoder_pid)

python3 disagg_proxy_server.py \
--host localhost \
--port 1000 \
--prefiller-host localhost \
--prefiller-port 1100 \
--decoder-host localhost \
--decoder-port 1200 \
> >(tee proxy.log) 2>&1 &
proxy_pid=$!
PIDS+=($proxy_pid)

wait_for_server 1100
wait_for_server 1200
wait_for_server 1000

echo "All servers are up. Starting benchmark..."

# begin benchmark
cd ../../../benchmarks/
python benchmark_serving.py --port 1000 --seed $(date +%s) \
--model /mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct/ \
--dataset-name random --random-input-len 8000 --random-output-len 200 \
--num-prompts 50 --burstiness 100 --request-rate 3.6 | tee benchmark.log

echo "Benchmarking done. Cleaning up..."

cleanup

}

main
Loading