HabanaAI · hsubramony · May 1, 2025 · Apr 29, 2025 · Jun 4, 2025 · Jun 5, 2025
@@ -0,0 +1,24 @@
+FROM vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install --upgrade pip && \
+    pip install -v -r requirements-hpu.txt
+
+ENV no_proxy=localhost,127.0.0.1
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+ENV VLLM_SKIP_WARMUP=true
+ENV VLLM_USE_V1=1
+
+RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
+
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["bash"]
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the example usage of cpu offloading
+with LMCache.
+
+Note that `lmcache` is needed to run this example.
+Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
+Learn more about LMCache environment setup, please refer to:
+https://docs.lmcache.ai/getting_started/installation.html
+"""
+import contextlib
+import os
+import time
+
+from lmcache.experimental.cache_engine import LMCacheEngineBuilder
+from lmcache.integration.vllm.utils import ENGINE_NAME
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+
+def setup_environment_variables():
+    # LMCache-related environment variables
+    # Use experimental features in LMCache
+    os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+    # LMCache is set to use 256 tokens per chunk
+    os.environ["LMCACHE_CHUNK_SIZE"] = "256"
+    # Enable local CPU backend in LMCache
+    os.environ["LMCACHE_LOCAL_CPU"] = "True"
+    # Set local CPU memory limit to 5.0 GB
+    os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
+
+
+@contextlib.contextmanager
+def build_llm_with_lmcache():
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # memory. Reduce the value if your GPU has less memory.
+    # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
+    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+              kv_transfer_config=ktc,
+              max_model_len=8000,
+              enable_chunked_prefill=True,
+              gpu_memory_utilization=0.8)
+
+    try:
+        yield llm
+    finally:
+        # Clean up lmcache backend
+        LMCacheEngineBuilder.destroy(ENGINE_NAME)
+
+
+def print_output(
+    llm: LLM,
+    prompt: list[str],
+    sampling_params: SamplingParams,
+    req_str: str,
+):
+    start = time.time()
+    outputs = llm.generate(prompt, sampling_params)
+    print("-" * 50)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+    print(f"Generation took {time.time() - start:.2f} seconds, "
+          f"{req_str} request done.")
+    print("-" * 50)
+
+
+def main():
+    setup_environment_variables()
+
+    with build_llm_with_lmcache() as llm:
+
+        # This example script runs two requests with a shared prefix.
+        # Define the shared prompt and specific prompts
+        shared_prompt = "Hello, how are you?" * 1000
+        first_prompt = [
+            shared_prompt + "Hello, my name is",
+        ]
+        second_prompt = [
+            shared_prompt + "Tell me a very long story",
+        ]
+
+        sampling_params = SamplingParams(temperature=0,
+                                         top_p=0.95,
+                                         max_tokens=10)
+
+        # Print the first output
+        print_output(llm, first_prompt, sampling_params, "first")
+
+        time.sleep(1)
+
+        # print the second output
+        print_output(llm, second_prompt, sampling_params, "second")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the example usage of cpu offloading
+with LMCache in vLLM v1.
+
+Note that lmcache needs to be installed to run this example.
+Learn more about LMCache in https://github.com/LMCache/LMCache.
+"""
+import os
+
+from lmcache.experimental.cache_engine import LMCacheEngineBuilder
+from lmcache.integration.vllm.utils import ENGINE_NAME
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+# LMCache-related environment variables
+# Use experimental features in LMCache
+os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+# LMCache is set to use 256 tokens per chunk
+os.environ["LMCACHE_CHUNK_SIZE"] = "256"
+# Enable local CPU backend in LMCache
+os.environ["LMCACHE_LOCAL_CPU"] = "True"
+# Set local CPU memory limit to 5.0 GB
+os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
+os.environ["LMCACHE_REMOTE_URL"] = f"lm://localhost:65432"
+# Set the serializer/deserializer between vllm and LMCache server
+# `naive` indicates using raw bytes of the tensor without any compression
+os.environ["LMCACHE_REMOTE_SERDE"] = "naive"
+
+# This example script runs two requests with a shared prefix.
+shared_prompt = "Hello, how are you?" * 1000
+first_prompt = [
+    shared_prompt + "Hello, my name is",
+]
+second_prompt = [
+    shared_prompt + "Tell me a very long story",
+]
+
+sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+ktc = KVTransferConfig.from_cli(
+    '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
+# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+# memory. Reduce the value if your GPU has less memory.
+# Note that LMCache is not compatible with chunked prefill for now.
+llm = LLM(model="/root/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct/",
+          kv_transfer_config=ktc,
+          max_model_len=8000,
+          gpu_memory_utilization=0.8)
+
+# Should be able to see logs like the following:
+# `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
+# This indicates that the KV cache has been stored in LMCache.
+outputs = llm.generate(first_prompt, sampling_params)
+for output in outputs:
+    generated_text = output.outputs[0].text
+    print(f"Generated text: {generated_text!r}")
+
+# Clean up lmcache backend
+LMCacheEngineBuilder.destroy(ENGINE_NAME)
@@ -0,0 +1,6 @@
+local_cpu: False
+max_local_cpu_size: 5.0
+#local_disk: 
+max_local_disk_size: 0
+remote_serde: naive
+remote_url: "lm://localhost:8100"
@@ -0,0 +1,6 @@
+local_cpu: False
+max_local_cpu_size: 5.0
+#local_disk: 
+max_local_disk_size: 0
+remote_serde: naive
+remote_url: "redis://localhost:6379"
@@ -5,9 +5,9 @@ max_local_disk_size: 0
 remote_serde: NULL
 
 enable_nixl: True
-nixl_role: "receiver"
+nixl_role: "RECEIVER"
 nixl_peer_host: "localhost"
 nixl_peer_port: 55555
 nixl_buffer_size: 1073741824 # 1GB
-nixl_buffer_device: "cuda"
+nixl_buffer_device: "hpu"
 nixl_enable_gc: True
@@ -5,9 +5,9 @@ max_local_disk_size: 0
 remote_serde: NULL
 
 enable_nixl: True
-nixl_role: "sender"
+nixl_role: "SENDER"
 nixl_peer_host: "localhost"
 nixl_peer_port: 55555
 nixl_buffer_size: 1073741824 # 1GB
-nixl_buffer_device: "cuda"
+nixl_buffer_device: "hpu"
 nixl_enable_gc: True
@@ -0,0 +1,138 @@
+#!/bin/bash
+
+echo "Warning: LMCache disaggregated prefill support for vLLM v1 is experimental and subject to change."
+
+
+PIDS=()
+
+# Switch to the directory of the current script
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+check_hf_token() {
+    if [ -z "$HF_TOKEN" ]; then
+        echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
+        exit 1
+    fi
+    if [[ "$HF_TOKEN" != hf_* ]]; then
+        echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
+        exit 1
+    fi
+    echo "HF_TOKEN is set and valid."
+}
+
+check_num_gpus() {
+    # can you check if the number of GPUs are >=2 via nvidia-smi?
+    num_gpus=$(hl-smi --query-gpu=name --format=csv,noheader | wc -l)
+    if [ "$num_gpus" -lt 2 ]; then
+        echo "You need at least 2 GPUs to run disaggregated prefill."
+        exit 1
+    else
+        echo "Found $num_gpus GPUs."
+    fi
+}
+
+ensure_python_library_installed() {
+    echo "Checking if $1 is installed..."
+    python -c "import $1" > /dev/null 2>&1
+    if [ $? -ne 0 ]; then
+        if [ "$1" == "nixl" ]; then
+            echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation."
+        else
+            echo "$1 is not installed. Please install it via pip install $1."
+        fi
+        exit 1
+    else
+        echo "$1 is installed."
+    fi
+}
+
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM        # prevent re-entrancy
+    kill -- -$$            # negative PID  ==  “this whole process-group”
+    wait                   # reap children so we don't leave zombies
+    exit 0
+}
+
+wait_for_server() {
+  local port=$1
+  local timeout_seconds=1200
+  local start_time=$(date +%s)
+
+  echo "Waiting for server on port $port..."
+
+  while true; do
+    if curl -s "localhost:${port}/v1/completions" > /dev/null; then
+      return 0
+    fi
+
+    local now=$(date +%s)
+    if (( now - start_time >= timeout_seconds )); then
+      echo "Timeout waiting for server"
+      return 1
+    fi
+
+    sleep 1
+  done
+}
+
+
+main() {
+    #check_hf_token
+    check_num_gpus
+    ensure_python_library_installed lmcache
+    #ensure_python_library_installed nixl
+    ensure_python_library_installed pandas
+    ensure_python_library_installed datasets
+    ensure_python_library_installed vllm
+
+    trap cleanup INT
+    trap cleanup USR1
+    trap cleanup TERM    
+
+    echo "Launching prefiller, decoder and proxy..."
+    echo "Please check prefiller.log, decoder.log and proxy.log for logs."
+
+    python -m lmcache.v1.server localhost 2000 2>&1 &
+
+    bash disagg_vllm_launcher_gaudi.sh prefiller \
+        > >(tee prefiller.log) 2>&1 &
+    prefiller_pid=$!
+    PIDS+=($prefiller_pid)
+
+    bash disagg_vllm_launcher_gaudi.sh decoder  \
+        > >(tee decoder.log)  2>&1 &
+    decoder_pid=$!
+    PIDS+=($decoder_pid)
+
+    python3 disagg_proxy_server.py \
+        --host localhost \
+        --port 1000 \
+        --prefiller-host localhost \
+        --prefiller-port 1100 \
+        --decoder-host localhost \
+        --decoder-port 1200  \
+        > >(tee proxy.log)    2>&1 &
+    proxy_pid=$!
+    PIDS+=($proxy_pid)
+
+    wait_for_server 1100
+    wait_for_server 1200
+    wait_for_server 1000
+
+    echo "All servers are up. Starting benchmark..."
+
+    # begin benchmark
+    cd ../../../benchmarks/
+    python benchmark_serving.py --port 1000 --seed $(date +%s) \
+        --model /mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct/ \
+        --dataset-name random --random-input-len 8000 --random-output-len 200 \
+        --num-prompts 50 --burstiness 100 --request-rate 3.6 | tee benchmark.log
+
+    echo "Benchmarking done. Cleaning up..."
+
+    cleanup
+
+}
+
+main