Merge branch 'main' into dev-cb-heterogeneous-tkv

yannicks1 · yannicks1 · commit 2085d1dbf060 · 2025-06-13T14:19:00.000Z
Signed-off-by: Yannick Schnider &lt;yannick.schnider1@ibm.com&gt;
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml
@@ -10,8 +10,12 @@ on:
   pull_request:
     branches:
       - "main"
-    paths-ignore:
-      - "**.md"
+    paths:
+      - ".github/workflows/build_docker.yml"
+      - "docker/**"
+      - "vllm_spyre/**/*.py"
+      - "pyproject.toml"
+      - "uv.lock"
   release:
     types: [published]
 
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -70,6 +70,7 @@ jobs:
           files: |
             .github/workflows/test.yml
             pyproject.toml
+            uv.lock
             tests/**/*.py
             vllm_spyre/**/*.py
 
diff --git a/.github/workflows/type_check.yaml b/.github/workflows/type_check.yaml
@@ -1,43 +1,52 @@
 name: Type Check
 
 on:
-  # Trigger the workflow on push or pull request, but only for the main branch.
-  # Don't use pull_request.paths filter since this workflow is required for
-  # all pull requests on main irrespective of file type or location.
+  # Don't use `paths` or `paths-ignore` filter since this workflow is required
+  # for all pull requests on main irrespective of file type or location
+  # Use `changed-src-files` step to determine if source code was changed
   pull_request:
     branches:
       - main
   push:
     branches:
       - main
-    paths:
-      - '**/*.py'
-      - '.github/workflows/type_check.yaml'
-      - 'tools/type_check.sh'
-      - 'pyproject.toml'
 
 jobs:
   type-check:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.10", "3.11", "3.12"]
     steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+    - name: "Checkout"
+      uses: actions/checkout@v4
+
+    - name: "Get changed source files"
+      id: changed-src-files
+      uses: tj-actions/changed-files@v46
+      with: # Avoid using single or double quotes for multiline patterns
+        files: |
+          .github/workflows/type_check.yaml
+          tools/type_check.sh
+          pyproject.toml
+          **.py
+
+    - name: "Set up Python ${{ matrix.python-version }}"
+      if: steps.changed-src-files.outputs.any_changed == 'true'
+      uses: astral-sh/setup-uv@v5
       with:
         python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        # TODO: use `uv`
-        python -m pip install --upgrade pip
-        pip install mypy==1.11.1
-        pip install types-setuptools
-        pip install types-PyYAML
-        pip install types-requests
-        pip install types-setuptools
-    - name: Mypy
+        enable-cache: true
+        ignore-nothing-to-cache: true
+        cache-dependency-glob: |
+          pyproject.toml
+
+    - name: "Install dependencies"
+      if: steps.changed-src-files.outputs.any_changed == 'true'
+      run: uv sync --frozen --only-group lint
+
+    - name: "Run mypy"
+      if: steps.changed-src-files.outputs.any_changed == 'true'
       run: |
         echo "::add-matcher::.github/workflows/matchers/mypy.json"
         tools/type_check.sh 1 ${{ matrix.python-version }}
diff --git a/.gitignore b/.gitignore
@@ -193,3 +193,6 @@ benchmarks/*.json
 # Linting
 actionlint
 shellcheck*/
+
+# version file generated by setuptools-scm
+/vllm_spyre/_version.py
diff --git a/docker/Dockerfile.amd64 b/docker/Dockerfile.amd64
@@ -72,8 +72,6 @@ ENV COMPILATION_MODE=offline_decoder \
     FLEX_COMPUTE=SENTIENT \
     FLEX_DEVICE=PF \
     FLEX_OVERWRITE_NMB_FRAME=1 \
-    FLEX_UNLINK_DEVMEM=false \
-    FLEX_RDMA_MODE_FULL=1 \
     TOKENIZERS_PARALLELISM=false \
     TORCH_SENDNN_LOG=WARNING
 
diff --git a/docs/.nav.yml b/docs/.nav.yml
@@ -8,7 +8,7 @@ nav:
         - Kubernetes: deploying/k8s.md
     - Examples:
       - Offline Inference: examples/offline_inference
-      - Other: examples/other
+      - Online Inference: examples/online_inference
     - User Guide:
         - Configuration: user_guide/configuration.md
         - Environment Variables: user_guide/env_vars.md
diff --git a/examples/offline_inference/cb_spyre_inference.py b/examples/offline_inference/cb_spyre_inference.py
@@ -1,16 +1,29 @@
+"""
+This example shows how to run offline inference using continuous batching 
+on CPU.
+"""
+
+import argparse
 import os
 import platform
 import time
 
 from vllm import LLM, SamplingParams
 
-# RUN with fms branch: https://github.com/foundation-model-stack/
-# foundation-model-stack/tree/paged_attn_mock
+# Continuous batching currently requires installing the branch
+# https://github.com/foundation-model-stack/foundation-model-stack/tree/paged_attn_mock
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", type=str, default="/models/llama-194m")
+parser.add_argument("--max_model_len", type=int, default=2048)
+parser.add_argument("--max_num_seqs", type=int, default=2)
+parser.add_argument("--tp", type=int, default=1)
+args = parser.parse_args()
 
 max_tokens1 = 65
 max_tokens2 = 67
 max_tokens3 = 7
-max_num_seqs = 2  # defines max batch size
+max_num_seqs = args.max_num_seqs  # defines the max batch size
 
 if platform.machine() == "arm64":
     print("Detected arm64 running environment. "
@@ -19,58 +32,40 @@
           "locally on arm64.")
     os.environ["HF_HUB_OFFLINE"] = "1"
 
-# defining here to be able to run/debug directly from VSC (not via terminal)
-os.environ['VLLM_SPYRE_DYNAMO_BACKEND'] = 'eager'
+if "VLLM_SPYRE_DYNAMO_BACKEND" not in os.environ:
+    os.environ['VLLM_SPYRE_DYNAMO_BACKEND'] = 'eager'
 os.environ['VLLM_SPYRE_USE_CB'] = '1'
 os.environ['VLLM_SPYRE_HETEROGEN_TKV'] = '0'
 os.environ['VLLM_USE_V1'] = '1'
 
-# Sample prompts.
 template = (
     "Below is an instruction that describes a task. Write a response that "
     "appropriately completes the request. Be polite in your response to the "
     "user.\n\n### Instruction:\n{}\n\n### Response:")
 
-prompt1 = template.format(
-    "Provide a list of instructions for preparing chicken soup for a family "
-    "of four.")
-
-prompt2 = template.format("Provide instructions for preparing chicken soup.")
-
-prompt3 = template.format(
-    "Provide a list of instructions for preparing chicken soup for a family.")
-
-prompts = [
-    prompt1,
-    prompt2,
-    prompt3,
+instructions = [
+    "Provide a list of instructions for preparing chicken soup for a family" + \
+        " of four.",
+    "Provide instructions for preparing chicken soup.",
+    "Provide a list of instructions for preparing chicken soup for a family.",
 ]
 
-# Create a sampling params object.
-sampling_params1 = SamplingParams(max_tokens=max_tokens1,
-                                  temperature=0.0,
-                                  ignore_eos=True)
-
-sampling_params2 = SamplingParams(max_tokens=max_tokens2,
-                                  temperature=0.0,
-                                  ignore_eos=True)
+prompts = [template.format(instr) for instr in instructions]
 
-sampling_params3 = SamplingParams(max_tokens=max_tokens3,
-                                  temperature=0.0,
-                                  ignore_eos=True)
+max_tokens_list = [max_tokens1, max_tokens2, max_tokens3]
 
 sampling_params = [
-    sampling_params1,
-    sampling_params2,
-    sampling_params3,
+    SamplingParams(max_tokens=mt, temperature=0.0, ignore_eos=True)
+    for mt in max_tokens_list
 ]
 
 # Create an LLM.
-llm = LLM(model="/models/llama-194m",
-          tokenizer="/models/llama-194m",
-          max_model_len=2048,
+llm = LLM(model=args.model,
+          tokenizer=args.model,
+          max_model_len=args.max_model_len,
           block_size=2048,
-          max_num_seqs=max_num_seqs)
+          max_num_seqs=max_num_seqs,
+          tensor_parallel_size=args.tp)
 
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
@@ -81,9 +76,11 @@
       (len(outputs[0].outputs[0].token_ids), time.time() - t0))
 print("===============")
 for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    print(output.outputs[0])
 print("===============")
 for output in outputs:
-    print(output.outputs[0])
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"\nPrompt:\n {prompt!r}")
+    print(f"\nGenerated text:\n {generated_text!r}\n")
+    print("-----------------------------------")
diff --git a/examples/offline_inference/multi_spyre_inference.py b/examples/offline_inference/multi_spyre_inference.py
@@ -1,3 +1,8 @@
+"""
+This example shows how to use Spyre with vLLM for running offline inference 
+with multiple cards.
+"""
+
 import gc
 import os
 import platform
@@ -18,13 +23,12 @@
 os.environ["VLLM_SPYRE_WARMUP_NEW_TOKENS"] = str(max_tokens)
 os.environ['VLLM_SPYRE_WARMUP_BATCH_SIZES'] = '1'
 
-# stuff for multi-spyre
+# Multi-spyre related variables
 os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 os.environ["DISTRIBUTED_STRATEGY_IGNORE_MODULES"] = "WordEmbedding"
 os.environ["MASTER_ADDR"] = "localhost"
 os.environ["MASTER_PORT"] = "12355"
 
-# Sample prompts.
 template = (
     "Below is an instruction that describes a task. Write a response that "
     "appropriately completes the request. Be polite in your response to the "
diff --git a/examples/offline_inference/spyre_inference.py b/examples/offline_inference/spyre_inference.py
@@ -1,3 +1,7 @@
+"""
+This example shows how to use Spyre with vLLM for running offline inference.
+"""
+
 import os
 import platform
 import time
@@ -17,19 +21,17 @@
 os.environ["VLLM_SPYRE_WARMUP_NEW_TOKENS"] = str(max_tokens)
 os.environ['VLLM_SPYRE_WARMUP_BATCH_SIZES'] = '1'
 
-# Sample prompts.
 template = (
     "Below is an instruction that describes a task. Write a response that "
     "appropriately completes the request. Be polite in your response to the "
     "user.\n\n### Instruction:\n{}\n\n### Response:")
-prompt1 = template.format(
-    "Provide a list of instructions for preparing chicken soup for a family "
-    "of four.")
 prompts = [
-    prompt1,
+    template.format(
+        "Provide a list of instructions for preparing chicken soup for a" + \
+        " family of four.",
+    )
 ]
 
-# Create a sampling params object.
 sampling_params = SamplingParams(max_tokens=max_tokens,
                                  temperature=0.0,
                                  ignore_eos=True)
diff --git a/examples/online_inference/openai_spyre_inference.py b/examples/online_inference/openai_spyre_inference.py
@@ -1,15 +1,17 @@
-""" Test for online serving.
+""" 
+This example shows how to use Spyre with vLLM for running online inference.
 
-On the server side, run the following commands
+First, start the server with the following command:
     python3 -m vllm.entrypoints.openai.api_server \
         --model /models/llama-7b-chat/ \
         --max-model-len=2048 \
         --block-size=2048
 
-Then the default batch size 1, max prompt length of 64 and maximum of 20
-ecode tokens is used. Otherwise change the behavior with the environment 
-variables `VLLM_SPYRE_WARMUP_BATCH_SIZES`, `VLLM_SPYRE_WARMUP_PROMPT_LENS`, 
-and `VLLM_SPYRE_WARMUP_NEW_TOKENS`.
+By default, the server will use a batch size of 1, a max prompt length of 64 
+tokens, and a max of 20 decode tokens.
+
+You can change these with the env variables VLLM_SPYRE_WARMUP_BATCH_SIZES, 
+VLLM_SPYRE_WARMUP_PROMPT_LENS, and VLLM_SPYRE_WARMUP_NEW_TOKENS.
 """
 
 import time
@@ -33,29 +35,26 @@
     "Below is an instruction that describes a task. Write a response that "
     "appropriately completes the request. Be polite in your response to the "
     "user.\n\n### Instruction:\n{}\n\n### Response:")
-prompt1 = template.format(
-    "Provide a list of instructions for preparing chicken soup for a family "
-    "of four.")
-
-prompt2 = template.format(
-    "Please compare New York City and Zurich and provide a list of attractions "
-    "for each city.")
 
-prompt3 = template.format(
-    "Provide detailed instructions for preparing asparagus soup for a family "
-    "of four.")
+instructions = [
+    "Provide a list of instructions for preparing chicken soup for a family" + \
+        " of four.",
+    "Please compare New York City and Zurich and provide a list of" + \
+        " attractions for each city.",
+    "Provide detailed instructions for preparing asparagus soup for a" + \
+        " family of four.",
+]
 
-prompts = [prompt1, prompt2, prompt3]
+prompts = [template.format(instr) for instr in instructions]
 
-# make sure that the specified batch size is in VLLM_SPYRE_WARMUP_BATCH_SIZES
+# This batch size must match VLLM_SPYRE_WARMUP_BATCH_SIZES
 batch_size = 1
 print('submitting prompts of batch size', batch_size)
 
 # making sure not to submit more prompts than the batch size
 for i in range(0, len(prompts), batch_size):
     prompt = prompts[i:i + batch_size]
 
-    # Completion API
     stream = False
     max_tokens = 20
 
diff --git a/tests/e2e/test_spyre_cb.py b/tests/e2e/test_spyre_cb.py
diff --git a/vllm_spyre/__init__.py b/vllm_spyre/__init__.py
diff --git a/vllm_spyre/envs.py b/vllm_spyre/envs.py
diff --git a/vllm_spyre/v1/worker/spyre_model_runner.py b/vllm_spyre/v1/worker/spyre_model_runner.py