v1/offloading: Add worker-side CPU support

orozery · orozery · commit 4a19c0492d3b · 2025-07-23T13:02:08.000+03:00
This commit adds worker-side support for CPU offloading.
It uses the swap_blocks function to perform the actual copying between CPU and GPU.
Supports any CPU block size which is divided by GPU block size.
Signed-off-by: Or Ozeri &lt;oro@il.ibm.com&gt;
diff --git a/tests/v1/offloading/test_cpu.py b/tests/v1/offloading/test_cpu.py
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
+from vllm.v1.offloading.mediums import CPULoadStoreSpec, GPULoadStoreSpec
+from vllm.v1.offloading.worker.cpu import (create_cpu_tensors,
+                                           generate_tensors_transfer_function)
+
+NUM_GPU_BLOCKS = [64]
+NUM_CPU_BLOCKS = [256]
+GPU_BLOCK_SIZES = [16]
+GPU_BLOCKS_PER_CPU_BLOCK = [1, 3]
+HEAD_SIZES = [64]
+NUM_HEADS = [8]
+NUM_LAYERS = [4]
+DTYPES = [torch.bfloat16]
+SEEDS = [0]
+CUDA_DEVICES = ['cuda:0']
+NUM_MAPPINGS = [3]
+
+
+@pytest.mark.parametrize("gpu_to_cpu", [True, False])
+@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("gpu_block_size", GPU_BLOCK_SIZES)
+@pytest.mark.parametrize("gpu_blocks_per_cpu_block", GPU_BLOCKS_PER_CPU_BLOCK)
+@pytest.mark.parametrize("num_gpu_blocks", NUM_GPU_BLOCKS)
+@pytest.mark.parametrize("num_cpu_blocks", NUM_CPU_BLOCKS)
+@pytest.mark.parametrize("num_layers", NUM_LAYERS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_transfer(
+    gpu_to_cpu: bool,
+    num_mappings: int,
+    head_size: int,
+    num_heads: int,
+    gpu_block_size: int,
+    gpu_blocks_per_cpu_block: int,
+    num_gpu_blocks: int,
+    num_cpu_blocks: int,
+    num_layers: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+
+    # create per-layer GPU KV caches
+    attn_backend = FlashAttentionBackend
+    gpu_cache_shape = attn_backend.get_kv_cache_shape(num_gpu_blocks,
+                                                      gpu_block_size,
+                                                      num_heads, head_size)
+    gpu_caches = {}
+    for i in range(num_layers):
+        gpu_caches[f'layer {i}'] = torch.rand(gpu_cache_shape,
+                                              dtype=dtype,
+                                              device=device)
+
+    # create CPU KV caches
+    cpu_block_size = gpu_blocks_per_cpu_block * gpu_block_size
+    gpu_tensors, cpu_tensors = create_cpu_tensors(gpu_caches, gpu_block_size,
+                                                  cpu_block_size,
+                                                  num_cpu_blocks)
+
+    # select block mappings
+    gpu_blocks = random.sample(range(num_gpu_blocks),
+                               num_mappings * gpu_blocks_per_cpu_block)
+    cpu_blocks = random.sample(range(num_cpu_blocks), num_mappings)
+
+    # convert cpu blocks to gpu block size
+    cpu_blocks_in_gpu_block_size = []
+    for cpu_block in cpu_blocks:
+        base_block_id = cpu_block * gpu_blocks_per_cpu_block
+        for i in range(gpu_blocks_per_cpu_block):
+            cpu_blocks_in_gpu_block_size.append(i + base_block_id)
+
+    # set transfer direction
+    if gpu_to_cpu:
+        src_kv_caches = gpu_tensors
+        dst_kv_caches = cpu_tensors
+        src_block_size = gpu_block_size
+        dst_block_size = cpu_block_size
+        src_spec_class = GPULoadStoreSpec
+        dst_spec_class = CPULoadStoreSpec
+        src_blocks = gpu_blocks
+        dst_blocks = cpu_blocks
+        src_blocks_in_gpu_block_size = gpu_blocks
+        dst_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size
+        dst_size_in_gpu_blocks = num_cpu_blocks * gpu_blocks_per_cpu_block
+    else:
+        src_kv_caches = cpu_tensors
+        dst_kv_caches = gpu_tensors
+        src_block_size = cpu_block_size
+        dst_block_size = gpu_block_size
+        src_spec_class = CPULoadStoreSpec
+        dst_spec_class = GPULoadStoreSpec
+        src_blocks = cpu_blocks
+        dst_blocks = gpu_blocks
+        src_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size
+        dst_blocks_in_gpu_block_size = gpu_blocks
+        dst_size_in_gpu_blocks = num_gpu_blocks
+
+    # build dst -> src mapping
+    dst_to_src = {}
+    for src_block, dst_block in zip(src_blocks_in_gpu_block_size,
+                                    dst_blocks_in_gpu_block_size):
+        dst_to_src[dst_block] = src_block
+
+    # build transfer specs
+    src_specs = [src_spec_class(block_id) for block_id in src_blocks]
+    dst_specs = [dst_spec_class(block_id) for block_id in dst_blocks]
+
+    # create transfer function
+    transfer_func = generate_tensors_transfer_function(src_kv_caches,
+                                                       dst_kv_caches,
+                                                       attn_backend,
+                                                       src_block_size,
+                                                       dst_block_size)
+
+    # clone src and dst tensors before transfer
+    orig_src_caches = [x.clone() for x in src_kv_caches]
+    orig_dst_caches = [x.clone() for x in dst_kv_caches]
+
+    # call transfer function
+    assert transfer_func((src_specs, dst_specs)) is True
+
+    # verify src tensors did not change
+    for orig_tensor, tensor in zip(orig_src_caches, src_kv_caches):
+        assert torch.equal(orig_tensor, tensor)
+
+    # verify dst tensors
+    for dst_block in range(dst_size_in_gpu_blocks):
+        src_block_candidate = dst_to_src.get(dst_block)
+        for src_cache, dst_cache, orig_dst_cache in zip(
+                src_kv_caches, dst_kv_caches, orig_dst_caches):
+            # iterate over key, value
+            for i in range(2):
+                if src_block_candidate is not None:
+                    expected_value = src_cache[i][src_block_candidate]
+                else:
+                    expected_value = orig_dst_cache[i][dst_block]
+                torch.testing.assert_close(dst_cache[i][dst_block].cpu(),
+                                           expected_value.cpu())
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
@@ -86,6 +86,19 @@ def get_kv_cache_shape(
             raise ValueError("Block size must be a multiple of 16.")
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        src_key_cache = src_kv_cache[0]
+        dst_key_cache = dst_kv_cache[0]
+        ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
+        src_value_cache = src_kv_cache[1]
+        dst_value_cache = dst_kv_cache[1]
+        ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
+
     @staticmethod
     def get_kv_cache_stride_order() -> tuple[int, ...]:
         # `stride_order` indicates the permutation that gets
diff --git a/vllm/v1/offloading/worker/cpu.py b/vllm/v1/offloading/worker/cpu.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterator
+
+import torch
+
+from vllm.attention import AttentionBackend
+from vllm.v1.offloading.abstract import LoadStoreSpec
+from vllm.v1.offloading.mediums import BlockIDLoadStoreSpec
+from vllm.v1.offloading.worker.worker import TransferFunction, TransferSpec
+
+
+def create_cpu_tensors(
+    gpu_kv_caches: dict[str, torch.Tensor],
+    gpu_block_size: int,
+    cpu_block_size: int,
+    num_cpu_blocks: int,
+) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+    """
+    Create tensors for the CPU KV cache.
+
+    Args:
+        gpu_kv_caches: The per-layer GPU KV cache tensors
+        gpu_block_size: Number of tokens per GPU block
+        cpu_block_size: Number of tokens per CPU block
+        num_cpu_blocks: The number of CPU blocks to allocate
+
+    Note:
+        - The GPU block size must divide the CPU block size.
+        - The shape of the GPU KV cache must be (2, num_blocks, ...)
+
+    Returns:
+        Matching per-layer lists of (gpu_tensors, cpu_tensors).
+    """
+    assert cpu_block_size % gpu_block_size == 0
+
+    gpu_tensors = []
+    cpu_tensors = []
+    for gpu_tensor in gpu_kv_caches.values():
+        gpu_shape = gpu_tensor.shape
+        assert len(gpu_shape) >= 4  # (2, num_blocks, ..., ...)
+        assert gpu_shape[0] == 2
+
+        cpu_shape = list(gpu_shape)
+        cpu_shape[1] = num_cpu_blocks * (cpu_block_size // gpu_block_size)
+
+        gpu_tensors.append(gpu_tensor)
+        cpu_tensors.append(
+            torch.zeros(cpu_shape, dtype=gpu_tensor.dtype, device="cpu"))
+
+    return gpu_tensors, cpu_tensors
+
+
+def block_ids(specs_list: list[LoadStoreSpec],
+              block_size_factor: int) -> Iterator[int]:
+    """
+    Convert a list of BlockIDLoadStoreSpec to a list of matching block ids,
+    assuming each spec is composed of actual block_size_factor blocks.
+
+    For example, if spec_list = [0, 1, 3] and block_size_factor =  4,
+    then it yields [0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15]
+    since 0 maps to [0, 1, 2, 3]
+    1 maps to [4, 5, 6, 7]
+    and 3 maps to [12, 13, 14, 15]
+    """
+    for spec in specs_list:
+        assert isinstance(spec, BlockIDLoadStoreSpec)
+        base_block_id = spec.block_id * block_size_factor
+        for i in range(block_size_factor):
+            yield base_block_id + i
+
+
+def generate_tensors_transfer_function(
+    src_tensors: list[torch.Tensor],
+    dst_tensors: list[torch.Tensor],
+    attn_backend: type[AttentionBackend],
+    src_block_size: int,
+    dst_block_size: int,
+) -> TransferFunction:
+    """
+    Generate a function for transferring from one KV cache to another.
+
+    Args:
+        src_tensors: the per-layer tensors of the source KV cache.
+        dst_tensors: the per-layer tensors of the destination KV cache.
+        attn_backend: the attention backend for both caches.
+        src_block_size: the block size of the source KV cache.
+        dst_block_size: the block size of the destination KV cache.
+
+    Returns:
+        A function for executing transfers between the caches.
+
+    Note: one of src_block_size, dst_block_size must divide the other.
+    """
+    assert len(src_tensors) == len(dst_tensors)
+
+    min_block_size = min(src_block_size, dst_block_size)
+    max_block_size = max(src_block_size, dst_block_size)
+    assert max_block_size % min_block_size == 0
+
+    src_block_size_factor = src_block_size // min_block_size
+    dst_block_size_factor = dst_block_size // min_block_size
+
+    def transfer_function(spec: TransferSpec) -> bool:
+        src_blocks_specs_list, dst_blocks_specs_list = spec
+
+        assert (len(src_blocks_specs_list) *
+                src_block_size_factor == len(dst_blocks_specs_list) *
+                dst_block_size_factor)
+
+        src_to_dst_list: list[tuple[int, int]] = list(
+            zip(block_ids(src_blocks_specs_list, src_block_size_factor),
+                block_ids(dst_blocks_specs_list, dst_block_size_factor)))
+        src_to_dst = torch.tensor(src_to_dst_list,
+                                  device="cpu",
+                                  dtype=torch.int64).view(-1, 2)
+
+        # iterate over layers
+        for src_tensor, dst_tensor in zip(src_tensors, dst_tensors):
+            attn_backend.swap_blocks(src_tensor, dst_tensor, src_to_dst)
+
+        # always successful
+        return True
+
+    return transfer_function